Initial commit of economic news event scraping logic

2023-10-25 06:29:18 -07:00 · 2023-10-25 06:29:18 -07:00 · 90155a703d
commit 90155a703d
5 changed files with 97 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.egg-info/
+__pycache__/
+dist/
--- a/news/init.py
+++ b/news/init.py
@ -0,0 +1 @@
+from .scrape_news import scrape_news, scrape_news_in_range
--- a/news/scrape_news.py
+++ b/news/scrape_news.py
@ -0,0 +1,78 @@
+import pandas as pd
+import requests
+
+from bs4 import BeautifulSoup
+from datetime import datetime, timedelta
+from time import sleep
+from typing import Generator
+
+def convert_datetime_to_url_format(date: datetime) -> str:
+    return date.strftime("%b%d.%Y").lower()
+
+def scrape_news(date: datetime) -> pd.DataFrame:
+    base_url = "https://www.forexfactory.com/calendar"
+    formatted_date = convert_datetime_to_url_format(date)
+    url = f"{base_url}?day={formatted_date}"
+
+    # Set headers to mimic a web browser.
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        print(f"Failed to retrieve news. Status code: {response.status_code}")
+        return
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    rows = soup.find_all("tr", class_="calendar__row")
+
+    previous_time = None
+
+    times = []
+    currencies = []
+    titles = []
+
+    for row in rows:
+        try:
+            time = row.find("td", class_="calendar__cell calendar__time").text.strip()
+            currency = row.find("td", class_="calendar__cell calendar__currency").text.strip()
+            title = row.find("span", class_="calendar__event-title").text.strip()
+
+            # If the time is not present, use the previous time.
+            if not time:
+                time = previous_time
+            else:
+                previous_time = time
+
+            print(f"Time: {time}")
+            print(f"Currency: {currency}")
+            print(f"Title: {title}")
+            print("---------------------------")
+
+            times.append(time)
+            currencies.append(currency)
+            titles.append(title)
+        except AttributeError:
+            print("Failed to scrape the row")
+            continue
+    news_data = pd.DataFrame({"Time": times, "Currency": currencies, "Title": titles})
+    news_data['Date'] = date.strftime('%Y-%m-%d')
+    news_data = news_data[['Date', 'Time', 'Currency', 'Title']]
+    return news_data
+
+def scrape_news_in_range(start_date: datetime, end_date: datetime) -> Generator[pd.DataFrame, None, None]:
+    current_date = start_date
+    while current_date <= end_date:
+        print(f"Scraping data for {current_date.date}")
+        news_data = scrape_news(current_date)
+        yield news_data
+        current_date += timedelta(days=1)
+        sleep(3) # Play nice with the server.
+
+if __name__ == '__main__':
+    start_date = datetime(2023, 10, 25)
+    end_date = datetime(2023, 10, 25)
+    scrape_news_in_range(start_date, end_date)
+    for news_data in scrape_news_in_range(start_date, end_date):
+        print(news_data)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+beautifulsoup4
+pandas
+requests
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,12 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="news",
+    version="1.0",
+    packages=find_packages(),
+    install_requires=[
+        'beautifulsoup4',
+        'pandas',
+        'requests'
+    ]
+)
				`@ -0,0 +1 @@`
				`from .scrape_news import scrape_news, scrape_news_in_range`