commit 90155a703dd2f1a41407577410f0c57a3144286d Author: moshferatu Date: Wed Oct 25 06:29:18 2023 -0700 Initial commit of economic news event scraping logic diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e4c188f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.egg-info/ +__pycache__/ +dist/ \ No newline at end of file diff --git a/news/__init__.py b/news/__init__.py new file mode 100644 index 0000000..308c6b1 --- /dev/null +++ b/news/__init__.py @@ -0,0 +1 @@ +from .scrape_news import scrape_news, scrape_news_in_range \ No newline at end of file diff --git a/news/scrape_news.py b/news/scrape_news.py new file mode 100644 index 0000000..5005b20 --- /dev/null +++ b/news/scrape_news.py @@ -0,0 +1,78 @@ +import pandas as pd +import requests + +from bs4 import BeautifulSoup +from datetime import datetime, timedelta +from time import sleep +from typing import Generator + +def convert_datetime_to_url_format(date: datetime) -> str: + return date.strftime("%b%d.%Y").lower() + +def scrape_news(date: datetime) -> pd.DataFrame: + base_url = "https://www.forexfactory.com/calendar" + formatted_date = convert_datetime_to_url_format(date) + url = f"{base_url}?day={formatted_date}" + + # Set headers to mimic a web browser. + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Failed to retrieve news. Status code: {response.status_code}") + return + + soup = BeautifulSoup(response.text, 'html.parser') + rows = soup.find_all("tr", class_="calendar__row") + + previous_time = None + + times = [] + currencies = [] + titles = [] + + for row in rows: + try: + time = row.find("td", class_="calendar__cell calendar__time").text.strip() + currency = row.find("td", class_="calendar__cell calendar__currency").text.strip() + title = row.find("span", class_="calendar__event-title").text.strip() + + # If the time is not present, use the previous time. + if not time: + time = previous_time + else: + previous_time = time + + print(f"Time: {time}") + print(f"Currency: {currency}") + print(f"Title: {title}") + print("---------------------------") + + times.append(time) + currencies.append(currency) + titles.append(title) + except AttributeError: + print("Failed to scrape the row") + continue + news_data = pd.DataFrame({"Time": times, "Currency": currencies, "Title": titles}) + news_data['Date'] = date.strftime('%Y-%m-%d') + news_data = news_data[['Date', 'Time', 'Currency', 'Title']] + return news_data + +def scrape_news_in_range(start_date: datetime, end_date: datetime) -> Generator[pd.DataFrame, None, None]: + current_date = start_date + while current_date <= end_date: + print(f"Scraping data for {current_date.date}") + news_data = scrape_news(current_date) + yield news_data + current_date += timedelta(days=1) + sleep(3) # Play nice with the server. + +if __name__ == '__main__': + start_date = datetime(2023, 10, 25) + end_date = datetime(2023, 10, 25) + scrape_news_in_range(start_date, end_date) + for news_data in scrape_news_in_range(start_date, end_date): + print(news_data) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8d9b348 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4 +pandas +requests \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..55583b1 --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup, find_packages + +setup( + name="news", + version="1.0", + packages=find_packages(), + install_requires=[ + 'beautifulsoup4', + 'pandas', + 'requests' + ] +) \ No newline at end of file