Initial commit of economic news event scraping logic
This commit is contained in:
commit
90155a703d
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
*.egg-info/
|
||||||
|
__pycache__/
|
||||||
|
dist/
|
1
news/__init__.py
Normal file
1
news/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .scrape_news import scrape_news, scrape_news_in_range
|
78
news/scrape_news.py
Normal file
78
news/scrape_news.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from time import sleep
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
def convert_datetime_to_url_format(date: datetime) -> str:
|
||||||
|
return date.strftime("%b%d.%Y").lower()
|
||||||
|
|
||||||
|
def scrape_news(date: datetime) -> pd.DataFrame:
|
||||||
|
base_url = "https://www.forexfactory.com/calendar"
|
||||||
|
formatted_date = convert_datetime_to_url_format(date)
|
||||||
|
url = f"{base_url}?day={formatted_date}"
|
||||||
|
|
||||||
|
# Set headers to mimic a web browser.
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Failed to retrieve news. Status code: {response.status_code}")
|
||||||
|
return
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
rows = soup.find_all("tr", class_="calendar__row")
|
||||||
|
|
||||||
|
previous_time = None
|
||||||
|
|
||||||
|
times = []
|
||||||
|
currencies = []
|
||||||
|
titles = []
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
try:
|
||||||
|
time = row.find("td", class_="calendar__cell calendar__time").text.strip()
|
||||||
|
currency = row.find("td", class_="calendar__cell calendar__currency").text.strip()
|
||||||
|
title = row.find("span", class_="calendar__event-title").text.strip()
|
||||||
|
|
||||||
|
# If the time is not present, use the previous time.
|
||||||
|
if not time:
|
||||||
|
time = previous_time
|
||||||
|
else:
|
||||||
|
previous_time = time
|
||||||
|
|
||||||
|
print(f"Time: {time}")
|
||||||
|
print(f"Currency: {currency}")
|
||||||
|
print(f"Title: {title}")
|
||||||
|
print("---------------------------")
|
||||||
|
|
||||||
|
times.append(time)
|
||||||
|
currencies.append(currency)
|
||||||
|
titles.append(title)
|
||||||
|
except AttributeError:
|
||||||
|
print("Failed to scrape the row")
|
||||||
|
continue
|
||||||
|
news_data = pd.DataFrame({"Time": times, "Currency": currencies, "Title": titles})
|
||||||
|
news_data['Date'] = date.strftime('%Y-%m-%d')
|
||||||
|
news_data = news_data[['Date', 'Time', 'Currency', 'Title']]
|
||||||
|
return news_data
|
||||||
|
|
||||||
|
def scrape_news_in_range(start_date: datetime, end_date: datetime) -> Generator[pd.DataFrame, None, None]:
|
||||||
|
current_date = start_date
|
||||||
|
while current_date <= end_date:
|
||||||
|
print(f"Scraping data for {current_date.date}")
|
||||||
|
news_data = scrape_news(current_date)
|
||||||
|
yield news_data
|
||||||
|
current_date += timedelta(days=1)
|
||||||
|
sleep(3) # Play nice with the server.
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
start_date = datetime(2023, 10, 25)
|
||||||
|
end_date = datetime(2023, 10, 25)
|
||||||
|
scrape_news_in_range(start_date, end_date)
|
||||||
|
for news_data in scrape_news_in_range(start_date, end_date):
|
||||||
|
print(news_data)
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
beautifulsoup4
|
||||||
|
pandas
|
||||||
|
requests
|
Loading…
Reference in New Issue
Block a user