news-automation/news/scrape_news.py

78 lines
2.7 KiB
Python
Raw Normal View History

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
from typing import Generator
def convert_datetime_to_url_format(date: datetime) -> str:
return date.strftime("%b%d.%Y").lower()
def scrape_news(date: datetime) -> pd.DataFrame:
base_url = "https://www.forexfactory.com/calendar"
formatted_date = convert_datetime_to_url_format(date)
url = f"{base_url}?day={formatted_date}"
# Set headers to mimic a web browser.
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to retrieve news. Status code: {response.status_code}")
return
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.find_all("tr", class_="calendar__row")
previous_time = None
times = []
currencies = []
titles = []
for row in rows:
try:
time = row.find("td", class_="calendar__cell calendar__time").text.strip()
currency = row.find("td", class_="calendar__cell calendar__currency").text.strip()
title = row.find("span", class_="calendar__event-title").text.strip()
# If the time is not present, use the previous time.
if not time:
time = previous_time
else:
previous_time = time
print(f"Time: {time}")
print(f"Currency: {currency}")
print(f"Title: {title}")
print("---------------------------")
times.append(time)
currencies.append(currency)
titles.append(title)
except AttributeError:
print("Failed to scrape the row")
continue
news_data = pd.DataFrame({"Time": times, "Currency": currencies, "Title": titles})
news_data['Date'] = date.strftime('%Y-%m-%d')
news_data = news_data[['Date', 'Time', 'Currency', 'Title']]
return news_data
def scrape_news_in_range(start_date: datetime, end_date: datetime) -> Generator[pd.DataFrame, None, None]:
current_date = start_date
while current_date <= end_date:
print(f"Scraping data for {current_date.date}")
news_data = scrape_news(current_date)
yield news_data
current_date += timedelta(days=1)
sleep(3) # Play nice with the server.
if __name__ == '__main__':
start_date = datetime(2023, 10, 25)
end_date = datetime(2023, 10, 25)
scrape_news_in_range(start_date, end_date)
for news_data in scrape_news_in_range(start_date, end_date):
print(news_data)