From ca32f51e1a54ffc3db09ffe37b4fa7785dce12ec Mon Sep 17 00:00:00 2001 From: moshferatu Date: Wed, 3 Jul 2024 19:50:08 -0700 Subject: [PATCH] Use cloudscraper module instead of requests in order to bypass Cloudflare bot restrictions --- news/scrape_news.py | 10 +++------- requirements.txt | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/news/scrape_news.py b/news/scrape_news.py index 0f70a02..e3f9f16 100644 --- a/news/scrape_news.py +++ b/news/scrape_news.py @@ -1,5 +1,5 @@ +import cloudscraper import pandas as pd -import requests from bs4 import BeautifulSoup from datetime import datetime, timedelta @@ -14,12 +14,8 @@ def scrape_news(date: datetime) -> pd.DataFrame: formatted_date = convert_datetime_to_url_format(date) url = f"{base_url}?day={formatted_date}" - # Set headers to mimic a web browser. - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" - } - - response = requests.get(url, headers=headers) + scraper = cloudscraper.create_scraper() + response = scraper.get(url) if response.status_code != 200: print(f"Failed to retrieve news. Status code: {response.status_code}") return diff --git a/requirements.txt b/requirements.txt index 8d9b348..4d39977 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4 -pandas -requests \ No newline at end of file +cloudscraper +pandas \ No newline at end of file