Use cloudscraper module instead of requests in order to bypass Cloudflare bot restrictions

This commit is contained in:
moshferatu 2024-07-03 19:50:08 -07:00
parent eae6f0db24
commit ca32f51e1a
2 changed files with 5 additions and 9 deletions

View File

@ -1,5 +1,5 @@
import cloudscraper
import pandas as pd import pandas as pd
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -14,12 +14,8 @@ def scrape_news(date: datetime) -> pd.DataFrame:
formatted_date = convert_datetime_to_url_format(date) formatted_date = convert_datetime_to_url_format(date)
url = f"{base_url}?day={formatted_date}" url = f"{base_url}?day={formatted_date}"
# Set headers to mimic a web browser. scraper = cloudscraper.create_scraper()
headers = { response = scraper.get(url)
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
response = requests.get(url, headers=headers)
if response.status_code != 200: if response.status_code != 200:
print(f"Failed to retrieve news. Status code: {response.status_code}") print(f"Failed to retrieve news. Status code: {response.status_code}")
return return

View File

@ -1,3 +1,3 @@
beautifulsoup4 beautifulsoup4
pandas cloudscraper
requests pandas