top of page

Google News Scrapper
Scrape Google News articles for a particulair keyword and date range
You can use the google_news_scraper function by providing the keyword and date range as inputs. For example, google_news_scraper("oil prices", "2023-08-25", "2023-08-31") will fetch articles with the keyword "oil prices" published between August 25 and 31, 2023, and save it as a CSV file.
# Install necessary packages
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import sys
import pandas as pd
from datetime import datetime, timedelta
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
def convert_relative_date(text, current_datetime):
current_year = current_datetime.year
if 'hour' in text or 'hours' in text:
return current_datetime.strftime('%Y-%m-%d')
elif 'day' in text or 'days' in text:
match = re.search(r'\d+', text)
days_ago = int(match.group()) if match else 0
return (current_datetime - timedelta(days=days_ago)).strftime('%Y-%m-%d')
elif 'minute' in text or 'minutes' in text:
return current_datetime.strftime('%Y-%m-%d')
elif 'yesterday' in text.lower():
return (current_datetime - timedelta(days=1)).strftime('%Y-%m-%d')
else:
try:
parsed_date = datetime.strptime(text, '%b %d')
return datetime(current_year, parsed_date.month, parsed_date.day).strftime('%Y-%m-%d')
except ValueError:
return text # Return the original text if parsing fails
def google_news_scraper(keyword, start_date, end_date):
# Convert start_date and end_date to datetime objects
start_date = datetime.strptime(start_date, '%Y-%m-%d')
end_date = datetime.strptime(end_date, '%Y-%m-%d')
# Set up Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=chrome_options)
# Fetch the Web Page
query = '+'.join(keyword.split())
url = f'https://news.google.com/search?q={query}'
driver.get(url)
# Scroll the page to load more articles
for _ in range(5): # Adjust the range for more or fewer scrolls
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
time.sleep(2) # Wait for page to load
# Get the page source and close the browser
html = driver.page_source
driver.quit()
# Parse the Web Page using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
articles = soup.find_all('article')
# Extract the Necessary Information
news_data = []
base_url = 'https://news.google.com'
for article in articles:
title_link_element = article.find('a', class_='JtKRv', href=True)
title = title_link_element.text.strip() if title_link_element else "No Title"
link = base_url + title_link_element['href'][1:] if title_link_element else "No Link"
time_element = article.find('time')
date = time_element.text.strip() if time_element else "No Date"
news_data.append([title, link, date])
# Store the Data in a DataFrame
df = pd.DataFrame(news_data, columns=['Title', 'Link', 'Date'])
# Convert dates to a standardized format
current_datetime = datetime.now()
for i, row in df.iterrows():
if row['Date']:
df.at[i, 'Date'] = convert_relative_date(row['Date'], current_datetime)
# Filter the DataFrame by the provided date range
def is_valid_date(date_str):
try:
return start_date <= datetime.strptime(date_str, '%Y-%m-%d') <= end_date
except (TypeError, ValueError):
return False
filtered_df = df[df['Date'].apply(is_valid_date)]
# Save the filtered DataFrame to CSV
csv_file = f'google_news_filtered_{query}.csv'
filtered_df.to_csv(csv_file, index=False)
print(f"Filtered articles saved to {csv_file}")
# Check if running in an environment that supports file download
try:
from google.colab import files
files.download(csv_file)
except ImportError:
print(f"Download not supported in this environment. Please manually retrieve the file: {csv_file}")
# Prompt user for input
keyword = input("Enter the search keyword: ")
start_date = input("Enter the start date (YYYY-MM-DD): ")
end_date = input("Enter the end date (YYYY-MM-DD): ")
# Call the function with user input
google_news_scraper(keyword, start_date, end_date)
Project Github repository: https://github.com/seanxjohn/google_news_scrapper/tree/main
bottom of page