top of page

Google News Scrapper

Scrape Google News articles for a particulair keyword and date range

You can use the google_news_scraper function by providing the keyword and date range as inputs. For example, google_news_scraper("oil prices", "2023-08-25", "2023-08-31") will fetch articles with the keyword "oil prices" published between August 25 and 31, 2023, and save it as a CSV file.


# Install necessary packages
!pip install selenium
!apt-get update
!apt install chromium-chromedriver

import sys
import pandas as pd
from datetime import datetime, timedelta
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time

def convert_relative_date(text, current_datetime):
    current_year = current_datetime.year
    if 'hour' in text or 'hours' in text:
        return current_datetime.strftime('%Y-%m-%d')
    elif 'day' in text or 'days' in text:
        match = re.search(r'\d+', text)
        days_ago = int(match.group()) if match else 0
        return (current_datetime - timedelta(days=days_ago)).strftime('%Y-%m-%d')
    elif 'minute' in text or 'minutes' in text:
        return current_datetime.strftime('%Y-%m-%d')
    elif 'yesterday' in text.lower():
        return (current_datetime - timedelta(days=1)).strftime('%Y-%m-%d')
    else:
        try:
            parsed_date = datetime.strptime(text, '%b %d')
            return datetime(current_year, parsed_date.month, parsed_date.day).strftime('%Y-%m-%d')
        except ValueError:
            return text  # Return the original text if parsing fails

def google_news_scraper(keyword, start_date, end_date):
    # Convert start_date and end_date to datetime objects
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')

    # Set up Chrome options for Selenium
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

    # Initialize the Chrome WebDriver with the specified options
    driver = webdriver.Chrome(options=chrome_options)

    # Fetch the Web Page
    query = '+'.join(keyword.split())
    url = f'https://news.google.com/search?q={query}'
    driver.get(url)

    # Scroll the page to load more articles
    for _ in range(5):  # Adjust the range for more or fewer scrolls
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(2)  # Wait for page to load

    # Get the page source and close the browser
    html = driver.page_source
    driver.quit()

    # Parse the Web Page using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    articles = soup.find_all('article')

    # Extract the Necessary Information
    news_data = []
    base_url = 'https://news.google.com'
    for article in articles:
        title_link_element = article.find('a', class_='JtKRv', href=True)
        title = title_link_element.text.strip() if title_link_element else "No Title"
        link = base_url + title_link_element['href'][1:] if title_link_element else "No Link"

        time_element = article.find('time')
        date = time_element.text.strip() if time_element else "No Date"

        news_data.append([title, link, date])

    # Store the Data in a DataFrame
    df = pd.DataFrame(news_data, columns=['Title', 'Link', 'Date'])

    # Convert dates to a standardized format
    current_datetime = datetime.now()
    for i, row in df.iterrows():
        if row['Date']:
            df.at[i, 'Date'] = convert_relative_date(row['Date'], current_datetime)

    # Filter the DataFrame by the provided date range
    def is_valid_date(date_str):
        try:
            return start_date <= datetime.strptime(date_str, '%Y-%m-%d') <= end_date
        except (TypeError, ValueError):
            return False

    filtered_df = df[df['Date'].apply(is_valid_date)]

    # Save the filtered DataFrame to CSV
    csv_file = f'google_news_filtered_{query}.csv'
    filtered_df.to_csv(csv_file, index=False)
    print(f"Filtered articles saved to {csv_file}")

    # Check if running in an environment that supports file download
    try:
        from google.colab import files
        files.download(csv_file)
    except ImportError:
        print(f"Download not supported in this environment. Please manually retrieve the file: {csv_file}")

# Prompt user for input
keyword = input("Enter the search keyword: ")
start_date = input("Enter the start date (YYYY-MM-DD): ")
end_date = input("Enter the end date (YYYY-MM-DD): ")

# Call the function with user input
google_news_scraper(keyword, start_date, end_date)

Project Github repository: https://github.com/seanxjohn/google_news_scrapper/tree/main

bottom of page