top of page
Scrapping Oil related articles
Run on python via GoogleCollab
# Install and set up necessary packages and dependencies
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
# Set up Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=chrome_options)
# Fetch the Web Page
url = 'https://news.google.com/search?q=oil%20prices'
driver.get(url)
# Get the page source and close the browser
html = driver.page_source
driver.quit()
# Parse the Web Page using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
articles = soup.find_all('article')
# Extract the Necessary Information
news_data = []
base_url = 'https://news.google.com'
for article in articles:
# Extracting the title and link
title_link_element = article.find('a', class_='JtKRv', href=True)
title = title_link_element.text.strip() if title_link_element else "No Title"
link = base_url + title_link_element['href'][1:] if title_link_element else "No Link"
# Extracting the date
time_element = article.find('time')
date = time_element['datetime'] if time_element and 'datetime' in time_element.attrs else time_element.text.strip() if time_element else "No Date"
news_data.append([title, link, date])
# Store the Data in a DataFrame
df = pd.DataFrame(news_data, columns=['Title', 'Link', 'Date'])
csv_file = 'google_news_oil_prices.csv'
df.to_csv(csv_file, index=False)
# Download the file to your computer (only works in Google Colab)
try:
from google.colab import files
files.download(csv_file)
except ImportError:
print("The files module is not available. This code is not running in Google Colab.")
Future Projects:
Relation of frequency of Oil related posts and sustainability risks
Relation of frequency of Oil related posts and Stock Prices (General & Oil producing/intensive firms)
Updated Code
# Install and set up necessary packages and dependencies
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta
import re
# Function to convert various date formats to a standardized format
def convert_relative_date(text):
current_datetime = datetime.now()
current_year = current_datetime.year
if 'hour' in text or 'hours' in text:
return current_datetime.strftime('%Y-%m-%d')
elif 'day' in text or 'days' in text:
match = re.search(r'\d+', text)
days_ago = int(match.group()) if match else 0
return (current_datetime - timedelta(days=days_ago)).strftime('%Y-%m-%d')
elif 'minute' in text or 'minutes' in text:
return current_datetime.strftime('%Y-%m-%d')
elif 'yesterday' in text.lower():
return (current_datetime - timedelta(days=1)).strftime('%Y-%m-%d')
else:
try:
parsed_date = datetime.strptime(text, '%b %d')
return datetime(current_year, parsed_date.month, parsed_date.day).strftime('%Y-%m-%d')
except ValueError:
return text # Return the original text if parsing fails
# Set up Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=chrome_options)
# Fetch the Web Page
url = 'https://news.google.com/search?q=oil%20prices'
driver.get(url)
# Scroll the page to load more articles
for _ in range(5): # Adjust the range for more or fewer scrolls
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
time.sleep(2) # Wait for page to load
# Get the page source and close the browser
html = driver.page_source
driver.quit()
# Parse the Web Page using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
articles = soup.find_all('article')
# Extract the Necessary Information
news_data = []
base_url = 'https://news.google.com'
for article in articles:
title_link_element = article.find('a', class_='JtKRv', href=True)
title = title_link_element.text.strip() if title_link_element else "No Title"
link = base_url + title_link_element['href'][1:] if title_link_element else "No Link"
time_element = article.find('time')
date = time_element.text.strip() if time_element else "No Date"
news_data.append([title, link, date])
# Store the Data in a DataFrame
df = pd.DataFrame(news_data, columns=['Title', 'Link', 'Date'])
# Convert dates to a standardized format
for i, row in df.iterrows():
df.at[i, 'Date'] = convert_relative_date(row['Date'])
# Save the DataFrame to CSV
csv_file = 'google_news_oil_prices.csv'
df.to_csv(csv_file, index=False)
# Download the file to your computer (only works in Google Colab)
try:
from google.colab import files
files.download(csv_file)
except ImportError:
print("The files module is not available. This code is not running in Google Colab.")
bottom of page