top of page
Web scrapping Box Office Sales
A python code used to web scrape data from Box Office Mojo's Website.
The objective at hand is to attain weekend box office sales in order to test a correlation of Sci -fi movies and Tech stock prices.
Thus, a python code was developed to web scrape data from Box Office Mojo's Website in a manner that that shows the Top 10 box office sales for each weekend. Additionally, using OMDb API, the genre for each movie was found.
Significant aspects of the code developed was based on a similair project by Jonathan Bown on Kaggle
*My actual API key from OMDb API has been removed in the code below
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import files
import re
def scrape_weekend_box_office(weekend_url):
response = requests.get(weekend_url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
if not table:
return []
rows = table.findAll('tr')[1:]
movies_data = []
for row in rows[:10]:
cols = row.findAll('td')
if cols and len(cols) > 3:
movie_name = cols[2].get_text(strip=True)
weekend_gross = cols[3].get_text(strip=True)
movies_data.append((movie_name, weekend_gross))
return movies_data
def check_genre(movie_name, api_key):
params = {'t': movie_name, 'apikey': api_key}
response = requests.get('http://www.omdbapi.com/', params=params)
data = response.json()
if 'Genre' in data:
genres = [genre.strip().lower() for genre in data['Genre'].split(',')]
return 1 if any(genre in genres for genre in ['sci-fi', 'fantasy', 'action']) else 0
return 0
def scrape_year_weekends(year, api_key):
base_url = f'https://www.boxofficemojo.com/weekend/by-year/{year}/'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('td.a-text-left a')
all_data = []
seen_weekends = set()
genre_cache = {}
for link in links:
weekend = link.get_text(strip=True)
weekend_link = 'https://www.boxofficemojo.com' + link['href'].split('?')[0]
match = re.search(r'(\d{4})W(\d+)', weekend_link)
if match and weekend_link not in seen_weekends:
seen_weekends.add(weekend_link)
weekend_number = match.group(2)
top_movies = scrape_weekend_box_office(weekend_link)
for rank, (movie_name, weekend_gross) in enumerate(top_movies, start=1):
if movie_name not in genre_cache:
genre_indicator = check_genre(movie_name, api_key)
genre_cache[movie_name] = genre_indicator
else:
genre_indicator = genre_cache[movie_name]
all_data.append({
'weekend_number': int(weekend_number),
'weekend': weekend,
'rank': rank,
'movie_name': movie_name,
'weekend_gross': weekend_gross,
'is_action_sci_fi_or_fantasy': genre_indicator
})
return all_data
omdb_api_key = ' ' # Replace with actual OMDb API key
year = 2019
data = scrape_year_weekends(year, omdb_api_key)
df = pd.DataFrame(data)
df.drop_duplicates(subset=['weekend_number', 'rank'], inplace=True)
df.sort_values(by=['weekend_number', 'rank'], inplace=True)
csv_file = f'weekend_box_office_{year}.csv'
df.to_csv(csv_file, index=False)
files.download(csv_file)
bottom of page