Director: #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Jonathan T. L. Lee
# Contact: jonathan.lee@yale.edu
# Date: 2017-11-18
# Description: A tool to help collect and analyze data from the,
# online movie database, IMDB. To do this, we will be using
# the IMDB API, and then returning the data in a JSON
# format. We will be fetching the data for the top 100
# most popular movies, and then writing the data to a CSV
# file.
#
# In addition to this, we will be performing some basic data
# analysis on the movies, such as finding the average
# runtime, and average rating.
#
# This script is intended for educational purposes only, and
# should not be used for any commercial purposes.
#
# Copyright (c) 2017 Jonathan T. L. Lee
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import json
from bs4 import BeautifulSoup
import requests
import sys
import pandas as pd
def get_movie_data(movie_title):
# Creates a URL based on the movie title that can be used to query the IMDB API.
# For example, if the movie title is "The Matrix", the URL will be
# "https://www.imdb.com/find?q=The+Matrix"
url = "https://www.imdb.com/find?q=" + movie_title.replace(' ', '+')
# Sends a GET request to the IMDB API and stores the response in the response object.
response = requests.get(url)
# Parses the HTML content of the response using Beautiful Soup.
soup = BeautifulSoup(response.text, 'html.parser')
# Finds the first movie result in the search results.
movie_result = soup.find('td', class_='result_text')
# If no movie result is found, prints an error message and returns None.
if movie_result is None:
print('No movie found with that title.')
return None
# Extracts the URL of the movie from the search results.
movie_url = movie_result.find('a')['href']
# Creates the full URL of the movie by prepending "https://www.imdb.com" to the movie URL.
movie_url = 'https://www.imdb.com' + movie_url
# Sends a GET request to the movie URL and stores the response in the response object.
response = requests.get(movie_url)
# Parses the HTML content of the response using Beautiful Soup.
soup = BeautifulSoup(response.text, 'html.parser')
# Extracts the movie title, year, and rating from the HTML.
title = soup.find('div', class_='title_wrapper').find('h1').text.strip()
year = soup.find('span', id='titleYear').text.strip()[1:-1]
rating = soup.find('div', class_='ratingValue').find('strong').find('span', class_='rating-bar-value').text.strip()
# Extracts the director's name from the HTML.
director = soup.find('div', class_='credit_summary_item').find('a').text.strip()
# Extracts the genres from the HTML.
genres = [genre.text.strip() for genre in soup.find('div', class_='title_genres').find_all('span', class_='itemprop')]
# Extracts the runtime from the HTML.
runtime = soup.find('div', class_='technical').find('span', class_='runtime').text.strip()
# Extracts the main cast members from the HTML.
cast_list = soup.find_all('table', class_='cast_list')[0].find_all('td', class_='primary_photo')
main_cast = [actor.find('img')['alt'].strip() for actor in cast_list[:4]]
# Extracts the plot summary from the HTML.
plot_summary = soup.find('div', class_='plot_summary').find('p', class_='plot_summary_item').text.strip()
# Extracts the critical reception from the HTML.
try:
critical_reception = soup.find('div', class_='metacriticScore').find('span').text.strip()
except AttributeError:
critical_reception = None
# Extracts the IMDb rating from the HTML.
imdb_rating = rating + '/10'
# Extracts the Rotten Tomatoes score from the HTML.
try:
rotten_tomatoes_score = soup.find('span', class_='tMeterScore').text.strip() + '%' if soup.find('span', class_='tMeterScore') else None
except AttributeError:
rotten_tomatoes_score = None
# Extracts the Metacritic score from the HTML.
try:
metacritic_score = soup.find('a', class_='metacriticScore').text.strip() + '/100' if soup.find('a', class_='metacriticScore') else None
except AttributeError:
metacritic_score = None
# Extracts the Google users rating from the HTML.
try:
google_users_rating = soup.find('div', class_='google-rating-wrapper').find('span', class_='rating-bar-value').text.strip() + '%' if soup.find('div', class_='google-rating-wrapper') else None
except AttributeError:
google_users_rating = None
# Extracts the reviews consensus from the HTML.
reviews_consensus = []
try:
consensus_items = soup.find_all('div', class_='consensusText')
for item in consensus_items:
reviews_consensus.append(item.text.strip())
except AttributeError:
pass
# Extracts the Google reviews summary from the HTML.
google_reviews_summary = ''
try:
google_reviews = soup.find('div', class_='google-reviews-wrapper').find_all('li')
for review in google_reviews:
google_reviews_summary += review.text.strip() + '\n'
except AttributeError:
pass
# Extracts the key accolades from the HTML.
key_accolades = ''
try:
accolades_items = soup.find('div', class_='awards').find_all('span', class_='award')
for item in accolades_items:
key_accolades += item.text.strip() + '\n'
if not key_accolades:
key_accolades = 'None notable'
except AttributeError:
key_accolades = 'None notable'
# Extracts the fun fact from the HTML.
fun_fact = ''
try:
fun_fact_div = soup.find('div', id='title-extra-item-wrapper').find('div', class_='inline').find('p')
fun_fact = fun_fact_div.text.strip()
except AttributeError:
fun_fact = 'No fun fact found.'
return {
'title': title,
'release_year': int(year),
'director': director,
'genres': genres,
'runtime': runtime,
'main_cast': main_cast,
'plot_summary': plot_summary,
'critical_reception': critical_reception,
'ratings': {
'imdb': imdb_rating,
'rotten_tomatoes': rotten_tomatoes_score,
'metacritic': metacritic_score,
'google_users': google_users_rating
},
'reviews_consensus': reviews_consensus,
'google_reviews_summary': google_reviews_summary,
'key_accolades': key_accolades,
'fun_fact': fun_fact
}
if __name__ == '__main__':
movie_title = 'Sinterklaas and the Golden Horseshoe'
movie_data = get_movie_data(movie_title)
if movie_data:
# Convert to JSON string and print to stdout
print(json.dumps(movie_data, indent=4))
else:
sys.exit(1)
•Genres: Family, Adventure
When the precious Golden Horseshoe, a symbol of good luck for Sinterklaas and his helpers, is stolen, Sinterklaas and his Pieten must embark on a thrilling adventure to retrieve it before the big Sinterklaas celebration. Their quest takes them through various challenges and intriguing locations as they race against time to recover the stolen artifact and ensure a joyful holiday for all.
This film is part of the long-running Dutch tradition of Sinterklaas films, which are released annually leading up to the Sinterklaas celebration on December 5th.
AI-generated overview · Verify ratings on official sources