Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions Project2-WebScraping.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 232,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import sys\n",
"import requests\n",
"from pathlib import Path\n",
"\n",
"from bs4 import BeautifulSoup\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"from selenium.webdriver.common.action_chains import ActionChains\n",
"from selenium.common.exceptions import NoSuchElementException"
]
},
{
"cell_type": "code",
"execution_count": 233,
"metadata": {},
"outputs": [],
"source": [
"driver = webdriver.Firefox(executable_path=r'C:\\Program Files\\geckodriver-v0.29.1-win64\\geckodriver.exe')"
]
},
{
"cell_type": "code",
"execution_count": 241,
"metadata": {},
"outputs": [],
"source": [
"def get_movies(name: str):\n",
" \n",
" driver = webdriver.Firefox(executable_path=r'C:\\Program Files\\geckodriver-v0.29.1-win64\\geckodriver.exe')\n",
" driver.get(\"https://www.imdb.com/\")\n",
" \n",
" time.sleep(1)\n",
" elem = driver.find_element_by_name(\"q\")\n",
" elem.clear()\n",
" elem.send_keys(name)\n",
" elem.send_keys(Keys.RETURN)\n",
" \n",
" time.sleep(1)\n",
" try:\n",
" element = driver.find_element_by_partial_link_text(name)\n",
" element.click()\n",
" except NoSuchElementException: \n",
" driver.close()\n",
" return 'Could not find actor/actress'\n",
" \n",
" time.sleep(1)\n",
" page_source = driver.page_source\n",
" \n",
" time.sleep(1)\n",
" driver.close()\n",
" \n",
" world_selection = BeautifulSoup(page_source, 'lxml')\n",
" movie_source = world_selection.find('div', class_='filmo-category-section')\n",
" movie_list = re.findall(r'>(.*?)<', str(movie_source))\n",
" movies = [i for i in movie_list if i != '']\n",
" \n",
" return movies\n"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Big Bug Man',\n",
" 'completed',\n",
" 'Superman II: The Richard Donner Cut',\n",
" 'The Godfather',\n",
" 'Michael Jackson: You Rock My World',\n",
" 'The Score',\n",
" 'Free Money',\n",
" 'The Brave',\n",
" 'The Island of Dr. Moreau',\n",
" 'Divine Rapture',\n",
" 'Don Juan DeMarco',\n",
" 'Christopher Columbus: The Discovery',\n",
" 'The Freshman',\n",
" 'A Dry White Season',\n",
" 'The Formula',\n",
" 'Apocalypse Now',\n",
" 'Roots: The Next Generations',\n",
" 'Part VII (1960-1967)',\n",
" 'Superman',\n",
" 'The Godfather Saga',\n",
" 'Episode #1.4',\n",
" 'Episode #1.3',\n",
" 'Episode #1.2',\n",
" 'Episode #1.1',\n",
" 'The Missouri Breaks',\n",
" 'Last Tango in Paris',\n",
" 'The Godfather',\n",
" 'The Nightcomers',\n",
" 'Burn!',\n",
" 'The Night of the Following Day',\n",
" 'Candy',\n",
" 'Reflections in a Golden Eye',\n",
" 'A Countess from Hong Kong',\n",
" 'The Appaloosa',\n",
" 'The Chase',\n",
" 'Morituri',\n",
" 'Bedtime Story',\n",
" 'The Ugly American',\n",
" 'Mutiny on the Bounty',\n",
" 'One-Eyed Jacks',\n",
" 'The Fugitive Kind',\n",
" 'The Young Lions',\n",
" 'Sayonara',\n",
" 'The Teahouse of the August Moon',\n",
" 'Guys and Dolls',\n",
" 'Omnibus',\n",
" 'Advice to Bathers',\n",
" 'Désirée',\n",
" 'On the Waterfront',\n",
" 'The Wild One',\n",
" 'Julius Caesar',\n",
" 'Viva Zapata!',\n",
" 'A Streetcar Named Desire',\n",
" 'The Men',\n",
" \"Actor's Studio\",\n",
" \"I'm No Hero\"]"
]
},
"execution_count": 243,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_movies('Marlon Brando')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}