diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..12d120e
Binary files /dev/null and b/.DS_Store differ
diff --git a/your-code/.DS_Store b/your-code/.DS_Store
new file mode 100644
index 0000000..a870c7a
Binary files /dev/null and b/your-code/.DS_Store differ
diff --git a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb
index 812f7a4..256b315 100644
--- a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb
+++ b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb
@@ -40,7 +40,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -48,12 +48,13 @@
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"# from pprint import pprint\n",
- "# from lxml import html\n",
- "# from lxml.html import fromstring\n",
+ "from lxml import html\n",
+ "from lxml.html import fromstring\n",
"# import urllib.request\n",
"# from urllib.request import urlopen\n",
- "# import random\n",
- "# import re\n",
+ "import random\n",
+ "import re\n",
+ "import html5lib\n",
"# import scrapy"
]
},
@@ -64,25 +65,6 @@
"#### Download, parse (using BeautifulSoup), and print the content from the Trending Developers page from GitHub:"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# This is the url you will scrape in this exercise\n",
- "url = 'https://github.com/trending/developers'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#your code"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -134,11 +116,108 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 361,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://github.com/trending/developers'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 362,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "conts = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 363,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "git_soup = BeautifulSoup(conts, 'html5lib')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 364,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "names = git_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 365,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nick = git_soup.find_all('a', {'class':\"link-gray\"})\n",
+ "nick = nick[19:44]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 366,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "nicks = [i.text for i in nick]\n",
+ "names_s = [i.text for i in names]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 367,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Eric Ma (ericmjl)',\n",
+ " 'Federico Brigante (bfred-it)',\n",
+ " 'Kyle Roach (iRoachie)',\n",
+ " 'Olle Jonsson (olleolleolle)',\n",
+ " 'Nikita Sobolev (sobolevn)',\n",
+ " 'Frank S. Thomas (fthomas)',\n",
+ " 'syuilo (syuilo)',\n",
+ " 'Ives van Hoorne (CompuIves)',\n",
+ " 'Paulus Schoutsen (balloob)',\n",
+ " 'Sarah Drasner (sdras)',\n",
+ " 'Stefanos Kornilios Mitsis Poiitidis (skmp)',\n",
+ " 'Jan Hovancik (hovancik)',\n",
+ " 'Andreas Mueller (amueller)',\n",
+ " 'Guillaume Gomez (GuillaumeGomez)',\n",
+ " 'Matt Holt (mholt)',\n",
+ " 'Clifford Wolf (cliffordwolf)',\n",
+ " 'Franck Nijhof (frenck)',\n",
+ " 'Joe Block (unixorn)',\n",
+ " 'Andrei Neagoie (aneagoie)',\n",
+ " 'Jack Lloyd (randombit)',\n",
+ " 'Guillermo Rauch (rauchg)',\n",
+ " 'Tim Griesser (tgriesser)',\n",
+ " 'Jameson Nash (vtjnash)',\n",
+ " 'Anderson Banihirwe (andersy005)',\n",
+ " 'Danny Ryan (djrtwo)']"
+ ]
+ },
+ "execution_count": 367,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "for i in range(len(names_s)):\n",
+ " names_s[i] = names_s[i] +\" \"+ \"(\" + nicks[i] + \")\"\n",
+ "names_s"
]
},
{
@@ -152,7 +231,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 368,
"metadata": {},
"outputs": [],
"source": [
@@ -162,11 +241,81 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 369,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "repos = requests.get(url).content\n",
+ "repo_soup = BeautifulSoup(repos, 'html5lib')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 370,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "reposi = repo_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 371,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "repos = [i.text for i in reposi]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 372,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['gto76/python-cheatsheet',\n",
+ " 'j3ssie/Osmedeus',\n",
+ " 'tangzixiang0304/Shielded_detector',\n",
+ " 'uber/ludwig',\n",
+ " 'xinshuoweng/AB3DMOT',\n",
+ " 'NVlabs/stylegan',\n",
+ " 'dagster-io/dagster',\n",
+ " 'tensorflow/models',\n",
+ " 'eragonruan/text-detection-ctpn',\n",
+ " 'sherlock-project/sherlock',\n",
+ " 'deepfakes/faceswap',\n",
+ " 'nbei/Deep-Flow-Guided-Video-Inpainting',\n",
+ " 'iovisor/bcc',\n",
+ " 'Roibal/Cryptocurrency-Trading-Bots-Python-Beginner-Advance',\n",
+ " 'NVIDIA/DeepLearningExamples',\n",
+ " 'BlackHC/tfpyth',\n",
+ " 'clovaai/deep-text-recognition-benchmark',\n",
+ " 'tkat0/PyTorch_BlazeFace',\n",
+ " 'OpenMined/PySyft',\n",
+ " 'CoreyMSchafer/code_snippets',\n",
+ " 'public-apis/public-apis',\n",
+ " 'd2l-ai/d2l-zh',\n",
+ " 'apache/airflow',\n",
+ " 'beecost/bee-university',\n",
+ " 'sundowndev/PhoneInfoga']"
+ ]
+ },
+ "execution_count": 372,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "repos = [i.replace('\\n', '') for i in repos]\n",
+ "repos = [i.replace(' ', '') for i in repos]\n",
+ "repos"
]
},
{
@@ -178,7 +327,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 373,
"metadata": {},
"outputs": [],
"source": [
@@ -188,11 +337,87 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 374,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "Walt = requests.get(url).content\n",
+ "walt_soup = BeautifulSoup(Walt, 'html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 375,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "imgs = walt_soup.find_all('div', {\"class\":\"thumbinner\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 376,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n",
+ " '/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n",
+ " '//upload.wikimedia.org/wikipedia/commons/4/4d/Newman_Laugh-O-Gram_%281921%29.webm',\n",
+ " '/wiki/File:Newman_Laugh-O-Gram_(1921).webm',\n",
+ " '/wiki/File:Trolley_Troubles_poster.jpg',\n",
+ " '/wiki/File:Trolley_Troubles_poster.jpg',\n",
+ " '/wiki/Trolley_Troubles',\n",
+ " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n",
+ " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n",
+ " '/wiki/Mickey_Mouse',\n",
+ " '/wiki/File:Steamboat-willie.jpg',\n",
+ " '/wiki/File:Steamboat-willie.jpg',\n",
+ " '/wiki/Mickey_Mouse',\n",
+ " '/wiki/Steamboat_Willie',\n",
+ " '/wiki/File:Walt_Disney_1935.jpg',\n",
+ " '/wiki/File:Walt_Disney_1935.jpg',\n",
+ " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n",
+ " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n",
+ " '/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',\n",
+ " '/wiki/File:Disney_drawing_goofy.jpg',\n",
+ " '/wiki/File:Disney_drawing_goofy.jpg',\n",
+ " '/wiki/Goofy',\n",
+ " '/wiki/File:DisneySchiphol1951.jpg',\n",
+ " '/wiki/File:DisneySchiphol1951.jpg',\n",
+ " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n",
+ " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n",
+ " '/wiki/Disneyland',\n",
+ " '/wiki/Orange_County,_California',\n",
+ " '/wiki/File:Walt_disney_portrait_right.jpg',\n",
+ " '/wiki/File:Walt_disney_portrait_right.jpg',\n",
+ " '/wiki/File:Walt_Disney_Grave.JPG',\n",
+ " '/wiki/File:Walt_Disney_Grave.JPG',\n",
+ " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n",
+ " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n",
+ " '/wiki/Roy_O._Disney',\n",
+ " '/wiki/File:Disney_Display_Case.JPG',\n",
+ " '/wiki/File:Disney_Display_Case.JPG',\n",
+ " '/wiki/The_Walt_Disney_Family_Museum',\n",
+ " '/wiki/File:Disney1968.jpg',\n",
+ " '/wiki/File:Disney1968.jpg']"
+ ]
+ },
+ "execution_count": 376,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "l = []\n",
+ "for i in range(len(imgs)):\n",
+ " for img in (imgs[i].find_all('a', href=True)):\n",
+ " l.append(img['href'])\n",
+ "l"
]
},
{
@@ -209,7 +434,20 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url ='https://en.wikipedia.org/wiki/Python' "
+ "url ='https://en.wikipedia.org/wiki/Python' \n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "ul = soup.find_all('ul')\n",
+ "ul= ul[2:14]"
]
},
{
@@ -218,7 +456,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "l = [img['href'] for i in range(len(ul)) for img in (ul[i].find_all('a', href=True)) ] \n",
+ "l"
]
},
{
@@ -235,7 +474,8 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'http://uscode.house.gov/download/download.shtml'"
+ "url = 'http://uscode.house.gov/download/download.shtml'\n",
+ "html = requests.get(url).content"
]
},
{
@@ -244,7 +484,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "#soup\n",
+ "titles = soup.find_all('div', {'class':'usctitlechanged'})"
]
},
{
@@ -261,7 +504,19 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.fbi.gov/wanted/topten'"
+ "url = 'https://www.fbi.gov/wanted/topten'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code \n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "titles = soup.find_all('h3', {'class':'title'})"
]
},
{
@@ -270,7 +525,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code "
+ "buscados = [nombre.text.replace(\"\\n\", \"\") for nombre in titles]\n",
+ "buscados"
]
},
{
@@ -287,7 +543,60 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.emsc-csem.org/Earthquake/'"
+ "url = 'https://www.emsc-csem.org/Earthquake/'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "\n",
+ "datetime = soup.find_all('td', {'class':'tabev6'})\n",
+ "lyl = soup.find_all('td', {'class':'tabev1'})\n",
+ "region = soup.find_all('td', {'class':'tb_region'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n",
+ "datefull = []\n",
+ "for fecha in limpdate:\n",
+ " fecha = fecha.replace('earthquake', '')\n",
+ " fecha =fecha.replace('ago', '')\n",
+ " datefull.append(fecha)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lat =[]\n",
+ "lon = []\n",
+ "for latlon in range(len(lyl)):\n",
+ " if latlon %2 == 0:\n",
+ " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n",
+ " else:\n",
+ " lon.append(lyl[latlon].text.replace('\\xa0', ''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lugar =[fecha.text for fecha in region]"
]
},
{
@@ -296,7 +605,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})\n",
+ "df.head(20)"
]
},
{
@@ -308,7 +618,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 291,
"metadata": {},
"outputs": [],
"source": [
@@ -318,11 +628,125 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 304,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "hackatons_soup = requests.get(url).content\n",
+ "hackatons_soup = BeautifulSoup(hackatons_soup, 'html5lib')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 342,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "hackis = hackatons_soup.find_all('div', {'class':'card-body'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 343,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Code Geist Hackathon by SefrWahed\\u20037/29/2019\\n\\n\\u2003Cairo, Egypt',\n",
+ " 'The Code Factor\\u20035/21/2019\\n\\n\\u2003Milano, Italy',\n",
+ " 'TECHFEST MUNICH\\u20039/6/2019\\n\\n\\u2003Munich, Germany',\n",
+ " 'Galileo App Competition\\u20031/31/2019\\n\\n\\u2003Prague, Czech Republic']"
+ ]
+ },
+ "execution_count": 343,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hacks = [hackis[i].text for i in range(len(hackis))]\n",
+ "hacks = [i.split(\"\\u2003\") for i in hacks]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 348,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Code Geist Hackathon by SefrWahed | \n",
+ " 7/29/2019 | \n",
+ " Cairo, Egypt | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Code Factor | \n",
+ " 5/21/2019 | \n",
+ " Milano, Italy | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " TECHFEST MUNICH | \n",
+ " 9/6/2019 | \n",
+ " Munich, Germany | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Galileo App Competition | \n",
+ " 1/31/2019 | \n",
+ " Prague, Czech Republic | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2\n",
+ "0 Code Geist Hackathon by SefrWahed 7/29/2019 Cairo, Egypt\n",
+ "1 The Code Factor 5/21/2019 Milano, Italy\n",
+ "2 TECHFEST MUNICH 9/6/2019 Munich, Germany\n",
+ "3 Galileo App Competition 1/31/2019 Prague, Czech Republic"
+ ]
+ },
+ "execution_count": 348,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hacks = pd.DataFrame(hacks)\n",
+ "hacks[1] = hacks[1].apply(lambda x:x.replace('\\n', ''))\n",
+ "hacks"
]
},
{
@@ -353,11 +777,39 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 300,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Check the tweets of an account:shiroiusagi4486\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'76'"
+ ]
+ },
+ "execution_count": 300,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#your code"
+ "#your code\n",
+ "usr_input = input('Check the tweets of an account:')\n",
+ "try:\n",
+ " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n",
+ "except:\n",
+ " print('The account does not exist')\n",
+ "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n",
+ "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n",
+ "twitter_follows[0]['data-count']\n",
+ " \n",
+ " "
]
},
{
@@ -388,11 +840,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 298,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Check the followers of an account:uaquiro\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'302'"
+ ]
+ },
+ "execution_count": 298,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#your code"
+ "#your code\n",
+ "usr_input = input('Check the followers of an account:')\n",
+ "try:\n",
+ " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n",
+ "except:\n",
+ " print('The account does not exist')\n",
+ "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n",
+ "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n",
+ "twitter_follows[2]['data-count']\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
]
},
{
@@ -404,7 +886,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 207,
"metadata": {},
"outputs": [],
"source": [
@@ -414,11 +896,243 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 208,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "wikissup = requests.get(url).content\n",
+ "wikissup = BeautifulSoup(wikissup, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 211,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "langs = wikissup.find_all('div', {'class':'central-featured'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 268,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['English',\n",
+ " '日本語',\n",
+ " 'Español',\n",
+ " 'Deutsch',\n",
+ " 'Русский',\n",
+ " 'Français',\n",
+ " 'Italiano',\n",
+ " '中文',\n",
+ " 'Português',\n",
+ " 'Polski']"
+ ]
+ },
+ "execution_count": 268,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "langu = []\n",
+ "for i in range(len(langs)):\n",
+ " langu.append(langs[i].find_all('strong'))\n",
+ "langu = [i for i in langu[0]]\n",
+ "langu = [i.text for i in langu]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[['English5\\xa0892\\xa0000',\n",
+ " ' articles日本語1\\xa0159\\xa0000',\n",
+ " ' 記事Español1\\xa0532\\xa0000',\n",
+ " ' artículosDeutsch2\\xa0323\\xa0000',\n",
+ " ' ArtikelРусский1\\xa0556\\xa0000',\n",
+ " ' статейFrançais2\\xa0123\\xa0000',\n",
+ " ' articlesItaliano1\\xa0541\\xa0000',\n",
+ " ' voci中文1\\xa0065\\xa0000',\n",
+ " ' 條目Português1\\xa0010\\xa0000',\n",
+ " ' artigosPolski1\\xa0346\\xa0000',\n",
+ " ' haseł']]"
+ ]
+ },
+ "execution_count": 245,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numbers = [] \n",
+ "for i in range(len(langs)):\n",
+ " numbers.append(langs[i].find_all('small'))\n",
+ "numbers = [i.text for i in langs]\n",
+ "numbers = [i.replace('\\n', '') for i in numbers]\n",
+ "numbers = [i.split('+') for i in numbers]\n",
+ "numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 257,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "numbers = [re.findall('\\d', i) for i in liss]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 258,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['5892000',\n",
+ " '1159000',\n",
+ " '1532000',\n",
+ " '2323000',\n",
+ " '1556000',\n",
+ " '2123000',\n",
+ " '1541000',\n",
+ " '1065000',\n",
+ " '1010000',\n",
+ " '1346000']"
+ ]
+ },
+ "execution_count": 258,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numberss = [\"\".join(numbers[i]) for i in range(len(numbers))]\n",
+ "numbers = numberss[:-1]\n",
+ "numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 269,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Language | \n",
+ " Articles | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " English | \n",
+ " 5892000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 日本語 | \n",
+ " 1159000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Español | \n",
+ " 1532000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Deutsch | \n",
+ " 2323000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Русский | \n",
+ " 1556000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Français | \n",
+ " 2123000 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Italiano | \n",
+ " 1541000 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 中文 | \n",
+ " 1065000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Português | \n",
+ " 1010000 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " Polski | \n",
+ " 1346000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Language Articles\n",
+ "0 English 5892000\n",
+ "1 日本語 1159000\n",
+ "2 Español 1532000\n",
+ "3 Deutsch 2323000\n",
+ "4 Русский 1556000\n",
+ "5 Français 2123000\n",
+ "6 Italiano 1541000\n",
+ "7 中文 1065000\n",
+ "8 Português 1010000\n",
+ "9 Polski 1346000"
+ ]
+ },
+ "execution_count": 269,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = {'Language':langu, 'Articles':numbers}\n",
+ "Wiki_df = pd.DataFrame(d)\n",
+ "Wiki_df"
]
},
{
@@ -430,7 +1144,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
@@ -440,11 +1154,166 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 192,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code \n",
+ "datasets_soup = requests.get(url).content\n",
+ "datasets_soup = BeautifulSoup(datasets_soup, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 195,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Business and economy', 'Crime and justice', 'Defence', 'Education', 'Environment', 'Government', 'Government spending', 'Health', 'Mapping', 'Society', 'Towns and cities', 'Transport']\n"
+ ]
+ }
+ ],
+ "source": [
+ "names = datasets_soup.find_all('h2')\n",
+ "names_s1 = [i.text for i in names]\n",
+ "\n",
+ "print(names_s1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 204,
"metadata": {},
"outputs": [],
"source": [
- "#your code "
+ "descp = datasets_soup.find_all('p')\n",
+ "descp = descp[5:-2]\n",
+ "descp = [i.text for i in descp]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 206,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Dataset | \n",
+ " Description | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Business and economy | \n",
+ " Small businesses, industry, imports, exports a... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Crime and justice | \n",
+ " Courts, police, prison, offenders, borders and... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Defence | \n",
+ " Armed forces, health and safety, search and re... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Education | \n",
+ " Students, training, qualifications and the Nat... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Environment | \n",
+ " Weather, flooding, rivers, air quality, geolog... | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Government | \n",
+ " Staff numbers and pay, local councillors and d... | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Government spending | \n",
+ " Includes all payments by government department... | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " Health | \n",
+ " Includes smoking, drugs, alcohol, medicine per... | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Mapping | \n",
+ " Addresses, boundaries, land ownership, aerial ... | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " Society | \n",
+ " Employment, benefits, household finances, pove... | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " Towns and cities | \n",
+ " Includes housing, urban planning, leisure, was... | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " Transport | \n",
+ " Airports, roads, freight, electric vehicles, p... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Dataset Description\n",
+ "0 Business and economy Small businesses, industry, imports, exports a...\n",
+ "1 Crime and justice Courts, police, prison, offenders, borders and...\n",
+ "2 Defence Armed forces, health and safety, search and re...\n",
+ "3 Education Students, training, qualifications and the Nat...\n",
+ "4 Environment Weather, flooding, rivers, air quality, geolog...\n",
+ "5 Government Staff numbers and pay, local councillors and d...\n",
+ "6 Government spending Includes all payments by government department...\n",
+ "7 Health Includes smoking, drugs, alcohol, medicine per...\n",
+ "8 Mapping Addresses, boundaries, land ownership, aerial ...\n",
+ "9 Society Employment, benefits, household finances, pove...\n",
+ "10 Towns and cities Includes housing, urban planning, leisure, was...\n",
+ "11 Transport Airports, roads, freight, electric vehicles, p..."
+ ]
+ },
+ "execution_count": 206,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = {'Dataset':names_s1, 'Description': descp}\n",
+ "DataSets = pd.DataFrame(d)\n",
+ "DataSets"
]
},
{
@@ -456,7 +1325,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 349,
"metadata": {},
"outputs": [],
"source": [
@@ -466,11 +1335,283 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 350,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "table_lang = requests.get(url).content\n",
+ "table_soup = BeautifulSoup(table_lang, 'html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 351,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tables = table_soup.find_all('tr')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 352,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tabless = []\n",
+ "for i in range(len(tables)):\n",
+ " tabless.append(tables[i])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 353,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "tal = [tabless[i].text.split(\"\\n\\n\") for i in range(len(tabless))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 354,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Rank | \n",
+ " Rank | \n",
+ " Language | \n",
+ " Primary Country | \n",
+ " TotalCountries[a] | \n",
+ " Speakers(millions) | \n",
+ " % of the World population\\n(March 2019)[7] | \n",
+ " \\nMacrolanguage | \n",
+ " Language familyBranch | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " — | \n",
+ " 1 | \n",
+ " Chinese (macrolanguage) | \n",
+ " China | \n",
+ " 39 | \n",
+ " 1,311 | \n",
+ " 17.026 | \n",
+ " | \n",
+ " Sino-TibetanSinitic | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " — | \n",
+ " Mandarin | \n",
+ " China | \n",
+ " 13 | \n",
+ " 918 | \n",
+ " 11.922 | \n",
+ " Chinese | \n",
+ " Sino-TibetanSinitic | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " Spanish | \n",
+ " Spain | \n",
+ " 31 | \n",
+ " 460 | \n",
+ " 5.974 | \n",
+ " | \n",
+ " Indo-EuropeanRomance | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " English | \n",
+ " United Kingdom | \n",
+ " 137 | \n",
+ " 379 | \n",
+ " 4.922 | \n",
+ " | \n",
+ " Indo-EuropeanGermanic | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " Hindi | \n",
+ " India | \n",
+ " 4 | \n",
+ " 341 | \n",
+ " 4.429 | \n",
+ " | \n",
+ " Indo-EuropeanIndo-Aryan | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " — | \n",
+ " 5 | \n",
+ " Arabic (macrolanguage) | \n",
+ " Saudi Arabia | \n",
+ " 59 | \n",
+ " 319 | \n",
+ " 4.143 | \n",
+ " | \n",
+ " AfroasiaticSemitic | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " Bengali | \n",
+ " Bangladesh | \n",
+ " 4 | \n",
+ " 228 | \n",
+ " 2.961 | \n",
+ " | \n",
+ " Indo-EuropeanIndo-Aryan | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " Portuguese | \n",
+ " Portugal | \n",
+ " 15 | \n",
+ " 221 | \n",
+ " 2.870 | \n",
+ " | \n",
+ " Indo-EuropeanRomance | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " Russian | \n",
+ " Russian Federation | \n",
+ " 19 | \n",
+ " 154 | \n",
+ " 2.000 | \n",
+ " | \n",
+ " Indo-EuropeanBalto-Slavic | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " Japanese | \n",
+ " Japan | \n",
+ " 2 | \n",
+ " 128 | \n",
+ " 1.662 | \n",
+ " | \n",
+ " JaponicJapanese | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 \\\n",
+ "0 Rank Rank Language Primary Country \n",
+ "1 — 1 Chinese (macrolanguage) China \n",
+ "2 1 — Mandarin China \n",
+ "3 2 2 Spanish Spain \n",
+ "4 3 3 English United Kingdom \n",
+ "5 4 4 Hindi India \n",
+ "6 — 5 Arabic (macrolanguage) Saudi Arabia \n",
+ "7 5 6 Bengali Bangladesh \n",
+ "8 6 7 Portuguese Portugal \n",
+ "9 7 8 Russian Russian Federation \n",
+ "10 8 9 Japanese Japan \n",
+ "\n",
+ " 4 5 \\\n",
+ "0 TotalCountries[a] Speakers(millions) \n",
+ "1 39 1,311 \n",
+ "2 13 918 \n",
+ "3 31 460 \n",
+ "4 137 379 \n",
+ "5 4 341 \n",
+ "6 59 319 \n",
+ "7 4 228 \n",
+ "8 15 221 \n",
+ "9 19 154 \n",
+ "10 2 128 \n",
+ "\n",
+ " 6 7 \\\n",
+ "0 % of the World population\\n(March 2019)[7] \\nMacrolanguage \n",
+ "1 17.026 \n",
+ "2 11.922 Chinese \n",
+ "3 5.974 \n",
+ "4 4.922 \n",
+ "5 4.429 \n",
+ "6 4.143 \n",
+ "7 2.961 \n",
+ "8 2.870 \n",
+ "9 2.000 \n",
+ "10 1.662 \n",
+ "\n",
+ " 8 \n",
+ "0 Language familyBranch \n",
+ "1 Sino-TibetanSinitic \n",
+ "2 Sino-TibetanSinitic \n",
+ "3 Indo-EuropeanRomance \n",
+ "4 Indo-EuropeanGermanic \n",
+ "5 Indo-EuropeanIndo-Aryan \n",
+ "6 AfroasiaticSemitic \n",
+ "7 Indo-EuropeanIndo-Aryan \n",
+ "8 Indo-EuropeanRomance \n",
+ "9 Indo-EuropeanBalto-Slavic \n",
+ "10 JaponicJapanese "
+ ]
+ },
+ "execution_count": 354,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "talcual = pd.DataFrame(tal)\n",
+ "talcual = talcual[:11]\n",
+ "talcual[0] = talcual[0].apply(lambda x: x.replace('\\n', ''))\n",
+ "talcual[8] = talcual[8].apply(lambda x: x.replace('\\n', ''))\n",
+ "talcual"
]
},
{
@@ -516,7 +1657,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 155,
"metadata": {},
"outputs": [],
"source": [
@@ -530,7 +1671,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "fims_get = requests.get(url).content\n",
+ "films_soup = BeautifulSoup(films_get, 'lxml')\n",
+ "films_names = films_soup.find_all(<"
]
},
{
@@ -569,7 +1713,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 355,
"metadata": {},
"outputs": [],
"source": [
@@ -580,11 +1724,135 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 356,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "books = requests.get(url).content\n",
+ "books_soup = BeautifulSoup(books, 'html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 357,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "names = books_soup.find_all('h3')\n",
+ "names_list = []\n",
+ "for i in range(len(names)):\n",
+ " for name in (names[i].find_all('a', title=True)):\n",
+ " names_list.append(name['title'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 358,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prices = books_soup.find_all('p', {'class':'price_color'})\n",
+ "\n",
+ "prices_list = [prices[i].text for i in range(len(prices))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 359,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stock = books_soup.find_all('p', {'class':'instock availability'})\n",
+ "stock = [stock[i].text for i in range(len(stock))]\n",
+ "stock = [i.replace('\\n', \"\") for i in stock]\n",
+ "stock = [re.sub('\\s\\s+', '', stock[i]) for i in range(len(stock))] "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 360,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Book Name | \n",
+ " Price | \n",
+ " Stock Availability | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " A Light in the Attic | \n",
+ " £51.77 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Tipping the Velvet | \n",
+ " £53.74 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Soumission | \n",
+ " £50.10 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Sharp Objects | \n",
+ " £47.82 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Sapiens: A Brief History of Humankind | \n",
+ " £54.23 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Book Name Price Stock Availability\n",
+ "0 A Light in the Attic £51.77 In stock\n",
+ "1 Tipping the Velvet £53.74 In stock\n",
+ "2 Soumission £50.10 In stock\n",
+ "3 Sharp Objects £47.82 In stock\n",
+ "4 Sapiens: A Brief History of Humankind £54.23 In stock"
+ ]
+ },
+ "execution_count": 360,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = {\"Book Name\":names_list, \"Price\":prices_list, \"Stock Availability\": stock}\n",
+ "books = pd.DataFrame(d)\n",
+ "books.head()"
]
}
],
diff --git a/your-code/.ipynb_checkpoints/main_fer-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main_fer-checkpoint.ipynb
new file mode 100644
index 0000000..0e85fa9
--- /dev/null
+++ b/your-code/.ipynb_checkpoints/main_fer-checkpoint.ipynb
@@ -0,0 +1,1224 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Web Scraping Lab\n",
+ "\n",
+ "You will find in this notebook some scrapy exercises to practise your scraping skills.\n",
+ "\n",
+ "**Tips:**\n",
+ "\n",
+ "- Check the response status code for each request to ensure you have obtained the intended contennt.\n",
+ "- Print the response text in each request to understand the kind of info you are getting and its format.\n",
+ "- Check for patterns in the response text to extract the data/info requested in each question.\n",
+ "- Visit each url and take a look at its source through Chrome DevTools. You'll need to identify the html tags, special class names etc. used for the html content you are expected to extract."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- [Requests library](http://docs.python-requests.org/en/master/#the-user-guide) documentation \n",
+ "- [Beautiful Soup Doc](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)\n",
+ "- [Urllib](https://docs.python.org/3/library/urllib.html#module-urllib)\n",
+ "- [re lib](https://docs.python.org/3/library/re.html)\n",
+ "- [lxml lib](https://lxml.de/)\n",
+ "- [Scrapy](https://scrapy.org/)\n",
+ "- [List of HTTP status codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes)\n",
+ "- [HTML basics](http://www.simplehtmlguide.com/cheatsheet.php)\n",
+ "- [CSS basics](https://www.cssbasics.com/#page_start)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Below are the libraries and modules you may need. `requests`, `BeautifulSoup` and `pandas` are imported for you. If you prefer to use additional libraries feel free to uncomment them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "from bs4 import BeautifulSoup\n",
+ "import pandas as pd\n",
+ "\n",
+ "# from pprint import pprint\n",
+ "from lxml import html\n",
+ "# from lxml.html import fromstring\n",
+ "# import urllib.request\n",
+ "# from urllib.request import urlopen\n",
+ "# import random\n",
+ "import re\n",
+ "# import scrapy"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Download, parse (using BeautifulSoup), and print the content from the Trending Developers page from GitHub:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://github.com/trending/developers'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Display the names of the trending developers retrieved in the previous step.\n",
+ "\n",
+ "Your output should be a Python list of developer names. Each name should not contain any html tag.\n",
+ "\n",
+ "**Instructions:**\n",
+ "\n",
+ "1. Find out the html tag and class names used for the developer names. You can achieve this using Chrome DevTools.\n",
+ "\n",
+ "1. Use BeautifulSoup to extract all the html elements that contain the developer names.\n",
+ "\n",
+ "1. Use string manipulation techniques to replace whitespaces and linebreaks (i.e. `\\n`) in the *text* of each html element. Use a list to store the clean names.\n",
+ "\n",
+ "1. Print the list of names.\n",
+ "\n",
+ "Your output should look like below:\n",
+ "\n",
+ "```\n",
+ "['trimstray (@trimstray)',\n",
+ " 'joewalnes (JoeWalnes)',\n",
+ " 'charlax (Charles-AxelDein)',\n",
+ " 'ForrestKnight (ForrestKnight)',\n",
+ " 'revery-ui (revery-ui)',\n",
+ " 'alibaba (Alibaba)',\n",
+ " 'Microsoft (Microsoft)',\n",
+ " 'github (GitHub)',\n",
+ " 'facebook (Facebook)',\n",
+ " 'boazsegev (Bo)',\n",
+ " 'google (Google)',\n",
+ " 'cloudfetch',\n",
+ " 'sindresorhus (SindreSorhus)',\n",
+ " 'tensorflow',\n",
+ " 'apache (TheApacheSoftwareFoundation)',\n",
+ " 'DevonCrawford (DevonCrawford)',\n",
+ " 'ARMmbed (ArmMbed)',\n",
+ " 'vuejs (vuejs)',\n",
+ " 'fastai (fast.ai)',\n",
+ " 'QiShaoXuan (Qi)',\n",
+ " 'joelparkerhenderson (JoelParkerHenderson)',\n",
+ " 'torvalds (LinusTorvalds)',\n",
+ " 'CyC2018',\n",
+ " 'komeiji-satori (神楽坂覚々)',\n",
+ " 'script-8']\n",
+ " ```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "h1 = soup.find_all('h1', {'class':'h3 lh-condensed'})\n",
+ "a = soup.find_all('a', {'class':'link-gray'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nombres = [link.string for link in h1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "a = a[19:44]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nicks = [elemento.string for elemento in a]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "todo = [nombres[i] + \" \"+ '('+ nicks[i]+')' for i in range(len(nombres))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Eric Ma (ericmjl)',\n",
+ " 'Federico Brigante (bfred-it)',\n",
+ " 'Kyle Roach (iRoachie)',\n",
+ " 'Olle Jonsson (olleolleolle)',\n",
+ " 'Nikita Sobolev (sobolevn)',\n",
+ " 'Frank S. Thomas (fthomas)',\n",
+ " 'syuilo (syuilo)',\n",
+ " 'Ives van Hoorne (CompuIves)',\n",
+ " 'Paulus Schoutsen (balloob)',\n",
+ " 'Sarah Drasner (sdras)',\n",
+ " 'Stefanos Kornilios Mitsis Poiitidis (skmp)',\n",
+ " 'Jan Hovancik (hovancik)',\n",
+ " 'Andreas Mueller (amueller)',\n",
+ " 'Guillaume Gomez (GuillaumeGomez)',\n",
+ " 'Matt Holt (mholt)',\n",
+ " 'Clifford Wolf (cliffordwolf)',\n",
+ " 'Franck Nijhof (frenck)',\n",
+ " 'Joe Block (unixorn)',\n",
+ " 'Andrei Neagoie (aneagoie)',\n",
+ " 'Jack Lloyd (randombit)',\n",
+ " 'Guillermo Rauch (rauchg)',\n",
+ " 'Tim Griesser (tgriesser)',\n",
+ " 'Jameson Nash (vtjnash)',\n",
+ " 'Anderson Banihirwe (andersy005)',\n",
+ " 'Danny Ryan (djrtwo)']"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "todo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Display the trending Python repositories in GitHub\n",
+ "\n",
+ "The steps to solve this problem is similar to the previous one except that you need to find out the repository names instead of developer names."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://github.com/trending/python?since=daily'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "h1 = soup.find_all('h1', {'class':'h3 lh-condensed'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "proyectos = [algo.text.replace(\"\\n\", \"\") for algo in h1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['gto76 / python-cheatsheet ',\n",
+ " 'j3ssie / Osmedeus ',\n",
+ " 'tangzixiang0304 / Shielded_detector ',\n",
+ " 'uber / ludwig ',\n",
+ " 'xinshuoweng / AB3DMOT ',\n",
+ " 'NVlabs / stylegan ',\n",
+ " 'dagster-io / dagster ',\n",
+ " 'tensorflow / models ',\n",
+ " 'eragonruan / text-detection-ctpn ',\n",
+ " 'sherlock-project / sherlock ',\n",
+ " 'deepfakes / faceswap ',\n",
+ " 'nbei / Deep-Flow-Guided-Video-Inpainting ',\n",
+ " 'iovisor / bcc ',\n",
+ " 'Roibal / Cryptocurrency-Trading-Bots-Python-Beginner-Advance ',\n",
+ " 'NVIDIA / DeepLearningExamples ',\n",
+ " 'BlackHC / tfpyth ',\n",
+ " 'clovaai / deep-text-recognition-benchmark ',\n",
+ " 'tkat0 / PyTorch_BlazeFace ',\n",
+ " 'OpenMined / PySyft ',\n",
+ " 'CoreyMSchafer / code_snippets ',\n",
+ " 'public-apis / public-apis ',\n",
+ " 'd2l-ai / d2l-zh ',\n",
+ " 'apache / airflow ',\n",
+ " 'beecost / bee-university ',\n",
+ " 'sundowndev / PhoneInfoga ']"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "proyectos"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Display all the image links from Walt Disney wikipedia page"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://en.wikipedia.org/wiki/Walt_Disney'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Retrieve an arbitary Wikipedia page of \"Python\" and create a list of links on that page"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url ='https://en.wikipedia.org/wiki/Python'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[- Pythonidae, a family of nonvenomous snakes found in Africa, Asia, and Australia known as pythons\n",
+ "
,\n",
+ " ,\n",
+ " - Python (mythology), a serpent, the earth-dragon of Delphi
\n",
+ " - Python of Aenus (4th-century BCE), student of Plato
\n",
+ " - Python (painter), (ca. 360-320 BCE) vase painter in Poseidonia
\n",
+ " - Python of Byzantium, orator, diplomat of Philip II of Macedon
\n",
+ " - Python of Catana, poet who accompanied Alexander the Great
,\n",
+ " - Python (film), a 2000 horror film by Richard Clabaugh\n",
+ "
- Pythons 2, or Python II, a 2002 sequel to Python
\n",
+ " - The Pythons, or Monty Python, a British comedy group\n",
+ "
,\n",
+ " - Pythons 2, or Python II, a 2002 sequel to Python
,\n",
+ " ,\n",
+ " ,\n",
+ " - CPython, the reference implementation of the Python programming language
,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "ul = soup.find_all('ul')\n",
+ "ul= ul[2:14]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['/wiki/Pythonidae',\n",
+ " '/wiki/Python_(genus)',\n",
+ " '/wiki/Python_(genus)',\n",
+ " '/wiki/Python_(mythology)',\n",
+ " '/wiki/Python_of_Aenus',\n",
+ " '/wiki/Python_(painter)',\n",
+ " '/wiki/Python_of_Byzantium',\n",
+ " '/wiki/Python_of_Catana',\n",
+ " '/wiki/Python_(film)',\n",
+ " '/wiki/Pythons_2',\n",
+ " '/wiki/Monty_Python',\n",
+ " '/wiki/Python_(Monty)_Pictures',\n",
+ " '/wiki/Pythons_2',\n",
+ " '/wiki/Python_(Monty)_Pictures',\n",
+ " '/wiki/Python_(programming_language)',\n",
+ " '/wiki/CPython',\n",
+ " '/wiki/CMU_Common_Lisp',\n",
+ " '/wiki/PERQ#PERQ_3',\n",
+ " '/wiki/CPython',\n",
+ " '/wiki/Python_(Busch_Gardens_Tampa_Bay)',\n",
+ " '/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)',\n",
+ " '/wiki/Python_(Efteling)',\n",
+ " '/wiki/Python_(automobile_maker)',\n",
+ " '/wiki/Python_(Ford_prototype)',\n",
+ " '/wiki/Colt_Python',\n",
+ " '/wiki/Python_(missile)',\n",
+ " '/wiki/Python_(nuclear_primary)',\n",
+ " '/wiki/Python_Anghelo']"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "l = [img['href'] for i in range(len(ul)) for img in (ul[i].find_all('a', href=True)) ] \n",
+ "l"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Number of Titles that have changed in the United States Code since its last release point "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'http://uscode.house.gov/download/download.shtml'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "#soup\n",
+ "titles = soup.find_all('div', {'class':'usctitlechanged'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "15"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(titles)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### A Python list with the top ten FBI's Most Wanted names "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://www.fbi.gov/wanted/topten'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code \n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "titles = soup.find_all('h3', {'class':'title'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ALEJANDRO ROSALES CASTILLO',\n",
+ " 'YASER ABDEL SAID',\n",
+ " 'JASON DEREK BROWN',\n",
+ " 'RAFAEL CARO-QUINTERO',\n",
+ " 'ALEXIS FLORES',\n",
+ " 'EUGENE PALMER',\n",
+ " 'SANTIAGO VILLALBA MEDEROS',\n",
+ " 'ROBERT WILLIAM FISHER',\n",
+ " 'BHADRESHKUMAR CHETANBHAI PATEL',\n",
+ " 'ARNOLDO JIMENEZ']"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "buscados = [nombre.text.replace(\"\\n\", \"\") for nombre in titles]\n",
+ "buscados"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://www.emsc-csem.org/Earthquake/'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "\n",
+ "datetime = soup.find_all('td', {'class':'tabev6'})\n",
+ "lyl = soup.find_all('td', {'class':'tabev1'})\n",
+ "region = soup.find_all('td', {'class':'tb_region'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n",
+ "datefull = []\n",
+ "for fecha in limpdate:\n",
+ " fecha = fecha.replace('earthquake', '')\n",
+ " fecha =fecha.replace('ago', '')\n",
+ " datefull.append(fecha)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lat =[]\n",
+ "lon = []\n",
+ "for latlon in range(len(lyl)):\n",
+ " if latlon %2 == 0:\n",
+ " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n",
+ " else:\n",
+ " lon.append(lyl[latlon].text.replace('\\xa0', ''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lugar =[fecha.text for fecha in region]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Date and time | \n",
+ " Latitud | \n",
+ " Longitud | \n",
+ " Region | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2019-07-16 21:06:21.515min | \n",
+ " 35.88 | \n",
+ " 117.69 | \n",
+ " CENTRAL CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2019-07-16 21:03:36.718min | \n",
+ " 37.81 | \n",
+ " 121.76 | \n",
+ " SAN FRANCISCO BAY AREA, CALIF. | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2019-07-16 20:52:18.629min | \n",
+ " 36.07 | \n",
+ " 117.84 | \n",
+ " CENTRAL CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2019-07-16 20:49:09.132min | \n",
+ " 36.07 | \n",
+ " 117.65 | \n",
+ " CENTRAL CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2019-07-16 20:41:11.040min | \n",
+ " 16.85 | \n",
+ " 100.25 | \n",
+ " OFFSHORE GUERRERO, MEXICO | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2019-07-16 20:33:52.948min | \n",
+ " 40.09 | \n",
+ " 19.91 | \n",
+ " ALBANIA | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2019-07-16 20:31:33.350min | \n",
+ " 23.45 | \n",
+ " 66.86 | \n",
+ " JUJUY, ARGENTINA | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 2019-07-16 20:29:07.752min | \n",
+ " 35.86 | \n",
+ " 117.69 | \n",
+ " CENTRAL CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2019-07-16 20:23:34.758min | \n",
+ " 36.07 | \n",
+ " 117.84 | \n",
+ " CENTRAL CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 2019-07-16 20:19:00.11hr 02min | \n",
+ " 33.10 | \n",
+ " 12.42 | \n",
+ " MADEIRA ISLANDS, PORTUGAL REGION | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 2019-07-16 20:17:51.61hr 04min | \n",
+ " 35.55 | \n",
+ " 117.43 | \n",
+ " SOUTHERN CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 2019-07-16 20:15:59.01hr 05min | \n",
+ " 35.68 | \n",
+ " 117.52 | \n",
+ " SOUTHERN CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 2019-07-16 20:11:01.51hr 10min | \n",
+ " 37.82 | \n",
+ " 121.77 | \n",
+ " SAN FRANCISCO BAY AREA, CALIF. | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 2019-07-16 19:51:06.51hr 30min | \n",
+ " 6.26 | \n",
+ " 148.65 | \n",
+ " NEW BRITAIN REGION, P.N.G. | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 2019-07-16 19:42:25.91hr 39min | \n",
+ " 35.61 | \n",
+ " 117.47 | \n",
+ " SOUTHERN CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 2019-07-16 19:35:57.01hr 46min | \n",
+ " 35.62 | \n",
+ " 117.45 | \n",
+ " SOUTHERN CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 2019-07-16 19:23:50.11hr 58min | \n",
+ " 36.19 | \n",
+ " 117.89 | \n",
+ " CENTRAL CALIFORNIA | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 2019-07-16 19:20:21.42hr 01min | \n",
+ " 38.39 | \n",
+ " 16.94 | \n",
+ " SOUTHERN ITALY | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 2019-07-16 19:16:53.82hr 05min | \n",
+ " 38.45 | \n",
+ " 16.91 | \n",
+ " SOUTHERN ITALY | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 2019-07-16 19:16:15.92hr 05min | \n",
+ " 61.27 | \n",
+ " 152.44 | \n",
+ " SOUTHERN ALASKA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Date and time Latitud Longitud \\\n",
+ "0 2019-07-16 21:06:21.515min 35.88 117.69 \n",
+ "1 2019-07-16 21:03:36.718min 37.81 121.76 \n",
+ "2 2019-07-16 20:52:18.629min 36.07 117.84 \n",
+ "3 2019-07-16 20:49:09.132min 36.07 117.65 \n",
+ "4 2019-07-16 20:41:11.040min 16.85 100.25 \n",
+ "5 2019-07-16 20:33:52.948min 40.09 19.91 \n",
+ "6 2019-07-16 20:31:33.350min 23.45 66.86 \n",
+ "7 2019-07-16 20:29:07.752min 35.86 117.69 \n",
+ "8 2019-07-16 20:23:34.758min 36.07 117.84 \n",
+ "9 2019-07-16 20:19:00.11hr 02min 33.10 12.42 \n",
+ "10 2019-07-16 20:17:51.61hr 04min 35.55 117.43 \n",
+ "11 2019-07-16 20:15:59.01hr 05min 35.68 117.52 \n",
+ "12 2019-07-16 20:11:01.51hr 10min 37.82 121.77 \n",
+ "13 2019-07-16 19:51:06.51hr 30min 6.26 148.65 \n",
+ "14 2019-07-16 19:42:25.91hr 39min 35.61 117.47 \n",
+ "15 2019-07-16 19:35:57.01hr 46min 35.62 117.45 \n",
+ "16 2019-07-16 19:23:50.11hr 58min 36.19 117.89 \n",
+ "17 2019-07-16 19:20:21.42hr 01min 38.39 16.94 \n",
+ "18 2019-07-16 19:16:53.82hr 05min 38.45 16.91 \n",
+ "19 2019-07-16 19:16:15.92hr 05min 61.27 152.44 \n",
+ "\n",
+ " Region \n",
+ "0 CENTRAL CALIFORNIA \n",
+ "1 SAN FRANCISCO BAY AREA, CALIF. \n",
+ "2 CENTRAL CALIFORNIA \n",
+ "3 CENTRAL CALIFORNIA \n",
+ "4 OFFSHORE GUERRERO, MEXICO \n",
+ "5 ALBANIA \n",
+ "6 JUJUY, ARGENTINA \n",
+ "7 CENTRAL CALIFORNIA \n",
+ "8 CENTRAL CALIFORNIA \n",
+ "9 MADEIRA ISLANDS, PORTUGAL REGION \n",
+ "10 SOUTHERN CALIFORNIA \n",
+ "11 SOUTHERN CALIFORNIA \n",
+ "12 SAN FRANCISCO BAY AREA, CALIF. \n",
+ "13 NEW BRITAIN REGION, P.N.G. \n",
+ "14 SOUTHERN CALIFORNIA \n",
+ "15 SOUTHERN CALIFORNIA \n",
+ "16 CENTRAL CALIFORNIA \n",
+ "17 SOUTHERN ITALY \n",
+ "18 SOUTHERN ITALY \n",
+ "19 SOUTHERN ALASKA "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Display the date, and title of upcoming hackathon events as a Pandas dataframe table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url ='https://hackevents.co/hackathons'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Count number of tweets by a given Twitter account."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You will need to include a ***try/except block*** for account names not found. \n",
+ "
***Hint:*** the program should count the number of tweets for any provided account"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise \n",
+ "# You will need to add the account credentials to this url\n",
+ "url = 'https://twitter.com/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Number of followers of a given twitter account"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You will need to include a ***try/except block*** in case account/s name not found. \n",
+ "
***Hint:*** the program should count the followers for any provided account"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise \n",
+ "# You will need to add the account credentials to this url\n",
+ "url = 'https://twitter.com/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### List all language names and number of related articles in the order they appear in wikipedia.org"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://www.wikipedia.org/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### A list with the different kind of datasets available in data.gov.uk "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://data.gov.uk/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Top 10 languages by number of native speakers stored in a Pandas Dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### BONUS QUESTIONS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Scrape a certain number of tweets of a given Twitter account."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise \n",
+ "# You will need to add the account credentials to this url\n",
+ "url = 'https://twitter.com/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### IMDB's Top 250 data (movie name, Initial release, director name and stars) as a pandas dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise \n",
+ "url = 'https://www.imdb.com/chart/top'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Find the live weather report (temperature, wind speed, description and weather) of a given city."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#https://openweathermap.org/current\n",
+ "city = city=input('Enter the city:')\n",
+ "url = 'http://api.openweathermap.org/data/2.5/weather?'+'q='+city+'&APPID=b35975e18dc93725acb092f7272cc6b8&units=metric'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# your code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Book name, price and stock availability as a pandas dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise. \n",
+ "# It is a fictional bookstore created to be scraped. \n",
+ "url = 'http://books.toscrape.com/'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "\n",
+ "datetime = soup.find_all('td', {'class':'tabev6'})\n",
+ "lyl = soup.find_all('td', {'class':'tabev1'})\n",
+ "region = soup.find_all('td', {'class':'tb_region'})\n",
+ "\n",
+ "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n",
+ "datefull = []\n",
+ "for fecha in limpdate:\n",
+ " fecha = fecha.replace('earthquake', '')\n",
+ " fecha =fecha.replace('ago', '')\n",
+ " datefull.append(fecha)\n",
+ " \n",
+ "lat =[]\n",
+ "lon = []\n",
+ "for latlon in range(len(lyl)):\n",
+ " if latlon %2 == 0:\n",
+ " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n",
+ " else:\n",
+ " lon.append(lyl[latlon].text.replace('\\xa0', ''))\n",
+ " \n",
+ "lugar =[fecha.text for fecha in region]\n",
+ "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/your-code/main.ipynb b/your-code/main.ipynb
index 812f7a4..256b315 100644
--- a/your-code/main.ipynb
+++ b/your-code/main.ipynb
@@ -40,7 +40,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -48,12 +48,13 @@
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"# from pprint import pprint\n",
- "# from lxml import html\n",
- "# from lxml.html import fromstring\n",
+ "from lxml import html\n",
+ "from lxml.html import fromstring\n",
"# import urllib.request\n",
"# from urllib.request import urlopen\n",
- "# import random\n",
- "# import re\n",
+ "import random\n",
+ "import re\n",
+ "import html5lib\n",
"# import scrapy"
]
},
@@ -64,25 +65,6 @@
"#### Download, parse (using BeautifulSoup), and print the content from the Trending Developers page from GitHub:"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# This is the url you will scrape in this exercise\n",
- "url = 'https://github.com/trending/developers'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#your code"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -134,11 +116,108 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 361,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is the url you will scrape in this exercise\n",
+ "url = 'https://github.com/trending/developers'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 362,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "conts = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 363,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "git_soup = BeautifulSoup(conts, 'html5lib')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 364,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "names = git_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 365,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nick = git_soup.find_all('a', {'class':\"link-gray\"})\n",
+ "nick = nick[19:44]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 366,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "nicks = [i.text for i in nick]\n",
+ "names_s = [i.text for i in names]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 367,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Eric Ma (ericmjl)',\n",
+ " 'Federico Brigante (bfred-it)',\n",
+ " 'Kyle Roach (iRoachie)',\n",
+ " 'Olle Jonsson (olleolleolle)',\n",
+ " 'Nikita Sobolev (sobolevn)',\n",
+ " 'Frank S. Thomas (fthomas)',\n",
+ " 'syuilo (syuilo)',\n",
+ " 'Ives van Hoorne (CompuIves)',\n",
+ " 'Paulus Schoutsen (balloob)',\n",
+ " 'Sarah Drasner (sdras)',\n",
+ " 'Stefanos Kornilios Mitsis Poiitidis (skmp)',\n",
+ " 'Jan Hovancik (hovancik)',\n",
+ " 'Andreas Mueller (amueller)',\n",
+ " 'Guillaume Gomez (GuillaumeGomez)',\n",
+ " 'Matt Holt (mholt)',\n",
+ " 'Clifford Wolf (cliffordwolf)',\n",
+ " 'Franck Nijhof (frenck)',\n",
+ " 'Joe Block (unixorn)',\n",
+ " 'Andrei Neagoie (aneagoie)',\n",
+ " 'Jack Lloyd (randombit)',\n",
+ " 'Guillermo Rauch (rauchg)',\n",
+ " 'Tim Griesser (tgriesser)',\n",
+ " 'Jameson Nash (vtjnash)',\n",
+ " 'Anderson Banihirwe (andersy005)',\n",
+ " 'Danny Ryan (djrtwo)']"
+ ]
+ },
+ "execution_count": 367,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "for i in range(len(names_s)):\n",
+ " names_s[i] = names_s[i] +\" \"+ \"(\" + nicks[i] + \")\"\n",
+ "names_s"
]
},
{
@@ -152,7 +231,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 368,
"metadata": {},
"outputs": [],
"source": [
@@ -162,11 +241,81 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 369,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "repos = requests.get(url).content\n",
+ "repo_soup = BeautifulSoup(repos, 'html5lib')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 370,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "reposi = repo_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 371,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "repos = [i.text for i in reposi]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 372,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['gto76/python-cheatsheet',\n",
+ " 'j3ssie/Osmedeus',\n",
+ " 'tangzixiang0304/Shielded_detector',\n",
+ " 'uber/ludwig',\n",
+ " 'xinshuoweng/AB3DMOT',\n",
+ " 'NVlabs/stylegan',\n",
+ " 'dagster-io/dagster',\n",
+ " 'tensorflow/models',\n",
+ " 'eragonruan/text-detection-ctpn',\n",
+ " 'sherlock-project/sherlock',\n",
+ " 'deepfakes/faceswap',\n",
+ " 'nbei/Deep-Flow-Guided-Video-Inpainting',\n",
+ " 'iovisor/bcc',\n",
+ " 'Roibal/Cryptocurrency-Trading-Bots-Python-Beginner-Advance',\n",
+ " 'NVIDIA/DeepLearningExamples',\n",
+ " 'BlackHC/tfpyth',\n",
+ " 'clovaai/deep-text-recognition-benchmark',\n",
+ " 'tkat0/PyTorch_BlazeFace',\n",
+ " 'OpenMined/PySyft',\n",
+ " 'CoreyMSchafer/code_snippets',\n",
+ " 'public-apis/public-apis',\n",
+ " 'd2l-ai/d2l-zh',\n",
+ " 'apache/airflow',\n",
+ " 'beecost/bee-university',\n",
+ " 'sundowndev/PhoneInfoga']"
+ ]
+ },
+ "execution_count": 372,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "repos = [i.replace('\\n', '') for i in repos]\n",
+ "repos = [i.replace(' ', '') for i in repos]\n",
+ "repos"
]
},
{
@@ -178,7 +327,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 373,
"metadata": {},
"outputs": [],
"source": [
@@ -188,11 +337,87 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 374,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "Walt = requests.get(url).content\n",
+ "walt_soup = BeautifulSoup(Walt, 'html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 375,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "imgs = walt_soup.find_all('div', {\"class\":\"thumbinner\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 376,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n",
+ " '/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n",
+ " '//upload.wikimedia.org/wikipedia/commons/4/4d/Newman_Laugh-O-Gram_%281921%29.webm',\n",
+ " '/wiki/File:Newman_Laugh-O-Gram_(1921).webm',\n",
+ " '/wiki/File:Trolley_Troubles_poster.jpg',\n",
+ " '/wiki/File:Trolley_Troubles_poster.jpg',\n",
+ " '/wiki/Trolley_Troubles',\n",
+ " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n",
+ " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n",
+ " '/wiki/Mickey_Mouse',\n",
+ " '/wiki/File:Steamboat-willie.jpg',\n",
+ " '/wiki/File:Steamboat-willie.jpg',\n",
+ " '/wiki/Mickey_Mouse',\n",
+ " '/wiki/Steamboat_Willie',\n",
+ " '/wiki/File:Walt_Disney_1935.jpg',\n",
+ " '/wiki/File:Walt_Disney_1935.jpg',\n",
+ " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n",
+ " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n",
+ " '/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',\n",
+ " '/wiki/File:Disney_drawing_goofy.jpg',\n",
+ " '/wiki/File:Disney_drawing_goofy.jpg',\n",
+ " '/wiki/Goofy',\n",
+ " '/wiki/File:DisneySchiphol1951.jpg',\n",
+ " '/wiki/File:DisneySchiphol1951.jpg',\n",
+ " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n",
+ " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n",
+ " '/wiki/Disneyland',\n",
+ " '/wiki/Orange_County,_California',\n",
+ " '/wiki/File:Walt_disney_portrait_right.jpg',\n",
+ " '/wiki/File:Walt_disney_portrait_right.jpg',\n",
+ " '/wiki/File:Walt_Disney_Grave.JPG',\n",
+ " '/wiki/File:Walt_Disney_Grave.JPG',\n",
+ " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n",
+ " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n",
+ " '/wiki/Roy_O._Disney',\n",
+ " '/wiki/File:Disney_Display_Case.JPG',\n",
+ " '/wiki/File:Disney_Display_Case.JPG',\n",
+ " '/wiki/The_Walt_Disney_Family_Museum',\n",
+ " '/wiki/File:Disney1968.jpg',\n",
+ " '/wiki/File:Disney1968.jpg']"
+ ]
+ },
+ "execution_count": 376,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "l = []\n",
+ "for i in range(len(imgs)):\n",
+ " for img in (imgs[i].find_all('a', href=True)):\n",
+ " l.append(img['href'])\n",
+ "l"
]
},
{
@@ -209,7 +434,20 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url ='https://en.wikipedia.org/wiki/Python' "
+ "url ='https://en.wikipedia.org/wiki/Python' \n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "ul = soup.find_all('ul')\n",
+ "ul= ul[2:14]"
]
},
{
@@ -218,7 +456,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "l = [img['href'] for i in range(len(ul)) for img in (ul[i].find_all('a', href=True)) ] \n",
+ "l"
]
},
{
@@ -235,7 +474,8 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'http://uscode.house.gov/download/download.shtml'"
+ "url = 'http://uscode.house.gov/download/download.shtml'\n",
+ "html = requests.get(url).content"
]
},
{
@@ -244,7 +484,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "#soup\n",
+ "titles = soup.find_all('div', {'class':'usctitlechanged'})"
]
},
{
@@ -261,7 +504,19 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.fbi.gov/wanted/topten'"
+ "url = 'https://www.fbi.gov/wanted/topten'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code \n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "titles = soup.find_all('h3', {'class':'title'})"
]
},
{
@@ -270,7 +525,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code "
+ "buscados = [nombre.text.replace(\"\\n\", \"\") for nombre in titles]\n",
+ "buscados"
]
},
{
@@ -287,7 +543,60 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.emsc-csem.org/Earthquake/'"
+ "url = 'https://www.emsc-csem.org/Earthquake/'\n",
+ "html = requests.get(url).content"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "\n",
+ "datetime = soup.find_all('td', {'class':'tabev6'})\n",
+ "lyl = soup.find_all('td', {'class':'tabev1'})\n",
+ "region = soup.find_all('td', {'class':'tb_region'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n",
+ "datefull = []\n",
+ "for fecha in limpdate:\n",
+ " fecha = fecha.replace('earthquake', '')\n",
+ " fecha =fecha.replace('ago', '')\n",
+ " datefull.append(fecha)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lat =[]\n",
+ "lon = []\n",
+ "for latlon in range(len(lyl)):\n",
+ " if latlon %2 == 0:\n",
+ " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n",
+ " else:\n",
+ " lon.append(lyl[latlon].text.replace('\\xa0', ''))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lugar =[fecha.text for fecha in region]"
]
},
{
@@ -296,7 +605,8 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})\n",
+ "df.head(20)"
]
},
{
@@ -308,7 +618,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 291,
"metadata": {},
"outputs": [],
"source": [
@@ -318,11 +628,125 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 304,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "hackatons_soup = requests.get(url).content\n",
+ "hackatons_soup = BeautifulSoup(hackatons_soup, 'html5lib')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 342,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "hackis = hackatons_soup.find_all('div', {'class':'card-body'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 343,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Code Geist Hackathon by SefrWahed\\u20037/29/2019\\n\\n\\u2003Cairo, Egypt',\n",
+ " 'The Code Factor\\u20035/21/2019\\n\\n\\u2003Milano, Italy',\n",
+ " 'TECHFEST MUNICH\\u20039/6/2019\\n\\n\\u2003Munich, Germany',\n",
+ " 'Galileo App Competition\\u20031/31/2019\\n\\n\\u2003Prague, Czech Republic']"
+ ]
+ },
+ "execution_count": 343,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hacks = [hackis[i].text for i in range(len(hackis))]\n",
+ "hacks = [i.split(\"\\u2003\") for i in hacks]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 348,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Code Geist Hackathon by SefrWahed | \n",
+ " 7/29/2019 | \n",
+ " Cairo, Egypt | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Code Factor | \n",
+ " 5/21/2019 | \n",
+ " Milano, Italy | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " TECHFEST MUNICH | \n",
+ " 9/6/2019 | \n",
+ " Munich, Germany | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Galileo App Competition | \n",
+ " 1/31/2019 | \n",
+ " Prague, Czech Republic | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2\n",
+ "0 Code Geist Hackathon by SefrWahed 7/29/2019 Cairo, Egypt\n",
+ "1 The Code Factor 5/21/2019 Milano, Italy\n",
+ "2 TECHFEST MUNICH 9/6/2019 Munich, Germany\n",
+ "3 Galileo App Competition 1/31/2019 Prague, Czech Republic"
+ ]
+ },
+ "execution_count": 348,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hacks = pd.DataFrame(hacks)\n",
+ "hacks[1] = hacks[1].apply(lambda x:x.replace('\\n', ''))\n",
+ "hacks"
]
},
{
@@ -353,11 +777,39 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 300,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Check the tweets of an account:shiroiusagi4486\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'76'"
+ ]
+ },
+ "execution_count": 300,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#your code"
+ "#your code\n",
+ "usr_input = input('Check the tweets of an account:')\n",
+ "try:\n",
+ " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n",
+ "except:\n",
+ " print('The account does not exist')\n",
+ "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n",
+ "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n",
+ "twitter_follows[0]['data-count']\n",
+ " \n",
+ " "
]
},
{
@@ -388,11 +840,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 298,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Check the followers of an account:uaquiro\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'302'"
+ ]
+ },
+ "execution_count": 298,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#your code"
+ "#your code\n",
+ "usr_input = input('Check the followers of an account:')\n",
+ "try:\n",
+ " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n",
+ "except:\n",
+ " print('The account does not exist')\n",
+ "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n",
+ "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n",
+ "twitter_follows[2]['data-count']\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
]
},
{
@@ -404,7 +886,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 207,
"metadata": {},
"outputs": [],
"source": [
@@ -414,11 +896,243 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 208,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "wikissup = requests.get(url).content\n",
+ "wikissup = BeautifulSoup(wikissup, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 211,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "langs = wikissup.find_all('div', {'class':'central-featured'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 268,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['English',\n",
+ " '日本語',\n",
+ " 'Español',\n",
+ " 'Deutsch',\n",
+ " 'Русский',\n",
+ " 'Français',\n",
+ " 'Italiano',\n",
+ " '中文',\n",
+ " 'Português',\n",
+ " 'Polski']"
+ ]
+ },
+ "execution_count": 268,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "langu = []\n",
+ "for i in range(len(langs)):\n",
+ " langu.append(langs[i].find_all('strong'))\n",
+ "langu = [i for i in langu[0]]\n",
+ "langu = [i.text for i in langu]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[['English5\\xa0892\\xa0000',\n",
+ " ' articles日本語1\\xa0159\\xa0000',\n",
+ " ' 記事Español1\\xa0532\\xa0000',\n",
+ " ' artículosDeutsch2\\xa0323\\xa0000',\n",
+ " ' ArtikelРусский1\\xa0556\\xa0000',\n",
+ " ' статейFrançais2\\xa0123\\xa0000',\n",
+ " ' articlesItaliano1\\xa0541\\xa0000',\n",
+ " ' voci中文1\\xa0065\\xa0000',\n",
+ " ' 條目Português1\\xa0010\\xa0000',\n",
+ " ' artigosPolski1\\xa0346\\xa0000',\n",
+ " ' haseł']]"
+ ]
+ },
+ "execution_count": 245,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numbers = [] \n",
+ "for i in range(len(langs)):\n",
+ " numbers.append(langs[i].find_all('small'))\n",
+ "numbers = [i.text for i in langs]\n",
+ "numbers = [i.replace('\\n', '') for i in numbers]\n",
+ "numbers = [i.split('+') for i in numbers]\n",
+ "numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 257,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "numbers = [re.findall('\\d', i) for i in liss]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 258,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['5892000',\n",
+ " '1159000',\n",
+ " '1532000',\n",
+ " '2323000',\n",
+ " '1556000',\n",
+ " '2123000',\n",
+ " '1541000',\n",
+ " '1065000',\n",
+ " '1010000',\n",
+ " '1346000']"
+ ]
+ },
+ "execution_count": 258,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numberss = [\"\".join(numbers[i]) for i in range(len(numbers))]\n",
+ "numbers = numberss[:-1]\n",
+ "numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 269,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Language | \n",
+ " Articles | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " English | \n",
+ " 5892000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 日本語 | \n",
+ " 1159000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Español | \n",
+ " 1532000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Deutsch | \n",
+ " 2323000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Русский | \n",
+ " 1556000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Français | \n",
+ " 2123000 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Italiano | \n",
+ " 1541000 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 中文 | \n",
+ " 1065000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Português | \n",
+ " 1010000 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " Polski | \n",
+ " 1346000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Language Articles\n",
+ "0 English 5892000\n",
+ "1 日本語 1159000\n",
+ "2 Español 1532000\n",
+ "3 Deutsch 2323000\n",
+ "4 Русский 1556000\n",
+ "5 Français 2123000\n",
+ "6 Italiano 1541000\n",
+ "7 中文 1065000\n",
+ "8 Português 1010000\n",
+ "9 Polski 1346000"
+ ]
+ },
+ "execution_count": 269,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = {'Language':langu, 'Articles':numbers}\n",
+ "Wiki_df = pd.DataFrame(d)\n",
+ "Wiki_df"
]
},
{
@@ -430,7 +1144,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
@@ -440,11 +1154,166 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 192,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code \n",
+ "datasets_soup = requests.get(url).content\n",
+ "datasets_soup = BeautifulSoup(datasets_soup, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 195,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Business and economy', 'Crime and justice', 'Defence', 'Education', 'Environment', 'Government', 'Government spending', 'Health', 'Mapping', 'Society', 'Towns and cities', 'Transport']\n"
+ ]
+ }
+ ],
+ "source": [
+ "names = datasets_soup.find_all('h2')\n",
+ "names_s1 = [i.text for i in names]\n",
+ "\n",
+ "print(names_s1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 204,
"metadata": {},
"outputs": [],
"source": [
- "#your code "
+ "descp = datasets_soup.find_all('p')\n",
+ "descp = descp[5:-2]\n",
+ "descp = [i.text for i in descp]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 206,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Dataset | \n",
+ " Description | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Business and economy | \n",
+ " Small businesses, industry, imports, exports a... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Crime and justice | \n",
+ " Courts, police, prison, offenders, borders and... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Defence | \n",
+ " Armed forces, health and safety, search and re... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Education | \n",
+ " Students, training, qualifications and the Nat... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Environment | \n",
+ " Weather, flooding, rivers, air quality, geolog... | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Government | \n",
+ " Staff numbers and pay, local councillors and d... | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Government spending | \n",
+ " Includes all payments by government department... | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " Health | \n",
+ " Includes smoking, drugs, alcohol, medicine per... | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Mapping | \n",
+ " Addresses, boundaries, land ownership, aerial ... | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " Society | \n",
+ " Employment, benefits, household finances, pove... | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " Towns and cities | \n",
+ " Includes housing, urban planning, leisure, was... | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " Transport | \n",
+ " Airports, roads, freight, electric vehicles, p... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Dataset Description\n",
+ "0 Business and economy Small businesses, industry, imports, exports a...\n",
+ "1 Crime and justice Courts, police, prison, offenders, borders and...\n",
+ "2 Defence Armed forces, health and safety, search and re...\n",
+ "3 Education Students, training, qualifications and the Nat...\n",
+ "4 Environment Weather, flooding, rivers, air quality, geolog...\n",
+ "5 Government Staff numbers and pay, local councillors and d...\n",
+ "6 Government spending Includes all payments by government department...\n",
+ "7 Health Includes smoking, drugs, alcohol, medicine per...\n",
+ "8 Mapping Addresses, boundaries, land ownership, aerial ...\n",
+ "9 Society Employment, benefits, household finances, pove...\n",
+ "10 Towns and cities Includes housing, urban planning, leisure, was...\n",
+ "11 Transport Airports, roads, freight, electric vehicles, p..."
+ ]
+ },
+ "execution_count": 206,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = {'Dataset':names_s1, 'Description': descp}\n",
+ "DataSets = pd.DataFrame(d)\n",
+ "DataSets"
]
},
{
@@ -456,7 +1325,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 349,
"metadata": {},
"outputs": [],
"source": [
@@ -466,11 +1335,283 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 350,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#your code\n",
+ "table_lang = requests.get(url).content\n",
+ "table_soup = BeautifulSoup(table_lang, 'html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 351,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tables = table_soup.find_all('tr')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 352,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tabless = []\n",
+ "for i in range(len(tables)):\n",
+ " tabless.append(tables[i])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 353,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "tal = [tabless[i].text.split(\"\\n\\n\") for i in range(len(tabless))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 354,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Rank | \n",
+ " Rank | \n",
+ " Language | \n",
+ " Primary Country | \n",
+ " TotalCountries[a] | \n",
+ " Speakers(millions) | \n",
+ " % of the World population\\n(March 2019)[7] | \n",
+ " \\nMacrolanguage | \n",
+ " Language familyBranch | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " — | \n",
+ " 1 | \n",
+ " Chinese (macrolanguage) | \n",
+ " China | \n",
+ " 39 | \n",
+ " 1,311 | \n",
+ " 17.026 | \n",
+ " | \n",
+ " Sino-TibetanSinitic | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " — | \n",
+ " Mandarin | \n",
+ " China | \n",
+ " 13 | \n",
+ " 918 | \n",
+ " 11.922 | \n",
+ " Chinese | \n",
+ " Sino-TibetanSinitic | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " Spanish | \n",
+ " Spain | \n",
+ " 31 | \n",
+ " 460 | \n",
+ " 5.974 | \n",
+ " | \n",
+ " Indo-EuropeanRomance | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " English | \n",
+ " United Kingdom | \n",
+ " 137 | \n",
+ " 379 | \n",
+ " 4.922 | \n",
+ " | \n",
+ " Indo-EuropeanGermanic | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " Hindi | \n",
+ " India | \n",
+ " 4 | \n",
+ " 341 | \n",
+ " 4.429 | \n",
+ " | \n",
+ " Indo-EuropeanIndo-Aryan | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " — | \n",
+ " 5 | \n",
+ " Arabic (macrolanguage) | \n",
+ " Saudi Arabia | \n",
+ " 59 | \n",
+ " 319 | \n",
+ " 4.143 | \n",
+ " | \n",
+ " AfroasiaticSemitic | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " Bengali | \n",
+ " Bangladesh | \n",
+ " 4 | \n",
+ " 228 | \n",
+ " 2.961 | \n",
+ " | \n",
+ " Indo-EuropeanIndo-Aryan | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " Portuguese | \n",
+ " Portugal | \n",
+ " 15 | \n",
+ " 221 | \n",
+ " 2.870 | \n",
+ " | \n",
+ " Indo-EuropeanRomance | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " Russian | \n",
+ " Russian Federation | \n",
+ " 19 | \n",
+ " 154 | \n",
+ " 2.000 | \n",
+ " | \n",
+ " Indo-EuropeanBalto-Slavic | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " Japanese | \n",
+ " Japan | \n",
+ " 2 | \n",
+ " 128 | \n",
+ " 1.662 | \n",
+ " | \n",
+ " JaponicJapanese | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 \\\n",
+ "0 Rank Rank Language Primary Country \n",
+ "1 — 1 Chinese (macrolanguage) China \n",
+ "2 1 — Mandarin China \n",
+ "3 2 2 Spanish Spain \n",
+ "4 3 3 English United Kingdom \n",
+ "5 4 4 Hindi India \n",
+ "6 — 5 Arabic (macrolanguage) Saudi Arabia \n",
+ "7 5 6 Bengali Bangladesh \n",
+ "8 6 7 Portuguese Portugal \n",
+ "9 7 8 Russian Russian Federation \n",
+ "10 8 9 Japanese Japan \n",
+ "\n",
+ " 4 5 \\\n",
+ "0 TotalCountries[a] Speakers(millions) \n",
+ "1 39 1,311 \n",
+ "2 13 918 \n",
+ "3 31 460 \n",
+ "4 137 379 \n",
+ "5 4 341 \n",
+ "6 59 319 \n",
+ "7 4 228 \n",
+ "8 15 221 \n",
+ "9 19 154 \n",
+ "10 2 128 \n",
+ "\n",
+ " 6 7 \\\n",
+ "0 % of the World population\\n(March 2019)[7] \\nMacrolanguage \n",
+ "1 17.026 \n",
+ "2 11.922 Chinese \n",
+ "3 5.974 \n",
+ "4 4.922 \n",
+ "5 4.429 \n",
+ "6 4.143 \n",
+ "7 2.961 \n",
+ "8 2.870 \n",
+ "9 2.000 \n",
+ "10 1.662 \n",
+ "\n",
+ " 8 \n",
+ "0 Language familyBranch \n",
+ "1 Sino-TibetanSinitic \n",
+ "2 Sino-TibetanSinitic \n",
+ "3 Indo-EuropeanRomance \n",
+ "4 Indo-EuropeanGermanic \n",
+ "5 Indo-EuropeanIndo-Aryan \n",
+ "6 AfroasiaticSemitic \n",
+ "7 Indo-EuropeanIndo-Aryan \n",
+ "8 Indo-EuropeanRomance \n",
+ "9 Indo-EuropeanBalto-Slavic \n",
+ "10 JaponicJapanese "
+ ]
+ },
+ "execution_count": 354,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "talcual = pd.DataFrame(tal)\n",
+ "talcual = talcual[:11]\n",
+ "talcual[0] = talcual[0].apply(lambda x: x.replace('\\n', ''))\n",
+ "talcual[8] = talcual[8].apply(lambda x: x.replace('\\n', ''))\n",
+ "talcual"
]
},
{
@@ -516,7 +1657,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 155,
"metadata": {},
"outputs": [],
"source": [
@@ -530,7 +1671,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "fims_get = requests.get(url).content\n",
+ "films_soup = BeautifulSoup(films_get, 'lxml')\n",
+ "films_names = films_soup.find_all(<"
]
},
{
@@ -569,7 +1713,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 355,
"metadata": {},
"outputs": [],
"source": [
@@ -580,11 +1724,135 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 356,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "#your code\n",
+ "books = requests.get(url).content\n",
+ "books_soup = BeautifulSoup(books, 'html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 357,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "names = books_soup.find_all('h3')\n",
+ "names_list = []\n",
+ "for i in range(len(names)):\n",
+ " for name in (names[i].find_all('a', title=True)):\n",
+ " names_list.append(name['title'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 358,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prices = books_soup.find_all('p', {'class':'price_color'})\n",
+ "\n",
+ "prices_list = [prices[i].text for i in range(len(prices))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 359,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stock = books_soup.find_all('p', {'class':'instock availability'})\n",
+ "stock = [stock[i].text for i in range(len(stock))]\n",
+ "stock = [i.replace('\\n', \"\") for i in stock]\n",
+ "stock = [re.sub('\\s\\s+', '', stock[i]) for i in range(len(stock))] "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 360,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Book Name | \n",
+ " Price | \n",
+ " Stock Availability | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " A Light in the Attic | \n",
+ " £51.77 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Tipping the Velvet | \n",
+ " £53.74 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Soumission | \n",
+ " £50.10 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Sharp Objects | \n",
+ " £47.82 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Sapiens: A Brief History of Humankind | \n",
+ " £54.23 | \n",
+ " In stock | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Book Name Price Stock Availability\n",
+ "0 A Light in the Attic £51.77 In stock\n",
+ "1 Tipping the Velvet £53.74 In stock\n",
+ "2 Soumission £50.10 In stock\n",
+ "3 Sharp Objects £47.82 In stock\n",
+ "4 Sapiens: A Brief History of Humankind £54.23 In stock"
+ ]
+ },
+ "execution_count": 360,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = {\"Book Name\":names_list, \"Price\":prices_list, \"Stock Availability\": stock}\n",
+ "books = pd.DataFrame(d)\n",
+ "books.head()"
]
}
],