diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..12d120e Binary files /dev/null and b/.DS_Store differ diff --git a/your-code/.DS_Store b/your-code/.DS_Store new file mode 100644 index 0000000..a870c7a Binary files /dev/null and b/your-code/.DS_Store differ diff --git a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb index 812f7a4..256b315 100644 --- a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb +++ b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -48,12 +48,13 @@ "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "# from pprint import pprint\n", - "# from lxml import html\n", - "# from lxml.html import fromstring\n", + "from lxml import html\n", + "from lxml.html import fromstring\n", "# import urllib.request\n", "# from urllib.request import urlopen\n", - "# import random\n", - "# import re\n", + "import random\n", + "import re\n", + "import html5lib\n", "# import scrapy" ] }, @@ -64,25 +65,6 @@ "#### Download, parse (using BeautifulSoup), and print the content from the Trending Developers page from GitHub:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is the url you will scrape in this exercise\n", - "url = 'https://github.com/trending/developers'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#your code" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -134,11 +116,108 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 361, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://github.com/trending/developers'" + ] + }, + { + "cell_type": "code", + "execution_count": 362, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "conts = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 363, + "metadata": {}, + "outputs": [], + "source": [ + "git_soup = BeautifulSoup(conts, 'html5lib')" + ] + }, + { + "cell_type": "code", + "execution_count": 364, "metadata": {}, "outputs": [], "source": [ - "#your code" + "names = git_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 365, + "metadata": {}, + "outputs": [], + "source": [ + "nick = git_soup.find_all('a', {'class':\"link-gray\"})\n", + "nick = nick[19:44]" + ] + }, + { + "cell_type": "code", + "execution_count": 366, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "nicks = [i.text for i in nick]\n", + "names_s = [i.text for i in names]" + ] + }, + { + "cell_type": "code", + "execution_count": 367, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Eric Ma (ericmjl)',\n", + " 'Federico Brigante (bfred-it)',\n", + " 'Kyle Roach (iRoachie)',\n", + " 'Olle Jonsson (olleolleolle)',\n", + " 'Nikita Sobolev (sobolevn)',\n", + " 'Frank S. Thomas (fthomas)',\n", + " 'syuilo (syuilo)',\n", + " 'Ives van Hoorne (CompuIves)',\n", + " 'Paulus Schoutsen (balloob)',\n", + " 'Sarah Drasner (sdras)',\n", + " 'Stefanos Kornilios Mitsis Poiitidis (skmp)',\n", + " 'Jan Hovancik (hovancik)',\n", + " 'Andreas Mueller (amueller)',\n", + " 'Guillaume Gomez (GuillaumeGomez)',\n", + " 'Matt Holt (mholt)',\n", + " 'Clifford Wolf (cliffordwolf)',\n", + " 'Franck Nijhof (frenck)',\n", + " 'Joe Block (unixorn)',\n", + " 'Andrei Neagoie (aneagoie)',\n", + " 'Jack Lloyd (randombit)',\n", + " 'Guillermo Rauch (rauchg)',\n", + " 'Tim Griesser (tgriesser)',\n", + " 'Jameson Nash (vtjnash)',\n", + " 'Anderson Banihirwe (andersy005)',\n", + " 'Danny Ryan (djrtwo)']" + ] + }, + "execution_count": 367, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for i in range(len(names_s)):\n", + " names_s[i] = names_s[i] +\" \"+ \"(\" + nicks[i] + \")\"\n", + "names_s" ] }, { @@ -152,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 368, "metadata": {}, "outputs": [], "source": [ @@ -162,11 +241,81 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 369, "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "repos = requests.get(url).content\n", + "repo_soup = BeautifulSoup(repos, 'html5lib')" + ] + }, + { + "cell_type": "code", + "execution_count": 370, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "reposi = repo_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 371, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "repos = [i.text for i in reposi]" + ] + }, + { + "cell_type": "code", + "execution_count": 372, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['gto76/python-cheatsheet',\n", + " 'j3ssie/Osmedeus',\n", + " 'tangzixiang0304/Shielded_detector',\n", + " 'uber/ludwig',\n", + " 'xinshuoweng/AB3DMOT',\n", + " 'NVlabs/stylegan',\n", + " 'dagster-io/dagster',\n", + " 'tensorflow/models',\n", + " 'eragonruan/text-detection-ctpn',\n", + " 'sherlock-project/sherlock',\n", + " 'deepfakes/faceswap',\n", + " 'nbei/Deep-Flow-Guided-Video-Inpainting',\n", + " 'iovisor/bcc',\n", + " 'Roibal/Cryptocurrency-Trading-Bots-Python-Beginner-Advance',\n", + " 'NVIDIA/DeepLearningExamples',\n", + " 'BlackHC/tfpyth',\n", + " 'clovaai/deep-text-recognition-benchmark',\n", + " 'tkat0/PyTorch_BlazeFace',\n", + " 'OpenMined/PySyft',\n", + " 'CoreyMSchafer/code_snippets',\n", + " 'public-apis/public-apis',\n", + " 'd2l-ai/d2l-zh',\n", + " 'apache/airflow',\n", + " 'beecost/bee-university',\n", + " 'sundowndev/PhoneInfoga']" + ] + }, + "execution_count": 372, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "repos = [i.replace('\\n', '') for i in repos]\n", + "repos = [i.replace(' ', '') for i in repos]\n", + "repos" ] }, { @@ -178,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 373, "metadata": {}, "outputs": [], "source": [ @@ -188,11 +337,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 374, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#your code\n", + "Walt = requests.get(url).content\n", + "walt_soup = BeautifulSoup(Walt, 'html')" + ] + }, + { + "cell_type": "code", + "execution_count": 375, "metadata": {}, "outputs": [], "source": [ - "#your code" + "imgs = walt_soup.find_all('div', {\"class\":\"thumbinner\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 376, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n", + " '/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n", + " '//upload.wikimedia.org/wikipedia/commons/4/4d/Newman_Laugh-O-Gram_%281921%29.webm',\n", + " '/wiki/File:Newman_Laugh-O-Gram_(1921).webm',\n", + " '/wiki/File:Trolley_Troubles_poster.jpg',\n", + " '/wiki/File:Trolley_Troubles_poster.jpg',\n", + " '/wiki/Trolley_Troubles',\n", + " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n", + " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n", + " '/wiki/Mickey_Mouse',\n", + " '/wiki/File:Steamboat-willie.jpg',\n", + " '/wiki/File:Steamboat-willie.jpg',\n", + " '/wiki/Mickey_Mouse',\n", + " '/wiki/Steamboat_Willie',\n", + " '/wiki/File:Walt_Disney_1935.jpg',\n", + " '/wiki/File:Walt_Disney_1935.jpg',\n", + " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n", + " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n", + " '/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',\n", + " '/wiki/File:Disney_drawing_goofy.jpg',\n", + " '/wiki/File:Disney_drawing_goofy.jpg',\n", + " '/wiki/Goofy',\n", + " '/wiki/File:DisneySchiphol1951.jpg',\n", + " '/wiki/File:DisneySchiphol1951.jpg',\n", + " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n", + " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n", + " '/wiki/Disneyland',\n", + " '/wiki/Orange_County,_California',\n", + " '/wiki/File:Walt_disney_portrait_right.jpg',\n", + " '/wiki/File:Walt_disney_portrait_right.jpg',\n", + " '/wiki/File:Walt_Disney_Grave.JPG',\n", + " '/wiki/File:Walt_Disney_Grave.JPG',\n", + " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n", + " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n", + " '/wiki/Roy_O._Disney',\n", + " '/wiki/File:Disney_Display_Case.JPG',\n", + " '/wiki/File:Disney_Display_Case.JPG',\n", + " '/wiki/The_Walt_Disney_Family_Museum',\n", + " '/wiki/File:Disney1968.jpg',\n", + " '/wiki/File:Disney1968.jpg']" + ] + }, + "execution_count": 376, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l = []\n", + "for i in range(len(imgs)):\n", + " for img in (imgs[i].find_all('a', href=True)):\n", + " l.append(img['href'])\n", + "l" ] }, { @@ -209,7 +434,20 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url ='https://en.wikipedia.org/wiki/Python' " + "url ='https://en.wikipedia.org/wiki/Python' \n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "ul = soup.find_all('ul')\n", + "ul= ul[2:14]" ] }, { @@ -218,7 +456,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "l = [img['href'] for i in range(len(ul)) for img in (ul[i].find_all('a', href=True)) ] \n", + "l" ] }, { @@ -235,7 +474,8 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'http://uscode.house.gov/download/download.shtml'" + "url = 'http://uscode.house.gov/download/download.shtml'\n", + "html = requests.get(url).content" ] }, { @@ -244,7 +484,10 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "#soup\n", + "titles = soup.find_all('div', {'class':'usctitlechanged'})" ] }, { @@ -261,7 +504,19 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://www.fbi.gov/wanted/topten'" + "url = 'https://www.fbi.gov/wanted/topten'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code \n", + "soup = BeautifulSoup(html, 'lxml')\n", + "titles = soup.find_all('h3', {'class':'title'})" ] }, { @@ -270,7 +525,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code " + "buscados = [nombre.text.replace(\"\\n\", \"\") for nombre in titles]\n", + "buscados" ] }, { @@ -287,7 +543,60 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://www.emsc-csem.org/Earthquake/'" + "url = 'https://www.emsc-csem.org/Earthquake/'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "\n", + "datetime = soup.find_all('td', {'class':'tabev6'})\n", + "lyl = soup.find_all('td', {'class':'tabev1'})\n", + "region = soup.find_all('td', {'class':'tb_region'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n", + "datefull = []\n", + "for fecha in limpdate:\n", + " fecha = fecha.replace('earthquake', '')\n", + " fecha =fecha.replace('ago', '')\n", + " datefull.append(fecha)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lat =[]\n", + "lon = []\n", + "for latlon in range(len(lyl)):\n", + " if latlon %2 == 0:\n", + " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n", + " else:\n", + " lon.append(lyl[latlon].text.replace('\\xa0', ''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lugar =[fecha.text for fecha in region]" ] }, { @@ -296,7 +605,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})\n", + "df.head(20)" ] }, { @@ -308,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 291, "metadata": {}, "outputs": [], "source": [ @@ -318,11 +628,125 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 304, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "hackatons_soup = requests.get(url).content\n", + "hackatons_soup = BeautifulSoup(hackatons_soup, 'html5lib')" + ] + }, + { + "cell_type": "code", + "execution_count": 342, "metadata": {}, "outputs": [], "source": [ - "#your code" + "hackis = hackatons_soup.find_all('div', {'class':'card-body'})" + ] + }, + { + "cell_type": "code", + "execution_count": 343, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Code Geist Hackathon by SefrWahed\\u20037/29/2019\\n\\n\\u2003Cairo, Egypt',\n", + " 'The Code Factor\\u20035/21/2019\\n\\n\\u2003Milano, Italy',\n", + " 'TECHFEST MUNICH\\u20039/6/2019\\n\\n\\u2003Munich, Germany',\n", + " 'Galileo App Competition\\u20031/31/2019\\n\\n\\u2003Prague, Czech Republic']" + ] + }, + "execution_count": 343, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacks = [hackis[i].text for i in range(len(hackis))]\n", + "hacks = [i.split(\"\\u2003\") for i in hacks]" + ] + }, + { + "cell_type": "code", + "execution_count": 348, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0Code Geist Hackathon by SefrWahed7/29/2019Cairo, Egypt
1The Code Factor5/21/2019Milano, Italy
2TECHFEST MUNICH9/6/2019Munich, Germany
3Galileo App Competition1/31/2019Prague, Czech Republic
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 Code Geist Hackathon by SefrWahed 7/29/2019 Cairo, Egypt\n", + "1 The Code Factor 5/21/2019 Milano, Italy\n", + "2 TECHFEST MUNICH 9/6/2019 Munich, Germany\n", + "3 Galileo App Competition 1/31/2019 Prague, Czech Republic" + ] + }, + "execution_count": 348, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacks = pd.DataFrame(hacks)\n", + "hacks[1] = hacks[1].apply(lambda x:x.replace('\\n', ''))\n", + "hacks" ] }, { @@ -353,11 +777,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 300, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check the tweets of an account:shiroiusagi4486\n" + ] + }, + { + "data": { + "text/plain": [ + "'76'" + ] + }, + "execution_count": 300, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code" + "#your code\n", + "usr_input = input('Check the tweets of an account:')\n", + "try:\n", + " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n", + "except:\n", + " print('The account does not exist')\n", + "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n", + "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n", + "twitter_follows[0]['data-count']\n", + " \n", + " " ] }, { @@ -388,11 +840,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 298, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check the followers of an account:uaquiro\n" + ] + }, + { + "data": { + "text/plain": [ + "'302'" + ] + }, + "execution_count": 298, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code" + "#your code\n", + "usr_input = input('Check the followers of an account:')\n", + "try:\n", + " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n", + "except:\n", + " print('The account does not exist')\n", + "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n", + "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n", + "twitter_follows[2]['data-count']\n", + " \n", + " \n", + " \n", + " " ] }, { @@ -404,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 207, "metadata": {}, "outputs": [], "source": [ @@ -414,11 +896,243 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 208, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "wikissup = requests.get(url).content\n", + "wikissup = BeautifulSoup(wikissup, 'lxml')" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [], + "source": [ + "langs = wikissup.find_all('div', {'class':'central-featured'})" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['English',\n", + " '日本語',\n", + " 'Español',\n", + " 'Deutsch',\n", + " 'Русский',\n", + " 'Français',\n", + " 'Italiano',\n", + " '中文',\n", + " 'Português',\n", + " 'Polski']" + ] + }, + "execution_count": 268, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "langu = []\n", + "for i in range(len(langs)):\n", + " langu.append(langs[i].find_all('strong'))\n", + "langu = [i for i in langu[0]]\n", + "langu = [i.text for i in langu]" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['English5\\xa0892\\xa0000',\n", + " ' articles日本語1\\xa0159\\xa0000',\n", + " ' 記事Español1\\xa0532\\xa0000',\n", + " ' artículosDeutsch2\\xa0323\\xa0000',\n", + " ' ArtikelРусский1\\xa0556\\xa0000',\n", + " ' статейFrançais2\\xa0123\\xa0000',\n", + " ' articlesItaliano1\\xa0541\\xa0000',\n", + " ' voci中文1\\xa0065\\xa0000',\n", + " ' 條目Português1\\xa0010\\xa0000',\n", + " ' artigosPolski1\\xa0346\\xa0000',\n", + " ' haseł']]" + ] + }, + "execution_count": 245, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers = [] \n", + "for i in range(len(langs)):\n", + " numbers.append(langs[i].find_all('small'))\n", + "numbers = [i.text for i in langs]\n", + "numbers = [i.replace('\\n', '') for i in numbers]\n", + "numbers = [i.split('+') for i in numbers]\n", + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 257, "metadata": {}, "outputs": [], "source": [ - "#your code" + "numbers = [re.findall('\\d', i) for i in liss]" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['5892000',\n", + " '1159000',\n", + " '1532000',\n", + " '2323000',\n", + " '1556000',\n", + " '2123000',\n", + " '1541000',\n", + " '1065000',\n", + " '1010000',\n", + " '1346000']" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numberss = [\"\".join(numbers[i]) for i in range(len(numbers))]\n", + "numbers = numberss[:-1]\n", + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 269, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LanguageArticles
0English5892000
1日本語1159000
2Español1532000
3Deutsch2323000
4Русский1556000
5Français2123000
6Italiano1541000
7中文1065000
8Português1010000
9Polski1346000
\n", + "
" + ], + "text/plain": [ + " Language Articles\n", + "0 English 5892000\n", + "1 日本語 1159000\n", + "2 Español 1532000\n", + "3 Deutsch 2323000\n", + "4 Русский 1556000\n", + "5 Français 2123000\n", + "6 Italiano 1541000\n", + "7 中文 1065000\n", + "8 Português 1010000\n", + "9 Polski 1346000" + ] + }, + "execution_count": 269, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {'Language':langu, 'Articles':numbers}\n", + "Wiki_df = pd.DataFrame(d)\n", + "Wiki_df" ] }, { @@ -430,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 191, "metadata": {}, "outputs": [], "source": [ @@ -440,11 +1154,166 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 192, + "metadata": {}, + "outputs": [], + "source": [ + "#your code \n", + "datasets_soup = requests.get(url).content\n", + "datasets_soup = BeautifulSoup(datasets_soup, 'lxml')" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Business and economy', 'Crime and justice', 'Defence', 'Education', 'Environment', 'Government', 'Government spending', 'Health', 'Mapping', 'Society', 'Towns and cities', 'Transport']\n" + ] + } + ], + "source": [ + "names = datasets_soup.find_all('h2')\n", + "names_s1 = [i.text for i in names]\n", + "\n", + "print(names_s1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 204, "metadata": {}, "outputs": [], "source": [ - "#your code " + "descp = datasets_soup.find_all('p')\n", + "descp = descp[5:-2]\n", + "descp = [i.text for i in descp]" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatasetDescription
0Business and economySmall businesses, industry, imports, exports a...
1Crime and justiceCourts, police, prison, offenders, borders and...
2DefenceArmed forces, health and safety, search and re...
3EducationStudents, training, qualifications and the Nat...
4EnvironmentWeather, flooding, rivers, air quality, geolog...
5GovernmentStaff numbers and pay, local councillors and d...
6Government spendingIncludes all payments by government department...
7HealthIncludes smoking, drugs, alcohol, medicine per...
8MappingAddresses, boundaries, land ownership, aerial ...
9SocietyEmployment, benefits, household finances, pove...
10Towns and citiesIncludes housing, urban planning, leisure, was...
11TransportAirports, roads, freight, electric vehicles, p...
\n", + "
" + ], + "text/plain": [ + " Dataset Description\n", + "0 Business and economy Small businesses, industry, imports, exports a...\n", + "1 Crime and justice Courts, police, prison, offenders, borders and...\n", + "2 Defence Armed forces, health and safety, search and re...\n", + "3 Education Students, training, qualifications and the Nat...\n", + "4 Environment Weather, flooding, rivers, air quality, geolog...\n", + "5 Government Staff numbers and pay, local councillors and d...\n", + "6 Government spending Includes all payments by government department...\n", + "7 Health Includes smoking, drugs, alcohol, medicine per...\n", + "8 Mapping Addresses, boundaries, land ownership, aerial ...\n", + "9 Society Employment, benefits, household finances, pove...\n", + "10 Towns and cities Includes housing, urban planning, leisure, was...\n", + "11 Transport Airports, roads, freight, electric vehicles, p..." + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {'Dataset':names_s1, 'Description': descp}\n", + "DataSets = pd.DataFrame(d)\n", + "DataSets" ] }, { @@ -456,7 +1325,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 349, "metadata": {}, "outputs": [], "source": [ @@ -466,11 +1335,283 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 350, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "table_lang = requests.get(url).content\n", + "table_soup = BeautifulSoup(table_lang, 'html')" + ] + }, + { + "cell_type": "code", + "execution_count": 351, + "metadata": {}, + "outputs": [], + "source": [ + "tables = table_soup.find_all('tr')" + ] + }, + { + "cell_type": "code", + "execution_count": 352, + "metadata": {}, + "outputs": [], + "source": [ + "tabless = []\n", + "for i in range(len(tables)):\n", + " tabless.append(tables[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 353, "metadata": {}, "outputs": [], "source": [ - "#your code" + "tal = [tabless[i].text.split(\"\\n\\n\") for i in range(len(tabless))]" + ] + }, + { + "cell_type": "code", + "execution_count": 354, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678
0RankRankLanguagePrimary CountryTotalCountries[a]Speakers(millions)% of the World population\\n(March 2019)[7]\\nMacrolanguageLanguage familyBranch
11Chinese (macrolanguage)China391,31117.026Sino-TibetanSinitic
21MandarinChina1391811.922ChineseSino-TibetanSinitic
322SpanishSpain314605.974Indo-EuropeanRomance
433EnglishUnited Kingdom1373794.922Indo-EuropeanGermanic
544HindiIndia43414.429Indo-EuropeanIndo-Aryan
65Arabic (macrolanguage)Saudi Arabia593194.143AfroasiaticSemitic
756BengaliBangladesh42282.961Indo-EuropeanIndo-Aryan
867PortuguesePortugal152212.870Indo-EuropeanRomance
978RussianRussian Federation191542.000Indo-EuropeanBalto-Slavic
1089JapaneseJapan21281.662JaponicJapanese
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 \\\n", + "0 Rank Rank Language Primary Country \n", + "1 — 1 Chinese (macrolanguage) China \n", + "2 1 — Mandarin China \n", + "3 2 2 Spanish Spain \n", + "4 3 3 English United Kingdom \n", + "5 4 4 Hindi India \n", + "6 — 5 Arabic (macrolanguage) Saudi Arabia \n", + "7 5 6 Bengali Bangladesh \n", + "8 6 7 Portuguese Portugal \n", + "9 7 8 Russian Russian Federation \n", + "10 8 9 Japanese Japan \n", + "\n", + " 4 5 \\\n", + "0 TotalCountries[a] Speakers(millions) \n", + "1 39 1,311 \n", + "2 13 918 \n", + "3 31 460 \n", + "4 137 379 \n", + "5 4 341 \n", + "6 59 319 \n", + "7 4 228 \n", + "8 15 221 \n", + "9 19 154 \n", + "10 2 128 \n", + "\n", + " 6 7 \\\n", + "0 % of the World population\\n(March 2019)[7] \\nMacrolanguage \n", + "1 17.026 \n", + "2 11.922 Chinese \n", + "3 5.974 \n", + "4 4.922 \n", + "5 4.429 \n", + "6 4.143 \n", + "7 2.961 \n", + "8 2.870 \n", + "9 2.000 \n", + "10 1.662 \n", + "\n", + " 8 \n", + "0 Language familyBranch \n", + "1 Sino-TibetanSinitic \n", + "2 Sino-TibetanSinitic \n", + "3 Indo-EuropeanRomance \n", + "4 Indo-EuropeanGermanic \n", + "5 Indo-EuropeanIndo-Aryan \n", + "6 AfroasiaticSemitic \n", + "7 Indo-EuropeanIndo-Aryan \n", + "8 Indo-EuropeanRomance \n", + "9 Indo-EuropeanBalto-Slavic \n", + "10 JaponicJapanese " + ] + }, + "execution_count": 354, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "talcual = pd.DataFrame(tal)\n", + "talcual = talcual[:11]\n", + "talcual[0] = talcual[0].apply(lambda x: x.replace('\\n', ''))\n", + "talcual[8] = talcual[8].apply(lambda x: x.replace('\\n', ''))\n", + "talcual" ] }, { @@ -516,7 +1657,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -530,7 +1671,10 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "fims_get = requests.get(url).content\n", + "films_soup = BeautifulSoup(films_get, 'lxml')\n", + "films_names = films_soup.find_all(<" ] }, { @@ -569,7 +1713,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 355, "metadata": {}, "outputs": [], "source": [ @@ -580,11 +1724,135 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 356, "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "books = requests.get(url).content\n", + "books_soup = BeautifulSoup(books, 'html')" + ] + }, + { + "cell_type": "code", + "execution_count": 357, + "metadata": {}, + "outputs": [], + "source": [ + "names = books_soup.find_all('h3')\n", + "names_list = []\n", + "for i in range(len(names)):\n", + " for name in (names[i].find_all('a', title=True)):\n", + " names_list.append(name['title'])" + ] + }, + { + "cell_type": "code", + "execution_count": 358, + "metadata": {}, + "outputs": [], + "source": [ + "prices = books_soup.find_all('p', {'class':'price_color'})\n", + "\n", + "prices_list = [prices[i].text for i in range(len(prices))]" + ] + }, + { + "cell_type": "code", + "execution_count": 359, + "metadata": {}, + "outputs": [], + "source": [ + "stock = books_soup.find_all('p', {'class':'instock availability'})\n", + "stock = [stock[i].text for i in range(len(stock))]\n", + "stock = [i.replace('\\n', \"\") for i in stock]\n", + "stock = [re.sub('\\s\\s+', '', stock[i]) for i in range(len(stock))] " + ] + }, + { + "cell_type": "code", + "execution_count": 360, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Book NamePriceStock Availability
0A Light in the Attic£51.77In stock
1Tipping the Velvet£53.74In stock
2Soumission£50.10In stock
3Sharp Objects£47.82In stock
4Sapiens: A Brief History of Humankind£54.23In stock
\n", + "
" + ], + "text/plain": [ + " Book Name Price Stock Availability\n", + "0 A Light in the Attic £51.77 In stock\n", + "1 Tipping the Velvet £53.74 In stock\n", + "2 Soumission £50.10 In stock\n", + "3 Sharp Objects £47.82 In stock\n", + "4 Sapiens: A Brief History of Humankind £54.23 In stock" + ] + }, + "execution_count": 360, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {\"Book Name\":names_list, \"Price\":prices_list, \"Stock Availability\": stock}\n", + "books = pd.DataFrame(d)\n", + "books.head()" ] } ], diff --git a/your-code/.ipynb_checkpoints/main_fer-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main_fer-checkpoint.ipynb new file mode 100644 index 0000000..0e85fa9 --- /dev/null +++ b/your-code/.ipynb_checkpoints/main_fer-checkpoint.ipynb @@ -0,0 +1,1224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Web Scraping Lab\n", + "\n", + "You will find in this notebook some scrapy exercises to practise your scraping skills.\n", + "\n", + "**Tips:**\n", + "\n", + "- Check the response status code for each request to ensure you have obtained the intended contennt.\n", + "- Print the response text in each request to understand the kind of info you are getting and its format.\n", + "- Check for patterns in the response text to extract the data/info requested in each question.\n", + "- Visit each url and take a look at its source through Chrome DevTools. You'll need to identify the html tags, special class names etc. used for the html content you are expected to extract." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- [Requests library](http://docs.python-requests.org/en/master/#the-user-guide) documentation \n", + "- [Beautiful Soup Doc](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)\n", + "- [Urllib](https://docs.python.org/3/library/urllib.html#module-urllib)\n", + "- [re lib](https://docs.python.org/3/library/re.html)\n", + "- [lxml lib](https://lxml.de/)\n", + "- [Scrapy](https://scrapy.org/)\n", + "- [List of HTTP status codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes)\n", + "- [HTML basics](http://www.simplehtmlguide.com/cheatsheet.php)\n", + "- [CSS basics](https://www.cssbasics.com/#page_start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Below are the libraries and modules you may need. `requests`, `BeautifulSoup` and `pandas` are imported for you. If you prefer to use additional libraries feel free to uncomment them." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "# from pprint import pprint\n", + "from lxml import html\n", + "# from lxml.html import fromstring\n", + "# import urllib.request\n", + "# from urllib.request import urlopen\n", + "# import random\n", + "import re\n", + "# import scrapy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Download, parse (using BeautifulSoup), and print the content from the Trending Developers page from GitHub:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://github.com/trending/developers'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display the names of the trending developers retrieved in the previous step.\n", + "\n", + "Your output should be a Python list of developer names. Each name should not contain any html tag.\n", + "\n", + "**Instructions:**\n", + "\n", + "1. Find out the html tag and class names used for the developer names. You can achieve this using Chrome DevTools.\n", + "\n", + "1. Use BeautifulSoup to extract all the html elements that contain the developer names.\n", + "\n", + "1. Use string manipulation techniques to replace whitespaces and linebreaks (i.e. `\\n`) in the *text* of each html element. Use a list to store the clean names.\n", + "\n", + "1. Print the list of names.\n", + "\n", + "Your output should look like below:\n", + "\n", + "```\n", + "['trimstray (@trimstray)',\n", + " 'joewalnes (JoeWalnes)',\n", + " 'charlax (Charles-AxelDein)',\n", + " 'ForrestKnight (ForrestKnight)',\n", + " 'revery-ui (revery-ui)',\n", + " 'alibaba (Alibaba)',\n", + " 'Microsoft (Microsoft)',\n", + " 'github (GitHub)',\n", + " 'facebook (Facebook)',\n", + " 'boazsegev (Bo)',\n", + " 'google (Google)',\n", + " 'cloudfetch',\n", + " 'sindresorhus (SindreSorhus)',\n", + " 'tensorflow',\n", + " 'apache (TheApacheSoftwareFoundation)',\n", + " 'DevonCrawford (DevonCrawford)',\n", + " 'ARMmbed (ArmMbed)',\n", + " 'vuejs (vuejs)',\n", + " 'fastai (fast.ai)',\n", + " 'QiShaoXuan (Qi)',\n", + " 'joelparkerhenderson (JoelParkerHenderson)',\n", + " 'torvalds (LinusTorvalds)',\n", + " 'CyC2018',\n", + " 'komeiji-satori (神楽坂覚々)',\n", + " 'script-8']\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "h1 = soup.find_all('h1', {'class':'h3 lh-condensed'})\n", + "a = soup.find_all('a', {'class':'link-gray'})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "nombres = [link.string for link in h1]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "a = a[19:44]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "nicks = [elemento.string for elemento in a]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "todo = [nombres[i] + \" \"+ '('+ nicks[i]+')' for i in range(len(nombres))]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Eric Ma (ericmjl)',\n", + " 'Federico Brigante (bfred-it)',\n", + " 'Kyle Roach (iRoachie)',\n", + " 'Olle Jonsson (olleolleolle)',\n", + " 'Nikita Sobolev (sobolevn)',\n", + " 'Frank S. Thomas (fthomas)',\n", + " 'syuilo (syuilo)',\n", + " 'Ives van Hoorne (CompuIves)',\n", + " 'Paulus Schoutsen (balloob)',\n", + " 'Sarah Drasner (sdras)',\n", + " 'Stefanos Kornilios Mitsis Poiitidis (skmp)',\n", + " 'Jan Hovancik (hovancik)',\n", + " 'Andreas Mueller (amueller)',\n", + " 'Guillaume Gomez (GuillaumeGomez)',\n", + " 'Matt Holt (mholt)',\n", + " 'Clifford Wolf (cliffordwolf)',\n", + " 'Franck Nijhof (frenck)',\n", + " 'Joe Block (unixorn)',\n", + " 'Andrei Neagoie (aneagoie)',\n", + " 'Jack Lloyd (randombit)',\n", + " 'Guillermo Rauch (rauchg)',\n", + " 'Tim Griesser (tgriesser)',\n", + " 'Jameson Nash (vtjnash)',\n", + " 'Anderson Banihirwe (andersy005)',\n", + " 'Danny Ryan (djrtwo)']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "todo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display the trending Python repositories in GitHub\n", + "\n", + "The steps to solve this problem is similar to the previous one except that you need to find out the repository names instead of developer names." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://github.com/trending/python?since=daily'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "soup = BeautifulSoup(html, 'lxml')\n", + "h1 = soup.find_all('h1', {'class':'h3 lh-condensed'})" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "proyectos = [algo.text.replace(\"\\n\", \"\") for algo in h1]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['gto76 / python-cheatsheet ',\n", + " 'j3ssie / Osmedeus ',\n", + " 'tangzixiang0304 / Shielded_detector ',\n", + " 'uber / ludwig ',\n", + " 'xinshuoweng / AB3DMOT ',\n", + " 'NVlabs / stylegan ',\n", + " 'dagster-io / dagster ',\n", + " 'tensorflow / models ',\n", + " 'eragonruan / text-detection-ctpn ',\n", + " 'sherlock-project / sherlock ',\n", + " 'deepfakes / faceswap ',\n", + " 'nbei / Deep-Flow-Guided-Video-Inpainting ',\n", + " 'iovisor / bcc ',\n", + " 'Roibal / Cryptocurrency-Trading-Bots-Python-Beginner-Advance ',\n", + " 'NVIDIA / DeepLearningExamples ',\n", + " 'BlackHC / tfpyth ',\n", + " 'clovaai / deep-text-recognition-benchmark ',\n", + " 'tkat0 / PyTorch_BlazeFace ',\n", + " 'OpenMined / PySyft ',\n", + " 'CoreyMSchafer / code_snippets ',\n", + " 'public-apis / public-apis ',\n", + " 'd2l-ai / d2l-zh ',\n", + " 'apache / airflow ',\n", + " 'beecost / bee-university ',\n", + " 'sundowndev / PhoneInfoga ']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proyectos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display all the image links from Walt Disney wikipedia page" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://en.wikipedia.org/wiki/Walt_Disney'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Retrieve an arbitary Wikipedia page of \"Python\" and create a list of links on that page" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url ='https://en.wikipedia.org/wiki/Python'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "ul = soup.find_all('ul')\n", + "ul= ul[2:14]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/wiki/Pythonidae',\n", + " '/wiki/Python_(genus)',\n", + " '/wiki/Python_(genus)',\n", + " '/wiki/Python_(mythology)',\n", + " '/wiki/Python_of_Aenus',\n", + " '/wiki/Python_(painter)',\n", + " '/wiki/Python_of_Byzantium',\n", + " '/wiki/Python_of_Catana',\n", + " '/wiki/Python_(film)',\n", + " '/wiki/Pythons_2',\n", + " '/wiki/Monty_Python',\n", + " '/wiki/Python_(Monty)_Pictures',\n", + " '/wiki/Pythons_2',\n", + " '/wiki/Python_(Monty)_Pictures',\n", + " '/wiki/Python_(programming_language)',\n", + " '/wiki/CPython',\n", + " '/wiki/CMU_Common_Lisp',\n", + " '/wiki/PERQ#PERQ_3',\n", + " '/wiki/CPython',\n", + " '/wiki/Python_(Busch_Gardens_Tampa_Bay)',\n", + " '/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)',\n", + " '/wiki/Python_(Efteling)',\n", + " '/wiki/Python_(automobile_maker)',\n", + " '/wiki/Python_(Ford_prototype)',\n", + " '/wiki/Colt_Python',\n", + " '/wiki/Python_(missile)',\n", + " '/wiki/Python_(nuclear_primary)',\n", + " '/wiki/Python_Anghelo']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l = [img['href'] for i in range(len(ul)) for img in (ul[i].find_all('a', href=True)) ] \n", + "l" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Number of Titles that have changed in the United States Code since its last release point " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'http://uscode.house.gov/download/download.shtml'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "#soup\n", + "titles = soup.find_all('div', {'class':'usctitlechanged'})" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "15" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(titles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### A Python list with the top ten FBI's Most Wanted names " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://www.fbi.gov/wanted/topten'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "#your code \n", + "soup = BeautifulSoup(html, 'lxml')\n", + "titles = soup.find_all('h3', {'class':'title'})" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ALEJANDRO ROSALES CASTILLO',\n", + " 'YASER ABDEL SAID',\n", + " 'JASON DEREK BROWN',\n", + " 'RAFAEL CARO-QUINTERO',\n", + " 'ALEXIS FLORES',\n", + " 'EUGENE PALMER',\n", + " 'SANTIAGO VILLALBA MEDEROS',\n", + " 'ROBERT WILLIAM FISHER',\n", + " 'BHADRESHKUMAR CHETANBHAI PATEL',\n", + " 'ARNOLDO JIMENEZ']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buscados = [nombre.text.replace(\"\\n\", \"\") for nombre in titles]\n", + "buscados" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://www.emsc-csem.org/Earthquake/'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "\n", + "datetime = soup.find_all('td', {'class':'tabev6'})\n", + "lyl = soup.find_all('td', {'class':'tabev1'})\n", + "region = soup.find_all('td', {'class':'tb_region'})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n", + "datefull = []\n", + "for fecha in limpdate:\n", + " fecha = fecha.replace('earthquake', '')\n", + " fecha =fecha.replace('ago', '')\n", + " datefull.append(fecha)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "lat =[]\n", + "lon = []\n", + "for latlon in range(len(lyl)):\n", + " if latlon %2 == 0:\n", + " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n", + " else:\n", + " lon.append(lyl[latlon].text.replace('\\xa0', ''))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "lugar =[fecha.text for fecha in region]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Date and timeLatitudLongitudRegion
02019-07-16 21:06:21.515min35.88117.69CENTRAL CALIFORNIA
12019-07-16 21:03:36.718min37.81121.76SAN FRANCISCO BAY AREA, CALIF.
22019-07-16 20:52:18.629min36.07117.84CENTRAL CALIFORNIA
32019-07-16 20:49:09.132min36.07117.65CENTRAL CALIFORNIA
42019-07-16 20:41:11.040min16.85100.25OFFSHORE GUERRERO, MEXICO
52019-07-16 20:33:52.948min40.0919.91ALBANIA
62019-07-16 20:31:33.350min23.4566.86JUJUY, ARGENTINA
72019-07-16 20:29:07.752min35.86117.69CENTRAL CALIFORNIA
82019-07-16 20:23:34.758min36.07117.84CENTRAL CALIFORNIA
92019-07-16 20:19:00.11hr 02min33.1012.42MADEIRA ISLANDS, PORTUGAL REGION
102019-07-16 20:17:51.61hr 04min35.55117.43SOUTHERN CALIFORNIA
112019-07-16 20:15:59.01hr 05min35.68117.52SOUTHERN CALIFORNIA
122019-07-16 20:11:01.51hr 10min37.82121.77SAN FRANCISCO BAY AREA, CALIF.
132019-07-16 19:51:06.51hr 30min6.26148.65NEW BRITAIN REGION, P.N.G.
142019-07-16 19:42:25.91hr 39min35.61117.47SOUTHERN CALIFORNIA
152019-07-16 19:35:57.01hr 46min35.62117.45SOUTHERN CALIFORNIA
162019-07-16 19:23:50.11hr 58min36.19117.89CENTRAL CALIFORNIA
172019-07-16 19:20:21.42hr 01min38.3916.94SOUTHERN ITALY
182019-07-16 19:16:53.82hr 05min38.4516.91SOUTHERN ITALY
192019-07-16 19:16:15.92hr 05min61.27152.44SOUTHERN ALASKA
\n", + "
" + ], + "text/plain": [ + " Date and time Latitud Longitud \\\n", + "0 2019-07-16 21:06:21.515min 35.88 117.69 \n", + "1 2019-07-16 21:03:36.718min 37.81 121.76 \n", + "2 2019-07-16 20:52:18.629min 36.07 117.84 \n", + "3 2019-07-16 20:49:09.132min 36.07 117.65 \n", + "4 2019-07-16 20:41:11.040min 16.85 100.25 \n", + "5 2019-07-16 20:33:52.948min 40.09 19.91 \n", + "6 2019-07-16 20:31:33.350min 23.45 66.86 \n", + "7 2019-07-16 20:29:07.752min 35.86 117.69 \n", + "8 2019-07-16 20:23:34.758min 36.07 117.84 \n", + "9 2019-07-16 20:19:00.11hr 02min 33.10 12.42 \n", + "10 2019-07-16 20:17:51.61hr 04min 35.55 117.43 \n", + "11 2019-07-16 20:15:59.01hr 05min 35.68 117.52 \n", + "12 2019-07-16 20:11:01.51hr 10min 37.82 121.77 \n", + "13 2019-07-16 19:51:06.51hr 30min 6.26 148.65 \n", + "14 2019-07-16 19:42:25.91hr 39min 35.61 117.47 \n", + "15 2019-07-16 19:35:57.01hr 46min 35.62 117.45 \n", + "16 2019-07-16 19:23:50.11hr 58min 36.19 117.89 \n", + "17 2019-07-16 19:20:21.42hr 01min 38.39 16.94 \n", + "18 2019-07-16 19:16:53.82hr 05min 38.45 16.91 \n", + "19 2019-07-16 19:16:15.92hr 05min 61.27 152.44 \n", + "\n", + " Region \n", + "0  CENTRAL CALIFORNIA \n", + "1  SAN FRANCISCO BAY AREA, CALIF. \n", + "2  CENTRAL CALIFORNIA \n", + "3  CENTRAL CALIFORNIA \n", + "4  OFFSHORE GUERRERO, MEXICO \n", + "5  ALBANIA \n", + "6  JUJUY, ARGENTINA \n", + "7  CENTRAL CALIFORNIA \n", + "8  CENTRAL CALIFORNIA \n", + "9  MADEIRA ISLANDS, PORTUGAL REGION \n", + "10  SOUTHERN CALIFORNIA \n", + "11  SOUTHERN CALIFORNIA \n", + "12  SAN FRANCISCO BAY AREA, CALIF. \n", + "13  NEW BRITAIN REGION, P.N.G. \n", + "14  SOUTHERN CALIFORNIA \n", + "15  SOUTHERN CALIFORNIA \n", + "16  CENTRAL CALIFORNIA \n", + "17  SOUTHERN ITALY \n", + "18  SOUTHERN ITALY \n", + "19  SOUTHERN ALASKA " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display the date, and title of upcoming hackathon events as a Pandas dataframe table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url ='https://hackevents.co/hackathons'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Count number of tweets by a given Twitter account." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will need to include a ***try/except block*** for account names not found. \n", + "
***Hint:*** the program should count the number of tweets for any provided account" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise \n", + "# You will need to add the account credentials to this url\n", + "url = 'https://twitter.com/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Number of followers of a given twitter account" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will need to include a ***try/except block*** in case account/s name not found. \n", + "
***Hint:*** the program should count the followers for any provided account" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise \n", + "# You will need to add the account credentials to this url\n", + "url = 'https://twitter.com/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### List all language names and number of related articles in the order they appear in wikipedia.org" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://www.wikipedia.org/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### A list with the different kind of datasets available in data.gov.uk " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://data.gov.uk/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Top 10 languages by number of native speakers stored in a Pandas Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BONUS QUESTIONS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Scrape a certain number of tweets of a given Twitter account." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise \n", + "# You will need to add the account credentials to this url\n", + "url = 'https://twitter.com/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### IMDB's Top 250 data (movie name, Initial release, director name and stars) as a pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise \n", + "url = 'https://www.imdb.com/chart/top'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Find the live weather report (temperature, wind speed, description and weather) of a given city." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#https://openweathermap.org/current\n", + "city = city=input('Enter the city:')\n", + "url = 'http://api.openweathermap.org/data/2.5/weather?'+'q='+city+'&APPID=b35975e18dc93725acb092f7272cc6b8&units=metric'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Book name, price and stock availability as a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise. \n", + "# It is a fictional bookstore created to be scraped. \n", + "url = 'http://books.toscrape.com/'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "\n", + "datetime = soup.find_all('td', {'class':'tabev6'})\n", + "lyl = soup.find_all('td', {'class':'tabev1'})\n", + "region = soup.find_all('td', {'class':'tb_region'})\n", + "\n", + "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n", + "datefull = []\n", + "for fecha in limpdate:\n", + " fecha = fecha.replace('earthquake', '')\n", + " fecha =fecha.replace('ago', '')\n", + " datefull.append(fecha)\n", + " \n", + "lat =[]\n", + "lon = []\n", + "for latlon in range(len(lyl)):\n", + " if latlon %2 == 0:\n", + " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n", + " else:\n", + " lon.append(lyl[latlon].text.replace('\\xa0', ''))\n", + " \n", + "lugar =[fecha.text for fecha in region]\n", + "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 812f7a4..256b315 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -48,12 +48,13 @@ "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "# from pprint import pprint\n", - "# from lxml import html\n", - "# from lxml.html import fromstring\n", + "from lxml import html\n", + "from lxml.html import fromstring\n", "# import urllib.request\n", "# from urllib.request import urlopen\n", - "# import random\n", - "# import re\n", + "import random\n", + "import re\n", + "import html5lib\n", "# import scrapy" ] }, @@ -64,25 +65,6 @@ "#### Download, parse (using BeautifulSoup), and print the content from the Trending Developers page from GitHub:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is the url you will scrape in this exercise\n", - "url = 'https://github.com/trending/developers'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#your code" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -134,11 +116,108 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 361, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the url you will scrape in this exercise\n", + "url = 'https://github.com/trending/developers'" + ] + }, + { + "cell_type": "code", + "execution_count": 362, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "conts = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": 363, + "metadata": {}, + "outputs": [], + "source": [ + "git_soup = BeautifulSoup(conts, 'html5lib')" + ] + }, + { + "cell_type": "code", + "execution_count": 364, "metadata": {}, "outputs": [], "source": [ - "#your code" + "names = git_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 365, + "metadata": {}, + "outputs": [], + "source": [ + "nick = git_soup.find_all('a', {'class':\"link-gray\"})\n", + "nick = nick[19:44]" + ] + }, + { + "cell_type": "code", + "execution_count": 366, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "nicks = [i.text for i in nick]\n", + "names_s = [i.text for i in names]" + ] + }, + { + "cell_type": "code", + "execution_count": 367, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Eric Ma (ericmjl)',\n", + " 'Federico Brigante (bfred-it)',\n", + " 'Kyle Roach (iRoachie)',\n", + " 'Olle Jonsson (olleolleolle)',\n", + " 'Nikita Sobolev (sobolevn)',\n", + " 'Frank S. Thomas (fthomas)',\n", + " 'syuilo (syuilo)',\n", + " 'Ives van Hoorne (CompuIves)',\n", + " 'Paulus Schoutsen (balloob)',\n", + " 'Sarah Drasner (sdras)',\n", + " 'Stefanos Kornilios Mitsis Poiitidis (skmp)',\n", + " 'Jan Hovancik (hovancik)',\n", + " 'Andreas Mueller (amueller)',\n", + " 'Guillaume Gomez (GuillaumeGomez)',\n", + " 'Matt Holt (mholt)',\n", + " 'Clifford Wolf (cliffordwolf)',\n", + " 'Franck Nijhof (frenck)',\n", + " 'Joe Block (unixorn)',\n", + " 'Andrei Neagoie (aneagoie)',\n", + " 'Jack Lloyd (randombit)',\n", + " 'Guillermo Rauch (rauchg)',\n", + " 'Tim Griesser (tgriesser)',\n", + " 'Jameson Nash (vtjnash)',\n", + " 'Anderson Banihirwe (andersy005)',\n", + " 'Danny Ryan (djrtwo)']" + ] + }, + "execution_count": 367, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for i in range(len(names_s)):\n", + " names_s[i] = names_s[i] +\" \"+ \"(\" + nicks[i] + \")\"\n", + "names_s" ] }, { @@ -152,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 368, "metadata": {}, "outputs": [], "source": [ @@ -162,11 +241,81 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 369, "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "repos = requests.get(url).content\n", + "repo_soup = BeautifulSoup(repos, 'html5lib')" + ] + }, + { + "cell_type": "code", + "execution_count": 370, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "reposi = repo_soup.find_all('h1', {\"class\":\"h3 lh-condensed\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 371, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "repos = [i.text for i in reposi]" + ] + }, + { + "cell_type": "code", + "execution_count": 372, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['gto76/python-cheatsheet',\n", + " 'j3ssie/Osmedeus',\n", + " 'tangzixiang0304/Shielded_detector',\n", + " 'uber/ludwig',\n", + " 'xinshuoweng/AB3DMOT',\n", + " 'NVlabs/stylegan',\n", + " 'dagster-io/dagster',\n", + " 'tensorflow/models',\n", + " 'eragonruan/text-detection-ctpn',\n", + " 'sherlock-project/sherlock',\n", + " 'deepfakes/faceswap',\n", + " 'nbei/Deep-Flow-Guided-Video-Inpainting',\n", + " 'iovisor/bcc',\n", + " 'Roibal/Cryptocurrency-Trading-Bots-Python-Beginner-Advance',\n", + " 'NVIDIA/DeepLearningExamples',\n", + " 'BlackHC/tfpyth',\n", + " 'clovaai/deep-text-recognition-benchmark',\n", + " 'tkat0/PyTorch_BlazeFace',\n", + " 'OpenMined/PySyft',\n", + " 'CoreyMSchafer/code_snippets',\n", + " 'public-apis/public-apis',\n", + " 'd2l-ai/d2l-zh',\n", + " 'apache/airflow',\n", + " 'beecost/bee-university',\n", + " 'sundowndev/PhoneInfoga']" + ] + }, + "execution_count": 372, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "repos = [i.replace('\\n', '') for i in repos]\n", + "repos = [i.replace(' ', '') for i in repos]\n", + "repos" ] }, { @@ -178,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 373, "metadata": {}, "outputs": [], "source": [ @@ -188,11 +337,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 374, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#your code\n", + "Walt = requests.get(url).content\n", + "walt_soup = BeautifulSoup(Walt, 'html')" + ] + }, + { + "cell_type": "code", + "execution_count": 375, "metadata": {}, "outputs": [], "source": [ - "#your code" + "imgs = walt_soup.find_all('div', {\"class\":\"thumbinner\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 376, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n", + " '/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n", + " '//upload.wikimedia.org/wikipedia/commons/4/4d/Newman_Laugh-O-Gram_%281921%29.webm',\n", + " '/wiki/File:Newman_Laugh-O-Gram_(1921).webm',\n", + " '/wiki/File:Trolley_Troubles_poster.jpg',\n", + " '/wiki/File:Trolley_Troubles_poster.jpg',\n", + " '/wiki/Trolley_Troubles',\n", + " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n", + " '/wiki/File:Walt_Disney_and_his_cartoon_creation_%22Mickey_Mouse%22_-_National_Board_of_Review_Magazine.jpg',\n", + " '/wiki/Mickey_Mouse',\n", + " '/wiki/File:Steamboat-willie.jpg',\n", + " '/wiki/File:Steamboat-willie.jpg',\n", + " '/wiki/Mickey_Mouse',\n", + " '/wiki/Steamboat_Willie',\n", + " '/wiki/File:Walt_Disney_1935.jpg',\n", + " '/wiki/File:Walt_Disney_1935.jpg',\n", + " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n", + " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n", + " '/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',\n", + " '/wiki/File:Disney_drawing_goofy.jpg',\n", + " '/wiki/File:Disney_drawing_goofy.jpg',\n", + " '/wiki/Goofy',\n", + " '/wiki/File:DisneySchiphol1951.jpg',\n", + " '/wiki/File:DisneySchiphol1951.jpg',\n", + " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n", + " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n", + " '/wiki/Disneyland',\n", + " '/wiki/Orange_County,_California',\n", + " '/wiki/File:Walt_disney_portrait_right.jpg',\n", + " '/wiki/File:Walt_disney_portrait_right.jpg',\n", + " '/wiki/File:Walt_Disney_Grave.JPG',\n", + " '/wiki/File:Walt_Disney_Grave.JPG',\n", + " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n", + " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n", + " '/wiki/Roy_O._Disney',\n", + " '/wiki/File:Disney_Display_Case.JPG',\n", + " '/wiki/File:Disney_Display_Case.JPG',\n", + " '/wiki/The_Walt_Disney_Family_Museum',\n", + " '/wiki/File:Disney1968.jpg',\n", + " '/wiki/File:Disney1968.jpg']" + ] + }, + "execution_count": 376, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l = []\n", + "for i in range(len(imgs)):\n", + " for img in (imgs[i].find_all('a', href=True)):\n", + " l.append(img['href'])\n", + "l" ] }, { @@ -209,7 +434,20 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url ='https://en.wikipedia.org/wiki/Python' " + "url ='https://en.wikipedia.org/wiki/Python' \n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "ul = soup.find_all('ul')\n", + "ul= ul[2:14]" ] }, { @@ -218,7 +456,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "l = [img['href'] for i in range(len(ul)) for img in (ul[i].find_all('a', href=True)) ] \n", + "l" ] }, { @@ -235,7 +474,8 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'http://uscode.house.gov/download/download.shtml'" + "url = 'http://uscode.house.gov/download/download.shtml'\n", + "html = requests.get(url).content" ] }, { @@ -244,7 +484,10 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "#soup\n", + "titles = soup.find_all('div', {'class':'usctitlechanged'})" ] }, { @@ -261,7 +504,19 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://www.fbi.gov/wanted/topten'" + "url = 'https://www.fbi.gov/wanted/topten'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code \n", + "soup = BeautifulSoup(html, 'lxml')\n", + "titles = soup.find_all('h3', {'class':'title'})" ] }, { @@ -270,7 +525,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code " + "buscados = [nombre.text.replace(\"\\n\", \"\") for nombre in titles]\n", + "buscados" ] }, { @@ -287,7 +543,60 @@ "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://www.emsc-csem.org/Earthquake/'" + "url = 'https://www.emsc-csem.org/Earthquake/'\n", + "html = requests.get(url).content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "soup = BeautifulSoup(html, 'lxml')\n", + "\n", + "datetime = soup.find_all('td', {'class':'tabev6'})\n", + "lyl = soup.find_all('td', {'class':'tabev1'})\n", + "region = soup.find_all('td', {'class':'tb_region'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "limpdate =[str(fecha.text.replace('\\xa0', ' ')) for fecha in datetime]\n", + "datefull = []\n", + "for fecha in limpdate:\n", + " fecha = fecha.replace('earthquake', '')\n", + " fecha =fecha.replace('ago', '')\n", + " datefull.append(fecha)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lat =[]\n", + "lon = []\n", + "for latlon in range(len(lyl)):\n", + " if latlon %2 == 0:\n", + " lat.append(lyl[latlon].text.replace('\\xa0', ''))\n", + " else:\n", + " lon.append(lyl[latlon].text.replace('\\xa0', ''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lugar =[fecha.text for fecha in region]" ] }, { @@ -296,7 +605,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "df = pd.DataFrame({'Date and time':datefull, 'Latitud':lat, 'Longitud':lon, 'Region':lugar})\n", + "df.head(20)" ] }, { @@ -308,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 291, "metadata": {}, "outputs": [], "source": [ @@ -318,11 +628,125 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 304, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "hackatons_soup = requests.get(url).content\n", + "hackatons_soup = BeautifulSoup(hackatons_soup, 'html5lib')" + ] + }, + { + "cell_type": "code", + "execution_count": 342, "metadata": {}, "outputs": [], "source": [ - "#your code" + "hackis = hackatons_soup.find_all('div', {'class':'card-body'})" + ] + }, + { + "cell_type": "code", + "execution_count": 343, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Code Geist Hackathon by SefrWahed\\u20037/29/2019\\n\\n\\u2003Cairo, Egypt',\n", + " 'The Code Factor\\u20035/21/2019\\n\\n\\u2003Milano, Italy',\n", + " 'TECHFEST MUNICH\\u20039/6/2019\\n\\n\\u2003Munich, Germany',\n", + " 'Galileo App Competition\\u20031/31/2019\\n\\n\\u2003Prague, Czech Republic']" + ] + }, + "execution_count": 343, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacks = [hackis[i].text for i in range(len(hackis))]\n", + "hacks = [i.split(\"\\u2003\") for i in hacks]" + ] + }, + { + "cell_type": "code", + "execution_count": 348, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0Code Geist Hackathon by SefrWahed7/29/2019Cairo, Egypt
1The Code Factor5/21/2019Milano, Italy
2TECHFEST MUNICH9/6/2019Munich, Germany
3Galileo App Competition1/31/2019Prague, Czech Republic
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 Code Geist Hackathon by SefrWahed 7/29/2019 Cairo, Egypt\n", + "1 The Code Factor 5/21/2019 Milano, Italy\n", + "2 TECHFEST MUNICH 9/6/2019 Munich, Germany\n", + "3 Galileo App Competition 1/31/2019 Prague, Czech Republic" + ] + }, + "execution_count": 348, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacks = pd.DataFrame(hacks)\n", + "hacks[1] = hacks[1].apply(lambda x:x.replace('\\n', ''))\n", + "hacks" ] }, { @@ -353,11 +777,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 300, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check the tweets of an account:shiroiusagi4486\n" + ] + }, + { + "data": { + "text/plain": [ + "'76'" + ] + }, + "execution_count": 300, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code" + "#your code\n", + "usr_input = input('Check the tweets of an account:')\n", + "try:\n", + " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n", + "except:\n", + " print('The account does not exist')\n", + "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n", + "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n", + "twitter_follows[0]['data-count']\n", + " \n", + " " ] }, { @@ -388,11 +840,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 298, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check the followers of an account:uaquiro\n" + ] + }, + { + "data": { + "text/plain": [ + "'302'" + ] + }, + "execution_count": 298, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#your code" + "#your code\n", + "usr_input = input('Check the followers of an account:')\n", + "try:\n", + " twitter_follows = requests.get('https://twitter.com/'+usr_input).content\n", + "except:\n", + " print('The account does not exist')\n", + "twitter_follows = BeautifulSoup(twitter_follows, 'lxml')\n", + "twitter_follows = twitter_follows.find_all('span', {'class': 'ProfileNav-value'})\n", + "twitter_follows[2]['data-count']\n", + " \n", + " \n", + " \n", + " " ] }, { @@ -404,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 207, "metadata": {}, "outputs": [], "source": [ @@ -414,11 +896,243 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 208, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "wikissup = requests.get(url).content\n", + "wikissup = BeautifulSoup(wikissup, 'lxml')" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [], + "source": [ + "langs = wikissup.find_all('div', {'class':'central-featured'})" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['English',\n", + " '日本語',\n", + " 'Español',\n", + " 'Deutsch',\n", + " 'Русский',\n", + " 'Français',\n", + " 'Italiano',\n", + " '中文',\n", + " 'Português',\n", + " 'Polski']" + ] + }, + "execution_count": 268, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "langu = []\n", + "for i in range(len(langs)):\n", + " langu.append(langs[i].find_all('strong'))\n", + "langu = [i for i in langu[0]]\n", + "langu = [i.text for i in langu]" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['English5\\xa0892\\xa0000',\n", + " ' articles日本語1\\xa0159\\xa0000',\n", + " ' 記事Español1\\xa0532\\xa0000',\n", + " ' artículosDeutsch2\\xa0323\\xa0000',\n", + " ' ArtikelРусский1\\xa0556\\xa0000',\n", + " ' статейFrançais2\\xa0123\\xa0000',\n", + " ' articlesItaliano1\\xa0541\\xa0000',\n", + " ' voci中文1\\xa0065\\xa0000',\n", + " ' 條目Português1\\xa0010\\xa0000',\n", + " ' artigosPolski1\\xa0346\\xa0000',\n", + " ' haseł']]" + ] + }, + "execution_count": 245, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers = [] \n", + "for i in range(len(langs)):\n", + " numbers.append(langs[i].find_all('small'))\n", + "numbers = [i.text for i in langs]\n", + "numbers = [i.replace('\\n', '') for i in numbers]\n", + "numbers = [i.split('+') for i in numbers]\n", + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 257, "metadata": {}, "outputs": [], "source": [ - "#your code" + "numbers = [re.findall('\\d', i) for i in liss]" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['5892000',\n", + " '1159000',\n", + " '1532000',\n", + " '2323000',\n", + " '1556000',\n", + " '2123000',\n", + " '1541000',\n", + " '1065000',\n", + " '1010000',\n", + " '1346000']" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numberss = [\"\".join(numbers[i]) for i in range(len(numbers))]\n", + "numbers = numberss[:-1]\n", + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 269, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LanguageArticles
0English5892000
1日本語1159000
2Español1532000
3Deutsch2323000
4Русский1556000
5Français2123000
6Italiano1541000
7中文1065000
8Português1010000
9Polski1346000
\n", + "
" + ], + "text/plain": [ + " Language Articles\n", + "0 English 5892000\n", + "1 日本語 1159000\n", + "2 Español 1532000\n", + "3 Deutsch 2323000\n", + "4 Русский 1556000\n", + "5 Français 2123000\n", + "6 Italiano 1541000\n", + "7 中文 1065000\n", + "8 Português 1010000\n", + "9 Polski 1346000" + ] + }, + "execution_count": 269, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {'Language':langu, 'Articles':numbers}\n", + "Wiki_df = pd.DataFrame(d)\n", + "Wiki_df" ] }, { @@ -430,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 191, "metadata": {}, "outputs": [], "source": [ @@ -440,11 +1154,166 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 192, + "metadata": {}, + "outputs": [], + "source": [ + "#your code \n", + "datasets_soup = requests.get(url).content\n", + "datasets_soup = BeautifulSoup(datasets_soup, 'lxml')" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Business and economy', 'Crime and justice', 'Defence', 'Education', 'Environment', 'Government', 'Government spending', 'Health', 'Mapping', 'Society', 'Towns and cities', 'Transport']\n" + ] + } + ], + "source": [ + "names = datasets_soup.find_all('h2')\n", + "names_s1 = [i.text for i in names]\n", + "\n", + "print(names_s1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 204, "metadata": {}, "outputs": [], "source": [ - "#your code " + "descp = datasets_soup.find_all('p')\n", + "descp = descp[5:-2]\n", + "descp = [i.text for i in descp]" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatasetDescription
0Business and economySmall businesses, industry, imports, exports a...
1Crime and justiceCourts, police, prison, offenders, borders and...
2DefenceArmed forces, health and safety, search and re...
3EducationStudents, training, qualifications and the Nat...
4EnvironmentWeather, flooding, rivers, air quality, geolog...
5GovernmentStaff numbers and pay, local councillors and d...
6Government spendingIncludes all payments by government department...
7HealthIncludes smoking, drugs, alcohol, medicine per...
8MappingAddresses, boundaries, land ownership, aerial ...
9SocietyEmployment, benefits, household finances, pove...
10Towns and citiesIncludes housing, urban planning, leisure, was...
11TransportAirports, roads, freight, electric vehicles, p...
\n", + "
" + ], + "text/plain": [ + " Dataset Description\n", + "0 Business and economy Small businesses, industry, imports, exports a...\n", + "1 Crime and justice Courts, police, prison, offenders, borders and...\n", + "2 Defence Armed forces, health and safety, search and re...\n", + "3 Education Students, training, qualifications and the Nat...\n", + "4 Environment Weather, flooding, rivers, air quality, geolog...\n", + "5 Government Staff numbers and pay, local councillors and d...\n", + "6 Government spending Includes all payments by government department...\n", + "7 Health Includes smoking, drugs, alcohol, medicine per...\n", + "8 Mapping Addresses, boundaries, land ownership, aerial ...\n", + "9 Society Employment, benefits, household finances, pove...\n", + "10 Towns and cities Includes housing, urban planning, leisure, was...\n", + "11 Transport Airports, roads, freight, electric vehicles, p..." + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {'Dataset':names_s1, 'Description': descp}\n", + "DataSets = pd.DataFrame(d)\n", + "DataSets" ] }, { @@ -456,7 +1325,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 349, "metadata": {}, "outputs": [], "source": [ @@ -466,11 +1335,283 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 350, + "metadata": {}, + "outputs": [], + "source": [ + "#your code\n", + "table_lang = requests.get(url).content\n", + "table_soup = BeautifulSoup(table_lang, 'html')" + ] + }, + { + "cell_type": "code", + "execution_count": 351, + "metadata": {}, + "outputs": [], + "source": [ + "tables = table_soup.find_all('tr')" + ] + }, + { + "cell_type": "code", + "execution_count": 352, + "metadata": {}, + "outputs": [], + "source": [ + "tabless = []\n", + "for i in range(len(tables)):\n", + " tabless.append(tables[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 353, "metadata": {}, "outputs": [], "source": [ - "#your code" + "tal = [tabless[i].text.split(\"\\n\\n\") for i in range(len(tabless))]" + ] + }, + { + "cell_type": "code", + "execution_count": 354, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678
0RankRankLanguagePrimary CountryTotalCountries[a]Speakers(millions)% of the World population\\n(March 2019)[7]\\nMacrolanguageLanguage familyBranch
11Chinese (macrolanguage)China391,31117.026Sino-TibetanSinitic
21MandarinChina1391811.922ChineseSino-TibetanSinitic
322SpanishSpain314605.974Indo-EuropeanRomance
433EnglishUnited Kingdom1373794.922Indo-EuropeanGermanic
544HindiIndia43414.429Indo-EuropeanIndo-Aryan
65Arabic (macrolanguage)Saudi Arabia593194.143AfroasiaticSemitic
756BengaliBangladesh42282.961Indo-EuropeanIndo-Aryan
867PortuguesePortugal152212.870Indo-EuropeanRomance
978RussianRussian Federation191542.000Indo-EuropeanBalto-Slavic
1089JapaneseJapan21281.662JaponicJapanese
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 \\\n", + "0 Rank Rank Language Primary Country \n", + "1 — 1 Chinese (macrolanguage) China \n", + "2 1 — Mandarin China \n", + "3 2 2 Spanish Spain \n", + "4 3 3 English United Kingdom \n", + "5 4 4 Hindi India \n", + "6 — 5 Arabic (macrolanguage) Saudi Arabia \n", + "7 5 6 Bengali Bangladesh \n", + "8 6 7 Portuguese Portugal \n", + "9 7 8 Russian Russian Federation \n", + "10 8 9 Japanese Japan \n", + "\n", + " 4 5 \\\n", + "0 TotalCountries[a] Speakers(millions) \n", + "1 39 1,311 \n", + "2 13 918 \n", + "3 31 460 \n", + "4 137 379 \n", + "5 4 341 \n", + "6 59 319 \n", + "7 4 228 \n", + "8 15 221 \n", + "9 19 154 \n", + "10 2 128 \n", + "\n", + " 6 7 \\\n", + "0 % of the World population\\n(March 2019)[7] \\nMacrolanguage \n", + "1 17.026 \n", + "2 11.922 Chinese \n", + "3 5.974 \n", + "4 4.922 \n", + "5 4.429 \n", + "6 4.143 \n", + "7 2.961 \n", + "8 2.870 \n", + "9 2.000 \n", + "10 1.662 \n", + "\n", + " 8 \n", + "0 Language familyBranch \n", + "1 Sino-TibetanSinitic \n", + "2 Sino-TibetanSinitic \n", + "3 Indo-EuropeanRomance \n", + "4 Indo-EuropeanGermanic \n", + "5 Indo-EuropeanIndo-Aryan \n", + "6 AfroasiaticSemitic \n", + "7 Indo-EuropeanIndo-Aryan \n", + "8 Indo-EuropeanRomance \n", + "9 Indo-EuropeanBalto-Slavic \n", + "10 JaponicJapanese " + ] + }, + "execution_count": 354, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "talcual = pd.DataFrame(tal)\n", + "talcual = talcual[:11]\n", + "talcual[0] = talcual[0].apply(lambda x: x.replace('\\n', ''))\n", + "talcual[8] = talcual[8].apply(lambda x: x.replace('\\n', ''))\n", + "talcual" ] }, { @@ -516,7 +1657,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -530,7 +1671,10 @@ "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "fims_get = requests.get(url).content\n", + "films_soup = BeautifulSoup(films_get, 'lxml')\n", + "films_names = films_soup.find_all(<" ] }, { @@ -569,7 +1713,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 355, "metadata": {}, "outputs": [], "source": [ @@ -580,11 +1724,135 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 356, "metadata": {}, "outputs": [], "source": [ - "#your code" + "#your code\n", + "books = requests.get(url).content\n", + "books_soup = BeautifulSoup(books, 'html')" + ] + }, + { + "cell_type": "code", + "execution_count": 357, + "metadata": {}, + "outputs": [], + "source": [ + "names = books_soup.find_all('h3')\n", + "names_list = []\n", + "for i in range(len(names)):\n", + " for name in (names[i].find_all('a', title=True)):\n", + " names_list.append(name['title'])" + ] + }, + { + "cell_type": "code", + "execution_count": 358, + "metadata": {}, + "outputs": [], + "source": [ + "prices = books_soup.find_all('p', {'class':'price_color'})\n", + "\n", + "prices_list = [prices[i].text for i in range(len(prices))]" + ] + }, + { + "cell_type": "code", + "execution_count": 359, + "metadata": {}, + "outputs": [], + "source": [ + "stock = books_soup.find_all('p', {'class':'instock availability'})\n", + "stock = [stock[i].text for i in range(len(stock))]\n", + "stock = [i.replace('\\n', \"\") for i in stock]\n", + "stock = [re.sub('\\s\\s+', '', stock[i]) for i in range(len(stock))] " + ] + }, + { + "cell_type": "code", + "execution_count": 360, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Book NamePriceStock Availability
0A Light in the Attic£51.77In stock
1Tipping the Velvet£53.74In stock
2Soumission£50.10In stock
3Sharp Objects£47.82In stock
4Sapiens: A Brief History of Humankind£54.23In stock
\n", + "
" + ], + "text/plain": [ + " Book Name Price Stock Availability\n", + "0 A Light in the Attic £51.77 In stock\n", + "1 Tipping the Velvet £53.74 In stock\n", + "2 Soumission £50.10 In stock\n", + "3 Sharp Objects £47.82 In stock\n", + "4 Sapiens: A Brief History of Humankind £54.23 In stock" + ] + }, + "execution_count": 360, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {\"Book Name\":names_list, \"Price\":prices_list, \"Stock Availability\": stock}\n", + "books = pd.DataFrame(d)\n", + "books.head()" ] } ],