diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 1fe9046..2e507fd 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -64,11 +64,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "html = requests.get(url).content\n", + "soup = BeautifulSoup(html)\n", + "soup1 = soup.find_all(\"div\", attrs = {\"class\": \"d-sm-flex flex-auto\"})" ] }, { @@ -126,11 +129,92 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "25" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "len(soup1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Jerry Liu'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "soup1[5].find_all(\"a\", attrs = {\"data-view-component\": \"true\"})[0].get_text().strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Johannes Köster',\n", + " 'Patrick von Platen',\n", + " 'Leonid Bugaev',\n", + " 'Stephen Celis',\n", + " 'anxdpanic',\n", + " 'Jerry Liu',\n", + " 'Sridhar Ratnakumar',\n", + " 'lijianan',\n", + " 'Thomas Eizinger',\n", + " 'Alan Shaw',\n", + " 'Luca Palmieri',\n", + " 'Michael Bui',\n", + " 'Harrison Chase',\n", + " 'HeYunfei',\n", + " 'Paul Beusterien',\n", + " 'bcoles',\n", + " 'pilcrowOnPaper',\n", + " 'Bjorn Lu',\n", + " 'Alessandro Ros',\n", + " 'yihong',\n", + " 'Erik Bernhardsson',\n", + " 'Hamish Willee',\n", + " 'Stefan Prodan',\n", + " 'Drew Powers',\n", + " 'Sebastian Muszynski']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "names = []\n", + "for i in range(len(soup1)):\n", + " x = soup1[i].find_all(\"a\", attrs = {\"data-view-component\": \"true\"})[0].get_text().strip()\n", + " names.append(x)\n", + "names" ] }, { @@ -144,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -154,11 +238,56 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Fxtekky/Fgpt4free',\n", + " 'FAIGC-Audio/FAudioGPT',\n", + " 'Falaeddine-13/Fthinkgpt',\n", + " 'Fdeep-floyd/FIF',\n", + " 'FNVIDIA/FNeMo-Guardrails',\n", + " 'Fgaomingqi/FTrack-Anything',\n", + " 'Fnlpxucan/FWizardLM',\n", + " 'FCVI-SZU/FLinly',\n", + " 'Fdeforum-art/Fdeforum-stable-diffusion',\n", + " 'FGFW-knocker/Fgfw_resist_tls_proxy',\n", + " 'FX-PLUG/FmPLUG-Owl',\n", + " 'FUX-Decoder/FSegment-Everything-Everywhere-All-At-Once',\n", + " 'Fbhaskatripathi/FpdfGPT',\n", + " 'Fopenai/Fplugins-quickstart',\n", + " '/sponsors/freedmand',\n", + " 'Fopenai/Fchatgpt-retrieval-plugin',\n", + " 'Faniskoubaa/Frosgpt',\n", + " 'Fultralytics/Fultralytics',\n", + " 'Ffarshadz1997/FMicrosoft-Rewards-bot',\n", + " '/sponsors/521xueweihan',\n", + " 'FJun-CEN/FSegmentAnyRGBD',\n", + " 'Fopen-mmlab/Fmmagic',\n", + " 'Flocalstack/Flocalstack',\n", + " 'Fxtekky/Fchatgpt-clone',\n", + " 'Fbookfere/FEbook-Translator-Calibre-Plugin']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "html = requests.get(url).content\n", + "soup = BeautifulSoup(html)\n", + "soup1 = soup.find_all(\"div\", attrs = {\"class\": \"position-relative container-lg p-responsive pt-6\"})\n", + "soup2 = soup1[0].find_all(\"article\", attrs = {\"class\": \"Box-row\"})\n", + "soup3 = soup2[0].a[\"href\"].strip()\n", + "names2 = []\n", + "for i in range(len(soup2)):\n", + " x = soup2[i].a[\"href\"].strip().replace(\"/login?return_to=%2\",\"\").replace(\"%2\",\"/\")\n", + " names2.append(x)\n", + "names2\n" ] }, { @@ -171,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -181,11 +310,50 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/wiki/File:Walt_Disney_1946.JPG',\n", + " '/wiki/File:Walt_Disney_1942_signature.svg',\n", + " '/wiki/File:Walt_Disney_Birthplace_Exterior_Hermosa_Chicago_Illinois.jpg',\n", + " '/wiki/File:Walt_Disney_envelope_ca._1921.jpg',\n", + " '/wiki/File:Trolley_Troubles_poster.jpg',\n", + " '/wiki/File:Steamboat-willie.jpg',\n", + " '/wiki/File:Walt_Disney_1935.jpg',\n", + " '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg',\n", + " '/wiki/File:Disney_drawing_goofy.jpg',\n", + " '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg',\n", + " '/wiki/File:Walt_disney_portrait_right.jpg',\n", + " '/wiki/File:Walt_Disney_Grave.JPG',\n", + " '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg',\n", + " '/wiki/File:DisneySchiphol1951.jpg',\n", + " '/wiki/File:Disney1968.jpg',\n", + " '/wiki/File:Disney_Oscar_1953_(cropped).jpg',\n", + " '/wiki/File:Disneyland_Resort_logo.svg',\n", + " '/wiki/File:Animation_disc.svg',\n", + " '/wiki/File:Magic_Kingdom_castle.jpg',\n", + " '/wiki/File:Blank_television_set.svg']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "link_list = []\n", + "html = requests.get(url).content\n", + "soup = BeautifulSoup(html)\n", + "\n", + "soup2 =soup.find_all(\"a\", attrs= {\"class\":\"image\"})\n", + "for item in range(len(soup2)):\n", + " x =soup2[item].get(\"href\")\n", + " link_list.append(x)\n", + "link_list" ] }, { @@ -197,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -207,11 +375,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'English': '6644000+',\n", + " 'Русский': '1909000+',\n", + " '日本語': '1370000+',\n", + " 'Deutsch': '2792000+',\n", + " 'Español': '1854000+',\n", + " 'Français': '2514000+',\n", + " 'Italiano': '1806000+',\n", + " '中文': '1347000+',\n", + " 'فارسی': 'فارسی',\n", + " 'Português': '1101000+'}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "language = []\n", + "number = []\n", + "html = requests.get(url).content\n", + "soup = BeautifulSoup(html)\n", + "soup2 = soup.find_all(\"div\", attrs = {\"class\": \"central-featured\"})\n", + "soup3 = soup2[0].find_all(\"div\")\n", + "\n", + "for item in soup3:\n", + " language.append(item.find_all(\"strong\")[0].get_text())\n", + " number.append((item.find_all(\"bdi\")[0].get_text().replace('\\xa0', '')))\n", + "final = zip(language,number)\n", + "final =dict(final)\n", + "final" ] }, { @@ -224,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -234,11 +435,146 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LanguageNative speakers (millions)Language familyBranch
0Mandarin Chinese (incl. Standard Chinese, but ...939.0Sino-TibetanSinitic
1Spanish485.0Indo-EuropeanRomance
2English380.0Indo-EuropeanGermanic
3Hindi (excl. Urdu, and other languages)345.0Indo-EuropeanIndo-Aryan
4Portuguese236.0Indo-EuropeanRomance
5Bengali234.0Indo-EuropeanIndo-Aryan
6Russian147.0Indo-EuropeanBalto-Slavic
7Japanese123.0JaponicJapanese
8Yue Chinese (incl. Cantonese)86.1Sino-TibetanSinitic
9Vietnamese85.0AustroasiaticVietic
\n", + "
" + ], + "text/plain": [ + " Language \n", + "0 Mandarin Chinese (incl. Standard Chinese, but ... \\\n", + "1 Spanish \n", + "2 English \n", + "3 Hindi (excl. Urdu, and other languages) \n", + "4 Portuguese \n", + "5 Bengali \n", + "6 Russian \n", + "7 Japanese \n", + "8 Yue Chinese (incl. Cantonese) \n", + "9 Vietnamese \n", + "\n", + " Native speakers (millions) Language family Branch \n", + "0 939.0 Sino-Tibetan Sinitic \n", + "1 485.0 Indo-European Romance \n", + "2 380.0 Indo-European Germanic \n", + "3 345.0 Indo-European Indo-Aryan \n", + "4 236.0 Indo-European Romance \n", + "5 234.0 Indo-European Indo-Aryan \n", + "6 147.0 Indo-European Balto-Slavic \n", + "7 123.0 Japonic Japanese \n", + "8 86.1 Sino-Tibetan Sinitic \n", + "9 85.0 Austroasiatic Vietic " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "html = pd.read_html(url)\n", + "html[0].head(10)" ] }, { @@ -251,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -261,11 +597,37 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'(1994)'" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "Movies = {}\n", + "html = requests.get(url).content\n", + "soup = BeautifulSoup(html)\n", + "soup2 = soup.find_all(\"tbody\", attrs = {\"class\": \"lister-list\"})\n", + "soup3 = soup2[0].find_all(\"a\")#, attrs = {\"class\": \"lister-list\"})\n", + "Name = soup3[0].img[\"alt\"]\n", + "year = soup2[0].find_all(\"span\", attrs={\"class\": \"secondaryInfo\"})[0].get_text()\n", + "\n", + "#soup3[1].a[\"title\"]\n", + "\n", + "#for item in soup3:\n", + " #name = item.img[\"alt\"] \n", + "\n", + "\n", + " " ] }, { @@ -277,21 +639,238 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Pesadelo em Elm Street'" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#This is the url you will scrape in this exercise\n", + "url = 'https://www.imdb.com/list/ls009796553/'\n", + "# your code here\n", + "html = requests.get(url).content\n", + "soup = BeautifulSoup(html)\n", + "\n", + "soup2 = soup.find_all(\"div\", attrs = {\"class\":\"lister-item mode-detail\"})\n", + "soup3 = soup2[0].find_all(\"a\")\n", + "film_name = soup3[0].img[\"alt\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'(1984)'" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "year = soup2[0].find_all(\"span\", attrs = {\"class\":\"lister-item-year text-muted unbold\"})[0].get_text()" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Teenager Nancy Thompson must uncover the dark truth concealed by her parents after she and her friends become targets of the spirit of a serial killer with a bladed glove in their dreams, in which if they die, it kills them in real life.'" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Description = soup2[0].find_all(\"p\", attrs = {\"class\":\"\"})[0].get_text().strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ - "#This is the url you will scrape in this exercise\n", - "url = 'https://www.imdb.com/list/ls009796553/'" + "movies = {}\n", + "index = 0\n", + "for item in range(11):\n", + " soup3 =soup2[item].find_all(\"a\")\n", + " film_name = soup3[0].img[\"alt\"]\n", + " year =soup2[item].find_all(\"span\", attrs = {\"class\":\"lister-item-year text-muted unbold\"})[0].get_text()\n", + " Description =soup2[item].find_all(\"p\", attrs = {\"class\":\"\"})[0].get_text().strip()\n", + " movies[index] = {\"movie\": film_name,\n", + " \"year\":year,\n", + " \"Description\":Description}\n", + " index +=1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "movies_df = pd.DataFrame(movies).T" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movieyearDescription
0Pesadelo em Elm Street(1984)Teenager Nancy Thompson must uncover the dark ...
1Despertares(1990)The victims of an encephalitis epidemic many y...
2Liga de Mulheres(1992)Two sisters join the first female professional...
3Um Bairro em Nova Iorque(1993)A father becomes worried when a local gangster...
4Anjos em Campo(1994)When a boy prays for a chance to have a family...
5Tempo de Matar(1996)In Clanton, Mississippi, a fearless young lawy...
6Amistad(1997)In 1839, the revolt of Mende captives aboard a...
7Anaconda(1997)A \"National Geographic\" film crew is taken hos...
8A Cool, Dry Place(1998)Russell, single father balances his work as a ...
9América Proibida(1998)A former neo-nazi skinhead tries to prevent hi...
10Uma Questão de Nervos(1999)A comedy about a psychiatrist whose number-one...
\n", + "
" + ], + "text/plain": [ + " movie year \n", + "0 Pesadelo em Elm Street (1984) \\\n", + "1 Despertares (1990) \n", + "2 Liga de Mulheres (1992) \n", + "3 Um Bairro em Nova Iorque (1993) \n", + "4 Anjos em Campo (1994) \n", + "5 Tempo de Matar (1996) \n", + "6 Amistad (1997) \n", + "7 Anaconda (1997) \n", + "8 A Cool, Dry Place (1998) \n", + "9 América Proibida (1998) \n", + "10 Uma Questão de Nervos (1999) \n", + "\n", + " Description \n", + "0 Teenager Nancy Thompson must uncover the dark ... \n", + "1 The victims of an encephalitis epidemic many y... \n", + "2 Two sisters join the first female professional... \n", + "3 A father becomes worried when a local gangster... \n", + "4 When a boy prays for a chance to have a family... \n", + "5 In Clanton, Mississippi, a fearless young lawy... \n", + "6 In 1839, the revolt of Mende captives aboard a... \n", + "7 A \"National Geographic\" film crew is taken hos... \n", + "8 Russell, single father balances his work as a ... \n", + "9 A former neo-nazi skinhead tries to prevent hi... \n", + "10 A comedy about a psychiatrist whose number-one... " + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "movies_df" ] }, { @@ -310,7 +889,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -321,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -337,7 +916,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -348,7 +927,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -365,9 +944,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://www.emsc-csem.org/Earthquake/?view=1',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=2',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=3',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=4',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=5']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This is the url you will scrape in this exercise\n", "url = 'https://www.emsc-csem.org/Earthquake/?view='\n", @@ -385,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -409,7 +1003,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.11.2" } }, "nbformat": 4,