diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 1fe9046..769c4f1 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -54,21 +54,1334 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://github.com/trending/developers'" + "url = 'https://github.com/trending/developers/'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[
\n", + " \n", + " 1\n", + " \n", + "
\n", + " \n", + " \"@emilk\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Emil Ernerfeldt\n", + "

\n", + "

\n", + " \n", + " emilk\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 2\n", + " \n", + "
\n", + " \n", + " \"@pilcrowOnPaper\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " pilcrowOnPaper\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + " \n", + " \"@lllyasviel\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " lllyasviel\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 4\n", + " \n", + "
\n", + " \n", + " \"@aler9\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Alessandro Ros\n", + "

\n", + "

\n", + " \n", + " aler9\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 5\n", + " \n", + "
\n", + " \n", + " \"@stefanprodan\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Stefan Prodan\n", + "

\n", + "

\n", + " \n", + " stefanprodan\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 6\n", + " \n", + "
\n", + " \n", + " \"@Jarred-Sumner\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Jarred Sumner\n", + "

\n", + "

\n", + " \n", + " Jarred-Sumner\n", + "

\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " San Francisco, CA

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + " \n", + " \"@castrojo\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Jorge O. Castro\n", + "

\n", + "

\n", + " \n", + " castrojo\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + " \n", + " \"@TooTallNate\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Nathan Rajlich\n", + "

\n", + "

\n", + " \n", + " TooTallNate\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 9\n", + " \n", + "
\n", + " \n", + " \"@d1onys1us\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " dave | d1onys1us\n", + "

\n", + "

\n", + " \n", + " d1onys1us\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + " \n", + " \"@jcrist\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Jim Crist-Harif\n", + "

\n", + "

\n", + " \n", + " jcrist\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 11\n", + " \n", + "
\n", + " \n", + " \"@chronark\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Andreas Thomas\n", + "

\n", + "

\n", + " \n", + " chronark\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 12\n", + " \n", + "
\n", + " \n", + " \"@mzz2017\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " mzz\n", + "

\n", + "

\n", + " \n", + " mzz2017\n", + "

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " Popular repo
\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " gg\n", + "

\n", + "
\n", + " 一个支持节点与订阅链接的 Linux 命令行代理工具 | A command-line tool for one-click proxy in your research and development without installing v2ray or anythin…\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + " \n", + " \"@homanp\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Ismail Pelaseyed\n", + "

\n", + "

\n", + " \n", + " homanp\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 14\n", + " \n", + "
\n", + " \n", + " \"@purcell\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Steve Purcell\n", + "

\n", + "

\n", + " \n", + " purcell\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 15\n", + " \n", + "
\n", + " \n", + " \"@L-M-Sherlock\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Jarrett Ye\n", + "

\n", + "

\n", + " \n", + " L-M-Sherlock\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 16\n", + " \n", + "
\n", + " \n", + " \"@guibranco\"\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + " \n", + " \"@wader\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Mattias Wadman\n", + "

\n", + "

\n", + " \n", + " wader\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 18\n", + " \n", + "
\n", + " \n", + " \"@JerBouma\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Jeroen Bouma\n", + "

\n", + "

\n", + " \n", + " JerBouma\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 19\n", + " \n", + "
\n", + " \n", + " \"@vanhauser-thc\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " van Hauser\n", + "

\n", + "

\n", + " \n", + " vanhauser-thc\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 20\n", + " \n", + "
\n", + " \n", + " \"@taranjeet\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Taranjeet Singh\n", + "

\n", + "

\n", + " \n", + " taranjeet\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 21\n", + " \n", + "
\n", + " \n", + " \"@liamdebeasi\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Liam DeBeasi\n", + "

\n", + "

\n", + " \n", + " liamdebeasi\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 22\n", + " \n", + "
\n", + " \n", + " \"@steven-tey\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Steven Tey\n", + "

\n", + "

\n", + " \n", + " steven-tey\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 23\n", + " \n", + "
\n", + " \n", + " \"@str4d\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " str4d\n", + "

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " Popular repo
\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " rage\n", + "

\n", + "
\n", + " A simple, secure and modern file encryption tool (and Rust library) with small explicit keys, no config options, and UNIX-style composabi…\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 24\n", + " \n", + "
\n", + " \n", + " \"@kripken\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Alon Zakai\n", + "

\n", + "

\n", + " \n", + " kripken\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
,\n", + "
\n", + " \n", + " 25\n", + " \n", + "
\n", + " \n", + " \"@martinvonz\"\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + " \n", + " Martin von Zweigbergk\n", + "

\n", + "

\n", + " \n", + " martinvonz\n", + "

\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " Follow\n", + " \n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "response = requests.get(url)\n", + "soup=BeautifulSoup(response.content)\n", + "table= soup.find_all(\"div\", attrs={\"class\":\"position-relative container-lg p-responsive pt-6\"})\n", + "table_row = table[0].find_all(\"article\",attrs= {\"class\":\"Box-row d-flex\"})\n", + "table_row" ] }, { @@ -126,11 +1439,141 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[\n", + " emilk\n", + " ,\n", + " \n", + " aler9\n", + " ,\n", + " \n", + " stefanprodan\n", + " ,\n", + " \n", + " Jarred-Sumner\n", + " ,\n", + " \n", + " castrojo\n", + " ,\n", + " \n", + " TooTallNate\n", + " ,\n", + " \n", + " d1onys1us\n", + " ,\n", + " \n", + " jcrist\n", + " ,\n", + " \n", + " chronark\n", + " ,\n", + " \n", + " mzz2017\n", + " ,\n", + " \n", + " homanp\n", + " ,\n", + " \n", + " purcell\n", + " ,\n", + " \n", + " L-M-Sherlock\n", + " ,\n", + " \n", + " guibranco\n", + " ,\n", + " \n", + " wader\n", + " ,\n", + " \n", + " JerBouma\n", + " ,\n", + " \n", + " vanhauser-thc\n", + " ,\n", + " \n", + " taranjeet\n", + " ,\n", + " \n", + " liamdebeasi\n", + " ,\n", + " \n", + " steven-tey\n", + " ,\n", + " \n", + " kripken\n", + " ,\n", + " \n", + " martinvonz\n", + " ]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "table[0].find_all(\"a\",attrs={\"class\":\"Link--secondary Link\"})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Emil Ernerfeldt (Emil Ernerfeldt)',\n", + " 'pilcrowOnPaper',\n", + " 'lllyasviel',\n", + " 'Alessandro Ros (Alessandro Ros)',\n", + " 'Stefan Prodan (Stefan Prodan)',\n", + " 'Jarred Sumner (Jarred Sumner)',\n", + " 'Jorge O. Castro (Jorge O. Castro)',\n", + " 'Nathan Rajlich (Nathan Rajlich)',\n", + " 'dave | d1onys1us (dave | d1onys1us)',\n", + " 'Jim Crist-Harif (Jim Crist-Harif)',\n", + " 'Andreas Thomas (Andreas Thomas)',\n", + " 'mzz (mzz)',\n", + " 'Ismail Pelaseyed (Ismail Pelaseyed)',\n", + " 'Steve Purcell (Steve Purcell)',\n", + " 'Jarrett Ye (Jarrett Ye)',\n", + " 'Guilherme Branco Stracini (Guilherme Branco Stracini)',\n", + " 'Mattias Wadman (Mattias Wadman)',\n", + " 'Jeroen Bouma (Jeroen Bouma)',\n", + " 'van Hauser (van Hauser)',\n", + " 'Taranjeet Singh (Taranjeet Singh)',\n", + " 'Liam DeBeasi (Liam DeBeasi)',\n", + " 'Steven Tey (Steven Tey)',\n", + " 'str4d',\n", + " 'Alon Zakai (Alon Zakai)',\n", + " 'Martin von Zweigbergk (Martin von Zweigbergk)']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_list= []\n", + "for info in table_row:\n", + " name_dev = info.find_all(\"h1\", attrs = {\"class\": \"h3 lh-condensed\"})#[0].get_text().strip()\n", + " user_dev = info.find_all(\"p\", attrs = {\"class\": \"f4 text-normal mb-1\"})#[0].get_text().strip()\n", + " if len(user_dev)>0:\n", + " my_list.append(info.find_all(\"h1\", attrs = {\"class\": \"h3 lh-condensed\"})[0].get_text().strip()+\" (\"+info.find_all(\"h1\", attrs = {\"class\": \"h3 lh-condensed\"})[0].get_text().strip()+\")\")\n", + " else:\n", + " my_list.append(info.find_all(\"h1\", attrs = {\"class\": \"h3 lh-condensed\"})[0].get_text().strip())\n", + " # developers_dict[counter] = my_list.append(info.find_all(\"p\", attrs = {\"class\": \"f4 text-normal mb-1\"}))#[0].get_text().strip())\n", + "\n", + "my_list" ] }, { @@ -144,7 +1587,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -154,11 +1597,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "r = requests.get(url)\n", + "soup=BeautifulSoup(r.content)\n", + "table=soup.find_all(\"div\", attrs={\"class\":\"position-relative container-lg p-responsive pt-6\"})\n", + "table_row = table[0].find_all(\"h2\",attrs={\"class\":\"h3 lh-condensed\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['modelscope', 'facechain'],\n", + " ['yt-dlp', 'yt-dlp'],\n", + " ['TheAlgorithms', 'Python'],\n", + " ['Plachtaa', 'VITS-fast-fine-tuning'],\n", + " ['graphdeco-inria', 'gaussian-splatting'],\n", + " ['521xueweihan', 'HelloGitHub'],\n", + " ['ytdl-org', 'youtube-dl'],\n", + " ['AUTOMATIC1111', 'stable-diffusion-webui'],\n", + " ['unifyai', 'ivy'],\n", + " ['Stability-AI', 'stablediffusion'],\n", + " ['modelscope', 'modelscope'],\n", + " ['sherlock-project', 'sherlock'],\n", + " ['bmaltais', 'kohya_ss'],\n", + " ['CHNZYX', 'Auto_Simulated_Universe'],\n", + " ['tiangolo', 'full-stack-fastapi-postgresql'],\n", + " ['CorentinJ', 'Real-Time-Voice-Cloning'],\n", + " ['zanfranceschi', 'rinha-de-backend-2023-q3'],\n", + " ['morph-labs', 'rift'],\n", + " ['sybrenjansen', 'mpire'],\n", + " ['MiuLab', 'Taiwan-LLaMa'],\n", + " ['AzatAI', 'cs_books'],\n", + " ['Jack-Cherish', 'dsi'],\n", + " ['D3Ext', 'AORT'],\n", + " ['pytube', 'pytube'],\n", + " ['nlpxucan', 'WizardLM']]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "repo = table_row[0].find_all(\"a\",href=True)[0].get_text().split()\n", + "my_list=[]\n", + "for i in table_row:\n", + " repo = i.find_all(\"a\",href=True)[0]\n", + " my_list.append(i.find_all(\"a\",href=True)[0].get_text().replace('/', '').replace('\\n', '').strip().split())\n", + "my_list" ] }, { @@ -171,21 +1667,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://en.wikipedia.org/wiki/Walt_Disney'" + "url = 'https://en.wikipedia.org/wiki/Walt_Disney'\n", + "r = requests.get(url)\n", + "soup=BeautifulSoup(r.content)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/20px-Cscr-featured.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/8/8c/Extended-protection-shackle.svg/20px-Extended-protection-shackle.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/d/df/Walt_Disney_1946.JPG/220px-Walt_Disney_1946.JPG\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/8/87/Walt_Disney_1942_signature.svg/150px-Walt_Disney_1942_signature.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Walt_Disney_Birthplace_Exterior_Hermosa_Chicago_Illinois.jpg/220px-Walt_Disney_Birthplace_Exterior_Hermosa_Chicago_Illinois.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Walt_Disney_envelope_ca._1921.jpg/220px-Walt_Disney_envelope_ca._1921.jpg\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/4/4e/Steamboat-willie.jpg/220px-Steamboat-willie.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Walt_Disney_Snow_white_1937_trailer_screenshot_%2813%29.jpg/220px-Walt_Disney_Snow_white_1937_trailer_screenshot_%2813%29.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/1/15/Disney_drawing_goofy.jpg/170px-Disney_drawing_goofy.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/8/8c/WaltDisneyplansDisneylandDec1954.jpg/220px-WaltDisneyplansDisneylandDec1954.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Walt_disney_portrait_right.jpg/170px-Walt_disney_portrait_right.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Walt_Disney_Grave.JPG/170px-Walt_Disney_Grave.JPG\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/1/13/DisneySchiphol1951.jpg/220px-DisneySchiphol1951.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/6/6c/Disney1968.jpg/170px-Disney1968.jpg\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Disney_Oscar_1953_%28cropped%29.jpg/170px-Disney_Oscar_1953_%28cropped%29.jpg\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/38px-Wikisource-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Wikiquote-logo.svg/34px-Wikiquote-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/e/e3/Disneyland_Resort_logo.svg/135px-Disneyland_Resort_logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/d/da/Animation_disc.svg/20px-Animation_disc.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/6/69/P_vip.svg/19px-P_vip.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Magic_Kingdom_castle.jpg/15px-Magic_Kingdom_castle.jpg\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Video-x-generic.svg/19px-Video-x-generic.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/a/a3/Flag_of_Los_Angeles_County%2C_California.svg/21px-Flag_of_Los_Angeles_County%2C_California.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Blank_television_set.svg/21px-Blank_television_set.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/21px-Flag_of_the_United_States.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/14px-Commons-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Wikiquote-logo.svg/16px-Wikiquote-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/18px-Wikisource-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Wikidata-logo.svg/21px-Wikidata-logo.svg.png\n", + "https//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png\n" + ] + } + ], "source": [ - "# your code here" + "# your code here\n", + "img = soup.find_all(\"img\", {\"class\":\"mw-file-element\"})\n", + "for image in img:\n", + " # Print image source\n", + " print(f\"https{image['src']}\")" ] }, { @@ -197,21 +1739,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# This is the url you will scrape in this exercise\n", - "url = 'https://www.wikipedia.org/'" + "url = 'https://www.wikipedia.org/'\n", + "r = requests.get(url)\n", + "soup=BeautifulSoup(r.content)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "English: 6691000\n", + "日本語: 1382000\n", + "Español: 1881000\n", + "Русский: 1930000\n", + "Deutsch: 2822000\n", + "Français: 2540000\n", + "Italiano: 1820000\n", + "中文: 1369000\n", + "Português: 1105000\n", + "فارسی: فارسی\n" + ] + } + ], "source": [ - "# your code here" + "# your code here\n", + "language=soup.find_all(\"div\",attrs={\"class\",\"central-featured-lang\"})\n", + "\n", + "for lan in language:\n", + " language_name= lan.find(\"strong\").text\n", + " num_article = lan.find(\"bdi\").get_text().strip('+').replace('\\xa0', '')\n", + " print(language_name + ': ' + num_article) " ] }, { @@ -224,7 +1791,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -234,11 +1801,117 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LanguageNative Speakers
0Mandarin Chinese (incl. Standard Chinese, but ...939.0
1Spanish485.0
2English380.0
3Hindi (excl. Urdu, and other languages)345.0
4Portuguese236.0
5Bengali234.0
6Russian147.0
7Japanese123.0
8Yue Chinese (incl. Cantonese)86.1
9Vietnamese85.0
\n", + "
" + ], + "text/plain": [ + " Language Native Speakers\n", + "0 Mandarin Chinese (incl. Standard Chinese, but ... 939.0\n", + "1 Spanish 485.0\n", + "2 English 380.0\n", + "3 Hindi (excl. Urdu, and other languages) 345.0\n", + "4 Portuguese 236.0\n", + "5 Bengali 234.0\n", + "6 Russian 147.0\n", + "7 Japanese 123.0\n", + "8 Yue Chinese (incl. Cantonese) 86.1\n", + "9 Vietnamese 85.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "tables = pd.read_html(url)\n", + "\n", + "df = tables[0]\n", + "df.columns = ['Language','Native Speakers','Language family', 'Branch']\n", + "\n", + "top_10 = df.iloc[:10, 0:2]\n", + "top_10" ] }, { @@ -251,21 +1924,147 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# This is the url you will scrape in this exercise \n", - "url = 'https://www.imdb.com/chart/top'" + "url = 'https://www.imdb.com/chart/top'\n", + "r= requests.get(url, headers={'User-Agent':'Slurp'})\n", + "soup=BeautifulSoup(r.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row_director = soup.find_all('body', attrs={'class':'ipc-promptable-base--body-locked'})\n", + "# irector_name = row_director[1].find_all(\"a\", attrs={\"class\": \"ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link\"})\n", + "row_director" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "rows = soup.find_all('li',attrs={'class': 'ipc-metadata-list-summary-item sc-bca49391-0 eypSaE cli-parent'})\n", + "title = rows[0].find_all(\"h3\", attrs={\"class\": \"ipc-title__text\"})\n", + "year = rows[0].find_all('span')[1]\n", + "rate = rows[0].find_all('span')[5]\n", + "movie_name = []\n", + "years = []\n", + "rates = []\n", + "for i, row in enumerate(rows):\n", + " title = row.find_all('h3', attrs={'class': 'ipc-title__text'})[0].get_text().strip()\n", + " year = row.find_all('span')[1].get_text().strip()\n", + " rate = row.find_all('span')[5].get_text().strip()\n", + " movie_name.append(title)\n", + " years.append(year)\n", + " rates.append(rate)\n", + "data = {'title': movie_name, \n", + " 'year': years, \n", + " 'rate': rates}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleyearrate
01. Os Condenados de Shawshank19949.3
12. O Padrinho19729.2
23. O Cavaleiro das Trevas20089.0
34. O Padrinho: Parte II19749.0
45. Doze Homens em Fúria19579.0
\n", + "
" + ], + "text/plain": [ + " title year rate\n", + "0 1. Os Condenados de Shawshank 1994 9.3\n", + "1 2. O Padrinho 1972 9.2\n", + "2 3. O Cavaleiro das Trevas 2008 9.0\n", + "3 4. O Padrinho: Parte II 1974 9.0\n", + "4 5. Doze Homens em Fúria 1957 9.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(data)\n", + "df.head()" ] }, { @@ -277,21 +2076,193 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "#This is the url you will scrape in this exercise\n", - "url = 'https://www.imdb.com/list/ls009796553/'" + "url = 'https://www.imdb.com/list/ls009796553/'\n", + "r= requests.get(url, headers={'User-Agent':'Slurp'})\n", + "soup=BeautifulSoup(r.content)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.Pesadelo em Elm Street(1984)'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "rows = soup.find_all('h3', attrs={'class': 'lister-item-header'})\n", + "name = rows[0].get_text().strip().replace('\\n', '')\n", + "name" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'(1984)'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "year = soup.find_all('span', attrs={'class': 'lister-item-year text-muted unbold'})\n", + "year[0].get_text().strip().replace('\\n', '')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Teenager Nancy Thompson must uncover the dark truth concealed by her parents after she and her friends become targets of the spirit of a serial killer with a bladed glove in their dreams, in which if they die, it kills them in real life.'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary_rows = soup.find_all('p',{'class':''})\n", + "summary_rows[0].get_text().strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "names = []\n", + "years = []\n", + "summarys = []\n", + "for i, row in enumerate(rows):\n", + " name = row.get_text().strip().replace('\\n', '')\n", + " year = row.find_all('span', attrs={'class': 'lister-item-year text-muted unbold'})[0].get_text().strip().replace('\\n', '')\n", + " names.append(name[2:-6])\n", + " years.append(year)\n", + "for i, row in enumerate(summary_rows):\n", + " summary = row.get_text().strip().replace('\\n', '')\n", + " summarys.append(summary)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesyearssummary
0Pesadelo em Elm Street(1984)Teenager Nancy Thompson must uncover the dark ...
1Despertares(1990)The victims of an encephalitis epidemic many y...
2Liga de Mulheres(1992)Two sisters join the first female professional...
3Um Bairro em Nova Iorque(1993)A father becomes worried when a local gangster...
4Anjos em Campo(1994)When a boy prays for a chance to have a family...
\n", + "
" + ], + "text/plain": [ + " names years \\\n", + "0 Pesadelo em Elm Street (1984) \n", + "1 Despertares (1990) \n", + "2 Liga de Mulheres (1992) \n", + "3 Um Bairro em Nova Iorque (1993) \n", + "4 Anjos em Campo (1994) \n", + "\n", + " summary \n", + "0 Teenager Nancy Thompson must uncover the dark ... \n", + "1 The victims of an encephalitis epidemic many y... \n", + "2 Two sisters join the first female professional... \n", + "3 A father becomes worried when a local gangster... \n", + "4 When a boy prays for a chance to have a family... " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {'names': names, \n", + " 'years': years, \n", + " 'summary': summarys}\n", + "data=pd.DataFrame(data)\n", + "data.head()" ] }, { @@ -310,7 +2281,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -321,7 +2292,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -337,7 +2308,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -348,7 +2319,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -365,9 +2336,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['https://www.emsc-csem.org/Earthquake/?view=1',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=2',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=3',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=4',\n", + " 'https://www.emsc-csem.org/Earthquake/?view=5']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This is the url you will scrape in this exercise\n", "url = 'https://www.emsc-csem.org/Earthquake/?view='\n", @@ -385,7 +2371,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -409,7 +2395,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.11.4" } }, "nbformat": 4,