diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..cd5a51f
Binary files /dev/null and b/.DS_Store differ
diff --git a/your-code/.DS_Store b/your-code/.DS_Store
new file mode 100644
index 0000000..4503468
Binary files /dev/null and b/your-code/.DS_Store differ
diff --git a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb
index 812f7a4..5fa6fc3 100644
--- a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb
+++ b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb
@@ -1,5 +1,29 @@
{
"cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\n",
+ "Requirement already satisfied: html5lib in /usr/local/lib/python3.9/site-packages (1.1)\n",
+ "Requirement already satisfied: webencodings in /usr/local/lib/python3.9/site-packages (from html5lib) (0.5.1)\n",
+ "Requirement already satisfied: six>=1.9 in /usr/local/lib/python3.9/site-packages (from html5lib) (1.16.0)\n",
+ "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\n",
+ "Requirement already satisfied: selenium in /usr/local/lib/python3.9/site-packages (3.141.0)\n",
+ "Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/site-packages (from selenium) (1.26.6)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip3 install html5lib\n",
+ "!pip3 install selenium"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -10,7 +34,7 @@
"\n",
"**Tips:**\n",
"\n",
- "- Check the response status code for each request to ensure you have obtained the intended contennt.\n",
+ "- Check the response status code for each request to ensure you have obtained the intended content.\n",
"- Print the response text in each request to understand the kind of info you are getting and its format.\n",
"- Check for patterns in the response text to extract the data/info requested in each question.\n",
"- Visit each url and take a look at its source through Chrome DevTools. You'll need to identify the html tags, special class names etc. used for the html content you are expected to extract."
@@ -40,21 +64,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
- "# from pprint import pprint\n",
- "# from lxml import html\n",
- "# from lxml.html import fromstring\n",
- "# import urllib.request\n",
- "# from urllib.request import urlopen\n",
- "# import random\n",
- "# import re\n",
- "# import scrapy"
+ "import re\n",
+ "import numpy as np\n",
+ "from pprint import pprint\n",
+ "from lxml import html\n",
+ "from lxml.html import fromstring\n",
+ "import urllib.request\n",
+ "from urllib.request import urlopen\n",
+ "import random\n",
+ "import time\n",
+ "from bs4 import BeautifulSoup\n",
+ "from selenium import webdriver\n",
+ "from selenium.webdriver.common.keys import Keys"
]
},
{
@@ -66,7 +94,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -76,11 +104,5492 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "200\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "response = requests.get(url)\n",
+ "print(response.status_code)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "Trending developers on GitHub today · GitHub \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
{{ message }}
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " \n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
Trending \n",
+ "
\n",
+ " These are the developers building the hot tools today.\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ " 1\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " 哔哩漫游,解除B站客户端番剧区域限制的Xposed模块,并且提供其他小功能。An Xposed module that unblocks bangumi area limit of BILIBILI with miscellaneous features.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 2\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Screenshots with JavaScript\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 3\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " A PSP emulator for Android, Windows, Mac and Linux, written in C++. Want to contribute? Join us on Discord at
https://discord.gg/5NJB6dD or just send pull requests / issues. For discussion use the forums on ppsspp.org.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 4\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "🔥 Stay motivated and show off your contribution streak! 🌟 Display your total contributions, current streak, and longest streak on your GitHub profile README\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 5\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "🎯 A .NET library for running a target dependency graph.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 6\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " QUIC interop runner\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 7\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " A fast reverse proxy to help you expose a local server behind a NAT or firewall to the internet.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 8\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "✅ The Node.js best practices list (August 2021)\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 9\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "💢 A list of breaking changes to the web platform\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 10\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " A simple way to access state while robust and testable.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 11\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
EdgeDB Inc.
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 12\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Beautiful multilingual API documentation theme for Hugo\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 13\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
@streetteam
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 14\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " This template helps you setup julia on google colab\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 15\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " This is a repository for the code posted on my blog\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 16\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Making packages work faster with more extensive precompilation\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 17\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " react-native native module for audio recorder and player.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 18\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Run scheduled tasks\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 19\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Composable nonblocking and synchronization programming framework\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 20\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "🤖 Just a command runner\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 21\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Tivi is a work-in-progress TV show tracking Android app, which connects to Trakt.tv. It is still in its early stages of development and currently only contains two pieces of UI. It is under heavy development.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 22\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " backup a github user or organization\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 23\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Web component server-side rendering\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 24\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " a lock-free concurrent slab (experimental)\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 25\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Open screens/snackbars/dialogs/bottomSheets without context, manage states and inject dependencies easily with Get.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " You can’t perform that action at this time.\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
You signed in with another tab or window. Reload to refresh your session. \n",
+ "
You signed out in another tab or window. Reload to refresh your session. \n",
+ "
\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " \n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "print(soup)"
]
},
{
@@ -134,11 +5643,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "usernames = soup.select('p[class=\"f4 text-normal mb-1\"]')\n",
+ "usernames_clean = [username.text.strip() for username in usernames]"
]
},
{
@@ -152,21 +5663,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://github.com/trending/python?since=daily'"
+ "url = 'https://github.com/trending/python?since=daily'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['mrlt8/docker-wyze-bridge', 'kingoflolz/mesh-transformer-jax', 'willmcgugan/textual', 'shuup/shuup', '3b1b/manim', 'iperov/DeepFaceLab', 'bitcoin/bips', 'breakdowns/slam-tg-mirror-bot', 'RasaHQ/rasa', 'RustPython/RustPython', 'blakeblackshear/frigate', 'byt3bl33d3r/CrackMapExec', 'keras-team/keras', 'PyCQA/bandit', 'jackfrued/Python-100-Days', 'hwkxk/HeytapTask', 'sammchardy/python-binance', 'ermongroup/SDEdit', 'swisskyrepo/PayloadsAllTheThings', 'ManimCommunity/manim', 'optuna/optuna', 'eriklindernoren/PyTorch-GAN', 'dortania/OpenCore-Legacy-Patcher', 'espressif/esptool', 'public-apis/public-apis']\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "repositories = soup.select('h1[class=\"h3 lh-condensed\"]')\n",
+ "repositories_clean = [repository.text.strip().replace(' /\\n\\n ', '/') for repository in repositories]\n",
+ "print(repositories_clean)"
]
},
{
@@ -178,21 +5703,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://en.wikipedia.org/wiki/Walt_Disney'"
+ "url = 'https://en.wikipedia.org/wiki/Walt_Disney'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['/wiki/File:Walt_Disney_1946.JPG', '/wiki/File:Walt_Disney_1942_signature.svg', '/wiki/File:Walt_Disney_envelope_ca._1921.jpg', '/wiki/File:Trolley_Troubles_poster.jpg', '/wiki/File:Steamboat-willie.jpg', '/wiki/File:Walt_Disney_1935.jpg', '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg', '/wiki/File:Disney_drawing_goofy.jpg', '/wiki/File:DisneySchiphol1951.jpg', '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg', '/wiki/File:Walt_disney_portrait_right.jpg', '/wiki/File:Walt_Disney_Grave.JPG', '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg', '/wiki/File:Disney_Display_Case.JPG', '/wiki/File:Disney1968.jpg', '/wiki/File:Disneyland_Resort_logo.svg', '/wiki/File:Animation_disc.svg', '/wiki/File:P_vip.svg', '/wiki/File:Magic_Kingdom_castle.jpg', '/wiki/File:Video-x-generic.svg', '/wiki/File:Flag_of_Los_Angeles_County,_California.svg', '/wiki/File:Blank_television_set.svg', '/wiki/File:Flag_of_the_United_States.svg']\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "images = soup.select('a[class=\"image\"]')\n",
+ "images_clean = [image['href'] for image in images]\n",
+ "print(images_clean)"
]
},
{
@@ -204,21 +5743,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url ='https://en.wikipedia.org/wiki/Python' "
+ "url ='https://en.wikipedia.org/wiki/Python'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "links = soup.select('li > a')\n",
+ "links_clean = [link['href'] for link in links]"
]
},
{
@@ -230,21 +5774,38 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'http://uscode.house.gov/download/download.shtml'"
+ "url = 'http://uscode.house.gov/download/download.shtml'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of Titles that have changed in the United States Code since its last release point: 53\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "titles_changed = soup.select('div[class=\"usctitle\"]')\n",
+ "titles_changed_clean = [title_changed.text.strip().replace('\\n\\n ', '').replace(' ٭', '') \\\n",
+ " for title_changed in titles_changed]\n",
+ "titles_changed_clean = titles_changed_clean[2:]\n",
+ "print('Number of Titles that have changed in the United States Code since its last release point:', \\\n",
+ " len(titles_changed_clean))"
]
},
{
@@ -256,21 +5817,47 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.fbi.gov/wanted/topten'"
+ "url = 'https://www.fbi.gov/wanted/topten'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ROBERT WILLIAM FISHER',\n",
+ " 'ARNOLDO JIMENEZ',\n",
+ " 'JASON DEREK BROWN',\n",
+ " 'ALEXIS FLORES',\n",
+ " 'JOSE RODOLFO VILLARREAL-HERNANDEZ',\n",
+ " 'EUGENE PALMER',\n",
+ " 'RAFAEL CARO-QUINTERO',\n",
+ " 'BHADRESHKUMAR CHETANBHAI PATEL',\n",
+ " 'ALEJANDRO ROSALES CASTILLO',\n",
+ " 'YASER ABDEL SAID']"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#your code "
+ "# Your code\n",
+ "top_10_most_wanted = soup.select('h3 > a')\n",
+ "top_10_most_wanted_clean = [most_wanted.text.strip() for most_wanted in top_10_most_wanted]\n",
+ "top_10_most_wanted_clean"
]
},
{
@@ -282,82 +5869,305 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.emsc-csem.org/Earthquake/'"
+ "url = 'https://www.emsc-csem.org/Earthquake/'\n",
+ "response = requests.get(url)\n",
+ "html = response.content"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "table = pd.read_html(html)\n",
+ "table = table[3]\n",
+ "\n",
+ "date = pd.DataFrame(table['Date & Time UTC', '12345678910\\x9b»'])\n",
+ "latitude = pd.DataFrame(table['Latitude degrees', '12345678910\\x9b»'])\n",
+ "longitude = pd.DataFrame(table['Longitude degrees', '12345678910\\x9b»'])\n",
+ "region = pd.DataFrame(table['Region name [+]', '12345678910\\x9b»'])\n",
+ "\n",
+ "earthquakes = pd.concat([date, latitude, longitude, region], axis=1, \\\n",
+ " names=['date', 'latitude', 'longitude', 'region']).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### Display the date, and title of upcoming hackathon events as a Pandas dataframe table"
+ "#### Count number of tweets by a given Twitter account."
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "# This is the url you will scrape in this exercise\n",
- "url ='https://hackevents.co/hackathons'"
+ "You will need to include a ***try/except block*** for account names not found. \n",
+ " ***Hint:*** the program should count the number of tweets for any provided account"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# This is the url you will scrape in this exercise \n",
+ "# You will need to add the account credentials to this url\n",
+ "url = 'https://twitter.com/elonmusk'\n",
+ "to_driver = '/Applications/chromedriver 2'\n",
+ "driver = webdriver.Chrome(to_driver)\n",
+ "driver.get(url)\n",
+ "time.sleep(10)\n",
+ "page_source = driver.page_source\n",
+ "driver.quit()"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 20,
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "Perfil / Twitter \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
JavaScript no está disponible. \n",
+ "
Detectamos que JavaScript está desactivado en este navegador. Activa JavaScript o cambia a un navegador compatible para seguir usando twitter.com. Puedes ver una lista de navegadores compatibles en nuestro Centro de Ayuda.
\n",
+ "
Centro de ayuda
\n",
+ "\n",
+ "
\n",
+ "
No te pierdas lo que está pasando
Los usuarios de Twitter son los primeros en enterarse.
Regístrate ahora para obtener tu propia cronología personalizada.
"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#### Count number of tweets by a given Twitter account."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You will need to include a ***try/except block*** for account names not found. \n",
- " ***Hint:*** the program should count the number of tweets for any provided account"
+ "soup = BeautifulSoup(page_source, 'html.parser')\n",
+ "soup"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "select() missing 1 required positional argument: 'selector'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mn_of_tweets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m: select() missing 1 required positional argument: 'selector'"
+ ]
+ }
+ ],
"source": [
- "# This is the url you will scrape in this exercise \n",
- "# You will need to add the account credentials to this url\n",
- "url = 'https://twitter.com/'"
+ "n_of_tweets[0].select('div[class=\"css-901oao css-bfa6kz r-9ilb82 r-1qd0xha r-n6v787 r-16dba41 r-1cwl3u0 r-bcqeeo r-qvutc0\"]')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "AttributeError",
+ "evalue": "ResultSet object has no attribute 'select'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mn_of_tweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'div[class=\"css-1dbjc4n r-1habvwh\"]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mn_of_tweets_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_of_tweets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'div[class=\"css-901oao css-bfa6kz r-9ilb82 r-1qd0xha r-n6v787 r-16dba41 r-1cwl3u0 r-bcqeeo r-qvutc0\"]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mn_of_tweets_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_of_tweets_clean\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\xa0'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m' '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_of_tweets_clean\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.9/site-packages/bs4/element.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2171\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2172\u001b[0m \u001b[0;34m\"\"\"Raise a helpful exception to explain a common code fix.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2173\u001b[0;31m raise AttributeError(\n\u001b[0m\u001b[1;32m 2174\u001b[0m \u001b[0;34m\"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2175\u001b[0m )\n",
+ "\u001b[0;31mAttributeError\u001b[0m: ResultSet object has no attribute 'select'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "n_of_tweets = soup.select('div[class=\"css-1dbjc4n r-1habvwh\"]')\n",
+ "n_of_tweets_clean = n_of_tweets[0].select('div[class=\"css-901oao css-bfa6kz r-9ilb82 r-1qd0xha r-n6v787 r-16dba41 r-1cwl3u0 r-bcqeeo r-qvutc0\"]')\n",
+ "n_of_tweets_clean = n_of_tweets_clean[0].text.replace('\\xa0', ' ')\n",
+ "n_of_tweets_clean"
]
},
{
@@ -383,7 +6193,13 @@
"source": [
"# This is the url you will scrape in this exercise \n",
"# You will need to add the account credentials to this url\n",
- "url = 'https://twitter.com/'"
+ "url = 'https://twitter.com/elonmusk'\n",
+ "to_driver = '/Applications/chromedriver 2'\n",
+ "driver = webdriver.Chrome(to_driver)\n",
+ "driver.get(url)\n",
+ "time.sleep(15)\n",
+ "page_source = driver.page_source\n",
+ "driver.quit()"
]
},
{
@@ -392,7 +6208,21 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "soup = BeautifulSoup(page_source, 'html.parser')\n",
+ "soup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Your code\n",
+ "followers = soup.select('div[class=\"css-1dbjc4n r-13awgt0 r-18u37iz r-1w6e6rj\"]')\n",
+ "followers_clean = followers[0].select('span[class=\"css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0\"]')\n",
+ "followers_clean = [item.text.strip().replace('\\xa0', ' ') for item in followers_clean]\n",
+ "followers_clean[2:4]"
]
},
{
@@ -409,7 +6239,10 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.wikipedia.org/'"
+ "url = 'https://www.wikipedia.org/'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
@@ -418,7 +6251,18 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "languages_html = []\n",
+ "n = 1\n",
+ "\n",
+ "while n < 11:\n",
+ " x = soup.select(f'div[class=\"central-featured-lang lang{n}\"]')\n",
+ " languages_html.append(x[0])\n",
+ " n = n + 1\n",
+ " \n",
+ "languages = [language.text.strip() for language in languages_html]\n",
+ "languages_clean = [re.sub(r'\\n.*', '', language) for language in languages]\n",
+ "languages_clean"
]
},
{
@@ -435,7 +6279,10 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://data.gov.uk/'"
+ "url = 'https://data.gov.uk/'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
@@ -444,7 +6291,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code "
+ "# Your code\n",
+ "datasets = soup.select('h3 > a')\n",
+ "datasets_clean = [dataset.text.strip() for dataset in datasets]\n",
+ "datasets_clean"
]
},
{
@@ -461,7 +6311,10 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'"
+ "url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
@@ -470,7 +6323,11 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "table = pd.read_html(html)\n",
+ "table = table[3]\n",
+ "table = table.drop(['Percentageof worldpopulation(2007)'], axis=1).dropna().head(10)\n",
+ "table"
]
},
{
@@ -504,7 +6361,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code"
+ "# Your code\n"
]
},
{
@@ -530,7 +6387,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n"
]
},
{
@@ -557,7 +6414,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code"
+ "# Your code\n"
]
},
{
@@ -584,7 +6441,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n"
]
}
],
@@ -604,7 +6461,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.9.5"
}
},
"nbformat": 4,
diff --git a/your-code/main.ipynb b/your-code/main.ipynb
index 812f7a4..0330a7c 100644
--- a/your-code/main.ipynb
+++ b/your-code/main.ipynb
@@ -1,5 +1,29 @@
{
"cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\n",
+ "Requirement already satisfied: html5lib in /usr/local/lib/python3.9/site-packages (1.1)\n",
+ "Requirement already satisfied: webencodings in /usr/local/lib/python3.9/site-packages (from html5lib) (0.5.1)\n",
+ "Requirement already satisfied: six>=1.9 in /usr/local/lib/python3.9/site-packages (from html5lib) (1.16.0)\n",
+ "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\n",
+ "Requirement already satisfied: selenium in /usr/local/lib/python3.9/site-packages (3.141.0)\n",
+ "Requirement already satisfied: urllib3 in /usr/local/lib/python3.9/site-packages (from selenium) (1.26.6)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip3 install html5lib\n",
+ "!pip3 install selenium"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -10,7 +34,7 @@
"\n",
"**Tips:**\n",
"\n",
- "- Check the response status code for each request to ensure you have obtained the intended contennt.\n",
+ "- Check the response status code for each request to ensure you have obtained the intended content.\n",
"- Print the response text in each request to understand the kind of info you are getting and its format.\n",
"- Check for patterns in the response text to extract the data/info requested in each question.\n",
"- Visit each url and take a look at its source through Chrome DevTools. You'll need to identify the html tags, special class names etc. used for the html content you are expected to extract."
@@ -40,21 +64,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
- "# from pprint import pprint\n",
- "# from lxml import html\n",
- "# from lxml.html import fromstring\n",
- "# import urllib.request\n",
- "# from urllib.request import urlopen\n",
- "# import random\n",
- "# import re\n",
- "# import scrapy"
+ "import re\n",
+ "import numpy as np\n",
+ "from pprint import pprint\n",
+ "from lxml import html\n",
+ "from lxml.html import fromstring\n",
+ "import urllib.request\n",
+ "from urllib.request import urlopen\n",
+ "import random\n",
+ "import time\n",
+ "from bs4 import BeautifulSoup\n",
+ "from selenium import webdriver\n",
+ "from selenium.webdriver.common.keys import Keys"
]
},
{
@@ -66,7 +94,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -76,11 +104,5492 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "200\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "response = requests.get(url)\n",
+ "print(response.status_code)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "Trending developers on GitHub today · GitHub \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
{{ message }}
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " \n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
Trending \n",
+ "
\n",
+ " These are the developers building the hot tools today.\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ " 1\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " 哔哩漫游,解除B站客户端番剧区域限制的Xposed模块,并且提供其他小功能。An Xposed module that unblocks bangumi area limit of BILIBILI with miscellaneous features.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 2\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Screenshots with JavaScript\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 3\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " A PSP emulator for Android, Windows, Mac and Linux, written in C++. Want to contribute? Join us on Discord at
https://discord.gg/5NJB6dD or just send pull requests / issues. For discussion use the forums on ppsspp.org.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 4\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "🔥 Stay motivated and show off your contribution streak! 🌟 Display your total contributions, current streak, and longest streak on your GitHub profile README\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 5\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "🎯 A .NET library for running a target dependency graph.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 6\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " QUIC interop runner\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 7\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " A fast reverse proxy to help you expose a local server behind a NAT or firewall to the internet.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 8\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "✅ The Node.js best practices list (August 2021)\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 9\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "💢 A list of breaking changes to the web platform\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 10\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " A simple way to access state while robust and testable.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 11\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
EdgeDB Inc.
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 12\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Beautiful multilingual API documentation theme for Hugo\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 13\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
@streetteam
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 14\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " This template helps you setup julia on google colab\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 15\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " This is a repository for the code posted on my blog\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 16\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Making packages work faster with more extensive precompilation\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 17\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " react-native native module for audio recorder and player.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 18\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Run scheduled tasks\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 19\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Composable nonblocking and synchronization programming framework\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 20\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "🤖 Just a command runner\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 21\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Tivi is a work-in-progress TV show tracking Android app, which connects to Trakt.tv. It is still in its early stages of development and currently only contains two pieces of UI. It is under heavy development.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 22\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " backup a github user or organization\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 23\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Web component server-side rendering\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 24\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " a lock-free concurrent slab (experimental)\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ " 25\n",
+ " \n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " Open screens/snackbars/dialogs/bottomSheets without context, manage states and inject dependencies easily with Get.\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "Follow \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " You can’t perform that action at this time.\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
You signed in with another tab or window. Reload to refresh your session. \n",
+ "
You signed out in another tab or window. Reload to refresh your session. \n",
+ "
\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " \n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')\n",
+ "print(soup)"
]
},
{
@@ -134,11 +5643,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "usernames = soup.select('p[class=\"f4 text-normal mb-1\"]')\n",
+ "usernames_clean = [username.text.strip() for username in usernames]"
]
},
{
@@ -152,21 +5663,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://github.com/trending/python?since=daily'"
+ "url = 'https://github.com/trending/python?since=daily'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['mrlt8/docker-wyze-bridge', 'kingoflolz/mesh-transformer-jax', 'willmcgugan/textual', 'shuup/shuup', '3b1b/manim', 'iperov/DeepFaceLab', 'bitcoin/bips', 'breakdowns/slam-tg-mirror-bot', 'RasaHQ/rasa', 'RustPython/RustPython', 'blakeblackshear/frigate', 'byt3bl33d3r/CrackMapExec', 'keras-team/keras', 'PyCQA/bandit', 'jackfrued/Python-100-Days', 'hwkxk/HeytapTask', 'sammchardy/python-binance', 'ermongroup/SDEdit', 'swisskyrepo/PayloadsAllTheThings', 'ManimCommunity/manim', 'optuna/optuna', 'eriklindernoren/PyTorch-GAN', 'dortania/OpenCore-Legacy-Patcher', 'espressif/esptool', 'public-apis/public-apis']\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "repositories = soup.select('h1[class=\"h3 lh-condensed\"]')\n",
+ "repositories_clean = [repository.text.strip().replace(' /\\n\\n ', '/') for repository in repositories]\n",
+ "print(repositories_clean)"
]
},
{
@@ -178,21 +5703,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://en.wikipedia.org/wiki/Walt_Disney'"
+ "url = 'https://en.wikipedia.org/wiki/Walt_Disney'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['/wiki/File:Walt_Disney_1946.JPG', '/wiki/File:Walt_Disney_1942_signature.svg', '/wiki/File:Walt_Disney_envelope_ca._1921.jpg', '/wiki/File:Trolley_Troubles_poster.jpg', '/wiki/File:Steamboat-willie.jpg', '/wiki/File:Walt_Disney_1935.jpg', '/wiki/File:Walt_Disney_Snow_white_1937_trailer_screenshot_(13).jpg', '/wiki/File:Disney_drawing_goofy.jpg', '/wiki/File:DisneySchiphol1951.jpg', '/wiki/File:WaltDisneyplansDisneylandDec1954.jpg', '/wiki/File:Walt_disney_portrait_right.jpg', '/wiki/File:Walt_Disney_Grave.JPG', '/wiki/File:Roy_O._Disney_with_Company_at_Press_Conference.jpg', '/wiki/File:Disney_Display_Case.JPG', '/wiki/File:Disney1968.jpg', '/wiki/File:Disneyland_Resort_logo.svg', '/wiki/File:Animation_disc.svg', '/wiki/File:P_vip.svg', '/wiki/File:Magic_Kingdom_castle.jpg', '/wiki/File:Video-x-generic.svg', '/wiki/File:Flag_of_Los_Angeles_County,_California.svg', '/wiki/File:Blank_television_set.svg', '/wiki/File:Flag_of_the_United_States.svg']\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "images = soup.select('a[class=\"image\"]')\n",
+ "images_clean = [image['href'] for image in images]\n",
+ "print(images_clean)"
]
},
{
@@ -204,21 +5743,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url ='https://en.wikipedia.org/wiki/Python' "
+ "url ='https://en.wikipedia.org/wiki/Python'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "links = soup.select('li > a')\n",
+ "links_clean = [link['href'] for link in links]"
]
},
{
@@ -230,21 +5774,38 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'http://uscode.house.gov/download/download.shtml'"
+ "url = 'http://uscode.house.gov/download/download.shtml'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of Titles that have changed in the United States Code since its last release point: 53\n"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "# Your code\n",
+ "titles_changed = soup.select('div[class=\"usctitle\"]')\n",
+ "titles_changed_clean = [title_changed.text.strip().replace('\\n\\n ', '').replace(' ٭', '') \\\n",
+ " for title_changed in titles_changed]\n",
+ "titles_changed_clean = titles_changed_clean[2:]\n",
+ "print('Number of Titles that have changed in the United States Code since its last release point:', \\\n",
+ " len(titles_changed_clean))"
]
},
{
@@ -256,21 +5817,47 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.fbi.gov/wanted/topten'"
+ "url = 'https://www.fbi.gov/wanted/topten'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ROBERT WILLIAM FISHER',\n",
+ " 'ARNOLDO JIMENEZ',\n",
+ " 'JASON DEREK BROWN',\n",
+ " 'ALEXIS FLORES',\n",
+ " 'JOSE RODOLFO VILLARREAL-HERNANDEZ',\n",
+ " 'EUGENE PALMER',\n",
+ " 'RAFAEL CARO-QUINTERO',\n",
+ " 'BHADRESHKUMAR CHETANBHAI PATEL',\n",
+ " 'ALEJANDRO ROSALES CASTILLO',\n",
+ " 'YASER ABDEL SAID']"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#your code "
+ "# Your code\n",
+ "top_10_most_wanted = soup.select('h3 > a')\n",
+ "top_10_most_wanted_clean = [most_wanted.text.strip() for most_wanted in top_10_most_wanted]\n",
+ "top_10_most_wanted_clean"
]
},
{
@@ -282,82 +5869,303 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.emsc-csem.org/Earthquake/'"
+ "url = 'https://www.emsc-csem.org/Earthquake/'\n",
+ "response = requests.get(url)\n",
+ "html = response.content"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "table = pd.read_html(html)\n",
+ "table = table[3]\n",
+ "\n",
+ "date = pd.DataFrame(table['Date & Time UTC', '12345678910\\x9b»'])\n",
+ "latitude = pd.DataFrame(table['Latitude degrees', '12345678910\\x9b»'])\n",
+ "longitude = pd.DataFrame(table['Longitude degrees', '12345678910\\x9b»'])\n",
+ "region = pd.DataFrame(table['Region name [+]', '12345678910\\x9b»'])\n",
+ "\n",
+ "earthquakes = pd.concat([date, latitude, longitude, region], axis=1, \\\n",
+ " names=['date', 'latitude', 'longitude', 'region']).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### Display the date, and title of upcoming hackathon events as a Pandas dataframe table"
+ "#### Count number of tweets by a given Twitter account."
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "# This is the url you will scrape in this exercise\n",
- "url ='https://hackevents.co/hackathons'"
+ "You will need to include a ***try/except block*** for account names not found. \n",
+ " ***Hint:*** the program should count the number of tweets for any provided account"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# This is the url you will scrape in this exercise \n",
+ "# You will need to add the account credentials to this url\n",
+ "url = 'https://twitter.com/elonmusk'\n",
+ "to_driver = '/Applications/chromedriver 2'\n",
+ "driver = webdriver.Chrome(to_driver)\n",
+ "driver.get(url)\n",
+ "time.sleep(10)\n",
+ "page_source = driver.page_source\n",
+ "driver.quit()"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 20,
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "Perfil / Twitter \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
JavaScript no está disponible. \n",
+ "
Detectamos que JavaScript está desactivado en este navegador. Activa JavaScript o cambia a un navegador compatible para seguir usando twitter.com. Puedes ver una lista de navegadores compatibles en nuestro Centro de Ayuda.
\n",
+ "
Centro de ayuda
\n",
+ "\n",
+ "
\n",
+ "
No te pierdas lo que está pasando
Los usuarios de Twitter son los primeros en enterarse.
Regístrate ahora para obtener tu propia cronología personalizada.
"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#### Count number of tweets by a given Twitter account."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You will need to include a ***try/except block*** for account names not found. \n",
- " ***Hint:*** the program should count the number of tweets for any provided account"
+ "soup = BeautifulSoup(page_source, 'html.parser')\n",
+ "soup"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[
Perfil ]"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# This is the url you will scrape in this exercise \n",
- "# You will need to add the account credentials to this url\n",
- "url = 'https://twitter.com/'"
+ "soup.select('div[class=\"css-1dbjc4n r-1habvwh\"]')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "IndexError",
+ "evalue": "list index out of range",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mn_of_tweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'div[class=\"css-1dbjc4n r-1habvwh\"]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mn_of_tweets_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_of_tweets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'div[class=\"css-901oao css-bfa6kz r-9ilb82 r-1qd0xha r-n6v787 r-16dba41 r-1cwl3u0 r-bcqeeo r-qvutc0\"]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mn_of_tweets_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_of_tweets_clean\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\xa0'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m' '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mn_of_tweets_clean\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mIndexError\u001b[0m: list index out of range"
+ ]
+ }
+ ],
"source": [
- "#your code"
+ "n_of_tweets = soup.select('div[class=\"css-1dbjc4n r-1habvwh\"]')\n",
+ "n_of_tweets_clean = n_of_tweets[0].select('div[class=\"css-901oao css-bfa6kz r-9ilb82 r-1qd0xha r-n6v787 r-16dba41 r-1cwl3u0 r-bcqeeo r-qvutc0\"]')\n",
+ "n_of_tweets_clean = n_of_tweets_clean[0].text.replace('\\xa0', ' ')\n",
+ "n_of_tweets_clean"
]
},
{
@@ -383,7 +6191,13 @@
"source": [
"# This is the url you will scrape in this exercise \n",
"# You will need to add the account credentials to this url\n",
- "url = 'https://twitter.com/'"
+ "url = 'https://twitter.com/elonmusk'\n",
+ "to_driver = '/Applications/chromedriver 2'\n",
+ "driver = webdriver.Chrome(to_driver)\n",
+ "driver.get(url)\n",
+ "time.sleep(15)\n",
+ "page_source = driver.page_source\n",
+ "driver.quit()"
]
},
{
@@ -392,7 +6206,21 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "soup = BeautifulSoup(page_source, 'html.parser')\n",
+ "soup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Your code\n",
+ "followers = soup.select('div[class=\"css-1dbjc4n r-13awgt0 r-18u37iz r-1w6e6rj\"]')\n",
+ "followers_clean = followers[0].select('span[class=\"css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0\"]')\n",
+ "followers_clean = [item.text.strip().replace('\\xa0', ' ') for item in followers_clean]\n",
+ "followers_clean[2:4]"
]
},
{
@@ -409,7 +6237,10 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://www.wikipedia.org/'"
+ "url = 'https://www.wikipedia.org/'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
@@ -418,7 +6249,18 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "languages_html = []\n",
+ "n = 1\n",
+ "\n",
+ "while n < 11:\n",
+ " x = soup.select(f'div[class=\"central-featured-lang lang{n}\"]')\n",
+ " languages_html.append(x[0])\n",
+ " n = n + 1\n",
+ " \n",
+ "languages = [language.text.strip() for language in languages_html]\n",
+ "languages_clean = [re.sub(r'\\n.*', '', language) for language in languages]\n",
+ "languages_clean"
]
},
{
@@ -435,7 +6277,10 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://data.gov.uk/'"
+ "url = 'https://data.gov.uk/'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
@@ -444,7 +6289,10 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code "
+ "# Your code\n",
+ "datasets = soup.select('h3 > a')\n",
+ "datasets_clean = [dataset.text.strip() for dataset in datasets]\n",
+ "datasets_clean"
]
},
{
@@ -461,7 +6309,10 @@
"outputs": [],
"source": [
"# This is the url you will scrape in this exercise\n",
- "url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'"
+ "url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'\n",
+ "response = requests.get(url)\n",
+ "html = response.content\n",
+ "soup = BeautifulSoup(html, 'lxml')"
]
},
{
@@ -470,7 +6321,11 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n",
+ "table = pd.read_html(html)\n",
+ "table = table[3]\n",
+ "table = table.drop(['Percentageof worldpopulation(2007)'], axis=1).dropna().head(10)\n",
+ "table"
]
},
{
@@ -504,7 +6359,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code"
+ "# Your code\n"
]
},
{
@@ -530,7 +6385,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n"
]
},
{
@@ -557,7 +6412,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code"
+ "# Your code\n"
]
},
{
@@ -584,7 +6439,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#your code"
+ "# Your code\n"
]
}
],
@@ -604,7 +6459,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.9.5"
}
},
"nbformat": 4,