-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
93 lines (74 loc) · 3.28 KB
/
app.py
File metadata and controls
93 lines (74 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import os
# Target page to scrape
url = "https://en.wikipedia.org/wiki/Transformer_(deep_learning)"
# Wikimedia blocks generic/default user-agents. Send a descriptive UA with contact info.
headers = {
"User-Agent": "PyScrapeBot/1.0 (https://example.org/pyscrape; contact@example.org)",
"Accept-Encoding": "gzip, deflate", # recommended by Wikimedia robot policy
}
# Send an HTTP request to the webpage
response = requests.get(url, headers=headers, timeout=15)
# Surface HTTP errors early (403/429 are common when UA is missing or rate limits hit)
response.raise_for_status()
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Print the title of the webpage to verify
if soup.title:
print("Title: " + soup.title.text)
else:
print("No title found in the document")
# Find the main data table (prefer wikitable, fall back to first table)
table = soup.find("table", class_="wikitable") or soup.find("table")
if not table:
raise RuntimeError("No table found in the page HTML; structure may have changed or access was blocked.")
# Extract headers and data in one go
first_row = table.find("tr")
if first_row:
table_headers = [th.text.strip() for th in first_row.find_all("th")]
if not table_headers: # If no <th>, treat first row as data
data_rows = table.find_all("tr")
data = [[td.text.strip() for td in row.find_all("td")] for row in data_rows]
table_headers = [f"Column_{i+1}" for i in range(len(data[0]))] if data else []
else:
data = [[td.text.strip() for td in row.find_all("td")] for row in table.find_all("tr")[1:]]
else:
raise RuntimeError("No rows found in the table.")
# Convert to DataFrame and display first few rows
if table_headers and data:
df = pd.DataFrame(data, columns=table_headers)
print(df.head())
else:
print("No data or headers extracted from the table.")
# Save to CSV with error handling
try:
df.to_csv("scraped_data.csv", index=False)
print("Data saved to scraped_data.csv")
except Exception as e:
print(f"Error saving CSV: {e}")
# New section: Download documents
# Find all document links (e.g., PDFs, assuming they have href with .pdf)
document_links = soup.find_all('a', href=lambda href: href and ('.pdf' in href.lower() or '.doc' in href.lower() or '.txt' in href.lower()))
# Create a directory for downloads if it doesn't exist
download_dir = "downloads"
os.makedirs(download_dir, exist_ok=True)
# Loop through each link and download the corresponding document
for i, link in enumerate(document_links):
document_href = link['href']
# Handle relative URLs
full_url = urljoin(url, document_href)
try:
document_response = requests.get(full_url, headers=headers, timeout=15)
document_response.raise_for_status()
# Extract filename from URL or use a default
filename = os.path.basename(full_url) or f"document_{i+1}.pdf"
filepath = os.path.join(download_dir, filename)
# Save the document
with open(filepath, 'wb') as file:
file.write(document_response.content)
print(f"Downloaded: {filename}")
except Exception as e:
print(f"Error downloading {full_url}: {e}")