PyScrape/app.py at main · Pieter1821/PyScrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import os

# Target page to scrape
url = "https://en.wikipedia.org/wiki/Transformer_(deep_learning)"

# Wikimedia blocks generic/default user-agents. Send a descriptive UA with contact info.
headers = {
    "User-Agent": "PyScrapeBot/1.0 (https://example.org/pyscrape; contact@example.org)",
    "Accept-Encoding": "gzip, deflate",  # recommended by Wikimedia robot policy
}

# Send an HTTP request to the webpage
response = requests.get(url, headers=headers, timeout=15)

# Surface HTTP errors early (403/429 are common when UA is missing or rate limits hit)
response.raise_for_status()

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Print the title of the webpage to verify
if soup.title:
    print("Title: " + soup.title.text)
else:
    print("No title found in the document")

# Find the main data table (prefer wikitable, fall back to first table)
table = soup.find("table", class_="wikitable") or soup.find("table")
if not table:
    raise RuntimeError("No table found in the page HTML; structure may have changed or access was blocked.")

# Extract headers and data in one go
first_row = table.find("tr")
if first_row:
    table_headers = [th.text.strip() for th in first_row.find_all("th")]
    if not table_headers:  # If no <th>, treat first row as data
        data_rows = table.find_all("tr")
        data = [[td.text.strip() for td in row.find_all("td")] for row in data_rows]
        table_headers = [f"Column_{i+1}" for i in range(len(data[0]))] if data else []
    else:
        data = [[td.text.strip() for td in row.find_all("td")] for row in table.find_all("tr")[1:]]
else:
    raise RuntimeError("No rows found in the table.")

# Convert to DataFrame and display first few rows
if table_headers and data:
    df = pd.DataFrame(data, columns=table_headers)
    print(df.head())
else:
    print("No data or headers extracted from the table.")

# Save to CSV with error handling
try:
    df.to_csv("scraped_data.csv", index=False)
    print("Data saved to scraped_data.csv")
except Exception as e:
    print(f"Error saving CSV: {e}")

# New section: Download documents
# Find all document links (e.g., PDFs, assuming they have href with .pdf)
document_links = soup.find_all('a', href=lambda href: href and ('.pdf' in href.lower() or '.doc' in href.lower() or '.txt' in href.lower()))

# Create a directory for downloads if it doesn't exist
download_dir = "downloads"
os.makedirs(download_dir, exist_ok=True)

# Loop through each link and download the corresponding document
for i, link in enumerate(document_links):
    document_href = link['href']
    # Handle relative URLs
    full_url = urljoin(url, document_href)

    try:
        document_response = requests.get(full_url, headers=headers, timeout=15)
        document_response.raise_for_status()

        # Extract filename from URL or use a default
        filename = os.path.basename(full_url) or f"document_{i+1}.pdf"
        filepath = os.path.join(download_dir, filename)

        # Save the document
        with open(filepath, 'wb') as file:
            file.write(document_response.content)
        print(f"Downloaded: {filename}")
    except Exception as e:
        print(f"Error downloading {full_url}: {e}")