-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
96 lines (80 loc) · 3.42 KB
/
main.py
File metadata and controls
96 lines (80 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import ssl
from fastapi import FastAPI, Query, HTTPException
from fastapi.responses import JSONResponse
import certifi
import requests
import urllib3
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
app = FastAPI(title="Web Scraper API")
def get_verify_path():
"""Use REQUESTS_CA_BUNDLE or SSL_CERT_FILE if set; otherwise certifi's bundle."""
return (
os.environ.get("REQUESTS_CA_BUNDLE")
or os.environ.get("SSL_CERT_FILE")
or certifi.where()
)
class CertifiHTTPSAdapter(HTTPAdapter):
"""HTTPS adapter that uses certifi CA bundle via SSLContext (fixes Windows SSL)."""
def init_poolmanager(self, *args, **kwargs):
ctx = ssl.create_default_context(cafile=get_verify_path())
kwargs["ssl_context"] = ctx
return super().init_poolmanager(*args, **kwargs)
# Session that uses certifi for HTTPS verification (works when verify=path fails on Windows)
_session = requests.Session()
_session.mount("https://", CertifiHTTPSAdapter())
# At startup: point env to certifi if not set (helps Python/OpenSSL on Windows)
_ca_path = get_verify_path()
os.environ.setdefault("SSL_CERT_FILE", _ca_path)
os.environ.setdefault("REQUESTS_CA_BUNDLE", _ca_path)
print("Using CA bundle:", _ca_path)
# Optional: skip SSL verification for /scrape only (e.g. corporate proxy, broken Windows store)
# Set SCRAPER_SKIP_SSL_VERIFY=1 to enable. Per-request verify_ssl param overrides this when provided.
_SKIP_SSL_VERIFY = os.environ.get("SCRAPER_SKIP_SSL_VERIFY", "").strip().lower() in ("1", "true", "yes")
if _SKIP_SSL_VERIFY:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
print("WARNING: SCRAPER_SKIP_SSL_VERIFY=1 — SSL verification disabled by default for /scrape requests.")
@app.get("/scrape")
async def scrape(
url: str = Query(..., description="The URL of the page to scrape"),
verify_ssl: bool = Query(True, description="Verify SSL certificate for HTTPS URLs"),
):
"""Scrape a webpage and return its URL, title, and link count."""
headers = {"User-Agent": "Mozilla/5.0"}
# Per-request verify_ssl; env SCRAPER_SKIP_SSL_VERIFY can force skip globally
verify = False if _SKIP_SSL_VERIFY else verify_ssl
try:
response = _session.get(
url,
headers=headers,
timeout=15,
allow_redirects=True,
verify=verify,
)
response.raise_for_status()
except requests.exceptions.Timeout:
raise HTTPException(status_code=400, detail="Timeout fetching URL")
except requests.exceptions.SSLError:
raise HTTPException(
status_code=400,
detail="SSL verification failed (try verify_ssl=false)",
)
except requests.exceptions.MissingSchema:
raise HTTPException(status_code=400, detail="Invalid URL format")
except requests.exceptions.InvalidURL:
raise HTTPException(status_code=400, detail="Invalid URL format")
except requests.RequestException as e:
raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {str(e)}")
soup = BeautifulSoup(response.text, "html.parser")
title_tag = soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else ""
links = soup.find_all("a", href=True)
links_found = len(links)
return JSONResponse(
content={
"url": url,
"title": title,
"links_found": links_found,
}
)