web-scraper-api/main.py at main · a-sehic-dev/web-scraper-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import ssl
from fastapi import FastAPI, Query, HTTPException
from fastapi.responses import JSONResponse
import certifi
import requests
import urllib3
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup

app = FastAPI(title="Web Scraper API")


def get_verify_path():
    """Use REQUESTS_CA_BUNDLE or SSL_CERT_FILE if set; otherwise certifi's bundle."""
    return (
        os.environ.get("REQUESTS_CA_BUNDLE")
        or os.environ.get("SSL_CERT_FILE")
        or certifi.where()
    )


class CertifiHTTPSAdapter(HTTPAdapter):
    """HTTPS adapter that uses certifi CA bundle via SSLContext (fixes Windows SSL)."""

    def init_poolmanager(self, *args, **kwargs):
        ctx = ssl.create_default_context(cafile=get_verify_path())
        kwargs["ssl_context"] = ctx
        return super().init_poolmanager(*args, **kwargs)


# Session that uses certifi for HTTPS verification (works when verify=path fails on Windows)
_session = requests.Session()
_session.mount("https://", CertifiHTTPSAdapter())

# At startup: point env to certifi if not set (helps Python/OpenSSL on Windows)
_ca_path = get_verify_path()
os.environ.setdefault("SSL_CERT_FILE", _ca_path)
os.environ.setdefault("REQUESTS_CA_BUNDLE", _ca_path)
print("Using CA bundle:", _ca_path)

# Optional: skip SSL verification for /scrape only (e.g. corporate proxy, broken Windows store)
# Set SCRAPER_SKIP_SSL_VERIFY=1 to enable. Per-request verify_ssl param overrides this when provided.
_SKIP_SSL_VERIFY = os.environ.get("SCRAPER_SKIP_SSL_VERIFY", "").strip().lower() in ("1", "true", "yes")
if _SKIP_SSL_VERIFY:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    print("WARNING: SCRAPER_SKIP_SSL_VERIFY=1 — SSL verification disabled by default for /scrape requests.")


@app.get("/scrape")
async def scrape(
    url: str = Query(..., description="The URL of the page to scrape"),
    verify_ssl: bool = Query(True, description="Verify SSL certificate for HTTPS URLs"),
):
    """Scrape a webpage and return its URL, title, and link count."""
    headers = {"User-Agent": "Mozilla/5.0"}
    # Per-request verify_ssl; env SCRAPER_SKIP_SSL_VERIFY can force skip globally
    verify = False if _SKIP_SSL_VERIFY else verify_ssl
    try:
        response = _session.get(
            url,
            headers=headers,
            timeout=15,
            allow_redirects=True,
            verify=verify,
        )
        response.raise_for_status()
    except requests.exceptions.Timeout:
        raise HTTPException(status_code=400, detail="Timeout fetching URL")
    except requests.exceptions.SSLError:
        raise HTTPException(
            status_code=400,
            detail="SSL verification failed (try verify_ssl=false)",
        )
    except requests.exceptions.MissingSchema:
        raise HTTPException(status_code=400, detail="Invalid URL format")
    except requests.exceptions.InvalidURL:
        raise HTTPException(status_code=400, detail="Invalid URL format")
    except requests.RequestException as e:
        raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {str(e)}")

    soup = BeautifulSoup(response.text, "html.parser")

    title_tag = soup.find("title")
    title = title_tag.get_text(strip=True) if title_tag else ""

    links = soup.find_all("a", href=True)
    links_found = len(links)

    return JSONResponse(
        content={
            "url": url,
            "title": title,
            "links_found": links_found,
        }
    )