diff --git a/tests/test_cli_scout_url.py b/tests/test_cli_scout_url.py index 29a40c8..9b5c641 100644 --- a/tests/test_cli_scout_url.py +++ b/tests/test_cli_scout_url.py @@ -176,3 +176,31 @@ def _fake_get(*args, **kwargs): assert calls[1]["timeout"] == 7 assert responses[0].text_reads == 0 assert responses[1].text_reads == 1 + + +def test_probe_url_passes_custom_user_agent(monkeypatch) -> None: + calls: list[dict[str, Any]] = [] + + class _FakeResponse: + def __init__(self) -> None: + self.headers = {"Content-Type": "application/octet-stream"} + self.url = "https://example.org/resource" + self.status_code = 200 + + def __enter__(self) -> "_FakeResponse": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + return None + + def _fake_get(*args, **kwargs): + calls.append(kwargs) + return _FakeResponse() + + monkeypatch.setattr("toolkit.cli.cmd_scout_url.requests.get", _fake_get) + + custom_ua = "Mozilla/5.0 (DataCivicLab Custom)" + probe_url("https://example.org/test", user_agent=custom_ua) + + assert len(calls) == 1 + assert calls[0]["headers"] == {"User-Agent": custom_ua} diff --git a/toolkit/cli/cmd_scout_url.py b/toolkit/cli/cmd_scout_url.py index a1ff41f..22e44e0 100644 --- a/toolkit/cli/cmd_scout_url.py +++ b/toolkit/cli/cmd_scout_url.py @@ -72,8 +72,8 @@ def _candidate_links(base_url: str, html_text: str) -> list[str]: return links -def probe_url(url: str, *, timeout: int = _DEFAULT_TIMEOUT) -> dict[str, Any]: - headers = {"User-Agent": _DEFAULT_USER_AGENT} +def probe_url(url: str, *, timeout: int = _DEFAULT_TIMEOUT, user_agent: str = _DEFAULT_USER_AGENT) -> dict[str, Any]: + headers = {"User-Agent": user_agent} with requests.get(url, allow_redirects=True, timeout=timeout, headers=headers, stream=True) as response: content_type = response.headers.get("Content-Type") content_disposition = response.headers.get("Content-Disposition") @@ -105,12 +105,13 @@ def probe_url(url: str, *, timeout: int = _DEFAULT_TIMEOUT) -> dict[str, Any]: def scout_url( url: str = typer.Argument(..., help="URL da ispezionare"), timeout: int = typer.Option(_DEFAULT_TIMEOUT, "--timeout", min=1, help="Timeout HTTP in secondi"), + user_agent: str = typer.Option(_DEFAULT_USER_AGENT, "--user-agent", help="User-Agent da usare per la richiesta"), ) -> None: """ Ispeziona un URL per dataset scouting minimale. """ try: - result = probe_url(url, timeout=timeout) + result = probe_url(url, timeout=timeout, user_agent=user_agent) except requests.RequestException as exc: typer.echo(f"error: {type(exc).__name__}: {exc}") raise typer.Exit(code=1) from exc