From b49a9fa78838c9d213383bbd7f22d83881ab8a23 Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Tue, 24 Mar 2026 18:39:11 +0000 Subject: [PATCH 1/2] feat(scout-url): add --user-agent CLI parameter (#67) --- toolkit/cli/cmd_scout_url.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/toolkit/cli/cmd_scout_url.py b/toolkit/cli/cmd_scout_url.py index a1ff41f..22e44e0 100644 --- a/toolkit/cli/cmd_scout_url.py +++ b/toolkit/cli/cmd_scout_url.py @@ -72,8 +72,8 @@ def _candidate_links(base_url: str, html_text: str) -> list[str]: return links -def probe_url(url: str, *, timeout: int = _DEFAULT_TIMEOUT) -> dict[str, Any]: - headers = {"User-Agent": _DEFAULT_USER_AGENT} +def probe_url(url: str, *, timeout: int = _DEFAULT_TIMEOUT, user_agent: str = _DEFAULT_USER_AGENT) -> dict[str, Any]: + headers = {"User-Agent": user_agent} with requests.get(url, allow_redirects=True, timeout=timeout, headers=headers, stream=True) as response: content_type = response.headers.get("Content-Type") content_disposition = response.headers.get("Content-Disposition") @@ -105,12 +105,13 @@ def probe_url(url: str, *, timeout: int = _DEFAULT_TIMEOUT) -> dict[str, Any]: def scout_url( url: str = typer.Argument(..., help="URL da ispezionare"), timeout: int = typer.Option(_DEFAULT_TIMEOUT, "--timeout", min=1, help="Timeout HTTP in secondi"), + user_agent: str = typer.Option(_DEFAULT_USER_AGENT, "--user-agent", help="User-Agent da usare per la richiesta"), ) -> None: """ Ispeziona un URL per dataset scouting minimale. """ try: - result = probe_url(url, timeout=timeout) + result = probe_url(url, timeout=timeout, user_agent=user_agent) except requests.RequestException as exc: typer.echo(f"error: {type(exc).__name__}: {exc}") raise typer.Exit(code=1) from exc From d3964d5272101f2dd9a6bc0a533ae7f60daa2a08 Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:35:53 +0000 Subject: [PATCH 2/2] test(scout-url): add explicit user-agent header test (#67) --- tests/test_cli_scout_url.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_cli_scout_url.py b/tests/test_cli_scout_url.py index 29a40c8..9b5c641 100644 --- a/tests/test_cli_scout_url.py +++ b/tests/test_cli_scout_url.py @@ -176,3 +176,31 @@ def _fake_get(*args, **kwargs): assert calls[1]["timeout"] == 7 assert responses[0].text_reads == 0 assert responses[1].text_reads == 1 + + +def test_probe_url_passes_custom_user_agent(monkeypatch) -> None: + calls: list[dict[str, Any]] = [] + + class _FakeResponse: + def __init__(self) -> None: + self.headers = {"Content-Type": "application/octet-stream"} + self.url = "https://example.org/resource" + self.status_code = 200 + + def __enter__(self) -> "_FakeResponse": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + return None + + def _fake_get(*args, **kwargs): + calls.append(kwargs) + return _FakeResponse() + + monkeypatch.setattr("toolkit.cli.cmd_scout_url.requests.get", _fake_get) + + custom_ua = "Mozilla/5.0 (DataCivicLab Custom)" + probe_url("https://example.org/test", user_agent=custom_ua) + + assert len(calls) == 1 + assert calls[0]["headers"] == {"User-Agent": custom_ua}