diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 172c6d1..5a8af59 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,6 @@ name: Tests on: push: - branches: [main, master] pull_request: branches: [main, master] workflow_dispatch: diff --git a/.gitignore b/.gitignore index c9454d6..462dbe2 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,6 @@ htmlcov/ # Old marker files (now stored in XDG data dir) *_marker *_results_*.out + +# Claude Code temp files +tmpclaude-* diff --git a/CLAUDE.md b/CLAUDE.md index 90e34f9..e353c33 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -34,6 +34,42 @@ ruff check src/ pytest ``` +## Editable Installs + +When installed with `pip install -e .` (editable mode), the `cetus` command runs directly from source code in `src/cetus/`. Any changes to the source files are immediately reflected without reinstalling. + +**How it works:** +- The installed package contains a `.pth` file pointing to the source directory +- Python imports modules directly from `src/cetus/` at runtime +- Entry point scripts (like `cetus.exe`) invoke `cetus.cli:main` from source + +**When to use:** +- Development and testing - changes are instant +- Debugging - breakpoints and print statements work immediately + +**Note:** The venv at the repo root (`alerting_app/.venv`) is shared with the Django app. Install cetus-client from the repo root: +```bash +pip install -e "./cetus-client[dev]" +``` + +## Version Management + +Version is defined in **one place only**: `pyproject.toml` + +The `src/cetus/__init__.py` uses `importlib.metadata.version()` to read it at runtime: +```python +from importlib.metadata import version +__version__ = version("cetus-client") +``` + +**When bumping version:** Only update `version = "X.Y.Z"` in `pyproject.toml` + +**In tests:** Never hardcode version strings. Import from the package: +```python +from cetus import __version__ +assert f"cetus-client/{__version__}" in user_agent +``` + ## Architecture The CLI is built with Click and uses httpx for HTTP requests. All source code is in `src/cetus/`. diff --git a/README.md b/README.md index d5f3b5c..6520448 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,91 @@ cetus query "A:192.168.1.1" --format table cetus alerts list ``` +## Operating Modes + +Cetus has two primary operating modes designed for different use cases: + +### Direct Mode (stdout) + +**For:** Interactive exploration, piping to other tools, one-off queries + +Direct mode outputs results to stdout with no state tracking. Each query is independent - you get exactly what you ask for, nothing more. + +```bash +# Interactive exploration +cetus query "host:*.example.com" --format table + +# Pipe to jq for processing +cetus query "host:*.example.com" | jq '.[].host' + +# Chain with other tools +cetus query "A:192.168.1.*" | jq -r '.[].host' | sort -u +``` + +**Characteristics:** +- Results go to stdout (terminal or pipe) +- No markers - queries are stateless +- Full query results returned every time +- Default format: `json` + +### Collector Mode (file output) + +**For:** Data collection, scheduled exports, building datasets over time + +Collector mode writes to files and tracks your position using markers. Subsequent runs fetch only new records since the last query, making it efficient for ongoing data collection. + +```bash +# First run: fetches last 7 days, creates file +cetus query "host:*.example.com" -o results.jsonl +# Output: Wrote 1,523 records to results.jsonl + +# Later runs: fetches only NEW records, appends to file +cetus query "host:*.example.com" -o results.jsonl +# Output: Resuming from: 2025-01-14T10:30:00 +# Appended 47 records to results.jsonl + +# No new data? File unchanged +cetus query "host:*.example.com" -o results.jsonl +# Output: Resuming from: 2025-01-14T15:42:18 +# No new records (file unchanged) +``` + +**Characteristics:** +- Results written to file (`-o` or `-p`) +- Markers track last-seen record per query +- Incremental updates - only fetches new data +- Appends to existing files (or creates timestamped files with `-p`) +- Default format: `json` (recommended: `jsonl`) + +**Two file output options:** + +| Option | Behavior | Use Case | +|--------|----------|----------| +| `-o FILE` | Appends to same file | Cumulative dataset | +| `-p PREFIX` | Creates timestamped files | Export pipelines, archival | + +**Important:** `-o` and `-p` maintain separate markers. You can use both modes +for the same query without data gaps - each tracks its own position independently. + +```bash +# -o: Single cumulative file +cetus query "host:*.example.com" -o dns_data.jsonl +# Always writes to: dns_data.jsonl + +# -p: Timestamped files per run +cetus query "host:*.example.com" -p exports/dns +# Creates: exports/dns_2025-01-14_10-30-00.jsonl +# Next run: exports/dns_2025-01-14_14-45-00.jsonl +``` + +**Switching modes:** Use `--no-marker` to run a collector-mode query without markers (full re-query, overwrites file): + +```bash +cetus query "host:*.example.com" --no-marker --since-days 30 -o full_export.jsonl +``` + +--- + ## Commands ### query @@ -68,59 +153,71 @@ cetus query SEARCH [OPTIONS] | `-i, --index` | Index: `dns`, `certstream`, `alerting` (default: dns) | | `-m, --media` | Storage tier: `nvme` (fast), `all` (complete) | | `-f, --format` | Output: `json`, `jsonl`, `csv`, `table` | -| `-o, --output FILE` | Write to file instead of stdout | -| `-d, --since-days N` | Look back N days (default: 7) | -| `--stream` | Stream results as they arrive | -| `--no-marker` | Disable incremental query tracking | +| `-o, --output FILE` | Collector mode: write to file (enables markers) | +| `-p, --output-prefix PREFIX` | Collector mode: timestamped files (e.g., `prefix_2025-01-14_10-30-00.jsonl`) | +| `-d, --since-days N` | Look back N days (default: 7, ignored if marker exists) | +| `--stream` | Stream results as they arrive (large queries) | +| `--no-marker` | Disable incremental tracking (full re-query) | **Examples:** ```bash -# Basic query -cetus query "host:*.example.com" - -# Pipe to jq for processing -cetus query "host:*.example.com" | jq '.[].host' +# Direct mode - interactive queries +cetus query "host:*.example.com" # JSON to stdout +cetus query "host:*.example.com" --format table # Human-readable +cetus query "host:*.example.com" | jq '.[].host' # Pipe to tools -# Table format for human reading -cetus query "A:10.0.0.1" --format table +# Collector mode - data collection +cetus query "host:*.example.com" -o results.jsonl # Incremental collection +cetus query "host:*.example.com" -p exports/dns # Timestamped exports -# Save to file -cetus query "host:*.example.com" -o results.json - -# Stream large results (uses jsonl format) +# Stream large results cetus query "host:*" --stream -o all_records.jsonl -# Query certificate transparency logs +# Query other indices cetus query "leaf_cert.subject.CN:*.example.com" --index certstream +cetus query "alert_type:dns_match" --index alerting -# Look back 30 days -cetus query "host:example.com" --since-days 30 +# Full re-query (ignore markers) +cetus query "host:*.example.com" --no-marker --since-days 30 -o full.jsonl ``` -### Incremental Queries +### Collector Mode Details -The client tracks your queries using markers. First run fetches N days of data; subsequent runs fetch only new records. +**Markers** track your position so subsequent queries fetch only new records: ```bash -# First run: fetches last 7 days -cetus query "host:*.example.com" -o results.jsonl +cetus markers list # Show all markers +cetus markers clear # Clear all markers +cetus markers clear --index dns # Clear only DNS markers +``` -# Later runs: fetches only new data -cetus query "host:*.example.com" -o results.jsonl +**Console feedback** shows what's happening: -# Skip markers for a full query -cetus query "host:*.example.com" --no-marker --since-days 30 ``` +# Starting incremental query with existing marker: +Resuming from: 2025-01-14T10:30:00 +Fetched 1,523 records (page 2)... +Appended 47 records to results.jsonl in 2.34s -Manage markers: +# No new records (file exists): +Resuming from: 2025-01-14T15:42:18 +No new records (file unchanged) in 0.45s -```bash -cetus markers list # Show all markers -cetus markers clear # Clear all markers -cetus markers clear --index dns # Clear only DNS markers +# No new records (first run, no data in time range): +No new records since last query (no file written) in 0.38s ``` +**Recommended format:** `jsonl` (JSON Lines) +- Efficient append operations +- Easy to process: `wc -l`, `grep`, `jq -s` +- No rewriting of existing data + +Other formats: +- `csv`: Appends rows without repeating header +- `json`: Merges into existing array (requires rewriting file) +- `table`: Not recommended for file output + ### alerts list List alert definitions. diff --git a/pyproject.toml b/pyproject.toml index 23688c8..b0e714c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "cetus-client" -version = "0.0.2" +version = "0.0.3" description = "CLI client for the Cetus threat intelligence alerting API" readme = "README.md" requires-python = ">=3.10" diff --git a/src/cetus/__init__.py b/src/cetus/__init__.py index 5c06242..996434d 100644 --- a/src/cetus/__init__.py +++ b/src/cetus/__init__.py @@ -1,3 +1,5 @@ """Cetus CLI - Client for the Cetus threat intelligence alerting API.""" -__version__ = "0.0.1" +from importlib.metadata import version + +__version__ = version("cetus-client") diff --git a/src/cetus/cli.py b/src/cetus/cli.py index ff7c993..a6f5938 100644 --- a/src/cetus/cli.py +++ b/src/cetus/cli.py @@ -3,10 +3,13 @@ from __future__ import annotations import asyncio +import csv import io +import json import logging import sys import time +from datetime import datetime from pathlib import Path import click @@ -24,6 +27,194 @@ console = Console(stderr=True) +def _generate_timestamped_filename(prefix: str, output_format: str) -> Path: + """Generate a filename with current timestamp. + + Args: + prefix: File path prefix (can include directory) + output_format: Format extension (json, jsonl, csv, table) + + Returns: + Path with timestamp and appropriate extension + """ + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # Map format to extension + ext_map = {"json": "json", "jsonl": "jsonl", "csv": "csv", "table": "txt"} + ext = ext_map.get(output_format, output_format) + return Path(f"{prefix}_{timestamp}.{ext}") + + +def _file_has_content(path: Path) -> bool: + """Check if file exists and has content.""" + return path.exists() and path.stat().st_size > 0 + + +def _append_jsonl(data: list[dict], output_file: Path) -> int: + """Append records to a JSONL file.""" + with open(output_file, "a", encoding="utf-8") as f: + for item in data: + f.write(json.dumps(item)) + f.write("\n") + return len(data) + + +def _append_csv(data: list[dict], output_file: Path) -> int: + """Append records to a CSV file (without repeating header).""" + if not data: + return 0 + + # Get fieldnames from existing file or from data + if _file_has_content(output_file): + # Read existing header + with open(output_file, encoding="utf-8", newline="") as f: + reader = csv.reader(f) + fieldnames = next(reader, None) + if not fieldnames: + fieldnames = list(data[0].keys()) + # Append without header + with open(output_file, "a", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore") + for row in data: + writer.writerow(row) + else: + # New file, write with header + fieldnames = list(data[0].keys()) + with open(output_file, "w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore") + writer.writeheader() + for row in data: + writer.writerow(row) + return len(data) + + +def _append_json(data: list[dict], output_file: Path) -> int: + """Append records to a JSON array file by rewriting the footer.""" + if not data: + return 0 + + if _file_has_content(output_file): + # Read existing data, extend, rewrite + with open(output_file, encoding="utf-8") as f: + try: + existing = json.load(f) + if not isinstance(existing, list): + existing = [existing] + except json.JSONDecodeError: + existing = [] + existing.extend(data) + with open(output_file, "w", encoding="utf-8") as f: + json.dump(existing, f, indent=2) + f.write("\n") + else: + # New file + with open(output_file, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + f.write("\n") + return len(data) + + +def _append_table(data: list[dict], output_file: Path) -> int: + """Append records to a table file (rewrites entire file).""" + if not data and not _file_has_content(output_file): + return 0 + + # Table format requires full rewrite - read existing, merge, rewrite + # Note: Table format is not ideal for file accumulation + if _file_has_content(output_file): + # For table format, we can't easily parse Rich tables back + # Just warn and overwrite with new data only + console.print( + "[yellow]Warning: table format cannot append to existing file. " + "Use jsonl or csv for incremental queries.[/yellow]" + ) + + # Write new data (or empty table) + formatter = get_formatter("table") + with open(output_file, "w", encoding="utf-8") as f: + formatter.format_stream(data, f) + return len(data) + + +def _write_or_append( + data: list[dict], + output_file: Path, + output_format: str, + is_incremental: bool, +) -> int: + """Write data to file, appending if incremental mode and file exists. + + Args: + data: Records to write + output_file: Target file path + output_format: Format (json, jsonl, csv, table) + is_incremental: True if using markers (incremental query mode) + + Returns: + Number of records written, or -1 if no file was written (incremental, no data) + """ + # If no data in incremental mode, don't touch the file at all + # (neither create nor modify) + if not data and is_incremental: + return -1 + + # If incremental and file exists, append + if is_incremental and _file_has_content(output_file): + if output_format == "jsonl": + return _append_jsonl(data, output_file) + elif output_format == "csv": + return _append_csv(data, output_file) + elif output_format == "json": + return _append_json(data, output_file) + elif output_format == "table": + return _append_table(data, output_file) + + # Fresh query or new file - overwrite + formatter = get_formatter(output_format) + # Use newline="" for CSV to let csv module handle line endings + newline = "" if output_format == "csv" else None + with open(output_file, "w", encoding="utf-8", newline=newline) as f: + formatter.format_stream(data, f) + return len(data) + + +def _output_formatted_data( + data: list[dict], + output_format: str, + output_file: Path | None, + item_name: str = "records", +) -> None: + """Output data in the specified format to file or stdout. + + This is a common helper for commands that output formatted data + (alerts list, alerts results, etc.). + + Args: + data: List of dicts to output + output_format: Format (json, jsonl, csv, table) + output_file: Optional file path, or None for stdout + item_name: Name for the items in success message (e.g., "alerts", "results") + """ + formatter = get_formatter(output_format) + + if output_file: + newline = "" if output_format == "csv" else None + with open(output_file, "w", encoding="utf-8", newline=newline) as f: + formatter.format_stream(data, f) + console.print(f"[green]Wrote {len(data)} {item_name} to {output_file}[/green]") + else: + # Write to stdout with UTF-8 encoding + if output_format == "table": + # Table format uses Rich console + stdout_console = Console(force_terminal=sys.stdout.isatty()) + formatter.format_stream(data, stdout_console.file) + else: + newline = "" if output_format == "csv" else None + stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", newline=newline) + formatter.format_stream(data, stdout) + stdout.flush() + stdout.detach() + + def execute_query_and_output( ctx: click.Context, search: str, @@ -35,6 +226,7 @@ def execute_query_and_output( no_marker: bool, api_key: str | None, host: str | None, + output_prefix: str | None = None, ) -> None: """Common query execution logic used by both 'query' and 'alerts backtest' commands. @@ -51,6 +243,7 @@ def execute_query_and_output( no_marker: If True, don't use or save markers api_key: Optional API key override host: Optional host override + output_prefix: Optional prefix for timestamped output files """ from .client import QueryResult @@ -58,9 +251,31 @@ def execute_query_and_output( if since_days is None: since_days = config.since_days + # Validate since_days is non-negative + if since_days is not None and since_days < 0: + raise ValueError("since-days cannot be negative") + + # Handle output_prefix: generate timestamped filename + # output_prefix mode creates new files each run, still uses markers + use_prefix_mode = output_prefix is not None + if use_prefix_mode: + output_file = _generate_timestamped_filename(output_prefix, output_format) + marker_store = MarkerStore() - # Only use markers in file mode, not stdout mode - marker = None if (no_marker or not output_file) else marker_store.get(search, index) + # Use markers in file mode (both -o and --output-prefix), not stdout mode + # Different modes have separate markers to prevent data gaps + marker_mode = "prefix" if use_prefix_mode else "file" if output_file else None + marker = None + if not no_marker and output_file: + marker = marker_store.get(search, index, marker_mode) + + # Show marker/file info before query starts + if use_prefix_mode: + console.print(f"[dim]Output file: {output_file}[/dim]") + if marker: + # Show marker info so user knows we're resuming + ts_display = marker.last_timestamp[:19] + console.print(f"[dim]Resuming from: {ts_display}[/dim]") formatter = get_formatter(output_format) @@ -112,32 +327,64 @@ async def run_query() -> QueryResult: elapsed = time.perf_counter() - start_time # Output results + is_incremental = marker is not None if output_file: - with open(output_file, "w", encoding="utf-8") as f: - formatter.format_stream(result.data, f) - console.print( - f"[green]Wrote {result.total_fetched} records to {output_file} " - f"in {elapsed:.2f}s[/green]" - ) - else: - # Write to stdout - use stream for proper encoding handling - # For table format, use Rich console directly to handle Unicode - if output_format == "table": - stdout_console = Console(force_terminal=sys.stdout.isatty()) - formatter.format_stream(result.data, stdout_console.file) + # In prefix mode, we always create new files (never append) + # but still use markers to only fetch new records + if use_prefix_mode: + if not result.data: + # No new records - don't create empty file + console.print(f"[dim]No new records (no file created) in {elapsed:.2f}s[/dim]") + else: + # Write to new timestamped file + formatter = get_formatter(output_format) + newline = "" if output_format == "csv" else None + with open(output_file, "w", encoding="utf-8", newline=newline) as f: + formatter.format_stream(result.data, f) + console.print( + f"[green]Wrote {len(result.data)} records to {output_file} " + f"in {elapsed:.2f}s[/green]" + ) else: - # For other formats, use UTF-8 wrapper on stdout - stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8") - formatter.format_stream(result.data, stdout) - stdout.flush() - stdout.detach() # Detach so wrapper doesn't close sys.stdout.buffer + # Standard -o mode with append support + file_existed = _file_has_content(output_file) + records_written = _write_or_append( + result.data, output_file, output_format, is_incremental + ) + if records_written == -1: + # Incremental mode with no new data - no file written/changed + if file_existed: + console.print(f"[dim]No new records (file unchanged) in {elapsed:.2f}s[/dim]") + else: + console.print( + f"[dim]No new records since last query (no file written) " + f"in {elapsed:.2f}s[/dim]" + ) + elif is_incremental and file_existed: + console.print( + f"[green]Appended {records_written} records to {output_file} " + f"in {elapsed:.2f}s[/green]" + ) + else: + console.print( + f"[green]Wrote {records_written} records to {output_file} " + f"in {elapsed:.2f}s[/green]" + ) + else: + # Write to stdout - use UTF-8 wrapper on Windows (cp1252 default can't handle Unicode) + # Use newline="" for CSV to let csv module handle line endings, None for others + newline = "" if output_format == "csv" else None + stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", newline=newline) + formatter.format_stream(result.data, stdout) + stdout.flush() + stdout.detach() # Detach so wrapper doesn't close sys.stdout.buffer console.print( f"\n[dim]{result.total_fetched} records in {elapsed:.2f}s[/dim]", highlight=False ) # Save marker for next incremental query (only in file mode, not stdout) if output_file and not no_marker and result.last_uuid and result.last_timestamp: - marker_store.save(search, index, result.last_timestamp, result.last_uuid) + marker_store.save(search, index, result.last_timestamp, result.last_uuid, marker_mode) if ctx.obj.get("verbose"): console.print("[dim]Saved marker for next incremental query[/dim]") @@ -153,6 +400,7 @@ def execute_streaming_query( no_marker: bool, api_key: str | None, host: str | None, + output_prefix: str | None = None, ) -> None: """Execute a streaming query, outputting results as they arrive. @@ -170,24 +418,53 @@ def execute_streaming_query( no_marker: If True, don't use or save markers api_key: Optional API key override host: Optional host override + output_prefix: Optional prefix for timestamped output files """ - import csv - import json - config = Config.load(api_key=api_key, host=host) if since_days is None: since_days = config.since_days + # Validate since_days is non-negative + if since_days is not None and since_days < 0: + raise ValueError("since-days cannot be negative") + # --stream implies jsonl format unless explicitly specified if output_format is None: output_format = "jsonl" + # Handle output_prefix: generate timestamped filename + use_prefix_mode = output_prefix is not None + if use_prefix_mode: + output_file = _generate_timestamped_filename(output_prefix, output_format) + marker_store = MarkerStore() - # Only use markers in file mode, not stdout mode - marker = None if (no_marker or not output_file) else marker_store.get(search, index) + # Use markers in file mode (both -o and --output-prefix), not stdout mode + # Different modes have separate markers to prevent data gaps + marker_mode = "prefix" if use_prefix_mode else "file" if output_file else None + marker = None + if not no_marker and output_file: + marker = marker_store.get(search, index, marker_mode) + is_incremental = marker is not None + + # Show marker/file info before query starts + if use_prefix_mode: + console.print(f"[dim]Output file: {output_file}[/dim]") + if marker: + # Show marker info so user knows we're resuming + ts_display = marker.last_timestamp[:19] + console.print(f"[dim]Resuming from: {ts_display}[/dim]") timestamp_field = f"{index}_timestamp" + # Check if file exists before we start (for append detection) + # In prefix mode, file_existed is always False since we just generated a new filename + file_existed = (not use_prefix_mode) and output_file and _file_has_content(output_file) + + # Determine if we can truly stream or need to buffer + # json and table formats require buffering; jsonl and csv can truly stream + # In prefix mode, we also buffer to avoid creating empty files when there's no data + needs_buffering = output_format in ("json", "table") or use_prefix_mode + # Table format requires buffering for column width calculation if output_format == "table": console.print( @@ -195,27 +472,47 @@ def execute_streaming_query( "Use --format csv or jsonl for true streaming.[/yellow]" ) - async def stream_results() -> tuple[int, str | None, str | None, bool]: - """Async inner function for streaming with responsive interrupt handling.""" + async def stream_results() -> tuple[int, str | None, str | None, bool, list[dict]]: + """Async inner function for streaming with responsive interrupt handling. + + Returns: (count, last_uuid, last_timestamp, interrupted, buffered_data) + buffered_data is only populated for json/table formats or when we need to merge. + """ count = 0 last_uuid = None last_timestamp = None interrupted = False + buffered_data: list[dict] = [] client = CetusClient.from_config(config) - # Set up output destination - if output_file: - out_file = open(output_file, "w", encoding="utf-8", newline="") - else: - out_file = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", newline="") + # For formats that need buffering (json, table), we buffer all data + # jsonl and csv can truly stream, even in append mode + buffer_all = needs_buffering + # Set up output destination for streaming formats + out_file = None csv_writer = None - table_buffer = [] # Only used for table format + csv_fieldnames = None + + if not buffer_all: + if output_file: + # For incremental jsonl/csv with existing file, use append mode + if is_incremental and file_existed: + if output_format == "csv": + # Read existing header for CSV append + with open(output_file, encoding="utf-8", newline="") as f: + reader = csv.reader(f) + csv_fieldnames = next(reader, None) + out_file = open(output_file, "a", encoding="utf-8", newline="") + else: + out_file = open(output_file, "w", encoding="utf-8", newline="") + else: + out_file = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", newline="") try: - if output_format == "json": - # JSON array format - stream but need wrapper + if not buffer_all and output_format == "json": + # JSON array format - stream but need wrapper (fresh file only) out_file.write("[\n") first = True else: @@ -235,7 +532,9 @@ async def stream_results() -> tuple[int, str | None, str | None, bool]: last_uuid = record.get("uuid") last_timestamp = record.get(timestamp_field) - if output_format == "jsonl": + if buffer_all: + buffered_data.append(record) + elif output_format == "jsonl": out_file.write(json.dumps(record) + "\n") out_file.flush() elif output_format == "json": @@ -246,26 +545,24 @@ async def stream_results() -> tuple[int, str | None, str | None, bool]: elif output_format == "csv": # Initialize CSV writer with headers from first record if csv_writer is None: - fieldnames = list(record.keys()) + if csv_fieldnames is None: + csv_fieldnames = list(record.keys()) + # Write header only for new files + if not (is_incremental and file_existed): + temp_writer = csv.DictWriter( + out_file, fieldnames=csv_fieldnames, extrasaction="ignore" + ) + temp_writer.writeheader() + out_file.flush() csv_writer = csv.DictWriter( - out_file, fieldnames=fieldnames, extrasaction="ignore" + out_file, fieldnames=csv_fieldnames, extrasaction="ignore" ) - csv_writer.writeheader() - out_file.flush() csv_writer.writerow(record) out_file.flush() - elif output_format == "table": - # Buffer for table format - table_buffer.append(record) - if output_format == "json": + if not buffer_all and output_format == "json": out_file.write("\n]\n") - # Handle table format - output buffered data - if output_format == "table" and table_buffer: - formatter = get_formatter("table") - formatter.format_stream(table_buffer, out_file) - except asyncio.CancelledError: interrupted = True console.print("\n[yellow]Interrupted[/yellow]") @@ -273,27 +570,81 @@ async def stream_results() -> tuple[int, str | None, str | None, bool]: interrupted = True console.print("\n[yellow]Interrupted[/yellow]") finally: - if output_file: - out_file.close() - else: - out_file.flush() - out_file.detach() # Detach so wrapper doesn't close sys.stdout.buffer + if out_file is not None: + if output_file: + out_file.close() + else: + out_file.flush() + out_file.detach() # Detach so wrapper doesn't close sys.stdout.buffer client.close() - return count, last_uuid, last_timestamp, interrupted + return count, last_uuid, last_timestamp, interrupted, buffered_data # Run the async streaming function start_time = time.perf_counter() try: - count, last_uuid, last_timestamp, interrupted = asyncio.run(stream_results()) + count, last_uuid, last_timestamp, interrupted, buffered_data = asyncio.run(stream_results()) except KeyboardInterrupt: console.print("\n[yellow]Interrupted[/yellow]") sys.exit(130) elapsed = time.perf_counter() - start_time + # Handle buffered data (for json/table formats) + if buffered_data and output_file: + if use_prefix_mode: + # In prefix mode, always create new file (never append) + formatter = get_formatter(output_format) + newline = "" if output_format == "csv" else None + with open(output_file, "w", encoding="utf-8", newline=newline) as f: + formatter.format_stream(buffered_data, f) + else: + _write_or_append(buffered_data, output_file, output_format, is_incremental) + elif buffered_data and not output_file: + # Stdout with buffered format - use UTF-8 wrapper for all formats + formatter = get_formatter(output_format) + newline = "" if output_format == "csv" else None + stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", newline=newline) + formatter.format_stream(buffered_data, stdout) + stdout.flush() + stdout.detach() + + # Clean up empty files created in incremental mode with no results + # (streaming formats open the file before knowing if there will be data) + created_empty_file = ( + count == 0 + and is_incremental + and not file_existed + and output_file + and output_file.exists() + and output_file.stat().st_size == 0 + ) + if created_empty_file: + output_file.unlink() + # Report results if output_file: - console.print(f"[green]Streamed {count} records to {output_file} in {elapsed:.2f}s[/green]") + if use_prefix_mode: + if count == 0: + console.print(f"[dim]No new records (no file created) in {elapsed:.2f}s[/dim]") + else: + console.print( + f"[green]Streamed {count} records to {output_file} in {elapsed:.2f}s[/green]" + ) + elif count == 0 and is_incremental and not file_existed: + # First incremental run with no results - no file written + console.print( + f"[dim]No new records since last query (no file written) in {elapsed:.2f}s[/dim]" + ) + elif count == 0 and is_incremental and file_existed: + console.print(f"[dim]No new records (file unchanged) in {elapsed:.2f}s[/dim]") + elif is_incremental and file_existed: + console.print( + f"[green]Appended {count} records to {output_file} in {elapsed:.2f}s[/green]" + ) + else: + console.print( + f"[green]Streamed {count} records to {output_file} in {elapsed:.2f}s[/green]" + ) elif not interrupted: console.print(f"\n[dim]Streamed {count} records in {elapsed:.2f}s[/dim]", highlight=False) @@ -302,7 +653,7 @@ async def stream_results() -> tuple[int, str | None, str | None, bool]: # Save marker for next incremental query (only in file mode, not stdout) if output_file and not no_marker and last_uuid and last_timestamp: - marker_store.save(search, index, last_timestamp, last_uuid) + marker_store.save(search, index, last_timestamp, last_uuid, marker_mode) if ctx.obj.get("verbose"): console.print("[dim]Saved marker for next incremental query[/dim]") @@ -374,7 +725,14 @@ def main(ctx: click.Context, verbose: bool, version: bool) -> None: "-o", "output_file", type=click.Path(dir_okay=False, writable=True, path_type=Path), - help="Write output to file instead of stdout", + help="Collector mode: write to file (enables incremental markers)", +) +@click.option( + "--output-prefix", + "-p", + "output_prefix", + type=str, + help="Collector mode: timestamped files (e.g., -p results -> results_.jsonl)", ) @click.option( "--since-days", @@ -411,6 +769,7 @@ def query( media: str, output_format: str | None, output_file: Path | None, + output_prefix: str | None, since_days: int | None, no_marker: bool, stream: bool, @@ -426,16 +785,22 @@ def query( A:192.168.1.1 # DNS A record lookup host:example.com AND A:* # Combined conditions - By default, results are written to stdout as JSON. Use --output to - write to a file, or --format to change the output format. + \b + OPERATING MODES: + Direct mode (default): Results to stdout, no state tracking. + Collector mode (-o/-p): Results to file with incremental markers. - Incremental queries are supported via markers. On first run, data - from the last 7 days is fetched. Subsequent runs fetch only new - data since the last query. Use --no-marker to disable this behavior. + In collector mode, markers track your position so subsequent runs + fetch only new records. First run fetches the last 7 days (or + --since-days). Use --no-marker for a full re-query. - Use --stream for large queries to see results as they arrive rather - than waiting for all data to be fetched. Streaming defaults to jsonl format. + Use --stream for large queries to see results as they arrive. """ + # Validate mutually exclusive options + if output_file and output_prefix: + console.print("[red]Error:[/red] --output and --output-prefix are mutually exclusive") + sys.exit(1) + try: if stream: # --stream implies jsonl unless format explicitly specified @@ -450,6 +815,7 @@ def query( no_marker=no_marker, api_key=api_key, host=host, + output_prefix=output_prefix, ) else: # Default to json for non-streaming @@ -464,10 +830,17 @@ def query( no_marker=no_marker, api_key=api_key, host=host, + output_prefix=output_prefix, ) + except ValueError as e: + console.print(f"[red]Error:[/red] {e}") + sys.exit(1) except CetusError as e: console.print(f"[red]Error:[/red] {e}") sys.exit(1) + except OSError as e: + console.print(f"[red]Error:[/red] Cannot write to output file: {e}") + sys.exit(1) except KeyboardInterrupt: console.print("\n[yellow]Interrupted[/yellow]") sys.exit(130) @@ -515,7 +888,10 @@ def config_set(key: str, value: str) -> None: elif key == "timeout": cfg.timeout = int(value) elif key == "since-days": - cfg.since_days = int(value) + days = int(value) + if days < 0: + raise ValueError("since-days cannot be negative") + cfg.since_days = days cfg.save() console.print(f"[green]Set {key} successfully[/green]") @@ -596,12 +972,29 @@ def alerts() -> None: type=click.Choice(["raw", "terms", "structured"]), help="Filter by alert type", ) +@click.option( + "--format", + "-f", + "output_format", + type=click.Choice(["json", "jsonl", "csv", "table"]), + default="table", + help="Output format (default: table)", +) +@click.option( + "--output", + "-o", + "output_file", + type=click.Path(dir_okay=False, writable=True, path_type=Path), + help="Write output to file instead of stdout", +) @click.option("--api-key", envvar="CETUS_API_KEY", help="API key") @click.option("--host", envvar="CETUS_HOST", help="API host") def alerts_list( owned: bool, shared: bool, alert_type: str | None, + output_format: str, + output_file: Path | None, api_key: str | None, host: str | None, ) -> None: @@ -612,10 +1005,12 @@ def alerts_list( \b Examples: - cetus alerts list # Your alerts - cetus alerts list --shared # Your alerts + shared - cetus alerts list --no-owned --shared # Only shared alerts - cetus alerts list --type raw # Only raw query alerts + cetus alerts list # Your alerts (table) + cetus alerts list --shared # Your alerts + shared + cetus alerts list --no-owned --shared # Only shared alerts + cetus alerts list --type raw # Only raw query alerts + cetus alerts list --format json # JSON output + cetus alerts list -f csv -o alerts.csv # Export to CSV """ try: config = Config.load(api_key=api_key, host=host) @@ -633,37 +1028,59 @@ def alerts_list( console.print("[dim]No alerts found[/dim]") return - from rich.table import Table - - table = Table(show_header=True, header_style="bold cyan") - table.add_column("ID", style="dim") - table.add_column("Type") - table.add_column("Title") - table.add_column("Description") - table.add_column("Owner/Shared By") - - type_colors = {"raw": "green", "terms": "blue", "structured": "cyan"} - - for alert in alerts_data: - type_color = type_colors.get(alert.alert_type, "white") - owner_col = "You" if alert.owned else f"[dim]{alert.shared_by}[/dim]" - desc = alert.description - if len(desc) > 40: - desc = desc[:40] + "..." - table.add_row( - str(alert.id), - f"[{type_color}]{alert.alert_type}[/{type_color}]", - alert.title, - desc, - owner_col, - ) - - console.print(table) - console.print(f"\n[dim]Total: {len(alerts_data)} alert(s)[/dim]") + # Convert Alert objects to dicts for output + alerts_as_dicts = [ + { + "id": alert.id, + "type": alert.alert_type, + "title": alert.title, + "description": alert.description, + "owned": alert.owned, + "shared_by": alert.shared_by, + "query_preview": alert.query_preview, + } + for alert in alerts_data + ] + + if output_format == "table" and not output_file: + # Special handling for table to stdout - use Rich table with colors + from rich.table import Table + + table = Table(show_header=True, header_style="bold cyan") + table.add_column("ID", style="dim") + table.add_column("Type") + table.add_column("Title") + table.add_column("Description") + table.add_column("Owner/Shared By") + + type_colors = {"raw": "green", "terms": "blue", "structured": "cyan"} + + for alert in alerts_data: + type_color = type_colors.get(alert.alert_type, "white") + owner_col = "You" if alert.owned else f"[dim]{alert.shared_by}[/dim]" + desc = alert.description + if len(desc) > 40: + desc = desc[:40] + "..." + table.add_row( + str(alert.id), + f"[{type_color}]{alert.alert_type}[/{type_color}]", + alert.title, + desc, + owner_col, + ) + + console.print(table) + console.print(f"\n[dim]Total: {len(alerts_data)} alert(s)[/dim]") + else: + # Use common helper for all other cases + _output_formatted_data(alerts_as_dicts, output_format, output_file, "alerts") except CetusError as e: console.print(f"[red]Error:[/red] {e}") sys.exit(1) + except OSError as e: + console.print(f"[red]Error:[/red] Cannot write to output file: {e}") + sys.exit(1) @alerts.command("results") @@ -711,7 +1128,6 @@ def alerts_results( """ try: config = Config.load(api_key=api_key, host=host) - formatter = get_formatter(output_format) with CetusClient.from_config(config) as client: with Progress( @@ -727,25 +1143,15 @@ def alerts_results( console.print("[dim]No results found for this alert[/dim]") return - # Output results - if output_file: - with open(output_file, "w", encoding="utf-8") as f: - formatter.format_stream(results, f) - console.print(f"[green]Wrote {len(results)} results to {output_file}[/green]") - else: - # Write to stdout - if output_format == "table": - stdout_console = Console(force_terminal=sys.stdout.isatty()) - formatter.format_stream(results, stdout_console.file) - else: - stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8") - formatter.format_stream(results, stdout) - stdout.flush() - stdout.detach() # Detach so wrapper doesn't close sys.stdout.buffer + # Output results using common helper + _output_formatted_data(results, output_format, output_file, "results") except CetusError as e: console.print(f"[red]Error:[/red] {e}") sys.exit(1) + except OSError as e: + console.print(f"[red]Error:[/red] Cannot write to output file: {e}") + sys.exit(1) @alerts.command("backtest") @@ -779,6 +1185,13 @@ def alerts_results( type=click.Path(dir_okay=False, writable=True, path_type=Path), help="Write output to file instead of stdout", ) +@click.option( + "--output-prefix", + "-p", + "output_prefix", + type=str, + help="Timestamped files (e.g., -p results -> results_.jsonl)", +) @click.option( "--since-days", "-d", @@ -806,6 +1219,7 @@ def alerts_backtest( media: str, output_format: str | None, output_file: Path | None, + output_prefix: str | None, since_days: int | None, no_marker: bool, stream: bool, @@ -826,8 +1240,14 @@ def alerts_backtest( cetus alerts backtest 123 --index dns cetus alerts backtest 123 --format table cetus alerts backtest 123 -o results.json --since-days 30 + cetus alerts backtest 123 -p results --since-days 30 cetus alerts backtest 123 --stream """ + # Validate mutually exclusive options + if output_file and output_prefix: + console.print("[red]Error:[/red] --output and --output-prefix are mutually exclusive") + sys.exit(1) + try: config = Config.load(api_key=api_key, host=host) @@ -868,6 +1288,7 @@ def alerts_backtest( no_marker=no_marker, api_key=api_key, host=host, + output_prefix=output_prefix, ) else: # Default to json for non-streaming @@ -882,11 +1303,18 @@ def alerts_backtest( no_marker=no_marker, api_key=api_key, host=host, + output_prefix=output_prefix, ) + except ValueError as e: + console.print(f"[red]Error:[/red] {e}") + sys.exit(1) except CetusError as e: console.print(f"[red]Error:[/red] {e}") sys.exit(1) + except OSError as e: + console.print(f"[red]Error:[/red] Cannot write to output file: {e}") + sys.exit(1) except KeyboardInterrupt: console.print("\n[yellow]Interrupted[/yellow]") sys.exit(130) diff --git a/src/cetus/client.py b/src/cetus/client.py index fbf836d..844b228 100644 --- a/src/cetus/client.py +++ b/src/cetus/client.py @@ -192,13 +192,29 @@ def __enter__(self) -> CetusClient: def __exit__(self, *args) -> None: self.close() + def _is_dsl_query(self, query: str) -> bool: + """Check if a query is Elasticsearch DSL (JSON) rather than Lucene. + + DSL queries start with '{' and are valid JSON objects. + """ + import json + + stripped = query.strip() + if not stripped.startswith("{"): + return False + try: + parsed = json.loads(stripped) + return isinstance(parsed, dict) + except (json.JSONDecodeError, ValueError): + return False + def _build_time_filter( self, index: Index, since_days: int | None, marker: Marker | None, ) -> str: - """Build the timestamp filter suffix for the query.""" + """Build the timestamp filter suffix for Lucene queries.""" timestamp_field = f"{index}_timestamp" if marker: @@ -211,10 +227,60 @@ def _build_time_filter( else: return "" + def _build_full_query( + self, + search: str, + index: Index, + since_days: int | None, + marker: Marker | None, + ) -> str: + """Build the full query with time filter. + + For Lucene queries: wraps with parentheses and appends time filter. + For DSL queries: wraps the DSL in a bool query with time filter. + """ + import json + + timestamp_field = f"{index}_timestamp" + + if self._is_dsl_query(search): + # DSL query - need to wrap in bool with time filter + parsed_query = json.loads(search.strip()) + + # If the query has a top-level "query" key (full ES query format), + # extract the inner query body. ES expects just the query body. + if "query" in parsed_query and len(parsed_query) == 1: + parsed_query = parsed_query["query"] + + # Determine the time constraint + if marker: + time_value = marker.last_timestamp + elif since_days: + since_date = (datetime.today() - timedelta(days=since_days)).replace(microsecond=0) + time_value = since_date.isoformat() + else: + # No time filter needed, return unwrapped query + return json.dumps(parsed_query) + + # Build a DSL query with both the original query and time filter + wrapped_query = { + "bool": { + "must": [parsed_query], + "filter": [{"range": {timestamp_field: {"gte": time_value}}}], + } + } + return json.dumps(wrapped_query) + else: + # Lucene query - use string concatenation + time_filter = self._build_time_filter(index, since_days, marker) + return f"({search}){time_filter}" + def _handle_error_response(self, response: httpx.Response) -> None: """Handle error responses, sanitizing error messages. Raises appropriate exceptions for error status codes. + For 400 errors (bad request), attempts to extract the error detail + from the response to provide helpful feedback. """ if response.status_code == 401: raise AuthenticationError( @@ -223,6 +289,18 @@ def _handle_error_response(self, response: httpx.Response) -> None: ) elif response.status_code == 403: raise AuthenticationError("Access denied - check your permissions") + elif response.status_code == 400: + # For 400 errors, try to extract the error detail from JSON response + # DRF returns {"detail": "error message"} for ParseError + logger.debug("API error response: %s", response.text[:500]) + try: + error_data = response.json() + detail = error_data.get("detail", "") + if detail: + raise APIError(detail, status_code=400) + except (ValueError, KeyError): + pass + raise APIError("Bad request", status_code=400) elif response.status_code >= 400: # Log full error for debugging, but don't expose to user logger.debug("API error response: %s", response.text[:500]) @@ -325,8 +403,7 @@ def query( timestamp_field = f"{index}_timestamp" # Build initial query with time filter (only needed for first request) - time_filter = self._build_time_filter(index, since_days, marker) - full_query = f"({search}){time_filter}" + full_query = self._build_full_query(search, index, since_days, marker) while True: response = self._fetch_page(full_query, index, media, pit_id, search_after) @@ -492,8 +569,7 @@ async def query_async( timestamp_field = f"{index}_timestamp" # Build initial query with time filter (only needed for first request) - time_filter = self._build_time_filter(index, since_days, marker) - full_query = f"({search}){time_filter}" + full_query = self._build_full_query(search, index, since_days, marker) async with httpx.AsyncClient(timeout=self.timeout) as client: while True: @@ -574,8 +650,7 @@ def query_iter( search_after: list | None = None marker_uuid = marker.last_uuid if marker else None - time_filter = self._build_time_filter(index, since_days, marker) - full_query = f"({search}){time_filter}" + full_query = self._build_full_query(search, index, since_days, marker) while True: response = self._fetch_page(full_query, index, media, pit_id, search_after) @@ -641,8 +716,7 @@ def query_stream( marker_uuid = marker.last_uuid if marker else None past_marker = marker_uuid is None - time_filter = self._build_time_filter(index, since_days, marker) - full_query = f"({search}){time_filter}" + full_query = self._build_full_query(search, index, since_days, marker) body = { "query": full_query, @@ -746,8 +820,7 @@ async def query_stream_async( marker_uuid = marker.last_uuid if marker else None past_marker = marker_uuid is None - time_filter = self._build_time_filter(index, since_days, marker) - full_query = f"({search}){time_filter}" + full_query = self._build_full_query(search, index, since_days, marker) body = { "query": full_query, diff --git a/src/cetus/config.py b/src/cetus/config.py index a16e54b..de3e078 100644 --- a/src/cetus/config.py +++ b/src/cetus/config.py @@ -48,7 +48,12 @@ def get_config_dir() -> Path: def get_data_dir() -> Path: - """Get the platform-appropriate data directory (for markers).""" + """Get the platform-appropriate data directory (for markers). + + Respects CETUS_DATA_DIR environment variable for testing. + """ + if env_dir := os.environ.get("CETUS_DATA_DIR"): + return Path(env_dir) return Path(platformdirs.user_data_dir(APP_NAME)) diff --git a/src/cetus/markers.py b/src/cetus/markers.py index 1198bae..0180b29 100644 --- a/src/cetus/markers.py +++ b/src/cetus/markers.py @@ -30,12 +30,19 @@ def get_markers_dir() -> Path: return get_data_dir() / "markers" -def _query_hash(query: str, index: str) -> str: +def _query_hash(query: str, index: str, mode: str | None = None) -> str: """Generate a hash for a query to use as filename. Uses 32 hex characters (128 bits) to minimize collision risk. + + Args: + query: The search query string + index: The index being queried (dns, certstream, alerting) + mode: Optional output mode ("file" or "prefix") to differentiate markers """ content = f"{index}:{query}" + if mode: + content = f"{content}:{mode}" return hashlib.sha256(content.encode()).hexdigest()[:32] @@ -87,17 +94,22 @@ class MarkerStore: def __init__(self, markers_dir: Path | None = None): self.markers_dir = markers_dir or get_markers_dir() - def _marker_path(self, query: str, index: str) -> Path: + def _marker_path(self, query: str, index: str, mode: str | None = None) -> Path: """Get the file path for a specific marker.""" - hash_id = _query_hash(query, index) + hash_id = _query_hash(query, index, mode) return self.markers_dir / f"{index}_{hash_id}.json" - def get(self, query: str, index: str) -> Marker | None: + def get(self, query: str, index: str, mode: str | None = None) -> Marker | None: """Retrieve a marker for the given query and index. + Args: + query: The search query string + index: The index being queried + mode: Output mode ("file" or "prefix") - different modes have separate markers + Validates file size before reading to prevent memory exhaustion. """ - path = self._marker_path(query, index) + path = self._marker_path(query, index, mode) if not path.exists(): return None @@ -114,9 +126,18 @@ def get(self, query: str, index: str) -> Marker | None: # Corrupted marker file, treat as missing return None - def save(self, query: str, index: str, last_timestamp: str, last_uuid: str) -> Marker: + def save( + self, query: str, index: str, last_timestamp: str, last_uuid: str, mode: str | None = None + ) -> Marker: """Save or update a marker. + Args: + query: The search query string + index: The index being queried + last_timestamp: Timestamp of the last record + last_uuid: UUID of the last record + mode: Output mode ("file" or "prefix") - different modes have separate markers + The marker file is created with secure permissions (0o600 on Unix) to protect query patterns from other users on the system. """ @@ -130,14 +151,14 @@ def save(self, query: str, index: str, last_timestamp: str, last_uuid: str) -> M updated_at=datetime.now().isoformat(), ) - path = self._marker_path(query, index) + path = self._marker_path(query, index, mode) path.write_text(json.dumps(marker.to_dict(), indent=2)) _set_secure_permissions(path) return marker - def delete(self, query: str, index: str) -> bool: + def delete(self, query: str, index: str, mode: str | None = None) -> bool: """Delete a marker. Returns True if it existed.""" - path = self._marker_path(query, index) + path = self._marker_path(query, index, mode) if path.exists(): path.unlink() return True diff --git a/tests/test_async.py b/tests/test_async.py index b8455e3..5354c70 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -5,8 +5,7 @@ import pytest from cetus.client import CetusClient, QueryResult -from cetus.exceptions import APIError, AuthenticationError, ConnectionError -from cetus.markers import Marker +from cetus.exceptions import AuthenticationError class TestQueryAsync: @@ -23,9 +22,7 @@ async def test_query_async_returns_query_result(self, client: CetusClient, httpx method="POST", url="http://localhost/api/query/", json={ - "data": [ - {"uuid": "1", "host": "a.com", "dns_timestamp": "2025-01-01T00:00:00Z"} - ], + "data": [{"uuid": "1", "host": "a.com", "dns_timestamp": "2025-01-01T00:00:00Z"}], "has_more": False, "pit_id": "pit123", }, @@ -149,9 +146,7 @@ async def test_fetch_page_async_makes_request(self, client: CetusClient, httpx_m ) async with httpx_lib.AsyncClient(timeout=60) as async_client: - result = await client._fetch_page_async( - async_client, "host:*", "dns", "nvme" - ) + result = await client._fetch_page_async(async_client, "host:*", "dns", "nvme") assert result == {"data": [], "has_more": False} client.close() @@ -193,9 +188,7 @@ async def test_fetch_page_async_rate_limit_retry(self, client: CetusClient, http ) async with httpx_lib.AsyncClient(timeout=60) as async_client: - result = await client._fetch_page_async( - async_client, "host:*", "dns", "nvme" - ) + result = await client._fetch_page_async(async_client, "host:*", "dns", "nvme") assert result["data"] == [{"id": 1}] assert len(httpx_mock.get_requests()) == 2 @@ -293,7 +286,9 @@ def test_sync_client_includes_user_agent(self, client: CetusClient, httpx_mock): request = httpx_mock.get_requests()[0] assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] - assert "cetus-client/0.0.1" in user_agent + from cetus import __version__ + + assert f"cetus-client/{__version__}" in user_agent assert "Python" in user_agent client.close() diff --git a/tests/test_cli.py b/tests/test_cli.py index 08e998c..c71398b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -216,7 +216,9 @@ def mock_query_result(self) -> QueryResult: return QueryResult( data=[ { - "uuid": "1", "host": "example.com", "A": "1.1.1.1", + "uuid": "1", + "host": "example.com", + "A": "1.1.1.1", "dns_timestamp": "2025-01-01T00:00:00Z", } ], @@ -439,9 +441,7 @@ def test_invalid_media(self, runner: CliRunner): class TestCLIIntegration: """Integration tests for CLI commands.""" - def test_full_workflow_config_then_query( - self, runner: CliRunner, tmp_path: Path - ): + def test_full_workflow_config_then_query(self, runner: CliRunner, tmp_path: Path): """Test setting config then using it in query.""" config_dir = tmp_path / "config" config_dir.mkdir() @@ -494,3 +494,649 @@ def test_markers_workflow(self, runner: CliRunner, tmp_path: Path): # Empty again result = runner.invoke(main, ["markers", "list"]) assert "No markers" in result.output + + +class TestIncrementalQueryAppend: + """Tests for incremental query file append behavior.""" + + @pytest.fixture + def mock_query_result_batch1(self) -> QueryResult: + """First batch of query results.""" + return QueryResult( + data=[ + { + "uuid": "1", + "host": "a.example.com", + "A": "1.1.1.1", + "dns_timestamp": "2025-01-01T00:00:00Z", + }, + { + "uuid": "2", + "host": "b.example.com", + "A": "2.2.2.2", + "dns_timestamp": "2025-01-01T01:00:00Z", + }, + ], + total_fetched=2, + last_uuid="2", + last_timestamp="2025-01-01T01:00:00Z", + pages_fetched=1, + ) + + @pytest.fixture + def mock_query_result_batch2(self) -> QueryResult: + """Second batch of query results (new records).""" + return QueryResult( + data=[ + { + "uuid": "3", + "host": "c.example.com", + "A": "3.3.3.3", + "dns_timestamp": "2025-01-02T00:00:00Z", + }, + ], + total_fetched=1, + last_uuid="3", + last_timestamp="2025-01-02T00:00:00Z", + pages_fetched=1, + ) + + @pytest.fixture + def mock_query_result_empty(self) -> QueryResult: + """Empty result (no new records).""" + return QueryResult( + data=[], + total_fetched=0, + last_uuid=None, + last_timestamp=None, + pages_fetched=1, + ) + + def test_incremental_jsonl_preserves_file_on_zero_results( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result_batch1: QueryResult, + mock_query_result_empty: QueryResult, + ): + """When incremental query returns 0 records, existing file should be unchanged.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + output_file = tmp_path / "results.jsonl" + + async def mock_query_async_batch1(*args, **kwargs): + return mock_query_result_batch1 + + async def mock_query_async_empty(*args, **kwargs): + return mock_query_result_empty + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + ): + # First run - write initial data + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch1): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "jsonl", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Verify file has initial data + initial_content = output_file.read_text() + assert '"uuid": "1"' in initial_content + assert '"uuid": "2"' in initial_content + lines_before = len(initial_content.strip().split("\n")) + assert lines_before == 2 + + # Second run with 0 results - file should be unchanged + with patch("cetus.client.CetusClient.query_async", mock_query_async_empty): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "jsonl", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + assert "No new records" in result.output or "unchanged" in result.output + + # Verify file is unchanged + final_content = output_file.read_text() + assert final_content == initial_content + + def test_incremental_jsonl_appends_new_records( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result_batch1: QueryResult, + mock_query_result_batch2: QueryResult, + ): + """When incremental query returns new records, they should be appended.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + output_file = tmp_path / "results.jsonl" + + async def mock_query_async_batch1(*args, **kwargs): + return mock_query_result_batch1 + + async def mock_query_async_batch2(*args, **kwargs): + return mock_query_result_batch2 + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + ): + # First run + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch1): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "jsonl", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Second run with new data + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch2): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "jsonl", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + assert "Appended" in result.output + + # Verify all 3 records are in file + final_content = output_file.read_text() + assert '"uuid": "1"' in final_content + assert '"uuid": "2"' in final_content + assert '"uuid": "3"' in final_content + lines = final_content.strip().split("\n") + assert len(lines) == 3 + + def test_incremental_csv_appends_without_repeating_header( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result_batch1: QueryResult, + mock_query_result_batch2: QueryResult, + ): + """CSV append should not repeat the header row.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + output_file = tmp_path / "results.csv" + + async def mock_query_async_batch1(*args, **kwargs): + return mock_query_result_batch1 + + async def mock_query_async_batch2(*args, **kwargs): + return mock_query_result_batch2 + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + ): + # First run + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch1): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "csv", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Second run + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch2): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "csv", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Verify only one header row + final_content = output_file.read_text() + lines = final_content.strip().split("\n") + # 1 header + 3 data rows + assert len(lines) == 4 + # First line should be header + assert lines[0].startswith("uuid,") + # Verify all 3 uuids are present in data rows + assert "1,a.example.com" in final_content + assert "2,b.example.com" in final_content + assert "3,c.example.com" in final_content + + def test_incremental_json_merges_arrays( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result_batch1: QueryResult, + mock_query_result_batch2: QueryResult, + ): + """JSON format should merge new records into existing array.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + output_file = tmp_path / "results.json" + + async def mock_query_async_batch1(*args, **kwargs): + return mock_query_result_batch1 + + async def mock_query_async_batch2(*args, **kwargs): + return mock_query_result_batch2 + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + ): + # First run + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch1): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "json", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Verify initial state + initial_data = json.loads(output_file.read_text()) + assert len(initial_data) == 2 + + # Second run + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch2): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "json", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Verify merged array + final_data = json.loads(output_file.read_text()) + assert len(final_data) == 3 + uuids = [r["uuid"] for r in final_data] + assert "1" in uuids + assert "2" in uuids + assert "3" in uuids + + def test_no_marker_flag_overwrites_file( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result_batch1: QueryResult, + mock_query_result_batch2: QueryResult, + ): + """With --no-marker, file should be overwritten not appended.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + output_file = tmp_path / "results.jsonl" + + async def mock_query_async_batch1(*args, **kwargs): + return mock_query_result_batch1 + + async def mock_query_async_batch2(*args, **kwargs): + return mock_query_result_batch2 + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + ): + # First run + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch1): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "jsonl", + "--no-marker", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + + # Second run with --no-marker should overwrite + with patch("cetus.client.CetusClient.query_async", mock_query_async_batch2): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(output_file), + "--format", + "jsonl", + "--no-marker", + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 0 + assert "Wrote" in result.output # Not "Appended" + + # Should only have batch2 data + final_content = output_file.read_text() + assert '"uuid": "1"' not in final_content + assert '"uuid": "2"' not in final_content + assert '"uuid": "3"' in final_content + lines = final_content.strip().split("\n") + assert len(lines) == 1 + + +class TestOutputPrefix: + """Tests for --output-prefix timestamped file output.""" + + @pytest.fixture + def mock_query_result(self) -> QueryResult: + """Sample query results.""" + return QueryResult( + data=[ + { + "uuid": "1", + "host": "a.example.com", + "A": "1.1.1.1", + "dns_timestamp": "2025-01-01T00:00:00Z", + }, + { + "uuid": "2", + "host": "b.example.com", + "A": "2.2.2.2", + "dns_timestamp": "2025-01-01T01:00:00Z", + }, + ], + total_fetched=2, + last_uuid="2", + last_timestamp="2025-01-01T01:00:00Z", + pages_fetched=1, + ) + + @pytest.fixture + def mock_query_result_empty(self) -> QueryResult: + """Empty result.""" + return QueryResult( + data=[], + total_fetched=0, + last_uuid=None, + last_timestamp=None, + pages_fetched=1, + ) + + def test_output_prefix_creates_timestamped_file( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result: QueryResult, + ): + """--output-prefix should create a timestamped file.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + prefix = str(tmp_path / "results") + + async def mock_query_async(*args, **kwargs): + return mock_query_result + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + patch("cetus.client.CetusClient.query_async", mock_query_async), + ): + result = runner.invoke( + main, + ["query", "host:*", "-p", prefix, "--format", "jsonl", "--api-key", "test-key"], + ) + assert result.exit_code == 0 + assert "Wrote 2 records" in result.output + + # Check that a timestamped file was created + files = list(tmp_path.glob("results_*.jsonl")) + assert len(files) == 1 + assert files[0].name.startswith("results_") + assert files[0].suffix == ".jsonl" + + # Verify content + content = files[0].read_text() + assert '"uuid": "1"' in content + assert '"uuid": "2"' in content + + def test_output_prefix_no_file_on_zero_results( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result_empty: QueryResult, + ): + """--output-prefix should not create file when there are no records.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + prefix = str(tmp_path / "results") + + async def mock_query_async(*args, **kwargs): + return mock_query_result_empty + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + patch("cetus.client.CetusClient.query_async", mock_query_async), + ): + result = runner.invoke( + main, + ["query", "host:*", "-p", prefix, "--format", "jsonl", "--api-key", "test-key"], + ) + assert result.exit_code == 0 + assert "No new records" in result.output + + # No file should be created + files = list(tmp_path.glob("results_*.jsonl")) + assert len(files) == 0 + + def test_output_prefix_uses_markers( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result: QueryResult, + ): + """--output-prefix should save markers for incremental queries.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + markers_dir = data_dir / "markers" + markers_dir.mkdir() + prefix = str(tmp_path / "results") + + async def mock_query_async(*args, **kwargs): + return mock_query_result + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + patch("cetus.markers.get_markers_dir", return_value=markers_dir), + patch("cetus.client.CetusClient.query_async", mock_query_async), + ): + result = runner.invoke( + main, + ["query", "host:*", "-p", prefix, "--format", "jsonl", "--api-key", "test-key"], + ) + assert result.exit_code == 0 + + # Check that a marker was saved + marker_files = list(markers_dir.glob("*.json")) + assert len(marker_files) == 1 + + def test_output_prefix_format_determines_extension( + self, + runner: CliRunner, + tmp_path: Path, + mock_query_result: QueryResult, + ): + """--output-prefix should use format to determine file extension.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + prefix = str(tmp_path / "results") + + async def mock_query_async(*args, **kwargs): + return mock_query_result + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + patch("cetus.client.CetusClient.query_async", mock_query_async), + ): + result = runner.invoke( + main, ["query", "host:*", "-p", prefix, "--format", "csv", "--api-key", "test-key"] + ) + assert result.exit_code == 0 + + # Check CSV extension + files = list(tmp_path.glob("results_*.csv")) + assert len(files) == 1 + + def test_output_and_output_prefix_mutually_exclusive( + self, + runner: CliRunner, + tmp_path: Path, + ): + """--output and --output-prefix cannot be used together.""" + output_file = tmp_path / "results.jsonl" + prefix = str(tmp_path / "results") + + result = runner.invoke( + main, ["query", "host:*", "-o", str(output_file), "-p", prefix, "--api-key", "test-key"] + ) + assert result.exit_code == 1 + assert "mutually exclusive" in result.output + + +class TestOutputDirectoryErrorHandling: + """Tests for error handling when output directory doesn't exist.""" + + def test_query_output_nonexistent_directory_shows_clean_error( + self, + runner: CliRunner, + tmp_path: Path, + ): + """query -o to non-existent directory should show clean error, not traceback.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + data_dir = tmp_path / "data" + data_dir.mkdir() + + # Use a path where the parent directory doesn't exist + nonexistent_dir = tmp_path / "nonexistent_subdir" / "results.json" + + async def mock_query_async(*args, **kwargs): + return QueryResult( + data=[{"uuid": "1", "host": "test.com", "dns_timestamp": "2025-01-01T00:00:00Z"}], + total_fetched=1, + last_uuid="1", + last_timestamp="2025-01-01T00:00:00Z", + pages_fetched=1, + ) + + with ( + patch("cetus.config.get_config_dir", return_value=config_dir), + patch("cetus.config.get_data_dir", return_value=data_dir), + patch("cetus.client.CetusClient.query_async", mock_query_async), + ): + result = runner.invoke( + main, + [ + "query", + "host:*", + "-o", + str(nonexistent_dir), + "--format", + "json", + "--api-key", + "test-key", + ], + ) + + # Should fail with exit code 1 + assert result.exit_code == 1 + # Should have clean error message + assert "Error" in result.output or "error" in result.output.lower() + # Should NOT have Python traceback + assert "Traceback" not in result.output + assert 'File "' not in result.output diff --git a/tests/test_client.py b/tests/test_client.py index cb2024a..157e161 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -355,6 +355,43 @@ def test_fetch_page_raises_api_error_on_other_errors(self, client: CetusClient, assert exc_info.value.status_code == 500 client.close() + def test_fetch_page_extracts_error_detail_on_400(self, client: CetusClient, httpx_mock): + """_fetch_page should extract error detail from 400 response JSON. + + When server returns 400 with {"detail": "error message"}, the client + should include that message in the APIError for helpful user feedback. + """ + httpx_mock.add_response( + method="POST", + url="http://localhost/api/query/", + status_code=400, + json={"detail": "Invalid query syntax: Cannot parse 'host:['"}, + ) + + with pytest.raises(APIError) as exc_info: + client._fetch_page("host:[", "dns", "nvme") + + # Should include the detail from the response + assert "Invalid query syntax" in str(exc_info.value) + assert exc_info.value.status_code == 400 + client.close() + + def test_fetch_page_handles_400_without_json(self, client: CetusClient, httpx_mock): + """_fetch_page should handle 400 response without JSON body gracefully.""" + httpx_mock.add_response( + method="POST", + url="http://localhost/api/query/", + status_code=400, + text="Bad Request", + ) + + with pytest.raises(APIError) as exc_info: + client._fetch_page("host:*", "dns", "nvme") + + assert exc_info.value.status_code == 400 + assert "Bad request" in str(exc_info.value) + client.close() + def test_fetch_page_raises_connection_error_on_connect_failure( self, client: CetusClient, httpx_mock ): @@ -369,9 +406,7 @@ def test_fetch_page_raises_connection_error_on_connect_failure( client._fetch_page("host:*", "dns", "nvme") client.close() - def test_fetch_page_raises_connection_error_on_timeout( - self, client: CetusClient, httpx_mock - ): + def test_fetch_page_raises_connection_error_on_timeout(self, client: CetusClient, httpx_mock): """_fetch_page should raise ConnectionError on timeout.""" httpx_mock.add_exception( httpx.TimeoutException("Timeout"), @@ -397,9 +432,7 @@ def test_query_returns_query_result(self, client: CetusClient, httpx_mock): method="POST", url="http://localhost/api/query/", json={ - "data": [ - {"uuid": "1", "host": "a.com", "dns_timestamp": "2025-01-01T00:00:00Z"} - ], + "data": [{"uuid": "1", "host": "a.com", "dns_timestamp": "2025-01-01T00:00:00Z"}], "has_more": False, }, ) diff --git a/tests/test_config.py b/tests/test_config.py index a5e9a93..565818c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -251,10 +251,7 @@ def test_load_from_file(self, config_dir: Path): """Config should be loaded from TOML file.""" config_file = config_dir / "config.toml" config_file.write_text( - 'api_key = "file-api-key"\n' - 'host = "file.example.com"\n' - "timeout = 45\n" - "since_days = 14\n" + 'api_key = "file-api-key"\nhost = "file.example.com"\ntimeout = 45\nsince_days = 14\n' ) with patch("cetus.config.get_config_dir", return_value=config_dir): @@ -350,10 +347,7 @@ def test_full_priority_chain(self, config_dir: Path): # Set up file config_file = config_dir / "config.toml" config_file.write_text( - 'api_key = "file-key"\n' - 'host = "file.com"\n' - "timeout = 10\n" - "since_days = 10\n" + 'api_key = "file-key"\nhost = "file.com"\ntimeout = 10\nsince_days = 10\n' ) # Set up env (overrides some file values) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 0e0ed5d..7d75fc8 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -11,12 +11,79 @@ Run with: CETUS_E2E_TEST=1 CETUS_API_KEY=your-key pytest tests/test_e2e.py -v -Expected duration: ~45-60 seconds for all 15 tests +Expected duration: ~7-8 minutes for all tests (--media all tests skipped) Query optimization: - Uses host:microsoft.com which has frequent data and returns quickly - Uses since_days=7 (same speed as 1 day for targeted queries) - Streaming tests break early after a few records + +Test categories (134 total): +- Query endpoints: 4 tests (dns, certstream, alerting indices, invalid index) +- Streaming: 2 tests +- Alerts API: 2 tests +- Async methods: 2 tests +- Authentication: 1 test +- CLI commands: 4 tests +- File output: 4 tests +- Incremental queries: 2 tests +- Version/markers/config: 5 tests +- Alert results/backtest: 2 tests +- Error handling: 4 tests +- Format/verbose: 3 tests +- Since-days edge cases: 3 tests (zero, negative rejected, config set negative) +- Alert type filtering: 2 tests +- Alert operations with real data: 3 tests +- Completion scripts: 3 tests (bash, zsh, fish) +- Alerts edge cases: 2 tests +- Verbose mode extended: 2 tests +- Config validation: 2 tests +- Mutually exclusive options: 1 test +- Markers clear by index: 1 test +- Get alert endpoint: 2 tests +- Streaming CSV: 1 test +- Media all option: 2 tests (API and CLI with extended timeout) +- Backtest streaming: 1 test +- Empty query handling: 1 test +- Output directory errors: 2 tests (regular and streaming) +- Alerts list combined flags: 1 test (--owned and --shared) +- Verbose mode with markers: 1 test +- Query edge cases: 6 tests (whitespace, unicode, large pagination xfail, + special chars, long queries) +- Alert access permissions: 2 tests +- Output prefix formats: 2 tests (JSON, CSV with -p option) +- Streaming table warning: 1 test +- Backtest with indices: 2 tests (certstream, alerting) +- Unicode output handling: 2 tests (table format encoding fix) +- Markers mode separation: 1 test (-o and -p have separate markers) +- Alert results --since filter: 2 tests (valid and invalid timestamp) +- Verbose mode streaming: 1 test (debug output with streaming) +- Large since-days values: 1 test (365 day lookback) +- Streaming with --no-marker: 2 tests (stdout and file output) +- Marker/since-days interaction: 1 test (marker takes precedence) +- Help text completeness: 5 tests (query, alerts, results, list format, backtest prefix documented) +- Alerts list formats: 4 tests (json, jsonl, csv, file output) +- Backtest output prefix: 2 tests (creates file, mutually exclusive with -o) +- DSL/JSON queries: 3 tests (CLI, API, and streaming with DSL syntax) +- Backtest terms alerts: 1 test (terms alert expansion) +- API key masking: 1 test (verbose mode security) +- Marker file corruption: 1 test (graceful recovery) +- User-Agent header: 1 test (version in header) +- Timeout behavior: 1 test (short timeout handling) +- Config environment variables: 1 test (CETUS_SINCE_DAYS) +- Query result count: 2 tests (buffered count, file output count) +- Alert results output formats: 2 tests (CSV and JSONL to file) +- Streaming alerting index: 2 tests (stdout and file output) +- Config file corruption: 2 tests (malformed TOML, empty file) +- Alerts list to file: 2 tests (CSV and JSONL export) +- Output prefix with --no-marker: 1 test +- Streaming with --media all: 1 test +- Backtest streaming with output prefix: 1 test +- Table format incremental append: 1 test (warning when table can't append) +- Backtest verbose mode: 1 test (shows alert details with -v) +- Shared alert operations: 2 tests (results and backtest on shared alerts) +- Config forward compatibility: 2 tests (unknown keys, empty values) +- Lucene query operators: 6 tests (AND, OR, NOT, wildcard suffix, quoted, grouping) """ from __future__ import annotations @@ -39,6 +106,7 @@ def api_key() -> str: if not key: # Fall back to config file from cetus.config import Config + config = Config.load() key = config.api_key if not key: @@ -349,11 +417,16 @@ def test_cli_query_command(self, api_key: str, host: str) -> None: [ "query", self.DATA_QUERY, - "--index", "dns", - "--since-days", "7", - "--format", "json", - "--api-key", api_key, - "--host", host, + "--index", + "dns", + "--since-days", + "7", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, ], ) # Should succeed with results @@ -373,12 +446,17 @@ def test_cli_query_streaming(self, api_key: str, host: str) -> None: [ "query", self.DATA_QUERY, - "--index", "dns", - "--since-days", "7", + "--index", + "dns", + "--since-days", + "7", "--stream", - "--format", "jsonl", - "--api-key", api_key, - "--host", host, + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, ], ) assert result.exit_code == 0 @@ -393,9 +471,12 @@ def test_cli_alerts_list_command(self, api_key: str, host: str) -> None: result = runner.invoke( main, [ - "alerts", "list", - "--api-key", api_key, - "--host", host, + "alerts", + "list", + "--api-key", + api_key, + "--host", + host, ], ) # Should succeed (may show "No alerts found" which is fine) @@ -411,3 +492,4174 @@ def test_cli_config_show_command(self) -> None: result = runner.invoke(main, ["config", "show"]) # Should succeed even without config assert result.exit_code in (0, 1) + + +class TestFileOutputModes: + """E2E tests for file output modes (-o and -p). + + Tests the incremental query functionality with real data. + """ + + DATA_QUERY = "host:microsoft.com" + + def test_cli_output_file_creates_file(self, api_key: str, host: str, tmp_path) -> None: + """Test -o creates output file with real data.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "jsonl", + "-o", + str(output_file), + "--no-marker", # Don't save marker for this test + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + content = output_file.read_text() + assert len(content) > 0 + # Should have JSONL content (one JSON object per line) + lines = content.strip().split("\n") + assert len(lines) > 0 + + def test_cli_output_prefix_creates_timestamped_file( + self, api_key: str, host: str, tmp_path + ) -> None: + """Test -p creates timestamped output file.""" + from click.testing import CliRunner + + from cetus.cli import main + + prefix = str(tmp_path / "results") + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "jsonl", + "-p", + prefix, + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + # Should have created a timestamped file + files = list(tmp_path.glob("results_*.jsonl")) + assert len(files) == 1 + assert files[0].stat().st_size > 0 + + def test_cli_output_csv_format(self, api_key: str, host: str, tmp_path) -> None: + """Test CSV output format works correctly.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.csv" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "csv", + "-o", + str(output_file), + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + + content = output_file.read_text() + lines = content.strip().split("\n") + # Should have header + at least one data row + assert len(lines) >= 2 + # First line should be CSV header + assert "uuid" in lines[0] or "host" in lines[0] + + def test_cli_streaming_with_output_file(self, api_key: str, host: str, tmp_path) -> None: + """Test --stream with -o creates file.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "streamed.jsonl" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--stream", + "-o", + str(output_file), + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + assert output_file.stat().st_size > 0 + + +class TestIncrementalQueries: + """E2E tests for incremental query behavior with markers. + + Tests that markers work correctly across multiple query runs. + """ + + DATA_QUERY = "host:microsoft.com" + + def test_marker_saved_and_used(self, api_key: str, host: str, tmp_path) -> None: + """Test that markers are saved and affect subsequent queries.""" + + from click.testing import CliRunner + + from cetus.cli import main + + # Use isolated marker directory + markers_dir = tmp_path / "markers" + markers_dir.mkdir() + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # First run - should fetch data and save marker + result1 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result1.exit_code == 0 + assert "Wrote" in result1.output + + first_size = output_file.stat().st_size + assert first_size > 0 + + # Check marker was saved + marker_files = list(tmp_path.glob("markers/*.json")) + assert len(marker_files) == 1 + + # Second run - should use marker (may append or show "No new records") + result2 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result2.exit_code == 0 + # Should either append or report no new records + assert "Appended" in result2.output or "No new records" in result2.output + + def test_output_prefix_with_markers(self, api_key: str, host: str, tmp_path) -> None: + """Test -p mode saves markers for incremental queries.""" + from click.testing import CliRunner + + from cetus.cli import main + + prefix = str(tmp_path / "export") + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # First run + result1 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "jsonl", + "-p", + prefix, + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result1.exit_code == 0 + + # Should have created one timestamped file + files1 = list(tmp_path.glob("export_*.jsonl")) + assert len(files1) == 1 + + # Marker should be saved + marker_files = list(tmp_path.glob("markers/*.json")) + assert len(marker_files) == 1 + + # Second run (immediately after - likely no new data) + import time + + time.sleep(1) # Ensure different timestamp + + result2 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "--format", + "jsonl", + "-p", + prefix, + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result2.exit_code == 0 + + # If no new data, no new file created + # If new data, a second file is created + files2 = list(tmp_path.glob("export_*.jsonl")) + # Should have 1 or 2 files depending on whether new data arrived + assert len(files2) >= 1 + + +class TestCLIVersion: + """Test CLI version and help commands.""" + + def test_version_flag(self) -> None: + """Test --version shows version string.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["--version"]) + assert result.exit_code == 0 + assert "cetus" in result.output.lower() + # Should contain a version number pattern + import re + + assert re.search(r"\d+\.\d+\.\d+", result.output) + + +class TestCLIMarkers: + """E2E tests for marker management commands.""" + + def test_markers_list_empty(self, tmp_path) -> None: + """Test markers list when no markers exist.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + result = runner.invoke(main, ["markers", "list"]) + assert result.exit_code == 0 + assert "No markers" in result.output or "0" in result.output or result.output.strip() == "" + + def test_markers_list_shows_markers(self, api_key: str, host: str, tmp_path) -> None: + """Test markers list shows saved markers after a query.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # Run a query to create a marker + runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + + # List markers + result = runner.invoke(main, ["markers", "list"]) + assert result.exit_code == 0 + # Should show the dns index marker + assert "dns" in result.output.lower() + + def test_markers_clear(self, api_key: str, host: str, tmp_path) -> None: + """Test markers clear removes markers.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # Run a query to create a marker + runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Clear markers + result = runner.invoke(main, ["markers", "clear", "-y"]) + assert result.exit_code == 0 + assert "Cleared" in result.output + + # Verify cleared + runner.invoke(main, ["markers", "list"]) # Check command runs + # Should be empty now + marker_files = list(tmp_path.glob("markers/*.json")) + assert len(marker_files) == 0 + + +class TestCLIConfig: + """E2E tests for config management commands.""" + + def test_config_path(self) -> None: + """Test config path shows file location.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["config", "path"]) + assert result.exit_code == 0 + assert "config" in result.output.lower() + # Should be a file path + assert "/" in result.output or "\\" in result.output + + +class TestAlertResults: + """E2E tests for alert results command.""" + + def test_alert_results_not_found(self, api_key: str, host: str) -> None: + """Test alert results with non-existent alert ID returns error.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "results", + "999999", # Non-existent ID + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with 404 or similar error + assert ( + result.exit_code != 0 + or "not found" in result.output.lower() + or "error" in result.output.lower() + ) + + +class TestAlertBacktest: + """E2E tests for alert backtest command.""" + + def test_alert_backtest_not_found(self, api_key: str, host: str) -> None: + """Test alert backtest with non-existent alert ID returns error.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "backtest", + "999999", # Non-existent ID + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with 404 or similar error + assert ( + result.exit_code != 0 + or "not found" in result.output.lower() + or "error" in result.output.lower() + ) + + +class TestConnectionErrors: + """E2E tests for connection error handling.""" + + def test_invalid_host_error(self) -> None: + """Test that invalid host gives clear error message.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:test.com", + "--host", + "nonexistent.invalid.host.example", + "--api-key", + "test-key", + ], + ) + assert result.exit_code != 0 + # Should have a connection error message + assert "connect" in result.output.lower() or "error" in result.output.lower() + + +class TestEmptyResults: + """E2E tests for queries that return no results. + + Note: We use alerting index with a specific non-matching query because + DNS queries without matches still scan all shards and can timeout. + The alerting index is smaller and faster for empty result tests. + """ + + def test_query_no_results(self, api_key: str, host: str) -> None: + """Test query that returns empty results handles gracefully.""" + from cetus.client import CetusClient + + # Use alerting index which is smaller - query for non-existent UUID + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + result = client.query( + search="uuid:00000000-0000-0000-0000-000000000000", + index="alerting", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + # Should return empty or very few results + assert len(result.data) < 10 + finally: + client.close() + + def test_cli_query_no_results(self, api_key: str, host: str) -> None: + """Test CLI query with no results shows appropriate message.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "uuid:00000000-0000-0000-0000-000000000000", + "--index", + "alerting", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should complete without error (may have [] or few results) + assert "[" in result.output # Valid JSON array + + +class TestQuerySyntaxErrors: + """E2E tests for query syntax error handling.""" + + def test_invalid_lucene_syntax_returns_error(self, api_key: str, host: str) -> None: + """Test that invalid Lucene syntax returns an error. + + The server should return a 400 Bad Request with a helpful error message + explaining the syntax issue, rather than a generic 500 error. + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:[", # Invalid Lucene syntax - unclosed bracket + "--index", + "dns", + "--since-days", + "1", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with an error + assert result.exit_code != 0 + output_lower = result.output.lower() + # Should have helpful error message about syntax (sanitized, no internal details) + assert "invalid query syntax" in output_lower + assert "brackets" in output_lower or "quotes" in output_lower + + def test_invalid_field_name_handled(self, api_key: str, host: str) -> None: + """Test that invalid field names are handled gracefully.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "nonexistent_field:value", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (ES allows querying non-existent fields, returns empty) + assert result.exit_code == 0 + + +class TestTableFormat: + """E2E tests for table format output.""" + + DATA_QUERY = "host:microsoft.com" + + def test_query_table_format(self, api_key: str, host: str) -> None: + """Test that table format output works for queries.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--format", + "table", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Table output should have formatting characters + assert "+" in result.output or "|" in result.output or "host" in result.output + + +class TestVerboseMode: + """E2E tests for verbose/debug output.""" + + def test_verbose_flag_shows_debug_info(self, api_key: str, host: str) -> None: + """Test that -v flag produces debug output.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "-v", # Verbose flag + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Verbose mode should show DEBUG or HTTP request info + assert "DEBUG" in result.output or "HTTP" in result.output or "200" in result.output + + +class TestSinceDaysEdgeCases: + """E2E tests for since-days edge cases.""" + + def test_since_days_zero(self, api_key: str, host: str) -> None: + """Test that since-days=0 works (queries for today only).""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "0", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (may or may not have results for today) + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + def test_since_days_negative_rejected(self, api_key: str, host: str) -> None: + """Test that negative since-days is rejected.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "-1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with error + assert result.exit_code != 0 + # Error message is printed to output + assert "negative" in result.output.lower() + + def test_config_set_since_days_negative_rejected(self, tmp_path) -> None: + """Test that config set rejects negative since-days.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + # Use -- to prevent -5 being parsed as an option + result = runner.invoke(main, ["config", "set", "since-days", "--", "-5"]) + assert result.exit_code != 0 + assert "negative" in result.output.lower() + + +class TestAlertTypeFiltering: + """E2E tests for alert type filtering.""" + + def test_list_alerts_filter_by_type_raw(self, api_key: str, host: str) -> None: + """Test listing alerts filtered by type=raw.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--type", + "raw", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Output should only show raw alerts (or "No alerts found") + if "raw" in result.output.lower(): + # If we have raw alerts, verify no other types shown + assert "terms" not in result.output.lower() or "raw" in result.output.lower() + + def test_list_alerts_filter_by_type_terms(self, api_key: str, host: str) -> None: + """Test listing alerts filtered by type=terms.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--type", + "terms", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + +class TestAlertOperationsWithRealData: + """E2E tests for alert operations with real alert data. + + These tests verify that alert results and backtest work with + actual alerts, not just 404 cases. + """ + + def test_alert_results_with_existing_alert(self, api_key: str, host: str) -> None: + """Test alert results with an alert that exists.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + # First, get list of owned alerts + alerts = client.list_alerts(owned=True, shared=False) + if not alerts: + pytest.skip("No owned alerts to test with") + + alert = alerts[0] + # Get results for this alert - should succeed even if empty + results = client.get_alert_results(alert.id) + assert isinstance(results, list) + # Results may be empty if alert hasn't matched anything + finally: + client.close() + + def test_cli_alert_results_with_existing_alert(self, api_key: str, host: str) -> None: + """Test CLI alert results with an existing alert.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + # Extract first alert ID from table output (handles both ASCII | and Unicode │) + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + assert match, f"Could not parse alert ID from output: {list_result.output[:200]}" + + alert_id = match.group(1) + + # Now test results command + result = runner.invoke( + main, + [ + "alerts", + "results", + alert_id, + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should either have JSON array or "No results" message + assert "[" in result.output or "No results" in result.output + + def test_cli_alert_backtest_with_existing_alert(self, api_key: str, host: str) -> None: + """Test CLI alert backtest with an existing alert.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + # Extract first alert ID from table output (handles both ASCII | and Unicode │) + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + assert match, f"Could not parse alert ID from output: {list_result.output[:200]}" + + alert_id = match.group(1) + + # Now test backtest command + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should have JSON output and timing info + assert "[" in result.output or "records" in result.output.lower() + + +class TestCompletionScripts: + """E2E tests for shell completion script generation.""" + + def test_completion_bash_generates_script(self) -> None: + """Test that bash completion script is generated.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["completion", "bash"]) + assert result.exit_code == 0 + # Bash completion script should contain function definition + assert "_cetus_completion" in result.output or "COMP_WORDS" in result.output + + def test_completion_zsh_generates_script(self) -> None: + """Test that zsh completion script is generated.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["completion", "zsh"]) + assert result.exit_code == 0 + # Zsh completion script should contain function or compdef + assert "compdef" in result.output or "_cetus" in result.output + + def test_completion_fish_generates_script(self) -> None: + """Test that fish completion script is generated.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["completion", "fish"]) + assert result.exit_code == 0 + # Fish completion script should contain complete command + assert "complete" in result.output + + +class TestAlertsListEdgeCases: + """E2E tests for alerts list edge cases.""" + + def test_alerts_list_no_owned_no_shared_warning(self) -> None: + """Test warning when both --no-owned and --no-shared are specified.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["alerts", "list", "--no-owned", "--no-shared"]) + assert result.exit_code == 0 + assert "warning" in result.output.lower() or "no alerts" in result.output.lower() + + def test_alerts_list_filter_by_type_structured(self, api_key: str, host: str) -> None: + """Test listing alerts filtered by type=structured.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--type", + "structured", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should either show structured alerts or "No alerts found" + + +class TestVerboseModeExtended: + """E2E tests for verbose mode with various commands.""" + + def test_verbose_alerts_list(self, api_key: str, host: str) -> None: + """Test verbose mode with alerts list command. + + Note: Debug output goes to stderr which Click runner captures separately. + We verify the command succeeds and returns alert data - verbose logging + is already tested in TestVerboseMode.test_verbose_flag_shows_debug_info. + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "-v", + "alerts", + "list", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Verify we get alert data (table output) + assert "ID" in result.output or "No alerts" in result.output + + def test_verbose_config_show(self) -> None: + """Test verbose mode with config show command.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["-v", "config", "show"]) + assert result.exit_code == 0 + + +class TestConfigSetValidation: + """E2E tests for config set value validation.""" + + def test_config_set_since_days_invalid(self, tmp_path) -> None: + """Test that invalid since-days value is rejected.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + result = runner.invoke(main, ["config", "set", "since-days", "not-a-number"]) + assert result.exit_code != 0 + assert "invalid" in result.output.lower() + + def test_config_set_since_days_valid(self, tmp_path) -> None: + """Test that valid since-days value is accepted.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + result = runner.invoke(main, ["config", "set", "since-days", "30"]) + assert result.exit_code == 0 + assert "success" in result.output.lower() + + +class TestMutuallyExclusiveOptions: + """E2E tests for mutually exclusive CLI options.""" + + def test_output_and_output_prefix_mutually_exclusive(self, tmp_path) -> None: + """Test that -o and -p cannot be used together.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.json" + prefix = str(tmp_path / "results") + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:test.com", + "-o", + str(output_file), + "-p", + prefix, + "--api-key", + "test-key", + ], + ) + assert result.exit_code == 1 + assert "mutually exclusive" in result.output.lower() + + +class TestMarkersClearByIndex: + """E2E tests for markers clear with index filtering.""" + + def test_markers_clear_by_index(self, api_key: str, host: str, tmp_path) -> None: + """Test that markers clear --index only clears that index.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + markers_dir = tmp_path / "markers" + markers_dir.mkdir(exist_ok=True) + + # Run queries on different indices to create markers + for idx in ["dns", "certstream"]: + runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + idx, + "--since-days", + "1", + "-o", + str(tmp_path / f"{idx}_results.jsonl"), + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Check markers exist + marker_files_before = list(markers_dir.glob("*.json")) + dns_markers_before = [f for f in marker_files_before if "dns" in f.name] + + # Only proceed if we have markers to clear + if dns_markers_before: + # Clear only dns markers + result = runner.invoke(main, ["markers", "clear", "--index", "dns", "-y"]) + assert result.exit_code == 0 + + # Check that certstream markers still exist + marker_files_after = list(markers_dir.glob("*.json")) + dns_markers_after = [f for f in marker_files_after if "dns" in f.name] + + # DNS markers should be cleared + assert len(dns_markers_after) < len(dns_markers_before) or len(dns_markers_before) == 0 + + +class TestGetAlertEndpoint: + """E2E tests for the get_alert endpoint.""" + + def test_get_alert_by_id(self, api_key: str, host: str) -> None: + """Test getting a specific alert by ID.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + # First, get list of owned alerts + alerts = client.list_alerts(owned=True, shared=False) + if not alerts: + pytest.skip("No owned alerts to test with") + + # Get the first alert by ID + alert = client.get_alert(alerts[0].id) + assert alert is not None + assert alert.id == alerts[0].id + assert hasattr(alert, "title") + assert hasattr(alert, "alert_type") + finally: + client.close() + + def test_get_alert_not_found(self, api_key: str, host: str) -> None: + """Test getting a non-existent alert returns None.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + alert = client.get_alert(999999) # Non-existent ID + assert alert is None + finally: + client.close() + + +class TestStreamingCSVFormat: + """E2E tests for streaming with CSV format.""" + + DATA_QUERY = "host:microsoft.com" + + def test_streaming_csv_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test streaming query with CSV format writes valid CSV file.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.csv" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--format", + "csv", + "-o", + str(output_file), + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + + # Verify it's valid CSV with header + content = output_file.read_text() + lines = content.strip().split("\n") + if len(lines) > 1: # Has data + # First line should be header + assert "uuid" in lines[0] or "host" in lines[0] + + +@pytest.mark.skip(reason="Media 'all' queries timeout - needs server-side optimization") +class TestMediaAllOption: + """E2E tests for --media all option which queries all storage tiers. + + These tests use longer timeouts because 'all' media scans more data. + Currently disabled due to timeout issues with full index scans. + """ + + DATA_QUERY = "host:microsoft.com" + + def test_query_media_all(self, api_key: str, host: str) -> None: + """Test query with --media all option. + + Note: This queries all storage tiers and may take longer than nvme-only. + Uses a 3-minute timeout to accommodate full index scans. + """ + from cetus.client import CetusClient + + # Use extended timeout for 'all' media queries + client = CetusClient(api_key=api_key, host=host, timeout=180) + try: + result = client.query( + search=self.DATA_QUERY, + index="dns", + media="all", + since_days=1, # Keep short timeframe to limit data + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + # 'all' should return at least as many results as 'nvme' + finally: + client.close() + + def test_cli_query_media_all(self, api_key: str, host: str) -> None: + """Test CLI query with --media all option.""" + from click.testing import CliRunner + + from cetus.cli import main + + # Use extended timeout via environment variable + runner = CliRunner(env={"CETUS_TIMEOUT": "180"}) + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--media", + "all", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed with extended timeout + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + +class TestBacktestWithStreaming: + """E2E tests for backtest command with streaming mode.""" + + def test_backtest_streaming_mode(self, api_key: str, host: str, tmp_path) -> None: + """Test backtest command with --stream flag.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + # Extract first alert ID from table output + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + if not match: + pytest.skip("Could not parse alert ID") + + alert_id = match.group(1) + output_file = tmp_path / "backtest.jsonl" + + # Test backtest with streaming + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--stream", + "--since-days", + "1", + "-o", + str(output_file), + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should either create a file or report no records + assert output_file.exists() or "no" in result.output.lower() + + +class TestEmptyQueryHandling: + """E2E tests for empty query string handling.""" + + def test_empty_query_returns_error(self, api_key: str, host: str) -> None: + """Test that empty query string returns appropriate error.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "", + "--since-days", + "1", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with an error about invalid query syntax + assert result.exit_code != 0 + assert "invalid" in result.output.lower() or "error" in result.output.lower() + + +class TestOutputDirectoryErrors: + """E2E tests for output directory error handling.""" + + def test_output_to_nonexistent_directory(self, api_key: str, host: str) -> None: + """Test that output to non-existent directory returns clean error.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--since-days", + "1", + "-o", + "nonexistent_directory_xyz/results.json", + "--no-marker", # Force write attempt (don't skip due to marker) + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with a clean error message (not a traceback) + assert result.exit_code == 1 + # Should have error message about file/directory + output_lower = result.output.lower() + assert "error" in output_lower + # Should NOT have traceback indicators + assert "traceback" not in output_lower + assert 'file "' not in output_lower # Python traceback pattern + + def test_streaming_output_to_nonexistent_directory(self, api_key: str, host: str) -> None: + """Test that streaming output to non-existent directory returns clean error.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--since-days", + "1", + "--stream", + "-o", + "nonexistent_directory_xyz/results.jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with a clean error message + assert result.exit_code == 1 + output_lower = result.output.lower() + assert "error" in output_lower + assert "traceback" not in output_lower + + +class TestAlertsListCombinedFlags: + """E2E tests for alerts list with combined owned and shared flags.""" + + def test_alerts_list_owned_and_shared_together(self, api_key: str, host: str) -> None: + """Test alerts list with both --owned and --shared flags.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--owned", + "--shared", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed and show combined results + assert result.exit_code == 0 + # Output should have table headers or "No alerts" + assert "ID" in result.output or "No alerts" in result.output + + +class TestVerboseModeWithMarkers: + """E2E tests for verbose mode with file output and markers.""" + + def test_verbose_mode_shows_marker_saved(self, api_key: str, host: str, tmp_path) -> None: + """Test that verbose mode shows marker saved message.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + result = runner.invoke( + main, + [ + "-v", # Verbose flag + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # In verbose mode, should show that marker was saved + assert "marker" in result.output.lower() + + +class TestQueryEdgeCases: + """E2E tests for query edge cases not covered elsewhere.""" + + def test_whitespace_only_query_returns_error(self, api_key: str, host: str) -> None: + """Test that whitespace-only query returns appropriate error.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + " ", # Whitespace-only query + "--since-days", + "1", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with an error about invalid query syntax + assert result.exit_code != 0 + assert "invalid" in result.output.lower() + + def test_unicode_characters_in_query(self, api_key: str, host: str) -> None: + """Test that Unicode characters in queries are handled correctly.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + # Query with German umlaut - should work (may return empty results) + result = runner.invoke( + main, + [ + "query", + "host:münchen.de", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (may have empty results, but no error) + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + def test_unicode_japanese_in_query(self, api_key: str, host: str) -> None: + """Test that Japanese Unicode characters in queries work.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:日本.jp", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (may have empty results, but no error) + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + @pytest.mark.xfail( + reason="Pagination timeout for large result sets - PIT expiration. " + "Fix deployed to server will resolve this.", + strict=False, # Allow test to pass once fix is deployed + ) + def test_unicode_chinese_in_query(self, api_key: str, host: str) -> None: + """Test that Chinese (simplified Han) characters in queries work. + + Root cause: NOT about Chinese characters. The query returns 300k+ records + requiring ~40 pages. The PIT (point-in-time) keep_alive was set to 1 minute, + which caused expiration during pagination. Japanese/German queries returned + fewer records and completed before PIT expired. + + Server-side fix: Increased PIT keep_alive from 1m to 5m, and added proper + error handling for PIT expiration errors. + + This test will pass once the server-side fix is deployed. + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:微软.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (may have empty results, but no error) + # Currently fails with "Server returned error 500" + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + def test_lucene_special_chars_escaped(self, api_key: str, host: str) -> None: + """Test query with escaped Lucene special characters. + + Lucene special chars: + - && || ! ( ) { } [ ] ^ " ~ * ? : \\ / + These need to be escaped with backslash to search literally. + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + # Query with parentheses - use targeted domain for speed + # Testing that special chars are handled without crashing + result = runner.invoke( + main, + [ + "query", + r"host:microsoft.com AND host:\(test\)", + "--index", + "alerting", # Use smaller index + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed or fail gracefully (not crash) + # The query may return empty results or error depending on ES handling + # Main thing is it shouldn't cause a 500 error or crash + assert result.exit_code in (0, 1) + + def test_very_long_query_string(self, api_key: str, host: str) -> None: + """Test that very long query strings are handled. + + This tests the client and server can handle queries approaching + reasonable limits without crashing. + """ + from click.testing import CliRunner + + from cetus.cli import main + + # Create a long query with many OR conditions + # This simulates a user searching for many domains at once + domains = [f"domain{i}.example.com" for i in range(50)] + long_query = " OR ".join(f"host:{d}" for d in domains) + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + long_query, + "--index", + "alerting", # Use smaller alerting index for speed + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should complete without error (likely empty results) + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + +class TestAlertAccessPermissions: + """E2E tests for alert access permission scenarios.""" + + def test_alert_get_nonexistent_returns_none(self, api_key: str, host: str) -> None: + """Test that getting non-existent alert returns gracefully.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + # Very high ID that shouldn't exist + alert = client.get_alert(99999999) + assert alert is None + finally: + client.close() + + def test_cli_backtest_nonexistent_alert(self, api_key: str, host: str) -> None: + """Test CLI backtest with non-existent alert ID.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "backtest", + "99999999", + "--since-days", + "1", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with clear error + assert result.exit_code != 0 + assert "not found" in result.output.lower() + + +class TestOutputPrefixFormats: + """E2E tests for output prefix mode with different formats.""" + + DATA_QUERY = "host:microsoft.com" + + def test_output_prefix_json_format(self, api_key: str, host: str, tmp_path) -> None: + """Test -p with --format json creates JSON file.""" + from click.testing import CliRunner + + from cetus.cli import main + + prefix = str(tmp_path / "results") + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "-p", + prefix, + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + # Should have created a timestamped JSON file + files = list(tmp_path.glob("results_*.json")) + assert len(files) == 1 + assert files[0].stat().st_size > 0 + + # Verify it's valid JSON + import json + + content = files[0].read_text() + data = json.loads(content) + assert isinstance(data, list) + + def test_output_prefix_csv_format(self, api_key: str, host: str, tmp_path) -> None: + """Test -p with --format csv creates CSV file.""" + from click.testing import CliRunner + + from cetus.cli import main + + prefix = str(tmp_path / "results") + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--format", + "csv", + "-p", + prefix, + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + # Should have created a timestamped CSV file + files = list(tmp_path.glob("results_*.csv")) + assert len(files) == 1 + assert files[0].stat().st_size > 0 + + # Verify it's valid CSV with header + content = files[0].read_text() + lines = content.strip().split("\n") + assert len(lines) >= 2 # Header + at least one data row + assert "uuid" in lines[0] or "host" in lines[0] + + +class TestStreamingTableWarning: + """E2E tests for streaming with table format warning.""" + + def test_streaming_table_shows_warning(self, api_key: str, host: str) -> None: + """Test that --stream with --format table shows buffering warning.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--format", + "table", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should show warning about buffering + assert "warning" in result.output.lower() + assert "buffer" in result.output.lower() + + +class TestBacktestWithDifferentIndices: + """E2E tests for backtest with certstream and alerting indices.""" + + def test_backtest_certstream_index(self, api_key: str, host: str) -> None: + """Test backtest command with --index certstream.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + if not match: + pytest.skip("Could not parse alert ID") + + alert_id = match.group(1) + + # Test backtest with certstream index + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--index", + "certstream", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should return valid JSON (may be empty array) + assert "[" in result.output + + def test_backtest_alerting_index(self, api_key: str, host: str) -> None: + """Test backtest command with --index alerting.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + if not match: + pytest.skip("Could not parse alert ID") + + alert_id = match.group(1) + + # Test backtest with alerting index + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--index", + "alerting", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should return valid JSON (may be empty array) + assert "[" in result.output + + +class TestBacktestStructuredAlerts: + """E2E tests for backtest with structured (DSL) alerts. + + Structured alerts use Elasticsearch DSL (JSON) queries instead of Lucene. + These require special handling because time filters cannot be concatenated + as Lucene strings - they must be incorporated into the DSL structure. + """ + + def test_backtest_structured_alert(self, api_key: str, host: str) -> None: + """Test backtest with a structured (DSL) alert. + + Structured alerts have queries like: + {"bool": {"must": [{"prefix": {"host": "bloomberg"}}]}} + + The client must handle these without breaking the JSON structure. + """ + import json + + from click.testing import CliRunner + + from cetus.cli import main + + # Get the list of alerts in JSON format to find a structured one + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + + if list_result.exit_code != 0: + pytest.skip(f"Could not list alerts: {list_result.output}") + + alerts = json.loads(list_result.output) + + # Find a structured alert + structured_alerts = [a for a in alerts if a.get("type") == "structured"] + if not structured_alerts: + pytest.skip("No structured alerts available to test") + + alert_id = str(structured_alerts[0]["id"]) + + # Test backtest with the structured alert + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Should succeed without "Invalid query syntax" error + # (may return empty array if no matches) + assert result.exit_code == 0, f"Backtest failed: {result.output}" + assert "Invalid query syntax" not in result.output + assert "[" in result.output # Valid JSON array + + +class TestUnicodeOutputHandling: + """E2E tests for Unicode/emoji handling in output. + + These tests verify that Unicode characters (including emoji) are + handled correctly on all platforms, particularly Windows where + the default console encoding is cp1252. + """ + + def test_table_format_handles_unicode(self, api_key: str, host: str) -> None: + """Test that table format handles Unicode characters without crashing. + + This test verifies the fix for Windows cp1252 encoding issues + where emoji/Unicode characters would cause 'charmap' codec errors. + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + # Query data that may contain Unicode (fingerprints can have emoji) + result = runner.invoke( + main, + [ + "query", + "host:*.google.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "table", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed without encoding error + assert result.exit_code == 0 + # Should have table formatting + assert "│" in result.output or "|" in result.output + + def test_streaming_table_handles_unicode(self, api_key: str, host: str) -> None: + """Test that streaming table format handles Unicode without crashing.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:*.google.com", + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--format", + "table", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed without encoding error + assert result.exit_code == 0 + + +class TestMarkersModeSeparation: + """E2E tests for marker mode separation between -o and -p modes.""" + + DATA_QUERY = "host:microsoft.com" + + def test_output_and_prefix_have_separate_markers( + self, api_key: str, host: str, tmp_path + ) -> None: + """Test that -o and -p modes maintain separate markers. + + Running a query with -o should not affect markers for -p mode, + and vice versa. This allows users to run both modes independently. + """ + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "output.jsonl" + prefix = str(tmp_path / "prefix") + markers_dir = tmp_path / "markers" + markers_dir.mkdir(exist_ok=True) + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # Run with -o to create file mode marker + result1 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result1.exit_code == 0 + + # Run with -p to create prefix mode marker + result2 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-p", + prefix, + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result2.exit_code == 0 + + # Should have two marker files (one for each mode) + marker_files = list(markers_dir.glob("*.json")) + # May have 1 or 2 depending on timing, but should work + assert len(marker_files) >= 1 + + +class TestAlertResultsSinceFilter: + """E2E tests for alerts results --since filter.""" + + def test_alerts_results_with_since_filter(self, api_key: str, host: str) -> None: + """Test alerts results with --since timestamp filter.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + # Extract first alert ID from table output + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + if not match: + pytest.skip("Could not parse alert ID") + + alert_id = match.group(1) + + # Test with --since filter (past timestamp) + result = runner.invoke( + main, + [ + "alerts", + "results", + alert_id, + "--since", + "2025-01-01T00:00:00Z", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (may return empty array or results) + assert result.exit_code == 0 + assert "[" in result.output or "No results" in result.output + + def test_alerts_results_since_invalid_format(self, api_key: str, host: str) -> None: + """Test alerts results with invalid --since timestamp format.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--api-key", api_key, "--host", host], + ) + if "No alerts" in list_result.output: + pytest.skip("No owned alerts to test with") + + # Extract first alert ID from table output + import re + + match = re.search(r"[│|]\s*(\d+)\s*[│|]", list_result.output) + if not match: + pytest.skip("Could not parse alert ID") + + alert_id = match.group(1) + + # Test with invalid --since format + result = runner.invoke( + main, + [ + "alerts", + "results", + alert_id, + "--since", + "not-a-timestamp", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should either fail or handle gracefully + # Server may return error or empty results depending on implementation + assert result.exit_code in (0, 1) + + +class TestVerboseModeStreaming: + """E2E tests for verbose mode with streaming queries.""" + + DATA_QUERY = "host:microsoft.com" + + def test_verbose_streaming_shows_debug_output(self, api_key: str, host: str) -> None: + """Test that verbose mode with streaming shows debug information.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "-v", # Verbose flag + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should show streaming indicator + assert "Streaming" in result.output or "stream" in result.output.lower() + + +class TestLargeSinceDaysValues: + """E2E tests for very large since-days values.""" + + DATA_QUERY = "host:microsoft.com" + + def test_since_days_365(self, api_key: str, host: str) -> None: + """Test query with since-days=365 (one year lookback).""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "365", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (large lookback is valid) + assert result.exit_code == 0 + assert "[" in result.output # Valid JSON array + + +class TestStreamingWithNoMarker: + """E2E tests for streaming mode with --no-marker flag.""" + + DATA_QUERY = "host:microsoft.com" + + def test_streaming_no_marker_to_stdout(self, api_key: str, host: str) -> None: + """Test streaming with --no-marker outputs to stdout correctly.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--no-marker", + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should show streaming indicator and have JSONL output + assert "Streaming" in result.output or "stream" in result.output.lower() + + def test_streaming_no_marker_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test streaming with --no-marker writes to file without saving marker.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + markers_dir = tmp_path / "markers" + markers_dir.mkdir(exist_ok=True) + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + result = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--no-marker", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + + # No marker should be saved + marker_files = list(markers_dir.glob("*.json")) + assert len(marker_files) == 0 + + +class TestMarkerSinceDaysInteraction: + """E2E tests for marker and since-days interaction. + + When a marker exists, since-days should be ignored and the query + should resume from the marker position. + """ + + DATA_QUERY = "host:microsoft.com" + + def test_marker_takes_precedence_over_since_days( + self, api_key: str, host: str, tmp_path + ) -> None: + """Test that marker timestamp takes precedence over --since-days.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # First run with since-days=1 to create a marker + result1 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result1.exit_code == 0 + assert "Wrote" in result1.output + + # Second run with since-days=365 (much larger lookback) + # Should still use marker (recent timestamp), not the 365 day lookback + result2 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "365", # This should be ignored + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result2.exit_code == 0 + # Should either append (if new data) or report no new records + # Key is that it doesn't re-fetch 365 days of data + assert "Appended" in result2.output or "No new records" in result2.output + + +class TestHelpTextCompleteness: + """E2E tests for help text completeness.""" + + def test_all_query_options_documented(self) -> None: + """Test that query command help documents all options.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["query", "--help"]) + assert result.exit_code == 0 + + # Check all options are documented + expected_options = [ + "--index", + "--media", + "--format", + "--output", + "--output-prefix", + "--since-days", + "--no-marker", + "--stream", + "--api-key", + "--host", + ] + for opt in expected_options: + assert opt in result.output, f"Option {opt} not documented in query help" + + def test_all_alerts_subcommands_documented(self) -> None: + """Test that alerts command documents all subcommands.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["alerts", "--help"]) + assert result.exit_code == 0 + + # Check all subcommands are documented + expected_commands = ["list", "results", "backtest"] + for cmd in expected_commands: + assert cmd in result.output, f"Subcommand {cmd} not documented in alerts help" + + def test_alerts_results_options_documented(self) -> None: + """Test that alerts results help documents --since option.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["alerts", "results", "--help"]) + assert result.exit_code == 0 + + assert "--since" in result.output + # Format hint may be wrapped across lines, so normalize whitespace + normalized = " ".join(result.output.split()) + assert "ISO 8601" in normalized # Format hint + + def test_alerts_list_format_option_documented(self) -> None: + """Test that alerts list help documents --format option.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["alerts", "list", "--help"]) + assert result.exit_code == 0 + + assert "--format" in result.output + assert "json" in result.output + assert "csv" in result.output + assert "table" in result.output + + def test_alerts_backtest_output_prefix_documented(self) -> None: + """Test that alerts backtest help documents --output-prefix option.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke(main, ["alerts", "backtest", "--help"]) + assert result.exit_code == 0 + + assert "--output-prefix" in result.output + assert "-p" in result.output + + +class TestAlertsListFormats: + """E2E tests for alerts list --format option.""" + + def test_alerts_list_json_format(self, api_key: str, host: str) -> None: + """Test alerts list with --format json.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + assert result.exit_code == 0, f"Command failed: {result.output}" + # Should be valid JSON array + import json + + data = json.loads(result.output) + assert isinstance(data, list) + if data: # If there are alerts + assert "id" in data[0] + assert "type" in data[0] + assert "title" in data[0] + + def test_alerts_list_jsonl_format(self, api_key: str, host: str) -> None: + """Test alerts list with --format jsonl.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--owned", + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should be one JSON object per line + import json + + lines = [line for line in result.output.strip().split("\n") if line] + if lines: + for line in lines: + obj = json.loads(line) + assert "id" in obj + + def test_alerts_list_csv_format(self, api_key: str, host: str) -> None: + """Test alerts list with --format csv.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "csv", "--api-key", api_key, "--host", host], + ) + assert result.exit_code == 0 + # Should have header row + lines = result.output.strip().split("\n") + assert len(lines) >= 1 + assert "id" in lines[0].lower() + assert "type" in lines[0].lower() + + def test_alerts_list_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test alerts list with --output to file.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "alerts.json" + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--owned", + "--format", + "json", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + + import json + + data = json.loads(output_file.read_text()) + assert isinstance(data, list) + + +class TestBacktestOutputPrefix: + """E2E tests for alerts backtest --output-prefix option.""" + + def test_backtest_output_prefix_creates_file(self, api_key: str, host: str, tmp_path) -> None: + """Test that backtest with --output-prefix creates timestamped file.""" + import json + + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID using JSON format (more reliable than parsing table) + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + if list_result.exit_code != 0: + pytest.skip(f"Could not list alerts: {list_result.output}") + + try: + alerts = json.loads(list_result.output) + except json.JSONDecodeError: + pytest.skip("Could not parse alerts JSON") + + if not alerts: + pytest.skip("No owned alerts to test with") + + alert_id = str(alerts[0]["id"]) + prefix = str(tmp_path / "backtest_results") + + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "-p", + prefix, + "--since-days", + "1", + "--no-marker", # Use --no-marker for consistent test behavior + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed (may have 0 results, but command should work) + assert result.exit_code == 0, f"Backtest failed: {result.output}" + + # Check that a timestamped file was created (or no file if no results) + files = list(tmp_path.glob("backtest_results_*.json")) + # Either file exists with data, or no file created (no results message) + if "No new records" not in result.output: + assert len(files) == 1 + assert files[0].name.startswith("backtest_results_") + + def test_backtest_output_and_prefix_mutually_exclusive( + self, api_key: str, host: str, tmp_path + ) -> None: + """Test that --output and --output-prefix are mutually exclusive.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.json" + prefix = str(tmp_path / "results") + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "backtest", + "1", + "-o", + str(output_file), + "-p", + prefix, + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 1 + assert "mutually exclusive" in result.output + + +class TestDSLQueries: + """E2E tests for Elasticsearch DSL (JSON) queries. + + The CLI can accept Elasticsearch DSL queries directly as JSON strings, + not just Lucene query syntax. This tests that code path. + """ + + def test_dsl_query_via_cli(self, api_key: str, host: str) -> None: + """Test that DSL/JSON queries work via CLI.""" + from click.testing import CliRunner + + from cetus.cli import main + + # DSL query_string equivalent of "host:microsoft.com" + dsl_query = '{"query_string": {"query": "host:microsoft.com"}}' + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + dsl_query, + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should have JSON array output + assert "[" in result.output + # Should contain data (microsoft.com is a common domain) + assert "uuid" in result.output or "[]" in result.output + + def test_dsl_query_via_api(self, api_key: str, host: str) -> None: + """Test DSL query through the client API.""" + from cetus.client import CetusClient + + # DSL query with bool must clause + dsl_query = '{"bool": {"must": [{"query_string": {"query": "host:microsoft.com"}}]}}' + + client = CetusClient(api_key=api_key, host=host, timeout=120) + try: + result = client.query( + search=dsl_query, + index="dns", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + finally: + client.close() + + def test_dsl_query_streaming(self, api_key: str, host: str) -> None: + """Test DSL query with streaming mode.""" + from click.testing import CliRunner + + from cetus.cli import main + + dsl_query = '{"query_string": {"query": "host:microsoft.com"}}' + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + dsl_query, + "--index", + "dns", + "--since-days", + "1", + "--stream", + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + +class TestBacktestTermsAlerts: + """E2E tests for backtest with terms alerts. + + Terms alerts expand to multiple term combinations which requires + different query handling than raw or structured alerts. + """ + + def test_backtest_terms_alert(self, api_key: str, host: str) -> None: + """Test backtest with a terms alert. + + Terms alerts have queries like: + host.raw:"*reuters.com" + + These expand to multiple term combinations when evaluated. + """ + import json + + from click.testing import CliRunner + + from cetus.cli import main + + # Get the list of alerts in JSON format to find a terms alert + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + + if list_result.exit_code != 0: + pytest.skip(f"Could not list alerts: {list_result.output}") + + alerts = json.loads(list_result.output) + + # Find a terms alert + terms_alerts = [a for a in alerts if a.get("type") == "terms"] + if not terms_alerts: + pytest.skip("No terms alerts available to test") + + alert_id = str(terms_alerts[0]["id"]) + + # Test backtest with the terms alert + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Should succeed without error (may return empty array if no matches) + assert result.exit_code == 0, f"Backtest failed: {result.output}" + assert "Invalid query syntax" not in result.output + assert "[" in result.output # Valid JSON array + + +class TestAPIKeyMasking: + """E2E tests for API key security in verbose output. + + API keys should never be exposed in verbose/debug output. + """ + + def test_verbose_mode_masks_api_key(self, api_key: str, host: str) -> None: + """Test that API key is not exposed in verbose output.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "-v", # Verbose mode + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + # API key should not appear in output + # The key might be partially shown (e.g., ***abc) but never in full + if len(api_key) > 8: + # Full key should never appear + assert api_key not in result.output + + +class TestMarkerFileCorruption: + """E2E tests for marker file corruption recovery. + + The client should handle corrupted marker files gracefully. + """ + + def test_corrupted_marker_file_recovery(self, api_key: str, host: str, tmp_path) -> None: + """Test that corrupted marker file is handled gracefully.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + markers_dir = tmp_path / "markers" + markers_dir.mkdir(exist_ok=True) + + # Create a corrupted marker file + corrupted_marker = markers_dir / "dns_test.json" + corrupted_marker.write_text("{ this is not valid json }") + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + + # Query should still work (treating marker as invalid/missing) + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Should succeed (ignore or reset corrupted marker) + # May show warning about invalid marker but shouldn't crash + assert result.exit_code == 0 + + +class TestUserAgentHeader: + """E2E tests for User-Agent header. + + The client should send a proper User-Agent header with version info. + """ + + def test_user_agent_contains_version(self) -> None: + """Test that User-Agent header constant contains client version.""" + from cetus import __version__ + from cetus.client import USER_AGENT + + # Check that the USER_AGENT constant has correct format + assert "cetus-client" in USER_AGENT + assert __version__ in USER_AGENT + # Should also include Python version and platform + import platform + + assert platform.python_version() in USER_AGENT + assert platform.system() in USER_AGENT + + +class TestTimeoutBehavior: + """E2E tests for timeout behavior. + + The client should respect timeout settings and fail gracefully. + """ + + def test_very_short_timeout_fails_gracefully(self, api_key: str, host: str) -> None: + """Test that very short timeout produces a clean error.""" + from click.testing import CliRunner + + from cetus.cli import main + + # Use an extremely short timeout that will likely fail + runner = CliRunner(env={"CETUS_TIMEOUT": "0.001"}) + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should fail with timeout or connection error (not crash) + # Could succeed if connection is very fast, so we just check it doesn't crash + assert result.exit_code in (0, 1) + # If it failed, should have clean error message + if result.exit_code == 1: + output_lower = result.output.lower() + assert "error" in output_lower or "timeout" in output_lower + + +class TestConfigEnvironmentVariables: + """E2E tests for environment variable configuration. + + Tests that all config environment variables work correctly. + """ + + def test_cetus_since_days_env_var(self, api_key: str, host: str) -> None: + """Test that CETUS_SINCE_DAYS environment variable is respected.""" + from click.testing import CliRunner + + from cetus.cli import main + + # Set since-days via environment + runner = CliRunner(env={"CETUS_SINCE_DAYS": "3"}) + result = runner.invoke( + main, + [ + "-v", # Verbose to see the query + "query", + "host:microsoft.com", + "--index", + "dns", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed and use the env var for since-days + assert result.exit_code == 0 + + +class TestQueryResultCount: + """E2E tests for query result count reporting. + + Tests that the CLI correctly reports the number of records returned. + """ + + def test_buffered_query_reports_count(self, api_key: str, host: str) -> None: + """Test that buffered query reports total record count.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should report count in output (e.g., "130 records in 2.5s") + assert "record" in result.output.lower() + + def test_file_output_reports_wrote_count(self, api_key: str, host: str, tmp_path) -> None: + """Test that file output reports 'Wrote X records'.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "results.jsonl" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--format", + "jsonl", + "-o", + str(output_file), + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # Should report "Wrote X records to " + assert "Wrote" in result.output + assert "records" in result.output.lower() + + +class TestAlertResultsOutputFormats: + """E2E tests for alert results with different output formats to file. + + Verifies that alert results can be exported to CSV, JSONL, and JSON files. + """ + + def test_alert_results_csv_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test alert results exported to CSV file.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + if list_result.exit_code != 0 or list_result.output.strip() == "[]": + pytest.skip("No owned alerts to test with") + + import json + + alerts = json.loads(list_result.output) + if not alerts: + pytest.skip("No owned alerts to test with") + + alert_id = str(alerts[0]["id"]) + output_file = tmp_path / "results.csv" + + result = runner.invoke( + main, + [ + "alerts", + "results", + alert_id, + "--format", + "csv", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should succeed even if no results (writes empty file or reports no results) + assert result.exit_code == 0 + + def test_alert_results_jsonl_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test alert results exported to JSONL file.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + if list_result.exit_code != 0 or list_result.output.strip() == "[]": + pytest.skip("No owned alerts to test with") + + import json + + alerts = json.loads(list_result.output) + if not alerts: + pytest.skip("No owned alerts to test with") + + alert_id = str(alerts[0]["id"]) + output_file = tmp_path / "results.jsonl" + + result = runner.invoke( + main, + [ + "alerts", + "results", + alert_id, + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + +class TestStreamingAlertingIndex: + """E2E tests for streaming queries on alerting index. + + Verifies streaming mode works correctly with the alerting index, + which may have different data patterns than dns/certstream. + """ + + def test_streaming_alerting_index_works(self, api_key: str, host: str) -> None: + """Test streaming query on alerting index.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "uuid:*", # Match any UUID in alerting index + "--index", + "alerting", + "--since-days", + "7", + "--stream", + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should complete without error (may have 0 results) + assert result.exit_code == 0 + assert "Streaming" in result.output or "Streamed" in result.output + + def test_streaming_alerting_index_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test streaming query on alerting index with file output.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "alerting_results.jsonl" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "uuid:*", + "--index", + "alerting", + "--since-days", + "7", + "--stream", + "-o", + str(output_file), + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # File may or may not exist depending on results + + +class TestConfigFileCorruption: + """E2E tests for config file corruption recovery. + + Verifies that corrupted or invalid config files are handled gracefully. + """ + + def test_malformed_config_toml_handled_gracefully(self, tmp_path) -> None: + """Test that malformed config.toml produces clear error, not traceback.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + # Create a malformed TOML file + config_file = config_dir / "config.toml" + config_file.write_text("this is not [valid toml\napi_key = ") + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + result = runner.invoke(main, ["config", "show"]) + + # Should either handle gracefully or show clean error (not Python traceback) + output_lower = result.output.lower() + assert "traceback" not in output_lower + + def test_empty_config_file_handled(self, tmp_path) -> None: + """Test that empty config file is handled gracefully.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + # Create an empty config file + config_file = config_dir / "config.toml" + config_file.write_text("") + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + result = runner.invoke(main, ["config", "show"]) + + # Should work (use defaults) + assert result.exit_code == 0 + # Should show default host + assert "alerting.sparkits.ca" in result.output or "host" in result.output.lower() + + +class TestAlertsListToFile: + """E2E tests for alerts list output to file. + + Verifies that alerts list can be exported to files in various formats. + """ + + def test_alerts_list_csv_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test alerts list exported to CSV file.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "alerts.csv" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--format", + "csv", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + if output_file.exists(): + content = output_file.read_text() + # CSV should have header row + assert "id" in content.lower() or "type" in content.lower() + + def test_alerts_list_jsonl_to_file(self, api_key: str, host: str, tmp_path) -> None: + """Test alerts list exported to JSONL file.""" + from click.testing import CliRunner + + from cetus.cli import main + + output_file = tmp_path / "alerts.jsonl" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "alerts", + "list", + "--format", + "jsonl", + "-o", + str(output_file), + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + if output_file.exists(): + content = output_file.read_text() + # Each line should be valid JSON + import json + + lines = [line for line in content.strip().split("\n") if line] + for line in lines[:3]: # Check first few lines + obj = json.loads(line) + assert "id" in obj or "type" in obj + + +class TestOutputPrefixWithNoMarker: + """E2E tests for --output-prefix combined with --no-marker. + + Verifies that -p and --no-marker work correctly together. + """ + + def test_output_prefix_with_no_marker_creates_file( + self, api_key: str, host: str, tmp_path + ) -> None: + """Test that -p with --no-marker creates timestamped file without marker.""" + from click.testing import CliRunner + + from cetus.cli import main + + prefix = str(tmp_path / "results") + + runner = CliRunner(env={"CETUS_DATA_DIR": str(tmp_path)}) + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "-p", + prefix, + "--format", + "jsonl", + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + + # Should create timestamped file + files = list(tmp_path.glob("results_*.jsonl")) + assert len(files) == 1 + + # Should NOT create marker (because --no-marker) + marker_files = list(tmp_path.glob("markers/*.json")) + assert len(marker_files) == 0 + + +class TestStreamingMediaAll: + """E2E tests for streaming with --media all option. + + The --media all option routes to all storage tiers, not just NVMe. + This can return more results but takes longer. + """ + + @pytest.mark.skip(reason="--media all is slow, skip by default") + def test_streaming_media_all(self, api_key: str, host: str) -> None: + """Test streaming query with --media all option.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + "host:microsoft.com", + "--index", + "dns", + "--since-days", + "1", + "--media", + "all", + "--stream", + "--format", + "jsonl", + "--api-key", + api_key, + "--host", + host, + ], + catch_exceptions=False, + ) + # Should complete (may take longer than nvme) + assert result.exit_code == 0 + assert "Streaming" in result.output or "Streamed" in result.output + + +class TestBacktestStreamingWithOutputPrefix: + """E2E tests for backtest with streaming and output prefix combined.""" + + def test_backtest_streaming_output_prefix(self, api_key: str, host: str, tmp_path) -> None: + """Test backtest with --stream and -p options together.""" + from click.testing import CliRunner + + from cetus.cli import main + + # First get an alert ID + runner = CliRunner() + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + if list_result.exit_code != 0: + pytest.skip("Could not list alerts") + + import json + + try: + alerts = json.loads(list_result.output) + except json.JSONDecodeError: + pytest.skip("Could not parse alerts list") + + if not alerts: + pytest.skip("No owned alerts to test with") + + alert_id = str(alerts[0]["id"]) + prefix = str(tmp_path / "backtest") + + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--index", + "dns", + "--since-days", + "1", + "--stream", + "-p", + prefix, + "--format", + "jsonl", + "--no-marker", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result.exit_code == 0 + # May or may not create file depending on results + # But should not crash + + +class TestTableFormatIncrementalAppendWarning: + """E2E tests for table format warning when used with incremental mode.""" + + DATA_QUERY = "host:microsoft.com" + + def test_table_format_append_shows_warning(self, api_key: str, host: str, tmp_path) -> None: + """Test that table format with existing file shows cannot-append warning. + + Table format cannot truly append to an existing file (Rich tables + require full content to calculate column widths). When used in + incremental mode with an existing file, a warning should be shown. + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + output_file = tmp_path / "results.txt" + + # First run - create initial file with table format + # Use longer lookback to ensure we get results + result1 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "-o", + str(output_file), + "--format", + "table", + "--no-marker", # Don't save marker, we'll test append behavior manually + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result1.exit_code == 0 + + # If no results were found, we can't test the append warning + if not output_file.exists() or output_file.stat().st_size == 0: + pytest.skip("No results found to test table append warning") + + initial_size = output_file.stat().st_size + + # Create a marker file manually to trigger incremental mode + from cetus.markers import MarkerStore + + marker_store = MarkerStore() + # Save marker from a past timestamp to ensure second run has "new" data + marker_store.save( + query=self.DATA_QUERY, + index="dns", + last_timestamp="2020-01-01T00:00:00Z", + last_uuid="test-uuid", + mode="file", + ) + + # Second run - incremental mode with marker, existing file should trigger warning + result2 = runner.invoke( + main, + [ + "query", + self.DATA_QUERY, + "--index", + "dns", + "--since-days", + "7", + "-o", + str(output_file), + "--format", + "table", + "--api-key", + api_key, + "--host", + host, + ], + ) + assert result2.exit_code == 0 + + # Should show the warning about table format not being able to append + # Note: The warning appears in stderr which Click captures in output + assert "table format cannot append" in result2.output or "Warning" in result2.output + + # Clean up marker + marker_store.clear("dns") + + +class TestBacktestVerboseMode: + """E2E tests for backtest command verbose output.""" + + def test_backtest_verbose_shows_alert_details(self, api_key: str, host: str) -> None: + """Test that verbose mode with backtest shows alert title and query. + + When running backtest with -v flag, it should display: + - The alert title + - The query being executed + """ + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + + # First get an alert ID + list_result = runner.invoke( + main, + ["alerts", "list", "--owned", "--format", "json", "--api-key", api_key, "--host", host], + ) + if list_result.exit_code != 0: + pytest.skip("Could not list alerts") + + import json + + try: + alerts = json.loads(list_result.output) + except json.JSONDecodeError: + pytest.skip("Could not parse alerts list") + + if not alerts: + pytest.skip("No owned alerts to test with") + + # Find an alert with a query + alert = None + for a in alerts: + if a.get("query_preview"): + alert = a + break + + if not alert: + pytest.skip("No alerts with query_preview found") + + alert_id = str(alert["id"]) + + # Run backtest with verbose mode + result = runner.invoke( + main, + [ + "-v", # Verbose flag before subcommand + "alerts", + "backtest", + alert_id, + "--index", + "dns", + "--since-days", + "1", + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Should succeed + assert result.exit_code == 0 + + # Verbose output should show alert details + # The exact format is: "Backtesting alert: {title}" and "Query: {query}" + assert "Backtesting alert" in result.output or "Query:" in result.output + + +class TestSharedAlertOperations: + """E2E tests for operations on shared alerts. + + Tests that users can access results and backtest alerts that have been + shared with them but which they don't own. + """ + + def test_alert_results_for_shared_alert(self, api_key: str, host: str) -> None: + """Test that alert results can be retrieved for a shared alert.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + + # First, find a shared alert + list_result = runner.invoke( + main, + [ + "alerts", + "list", + "--shared", + "--no-owned", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + if list_result.exit_code != 0: + pytest.skip("Could not list shared alerts") + + import json + + try: + alerts = json.loads(list_result.output) + except json.JSONDecodeError: + pytest.skip("Could not parse shared alerts list") + + if not alerts: + pytest.skip("No shared alerts to test with") + + alert_id = str(alerts[0]["id"]) + + # Get results for the shared alert + result = runner.invoke( + main, + [ + "alerts", + "results", + alert_id, + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + + # Should succeed (even if no results, should not error on permissions) + assert result.exit_code == 0 + # Output should be valid JSON (empty array or array of results) + output = result.output.strip() + if output: + # Remove any status messages before JSON + if "[" in output: + json_start = output.index("[") + json_output = output[json_start:] + data = json.loads(json_output) + assert isinstance(data, list) + + def test_backtest_shared_alert_access(self, api_key: str, host: str) -> None: + """Test that backtest works on alerts shared with the user.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + + # First, find a shared alert + list_result = runner.invoke( + main, + [ + "alerts", + "list", + "--shared", + "--no-owned", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + if list_result.exit_code != 0: + pytest.skip("Could not list shared alerts") + + import json + + try: + alerts = json.loads(list_result.output) + except json.JSONDecodeError: + pytest.skip("Could not parse shared alerts list") + + if not alerts: + pytest.skip("No shared alerts to test with") + + alert_id = str(alerts[0]["id"]) + + # Run backtest with a very short time window to avoid timeout + result = runner.invoke( + main, + [ + "alerts", + "backtest", + alert_id, + "--index", + "dns", + "--since-days", + "1", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + catch_exceptions=False, + ) + + # Should either succeed or timeout - but not fail with permission error + # We check that it doesn't fail with "permission" or "not found" errors + output_lower = result.output.lower() + assert "permission denied" not in output_lower + assert "not allowed" not in output_lower + # Note: timeout is acceptable for complex shared alerts + + +class TestConfigForwardCompatibility: + """E2E tests for config forward compatibility. + + Tests that unknown keys in config files are handled gracefully, + allowing older clients to work with config files from newer versions. + """ + + def test_config_with_unknown_keys_handled_gracefully(self, tmp_path) -> None: + """Test that config files with unknown keys don't cause errors.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + # Create a config file with unknown keys (simulating future version) + config_file = config_dir / "config.toml" + config_file.write_text( + 'api_key = "test-key"\n' + 'host = "custom.example.com"\n' + "timeout = 30\n" + "since_days = 14\n" + "# Unknown keys from future version\n" + 'new_feature_flag = true\n' + 'experimental_mode = "beta"\n' + "max_retries = 5\n" + ) + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + result = runner.invoke(main, ["config", "show"]) + + # Should work without error + assert result.exit_code == 0 + # Should show the known settings + assert "custom.example.com" in result.output or "host" in result.output.lower() + # Should not show Python traceback + assert "Traceback" not in result.output + + def test_config_with_empty_values(self, tmp_path) -> None: + """Test that config files with empty string values are handled.""" + from unittest.mock import patch + + from click.testing import CliRunner + + from cetus.cli import main + + config_dir = tmp_path / "config" + config_dir.mkdir() + + # Create a config file with empty values + config_file = config_dir / "config.toml" + config_file.write_text( + 'api_key = ""\n' # Empty API key + 'host = ""\n' # Empty host + ) + + with patch("cetus.config.get_config_dir", return_value=config_dir): + runner = CliRunner() + result = runner.invoke(main, ["config", "show"]) + + # Should handle gracefully - either use defaults or show empty + # Main thing is no crash/traceback + assert "Traceback" not in result.output + + +class TestQueryLuceneOperators: + """E2E tests for complex Lucene query syntax. + + Tests that various Lucene operators are handled correctly. + """ + + DATA_QUERY = "host:microsoft.com" + + def test_query_with_and_operator(self, api_key: str, host: str) -> None: + """Test query with explicit AND operator.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + result = client.query( + search="host:microsoft.com AND A:*", + index="dns", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + finally: + client.close() + + def test_query_with_or_operator(self, api_key: str, host: str) -> None: + """Test query with OR operator.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + result = client.query( + search="host:microsoft.com OR host:google.com", + index="dns", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + finally: + client.close() + + def test_query_with_not_operator(self, api_key: str, host: str) -> None: + """Test query with NOT operator.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + result = client.query( + search="host:microsoft.com AND NOT A:1.1.1.1", + index="dns", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + finally: + client.close() + + def test_query_with_wildcard_suffix(self, api_key: str, host: str) -> None: + """Test query with wildcard suffix match (trailing wildcard is fast).""" + from cetus.client import CetusClient + + # Note: Leading wildcards (*.example.com) are slow as they scan all data. + # Trailing wildcards (example.*) use the index efficiently. + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + result = client.query( + search="host:microsoft.*", + index="dns", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + finally: + client.close() + + def test_query_with_quoted_phrase(self, api_key: str, host: str) -> None: + """Test query with quoted exact phrase.""" + from click.testing import CliRunner + + from cetus.cli import main + + runner = CliRunner() + result = runner.invoke( + main, + [ + "query", + 'host:"www.microsoft.com"', + "--index", + "dns", + "--since-days", + "3", + "--format", + "json", + "--api-key", + api_key, + "--host", + host, + ], + ) + # Should execute without error + assert result.exit_code == 0 + + def test_query_with_field_grouping(self, api_key: str, host: str) -> None: + """Test query with field grouping using parentheses.""" + from cetus.client import CetusClient + + client = CetusClient(api_key=api_key, host=host, timeout=60) + try: + result = client.query( + search="(host:microsoft.com OR host:azure.com) AND A:*", + index="dns", + media="nvme", + since_days=1, + marker=None, + ) + assert result is not None + assert isinstance(result.data, list) + finally: + client.close() diff --git a/tests/test_formatters.py b/tests/test_formatters.py index 5b99146..d147765 100644 --- a/tests/test_formatters.py +++ b/tests/test_formatters.py @@ -73,9 +73,7 @@ def test_format_with_custom_indent(self, sample_data: list[dict]): result = formatter.format(sample_data) assert " " in result # 4-space indent - def test_format_stream_writes_to_file( - self, formatter: JSONFormatter, sample_data: list[dict] - ): + def test_format_stream_writes_to_file(self, formatter: JSONFormatter, sample_data: list[dict]): """format_stream should write JSON to file object.""" output = io.StringIO() count = formatter.format_stream(sample_data, output) @@ -359,10 +357,20 @@ def test_truncate_short_string(self, formatter: TableFormatter): def test_limits_columns(self, formatter: TableFormatter): """Table should limit number of columns.""" - data = [{ - "f1": 1, "f2": 2, "f3": 3, "f4": 4, "f5": 5, - "f6": 6, "f7": 7, "f8": 8, "f9": 9, "f10": 10, - }] + data = [ + { + "f1": 1, + "f2": 2, + "f3": 3, + "f4": 4, + "f5": 5, + "f6": 6, + "f7": 7, + "f8": 8, + "f9": 9, + "f10": 10, + } + ] result = formatter.format(data) # Should only show up to 8 columns # Count occurrences of column values is tricky, but format should work diff --git a/tests/test_security.py b/tests/test_security.py index 1c756a8..010a616 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -12,15 +12,11 @@ from __future__ import annotations -import json import logging -import stat import sys import time from pathlib import Path -from unittest.mock import patch -import httpx import pytest from cetus.client import ( @@ -30,7 +26,7 @@ VALID_MEDIA, CetusClient, ) -from cetus.config import Config, _set_secure_permissions, get_config_file +from cetus.config import Config, _set_secure_permissions from cetus.exceptions import APIError, ConfigurationError from cetus.markers import MAX_MARKER_FILE_SIZE, MarkerStore, _query_hash @@ -202,7 +198,7 @@ def test_normal_size_marker_file_works(self, tmp_path: Path): store = MarkerStore(markers_dir=markers_dir) # Save a normal marker - marker = store.save("test", "dns", "2025-01-01T00:00:00Z", "uuid") + store.save("test", "dns", "2025-01-01T00:00:00Z", "uuid") # Should be retrievable result = store.get("test", "dns")