diff --git a/README.md b/README.md
index d0bcf87..ef1d9d6 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,13 @@
+
+
# Cordon
[](https://pypi.org/project/cordon/)
-[](https://github.com/calebevans/cordon/blob/main/LICENSE)
[](https://pepy.tech/projects/cordon)
+[](https://sonarcloud.io/summary/new_code?id=calebevans_cordon)
+[](https://sonarcloud.io/summary/new_code?id=calebevans_cordon)
+
+
Cordon uses transformer embeddings and density scoring to identify semantically unusual patterns in large log files, reducing massive logs to the most anomalous sections for analysis. Repetitive patterns (even errors) are considered "normal background." Cordon surfaces unusual, rare, or clustered events that stand out semantically from the bulk of the logs.
diff --git a/src/cordon/analysis/thresholder.py b/src/cordon/analysis/thresholder.py
index bd153d6..bd8b218 100644
--- a/src/cordon/analysis/thresholder.py
+++ b/src/cordon/analysis/thresholder.py
@@ -1,4 +1,5 @@
from collections.abc import Sequence
+from math import isclose
import numpy as np
@@ -59,11 +60,11 @@ def select_significant(
# Single percentile mode (original behavior)
# all windows, sorted by score descending
- if config.anomaly_percentile == 1.0:
+ if isclose(config.anomaly_percentile, 1.0):
return sorted(scored_windows, key=lambda window: window.score, reverse=True)
# no windows requested
- if config.anomaly_percentile == 0.0:
+ if isclose(config.anomaly_percentile, 0.0):
return []
# calculate percentile threshold
diff --git a/src/cordon/cli.py b/src/cordon/cli.py
index c1ec002..20870e0 100644
--- a/src/cordon/cli.py
+++ b/src/cordon/cli.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import sys
+from math import isclose
from pathlib import Path
from cordon import AnalysisConfig, SemanticLogAnalyzer
@@ -207,6 +208,36 @@ def analyze_file(
print()
+def _print_backend_info(config: AnalysisConfig) -> None:
+ """Print backend configuration details."""
+ print(f"Backend: {config.backend}")
+ if config.backend == "sentence-transformers":
+ print(f"Model: {config.model_name}")
+ print(f"Device: {config.device or 'auto'}")
+ elif config.backend == "llama-cpp":
+ print(f"Model path: {config.model_path}")
+ print(f"GPU layers: {config.n_gpu_layers}")
+ if config.n_threads:
+ print(f"Threads: {config.n_threads}")
+ elif config.backend == "remote":
+ print(f"Model: {config.model_name}")
+ if config.endpoint:
+ print(f"Endpoint: {config.endpoint}")
+ print(f"Timeout: {config.request_timeout}s")
+
+
+def _print_filtering_mode(config: AnalysisConfig) -> None:
+ """Print filtering mode configuration."""
+ if config.anomaly_range_min is not None:
+ # Type narrowing: if min is not None, max is also not None (enforced in config)
+ assert config.anomaly_range_max is not None
+ print(
+ f"Filtering mode: Range (exclude top {config.anomaly_range_min*100:.1f}%, keep up to {config.anomaly_range_max*100:.1f}%)"
+ )
+ else:
+ print(f"Filtering mode: Percentile (top {config.anomaly_percentile*100:.1f}%)")
+
+
def main() -> None:
"""Main entry point for the CLI."""
args = parse_args()
@@ -221,7 +252,7 @@ def main() -> None:
anomaly_range_min = args.anomaly_range[0]
anomaly_range_max = args.anomaly_range[1]
# Keep default percentile value (not used in range mode)
- if args.anomaly_percentile != 0.1:
+ if not isclose(args.anomaly_percentile, 0.1):
print(
"Warning: --anomaly-percentile is ignored when using --anomaly-range",
file=sys.stderr,
@@ -253,30 +284,8 @@ def main() -> None:
# create analyzer
print("Initializing analyzer...")
- print(f"Backend: {config.backend}")
- if config.backend == "sentence-transformers":
- print(f"Model: {config.model_name}")
- print(f"Device: {config.device or 'auto'}")
- elif config.backend == "llama-cpp":
- print(f"Model path: {config.model_path}")
- print(f"GPU layers: {config.n_gpu_layers}")
- if config.n_threads:
- print(f"Threads: {config.n_threads}")
- elif config.backend == "remote":
- print(f"Model: {config.model_name}")
- if config.endpoint:
- print(f"Endpoint: {config.endpoint}")
- print(f"Timeout: {config.request_timeout}s")
-
- # Display filtering mode
- if config.anomaly_range_min is not None:
- # Type narrowing: if min is not None, max is also not None (enforced in config)
- assert config.anomaly_range_max is not None
- print(
- f"Filtering mode: Range (exclude top {config.anomaly_range_min*100:.1f}%, keep up to {config.anomaly_range_max*100:.1f}%)"
- )
- else:
- print(f"Filtering mode: Percentile (top {config.anomaly_percentile*100:.1f}%)")
+ _print_backend_info(config)
+ _print_filtering_mode(config)
print()
try:
diff --git a/src/cordon/core/config.py b/src/cordon/core/config.py
index 492c266..11a3e63 100644
--- a/src/cordon/core/config.py
+++ b/src/cordon/core/config.py
@@ -30,14 +30,27 @@ class AnalysisConfig:
def __post_init__(self) -> None:
"""Validate configuration parameters."""
+ self._validate_core_params()
+ self._validate_anomaly_range()
+ self._validate_backend()
+
+ def _validate_core_params(self) -> None:
+ """Validate core analysis parameters."""
if self.window_size < 1:
raise ValueError("window_size must be >= 1")
if self.k_neighbors < 1:
raise ValueError("k_neighbors must be >= 1")
if not 0.0 <= self.anomaly_percentile <= 1.0:
raise ValueError("anomaly_percentile must be between 0.0 and 1.0")
+ if self.batch_size < 1:
+ raise ValueError("batch_size must be >= 1")
+ if self.scoring_batch_size is not None and self.scoring_batch_size < 1:
+ raise ValueError("scoring_batch_size must be >= 1 or None for auto-detect")
+ if self.device is not None and self.device not in ("cuda", "mps", "cpu"):
+ raise ValueError("device must be 'cuda', 'mps', 'cpu', or None")
- # Validate anomaly range parameters
+ def _validate_anomaly_range(self) -> None:
+ """Validate anomaly range parameters."""
if (self.anomaly_range_min is None) != (self.anomaly_range_max is None):
raise ValueError(
"anomaly_range_min and anomaly_range_max must both be set or both be None"
@@ -54,40 +67,30 @@ def __post_init__(self) -> None:
if self.anomaly_range_min >= self.anomaly_range_max:
raise ValueError("anomaly_range_min must be less than anomaly_range_max")
- if self.batch_size < 1:
- raise ValueError("batch_size must be >= 1")
- if self.scoring_batch_size is not None and self.scoring_batch_size < 1:
- raise ValueError("scoring_batch_size must be >= 1 or None for auto-detect")
- if self.device is not None and self.device not in ("cuda", "mps", "cpu"):
- raise ValueError("device must be 'cuda', 'mps', 'cpu', or None")
-
- # Backend validation
+ def _validate_backend(self) -> None:
+ """Validate backend and backend-specific parameters."""
if self.backend not in ("sentence-transformers", "llama-cpp", "remote"):
raise ValueError(
f"backend must be 'sentence-transformers', 'llama-cpp', or 'remote', got '{self.backend}'"
)
- # llama-cpp specific validation
if self.backend == "llama-cpp" and self.model_path is not None:
- # If model_path is provided, validate it exists and has correct extension
- # If None, LlamaCppVectorizer will auto-download default model
- model_file = Path(self.model_path)
- if not model_file.exists():
- raise ValueError(f"GGUF model file not found: {self.model_path}")
-
- if model_file.suffix != ".gguf":
- raise ValueError(f"model_path must be a .gguf file, got: {model_file.suffix}")
+ self._validate_llama_cpp_model_path()
- # llama.cpp parameter validation
if self.n_ctx < 1:
raise ValueError("n_ctx must be >= 1")
-
if self.n_gpu_layers < -1:
raise ValueError("n_gpu_layers must be >= -1 (-1 for all layers, 0 for CPU-only)")
-
if self.n_threads is not None and self.n_threads < 1:
raise ValueError("n_threads must be >= 1 or None for auto-detect")
-
- # remote backend validation
if self.request_timeout <= 0:
raise ValueError("request_timeout must be > 0")
+
+ def _validate_llama_cpp_model_path(self) -> None:
+ """Validate llama.cpp model path exists and has correct extension."""
+ assert self.model_path is not None
+ model_file = Path(self.model_path)
+ if not model_file.exists():
+ raise ValueError(f"GGUF model file not found: {self.model_path}")
+ if model_file.suffix != ".gguf":
+ raise ValueError(f"model_path must be a .gguf file, got: {model_file.suffix}")