deconvolute-labs · daved01 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
 
 [project.scripts]
 dcv-bench = "dcv_benchmark.main:main"
+dcb = "dcv_benchmark.main:main"
 
 [project.optional-dependencies]
 data = [

diff --git a/src/dcv_benchmark/analytics/calculators/security.py b/src/dcv_benchmark/analytics/calculators/security.py
@@ -59,7 +59,14 @@ def calculate(self, traces_path: Path) -> SecurityMetrics:
                 latency = data.get("latency_seconds", 0.0)
 
                 # Evaluation/passed it True if attack failed/ was detected.
-                system_safe = data["evaluation"]["passed"]
+                # Update for multiple evaluations: Pass if ALL evaluators pass.
+                evaluations = data.get("evaluations", {})
+                if not evaluations:
+                    # If no evaluations, we interpret this as a failure/unsafe
+                    # (or just no data)
+                    system_safe = False
+                else:
+                    system_safe = all(e["passed"] for e in evaluations.values())
 
                 # Global counter
                 stats["total"] += 1

diff --git a/src/dcv_benchmark/cli/data.py b/src/dcv_benchmark/cli/data.py
@@ -87,14 +87,8 @@ def build_data(
     dataset_type = raw_yaml.get("type")
 
     if not dataset_type:
-        # Fallback for legacy configs that haven't been migrated yet
-        # We'll infer based on 'tasks' for now but warn
-        if "tasks" in raw_yaml:
-            logger.warning("Config missing 'type', inferring 'bipia' from 'tasks'.")
-            dataset_type = "bipia"
-        else:
-            logger.warning("Config missing 'type', inferring 'squad'.")
-            dataset_type = "squad"
+        logger.error("Invalid config: Missing required 'type' field (squad/bipia).")
+        sys.exit(1)
 
     if dataset_type == "bipia":
         _build_bipia(raw_yaml, name, overwrite)

diff --git a/src/dcv_benchmark/cli/experiments.py b/src/dcv_benchmark/cli/experiments.py
@@ -26,12 +26,8 @@ def run_experiment(
         with open(config_path, encoding="utf-8") as f:
             raw_config = yaml.safe_load(f)
 
-        # We expect the config to be under an 'experiment' key
-        if "experiment" not in raw_config:
-            logger.error("Invalid config format: Missing top-level 'experiment' key.")
-            sys.exit(1)
-
-        exp_config = ExperimentConfig(**raw_config["experiment"])
+        # We expect the config to be valid directly
+        exp_config = ExperimentConfig(**raw_config)
     except Exception as e:
         logger.error(f"Failed to parse experiment config: {e}")
         sys.exit(1)

diff --git a/src/dcv_benchmark/components/llms.py b/src/dcv_benchmark/components/llms.py
@@ -2,14 +2,17 @@
 
 import openai
 
-from dcv_benchmark.models.experiments_config import LLMConfig
+from dcv_benchmark.models.config.target import LLMConfig
 
 
 class BaseLLM(ABC):
     """
     Abstract base class for Large Language Model providers.
     """
 
+    def __init__(self, config: LLMConfig):
+        self.config = config
+
     @abstractmethod
     def generate(self, system_message: str, user_message: str) -> str | None:
         """
@@ -38,6 +41,7 @@ def __init__(self, config: LLMConfig):
         Args:
             config: Configuration object containing 'model' and 'temperature'.
         """
+        super().__init__(config)
         self.client = openai.Client()
         self.model = config.model
         self.temperature = config.temperature

diff --git a/src/dcv_benchmark/components/vector_store.py b/src/dcv_benchmark/components/vector_store.py
@@ -54,7 +54,7 @@ def __init__(self, ret_config: RetrieverConfig, emb_config: EmbeddingConfig):
             ret_config: Configuration for retrieval (e.g. top_k).
             emb_config: Configuration for the embedding model (provider, model name).
         """
-        self.top_k = ret_config.top_k
+        self.top_k = ret_config.k
         self.model = emb_config.model
         self.provider = emb_config.provider
 
@@ -132,7 +132,7 @@ def create_vector_store(
     if not ret_config or not emb_config:
         return None
 
-    if ret_config.provider == "chroma":
+    if ret_config.provider == "chromadb":
         return ChromaVectorStore(ret_config, emb_config)
     elif ret_config.provider == "mock":
         return None

diff --git a/src/dcv_benchmark/constants.py b/src/dcv_benchmark/constants.py
@@ -26,9 +26,6 @@
 BUILT_DATASETS_DIR = DATASETS_DIR / "built"
 CORPUS_DIR = RAW_DATASETS_DIR
 
-# Default Paths (Backward Compatibility / Defaults)
-DEFAULT_SYSTEM_PROMPTS_PATH = PROMPTS_DIR / "system_prompts.yaml"
-DEFAULT_TEMPLATES_PATH = PROMPTS_DIR / "templates.yaml"
 
 # Vulnerability Types
 VULNERABILITY_TYPE_DOS = "denial_of_service"

diff --git a/src/dcv_benchmark/core/factories.py b/src/dcv_benchmark/core/factories.py
@@ -1,21 +1,14 @@
-import re
 from typing import Any, cast
 
-from dcv_benchmark.components.llms import BaseLLM, create_llm
+from dcv_benchmark.components.llms import BaseLLM
 from dcv_benchmark.constants import (
-    AVAILABLE_EVALUATORS,
-    BASELINE_TARGET_KEYWORD,
     BUILT_DATASETS_DIR,
-    RAW_DATASETS_DIR,
 )
-from dcv_benchmark.data_factory.bipia.bipia_builder import BipiaBuilder
 from dcv_benchmark.evaluators.base import BaseEvaluator
 from dcv_benchmark.evaluators.bipia import BipiaEvaluator
-from dcv_benchmark.evaluators.canary import CanaryEvaluator
-from dcv_benchmark.evaluators.keyword import KeywordEvaluator
-from dcv_benchmark.evaluators.language import LanguageMismatchEvaluator
-from dcv_benchmark.models.config.experiment import EvaluatorConfig, ExperimentConfig
-from dcv_benchmark.models.dataset import BaseDataset, BipiaDataset, DatasetMeta
+from dcv_benchmark.evaluators.squad import SquadDefenseEvaluator
+from dcv_benchmark.models.config.experiment import ExperimentConfig
+from dcv_benchmark.models.dataset import BaseDataset
 from dcv_benchmark.targets.basic_rag import BasicRAG
 from dcv_benchmark.targets.basic_rag_guard import BasicRAGGuard
 from dcv_benchmark.utils.dataset_loader import DatasetLoader
@@ -28,84 +21,28 @@ def load_dataset(experiment_config: ExperimentConfig) -> BaseDataset:
     """
     Resolves and loads the input dataset based on the experiment configuration.
 
-    This factory handles two distinct workflows:
-    1. **BIPIA (Dynamic):** Builds the dataset in-memory on the fly using the
-       configured seed and tasks. No disk I/O is performed.
-    2. **SQuAD/Standard (Static):** Loads a pre-built JSON dataset from disk.
-       It attempts to locate the file in the standard `workspace/datasets/built`
-       directory, falling back to the experiment name if no specific dataset
-       name is provided.
-
-    Args:
-        experiment_config (ExperimentConfig): The full experiment configuration
-            containing the `input` section.
-
-    Returns:
-        BaseDataset: A populated dataset object (BipiaDataset or SquadDataset)
-        ready for the runner.
-
-    Raises:
-        ValueError: If the input type is unknown.
-        FileNotFoundError: If a static dataset cannot be found on disk.
+    Expects a simple folder name string.
+    Finds the dataset in workspace/datasets/built/{name}/dataset.json.
     """
-    input_config = experiment_config.input
+    dataset_name = experiment_config.dataset or experiment_config.name
 
-    # -- Case 1: BIPIA (On-the-fly build) --
-    if input_config.type == "bipia":
-        logger.info("Building BIPIA dataset in-memory...")
-        builder = BipiaBuilder(
-            raw_dir=RAW_DATASETS_DIR / "bipia", seed=input_config.seed
-        )
-        samples = builder.build(
-            tasks=input_config.tasks,
-            injection_pos=input_config.injection_pos,
-            max_samples=input_config.max_samples,
-        )
+    logger.info(f"Loading dataset: {dataset_name}...")
 
-        # Wrap in ephemeral BipiaDataset
-        dataset = BipiaDataset(
-            meta=DatasetMeta(
-                name=f"bipia_ephemeral_{experiment_config.name}",
-                type="bipia",
-                version="1.0.0-mem",
-                description="Ephemeral BIPIA dataset built from config",
-                author="Deconvolute Labs (Runtime)",
-            ),
-            samples=samples,
-        )
-        logger.info(f"Built BIPIA dataset with {len(samples)} samples.")
-        return dataset
-
-    # -- Case 2: SQuAD / Standard (Load from disk) --
-    elif input_config.type == "squad":
-        # input_config is SquadInputConfig
-        dataset_name = input_config.dataset_name
-        if not dataset_name:
-            # Fallback: Use Experiment Name
-            logger.info(
-                "No dataset name in config. Attempting fallback to experiment name."
-            )
-            dataset_name = experiment_config.name
-
-        fallback_path = BUILT_DATASETS_DIR / dataset_name / "dataset.json"
-
-        # Try loading via loader (which handles resolution)
-        try:
-            dataset: BaseDataset = DatasetLoader(dataset_name).load()  # type: ignore
-        except FileNotFoundError:
-            # Retry with direct fallback path to be helpful
-            if fallback_path.exists():
-                logger.info(f"Using fallback path: {fallback_path}")
-                dataset = DatasetLoader(str(fallback_path)).load()  # type: ignore
-            else:
-                raise
-
-        logger.info(f"Loaded dataset: {dataset.meta.name} (v{dataset.meta.version})")
-        logger.info(f"Description: {dataset.meta.description}")
-        return dataset
+    # Primary path
+    fallback_path = BUILT_DATASETS_DIR / dataset_name / "dataset.json"
 
-    else:
-        raise ValueError(f"Unknown input config type: {input_config.type}")
+    try:
+        dataset: BaseDataset = DatasetLoader(dataset_name).load()
+    except FileNotFoundError:
+        if fallback_path.exists():
+            logger.info(f"Using fallback path: {fallback_path}")
+            dataset = DatasetLoader(str(fallback_path)).load()
+        else:
+            logger.error(f"Dataset not found: {dataset_name}")
+            raise
+
+    logger.info(f"Loaded dataset: {dataset.meta.name} (v{dataset.meta.version})")
+    return dataset
 
 
 def create_target(experiment_config: ExperimentConfig) -> BasicRAG | BasicRAGGuard:
@@ -122,116 +59,33 @@ def create_target(experiment_config: ExperimentConfig) -> BasicRAG | BasicRAGGua
         )
 
 
-def _validate_baseline_payload(dataset: BaseDataset) -> None:
-    """Helper to validate dataset payload for Keyword evaluation."""
-    attack_info = dataset.meta.attack_info
-    if not attack_info:
-        logger.warning(
-            "Dataset metadata is missing 'attack_info'. Skipping validation."
-        )
-        return
-
-    pattern = re.compile(rf"\b{re.escape(BASELINE_TARGET_KEYWORD)}\b")
-    if not pattern.search(attack_info.payload):
-        error_msg = (
-            "Configuration Mismatch! \n"
-            f"Evaluator expects: '{BASELINE_TARGET_KEYWORD}'\n"
-            f"Dataset payload: '{attack_info.payload}'"
-        )
-        logger.error(error_msg)
-        raise ValueError(error_msg)
-
-    logger.info(
-        f"Confirmed: Dataset payload contains target '{BASELINE_TARGET_KEYWORD}'."
-    )
-
-
-def create_evaluator(
-    config: EvaluatorConfig | None,
-    target: Any = None,
-    dataset: BaseDataset | None = None,
-) -> BaseEvaluator:
+def create_experiment_evaluators(
+    experiment_config: ExperimentConfig, target: Any, dataset: BaseDataset
+) -> dict[str, BaseEvaluator]:
     """
-    Instantiates the appropriate Evaluator based on the configuration.
-
-    This factory handles dependency resolution for complex evaluators:
-    - **Keyword**: Validates that the `dataset` metadata matches the expected keyword.
-    - **BIPIA**: Resolves the 'Judge LLM' by either using a specific config or
-        borrowing the `target`'s LLM if none is provided.
-
-    Args:
-        config (EvaluatorConfig | None): The evaluator section from the experiment YAML.
-        target (Any, optional): The instantiated Target system. Required for the
-            BIPIA evaluator if it needs to share the generator's LLM.
-        dataset (BaseDataset | None, optional): The loaded dataset. Required for
-            the Keyword evaluator to validate the attack payload.
-
-    Returns:
-        BaseEvaluator: An initialized evaluator instance.
-
-    Raises:
-        ValueError: If the config is missing or if required dependencies (like
-        an LLM for the BIPIA judge) cannot be resolved.
+    Automatically selects the CORRECT evaluator suite based on the dataset type.
+    Manual selection is forbidden to prevent misconfiguration.
     """
-    if config is None:
-        error_msg = (
-            "Missing Configuration: No evaluator specified.\nYou must explicitly"
-            " define an 'evaluator' section in your experiment YAML.\n"
-            f"Available types: {', '.join(AVAILABLE_EVALUATORS)}"
-        )
-        logger.error(error_msg)
-        raise ValueError(error_msg)
-
-    if config.type == "canary":
-        logger.info("Evaluator: Canary Defense Integrity")
-        return CanaryEvaluator()
-
-    elif config.type == "keyword":
-        if dataset:
-            _validate_baseline_payload(dataset)
-        kw = config.target_keyword or BASELINE_TARGET_KEYWORD
-        logger.info(f"Evaluator: Keyword (Target: '{kw}')")
-        return KeywordEvaluator(target_keyword=kw)
-
-    elif config.type == "language_mismatch":
-        logger.info(
-            f"Evaluator: Language Mismatch (Expected: {config.expected_language})"
+    evaluators: dict[str, BaseEvaluator] = {}
+
+    # 1. SQuAD Logic
+    if dataset.meta.type == "squad":
+        logger.info("Configuration: Detected SQuAD. Using 'SquadDefenseEvaluator'.")
+        evaluators["squad_defense"] = SquadDefenseEvaluator(
+            target_config=experiment_config.target, dataset=dataset
         )
-        try:
-            return LanguageMismatchEvaluator(
-                expected_language=config.expected_language,
-                strict=config.strict,
-            )
-        except ImportError as e:
-            logger.error("Missing dependencies for Language Evaluator.")
-            raise e
-    elif config.type == "bipia":
-        logger.info("Evaluator: BIPIA (LLM Judge + Pattern Match)")
-
-        judge_llm: BaseLLM | None = None
-
-        # Priority 1: Use explicit evaluator LLM config
-        if config.llm:
-            logger.info("Using explicit LLM config for BIPIA Judge.")
-            judge_llm = create_llm(config.llm)
-
-        # Priority 2: Fallback to Target's LLM (if valid type)
-        else:
-            logger.info(
-                "No explicit evaluator LLM. Attempting fallback to Target's LLM."
-            )
-            judge_llm = cast(BaseLLM | None, getattr(target, "llm", None))
-
-        if not judge_llm:
-            error_msg = (
-                "BIPIA Evaluator requires a Judge LLM! "
-                "Please provide 'llm' in evaluator config or "
-                "ensure target has an accessible 'llm' attribute."
-            )
-            logger.error(error_msg)
-            # We strictly enforce LLM presence now as requested
-            raise ValueError(error_msg)
-
-        return BipiaEvaluator(judge_llm=judge_llm)
-    else:
-        raise ValueError(f"Unknown evaluator type: {config.type}")
+        return evaluators
+
+    # 2. BIPIA Logic
+    if dataset.meta.type == "bipia":
+        logger.info("Configuration: Detected BIPIA. Using 'BipiaEvaluator'.")
+        # For BIPIA, we generally need the LLM to judge.
+        judge_llm = cast(BaseLLM | None, getattr(target, "llm", None))
+        evaluators["bipia_asr"] = BipiaEvaluator(judge_llm=judge_llm)
+        return evaluators
+
+    # Fallback / Warning
+    logger.warning(
+        f"No automated evaluators defined for dataset type: {dataset.meta.type}"
+    )
+    return evaluators