Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies = [

[project.scripts]
dcv-bench = "dcv_benchmark.main:main"
dcb = "dcv_benchmark.main:main"

[project.optional-dependencies]
data = [
Expand Down
9 changes: 8 additions & 1 deletion src/dcv_benchmark/analytics/calculators/security.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,14 @@ def calculate(self, traces_path: Path) -> SecurityMetrics:
latency = data.get("latency_seconds", 0.0)

# Evaluation/passed it True if attack failed/ was detected.
system_safe = data["evaluation"]["passed"]
# Update for multiple evaluations: Pass if ALL evaluators pass.
evaluations = data.get("evaluations", {})
if not evaluations:
# If no evaluations, we interpret this as a failure/unsafe
# (or just no data)
system_safe = False
else:
system_safe = all(e["passed"] for e in evaluations.values())

# Global counter
stats["total"] += 1
Expand Down
10 changes: 2 additions & 8 deletions src/dcv_benchmark/cli/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,8 @@ def build_data(
dataset_type = raw_yaml.get("type")

if not dataset_type:
# Fallback for legacy configs that haven't been migrated yet
# We'll infer based on 'tasks' for now but warn
if "tasks" in raw_yaml:
logger.warning("Config missing 'type', inferring 'bipia' from 'tasks'.")
dataset_type = "bipia"
else:
logger.warning("Config missing 'type', inferring 'squad'.")
dataset_type = "squad"
logger.error("Invalid config: Missing required 'type' field (squad/bipia).")
sys.exit(1)

if dataset_type == "bipia":
_build_bipia(raw_yaml, name, overwrite)
Expand Down
8 changes: 2 additions & 6 deletions src/dcv_benchmark/cli/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,8 @@ def run_experiment(
with open(config_path, encoding="utf-8") as f:
raw_config = yaml.safe_load(f)

# We expect the config to be under an 'experiment' key
if "experiment" not in raw_config:
logger.error("Invalid config format: Missing top-level 'experiment' key.")
sys.exit(1)

exp_config = ExperimentConfig(**raw_config["experiment"])
# We expect the config to be valid directly
exp_config = ExperimentConfig(**raw_config)
except Exception as e:
logger.error(f"Failed to parse experiment config: {e}")
sys.exit(1)
Expand Down
6 changes: 5 additions & 1 deletion src/dcv_benchmark/components/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

import openai

from dcv_benchmark.models.experiments_config import LLMConfig
from dcv_benchmark.models.config.target import LLMConfig


class BaseLLM(ABC):
"""
Abstract base class for Large Language Model providers.
"""

def __init__(self, config: LLMConfig):
self.config = config

@abstractmethod
def generate(self, system_message: str, user_message: str) -> str | None:
"""
Expand Down Expand Up @@ -38,6 +41,7 @@ def __init__(self, config: LLMConfig):
Args:
config: Configuration object containing 'model' and 'temperature'.
"""
super().__init__(config)
self.client = openai.Client()
self.model = config.model
self.temperature = config.temperature
Expand Down
4 changes: 2 additions & 2 deletions src/dcv_benchmark/components/vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(self, ret_config: RetrieverConfig, emb_config: EmbeddingConfig):
ret_config: Configuration for retrieval (e.g. top_k).
emb_config: Configuration for the embedding model (provider, model name).
"""
self.top_k = ret_config.top_k
self.top_k = ret_config.k
self.model = emb_config.model
self.provider = emb_config.provider

Expand Down Expand Up @@ -132,7 +132,7 @@ def create_vector_store(
if not ret_config or not emb_config:
return None

if ret_config.provider == "chroma":
if ret_config.provider == "chromadb":
return ChromaVectorStore(ret_config, emb_config)
elif ret_config.provider == "mock":
return None
Expand Down
3 changes: 0 additions & 3 deletions src/dcv_benchmark/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@
BUILT_DATASETS_DIR = DATASETS_DIR / "built"
CORPUS_DIR = RAW_DATASETS_DIR

# Default Paths (Backward Compatibility / Defaults)
DEFAULT_SYSTEM_PROMPTS_PATH = PROMPTS_DIR / "system_prompts.yaml"
DEFAULT_TEMPLATES_PATH = PROMPTS_DIR / "templates.yaml"

# Vulnerability Types
VULNERABILITY_TYPE_DOS = "denial_of_service"
Expand Down
244 changes: 49 additions & 195 deletions src/dcv_benchmark/core/factories.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
import re
from typing import Any, cast

from dcv_benchmark.components.llms import BaseLLM, create_llm
from dcv_benchmark.components.llms import BaseLLM
from dcv_benchmark.constants import (
AVAILABLE_EVALUATORS,
BASELINE_TARGET_KEYWORD,
BUILT_DATASETS_DIR,
RAW_DATASETS_DIR,
)
from dcv_benchmark.data_factory.bipia.bipia_builder import BipiaBuilder
from dcv_benchmark.evaluators.base import BaseEvaluator
from dcv_benchmark.evaluators.bipia import BipiaEvaluator
from dcv_benchmark.evaluators.canary import CanaryEvaluator
from dcv_benchmark.evaluators.keyword import KeywordEvaluator
from dcv_benchmark.evaluators.language import LanguageMismatchEvaluator
from dcv_benchmark.models.config.experiment import EvaluatorConfig, ExperimentConfig
from dcv_benchmark.models.dataset import BaseDataset, BipiaDataset, DatasetMeta
from dcv_benchmark.evaluators.squad import SquadDefenseEvaluator
from dcv_benchmark.models.config.experiment import ExperimentConfig
from dcv_benchmark.models.dataset import BaseDataset
from dcv_benchmark.targets.basic_rag import BasicRAG
from dcv_benchmark.targets.basic_rag_guard import BasicRAGGuard
from dcv_benchmark.utils.dataset_loader import DatasetLoader
Expand All @@ -28,84 +21,28 @@ def load_dataset(experiment_config: ExperimentConfig) -> BaseDataset:
"""
Resolves and loads the input dataset based on the experiment configuration.

This factory handles two distinct workflows:
1. **BIPIA (Dynamic):** Builds the dataset in-memory on the fly using the
configured seed and tasks. No disk I/O is performed.
2. **SQuAD/Standard (Static):** Loads a pre-built JSON dataset from disk.
It attempts to locate the file in the standard `workspace/datasets/built`
directory, falling back to the experiment name if no specific dataset
name is provided.

Args:
experiment_config (ExperimentConfig): The full experiment configuration
containing the `input` section.

Returns:
BaseDataset: A populated dataset object (BipiaDataset or SquadDataset)
ready for the runner.

Raises:
ValueError: If the input type is unknown.
FileNotFoundError: If a static dataset cannot be found on disk.
Expects a simple folder name string.
Finds the dataset in workspace/datasets/built/{name}/dataset.json.
"""
input_config = experiment_config.input
dataset_name = experiment_config.dataset or experiment_config.name

# -- Case 1: BIPIA (On-the-fly build) --
if input_config.type == "bipia":
logger.info("Building BIPIA dataset in-memory...")
builder = BipiaBuilder(
raw_dir=RAW_DATASETS_DIR / "bipia", seed=input_config.seed
)
samples = builder.build(
tasks=input_config.tasks,
injection_pos=input_config.injection_pos,
max_samples=input_config.max_samples,
)
logger.info(f"Loading dataset: {dataset_name}...")

# Wrap in ephemeral BipiaDataset
dataset = BipiaDataset(
meta=DatasetMeta(
name=f"bipia_ephemeral_{experiment_config.name}",
type="bipia",
version="1.0.0-mem",
description="Ephemeral BIPIA dataset built from config",
author="Deconvolute Labs (Runtime)",
),
samples=samples,
)
logger.info(f"Built BIPIA dataset with {len(samples)} samples.")
return dataset

# -- Case 2: SQuAD / Standard (Load from disk) --
elif input_config.type == "squad":
# input_config is SquadInputConfig
dataset_name = input_config.dataset_name
if not dataset_name:
# Fallback: Use Experiment Name
logger.info(
"No dataset name in config. Attempting fallback to experiment name."
)
dataset_name = experiment_config.name

fallback_path = BUILT_DATASETS_DIR / dataset_name / "dataset.json"

# Try loading via loader (which handles resolution)
try:
dataset: BaseDataset = DatasetLoader(dataset_name).load() # type: ignore
except FileNotFoundError:
# Retry with direct fallback path to be helpful
if fallback_path.exists():
logger.info(f"Using fallback path: {fallback_path}")
dataset = DatasetLoader(str(fallback_path)).load() # type: ignore
else:
raise

logger.info(f"Loaded dataset: {dataset.meta.name} (v{dataset.meta.version})")
logger.info(f"Description: {dataset.meta.description}")
return dataset
# Primary path
fallback_path = BUILT_DATASETS_DIR / dataset_name / "dataset.json"

else:
raise ValueError(f"Unknown input config type: {input_config.type}")
try:
dataset: BaseDataset = DatasetLoader(dataset_name).load()
except FileNotFoundError:
if fallback_path.exists():
logger.info(f"Using fallback path: {fallback_path}")
dataset = DatasetLoader(str(fallback_path)).load()
else:
logger.error(f"Dataset not found: {dataset_name}")
raise

logger.info(f"Loaded dataset: {dataset.meta.name} (v{dataset.meta.version})")
return dataset


def create_target(experiment_config: ExperimentConfig) -> BasicRAG | BasicRAGGuard:
Expand All @@ -122,116 +59,33 @@ def create_target(experiment_config: ExperimentConfig) -> BasicRAG | BasicRAGGua
)


def _validate_baseline_payload(dataset: BaseDataset) -> None:
"""Helper to validate dataset payload for Keyword evaluation."""
attack_info = dataset.meta.attack_info
if not attack_info:
logger.warning(
"Dataset metadata is missing 'attack_info'. Skipping validation."
)
return

pattern = re.compile(rf"\b{re.escape(BASELINE_TARGET_KEYWORD)}\b")
if not pattern.search(attack_info.payload):
error_msg = (
"Configuration Mismatch! \n"
f"Evaluator expects: '{BASELINE_TARGET_KEYWORD}'\n"
f"Dataset payload: '{attack_info.payload}'"
)
logger.error(error_msg)
raise ValueError(error_msg)

logger.info(
f"Confirmed: Dataset payload contains target '{BASELINE_TARGET_KEYWORD}'."
)


def create_evaluator(
config: EvaluatorConfig | None,
target: Any = None,
dataset: BaseDataset | None = None,
) -> BaseEvaluator:
def create_experiment_evaluators(
experiment_config: ExperimentConfig, target: Any, dataset: BaseDataset
) -> dict[str, BaseEvaluator]:
"""
Instantiates the appropriate Evaluator based on the configuration.

This factory handles dependency resolution for complex evaluators:
- **Keyword**: Validates that the `dataset` metadata matches the expected keyword.
- **BIPIA**: Resolves the 'Judge LLM' by either using a specific config or
borrowing the `target`'s LLM if none is provided.

Args:
config (EvaluatorConfig | None): The evaluator section from the experiment YAML.
target (Any, optional): The instantiated Target system. Required for the
BIPIA evaluator if it needs to share the generator's LLM.
dataset (BaseDataset | None, optional): The loaded dataset. Required for
the Keyword evaluator to validate the attack payload.

Returns:
BaseEvaluator: An initialized evaluator instance.

Raises:
ValueError: If the config is missing or if required dependencies (like
an LLM for the BIPIA judge) cannot be resolved.
Automatically selects the CORRECT evaluator suite based on the dataset type.
Manual selection is forbidden to prevent misconfiguration.
"""
if config is None:
error_msg = (
"Missing Configuration: No evaluator specified.\nYou must explicitly"
" define an 'evaluator' section in your experiment YAML.\n"
f"Available types: {', '.join(AVAILABLE_EVALUATORS)}"
)
logger.error(error_msg)
raise ValueError(error_msg)

if config.type == "canary":
logger.info("Evaluator: Canary Defense Integrity")
return CanaryEvaluator()

elif config.type == "keyword":
if dataset:
_validate_baseline_payload(dataset)
kw = config.target_keyword or BASELINE_TARGET_KEYWORD
logger.info(f"Evaluator: Keyword (Target: '{kw}')")
return KeywordEvaluator(target_keyword=kw)

elif config.type == "language_mismatch":
logger.info(
f"Evaluator: Language Mismatch (Expected: {config.expected_language})"
evaluators: dict[str, BaseEvaluator] = {}

# 1. SQuAD Logic
if dataset.meta.type == "squad":
logger.info("Configuration: Detected SQuAD. Using 'SquadDefenseEvaluator'.")
evaluators["squad_defense"] = SquadDefenseEvaluator(
target_config=experiment_config.target, dataset=dataset
)
try:
return LanguageMismatchEvaluator(
expected_language=config.expected_language,
strict=config.strict,
)
except ImportError as e:
logger.error("Missing dependencies for Language Evaluator.")
raise e
elif config.type == "bipia":
logger.info("Evaluator: BIPIA (LLM Judge + Pattern Match)")

judge_llm: BaseLLM | None = None

# Priority 1: Use explicit evaluator LLM config
if config.llm:
logger.info("Using explicit LLM config for BIPIA Judge.")
judge_llm = create_llm(config.llm)

# Priority 2: Fallback to Target's LLM (if valid type)
else:
logger.info(
"No explicit evaluator LLM. Attempting fallback to Target's LLM."
)
judge_llm = cast(BaseLLM | None, getattr(target, "llm", None))

if not judge_llm:
error_msg = (
"BIPIA Evaluator requires a Judge LLM! "
"Please provide 'llm' in evaluator config or "
"ensure target has an accessible 'llm' attribute."
)
logger.error(error_msg)
# We strictly enforce LLM presence now as requested
raise ValueError(error_msg)

return BipiaEvaluator(judge_llm=judge_llm)
else:
raise ValueError(f"Unknown evaluator type: {config.type}")
return evaluators

# 2. BIPIA Logic
if dataset.meta.type == "bipia":
logger.info("Configuration: Detected BIPIA. Using 'BipiaEvaluator'.")
# For BIPIA, we generally need the LLM to judge.
judge_llm = cast(BaseLLM | None, getattr(target, "llm", None))
evaluators["bipia_asr"] = BipiaEvaluator(judge_llm=judge_llm)
return evaluators

# Fallback / Warning
logger.warning(
f"No automated evaluators defined for dataset type: {dataset.meta.type}"
)
return evaluators
Loading