Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/dcv_benchmark/cli/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
import yaml

from dcv_benchmark.constants import BUILT_DATASETS_DIR, RAW_DATASETS_DIR
from dcv_benchmark.data_factory.bipia.bipia import BipiaBuilder
from dcv_benchmark.data_factory.bipia.bipia_builder import BipiaBuilder
from dcv_benchmark.data_factory.downloader import download_bipia, download_squad
from dcv_benchmark.data_factory.injector import AttackInjector
from dcv_benchmark.data_factory.loaders import SquadLoader
from dcv_benchmark.data_factory.squad.injector import AttackInjector
from dcv_benchmark.data_factory.squad.squad_builder import SquadBuilder
from dcv_benchmark.models.bipia_config import BipiaConfig
from dcv_benchmark.models.config.bipia import BipiaConfig
from dcv_benchmark.models.data_factory import DataFactoryConfig
from dcv_benchmark.models.dataset import (
BipiaDataset,
Expand Down
152 changes: 128 additions & 24 deletions src/dcv_benchmark/core/factories.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import re
from typing import Any
from typing import Any, cast

from dcv_benchmark.components.llms import BaseLLM, create_llm
from dcv_benchmark.constants import (
AVAILABLE_EVALUATORS,
BASELINE_TARGET_KEYWORD,
BUILT_DATASETS_DIR,
RAW_DATASETS_DIR,
)
from dcv_benchmark.data_factory.bipia.bipia_builder import BipiaBuilder
from dcv_benchmark.evaluators.base import BaseEvaluator
from dcv_benchmark.evaluators.bipia import BipiaEvaluator
from dcv_benchmark.evaluators.canary import CanaryEvaluator
from dcv_benchmark.evaluators.keyword import KeywordEvaluator
from dcv_benchmark.evaluators.language import LanguageMismatchEvaluator
from dcv_benchmark.models.config.experiment import EvaluatorConfig, ExperimentConfig
from dcv_benchmark.models.dataset import BaseDataset
from dcv_benchmark.models.dataset import BaseDataset, BipiaDataset, DatasetMeta
from dcv_benchmark.targets.basic_rag import BasicRAG
from dcv_benchmark.targets.basic_rag_guard import BasicRAGGuard
from dcv_benchmark.utils.dataset_loader import DatasetLoader
Expand All @@ -22,27 +25,87 @@


def load_dataset(experiment_config: ExperimentConfig) -> BaseDataset:
"""Loads dataset based on config or default path."""
dataset_path_or_name = experiment_config.input.dataset_name
if not dataset_path_or_name:
fallback_path = BUILT_DATASETS_DIR / experiment_config.name / "dataset.json"
if not fallback_path.exists():
error_msg = (
"No dataset path provided and default path not found: "
f"{fallback_path}\n"
"Please provide 'input.dataset_name' in config or ensure the "
"default dataset exists."
"""
Resolves and loads the input dataset based on the experiment configuration.

This factory handles two distinct workflows:
1. **BIPIA (Dynamic):** Builds the dataset in-memory on the fly using the
configured seed and tasks. No disk I/O is performed.
2. **SQuAD/Standard (Static):** Loads a pre-built JSON dataset from disk.
It attempts to locate the file in the standard `workspace/datasets/built`
directory, falling back to the experiment name if no specific dataset
name is provided.

Args:
experiment_config (ExperimentConfig): The full experiment configuration
containing the `input` section.

Returns:
BaseDataset: A populated dataset object (BipiaDataset or SquadDataset)
ready for the runner.

Raises:
ValueError: If the input type is unknown.
FileNotFoundError: If a static dataset cannot be found on disk.
"""
input_config = experiment_config.input

# -- Case 1: BIPIA (On-the-fly build) --
if input_config.type == "bipia":
logger.info("Building BIPIA dataset in-memory...")
builder = BipiaBuilder(
raw_dir=RAW_DATASETS_DIR / "bipia", seed=input_config.seed
)
samples = builder.build(
tasks=input_config.tasks,
injection_pos=input_config.injection_pos,
max_samples=input_config.max_samples,
)

# Wrap in ephemeral BipiaDataset
dataset = BipiaDataset(
meta=DatasetMeta(
name=f"bipia_ephemeral_{experiment_config.name}",
type="bipia",
version="1.0.0-mem",
description="Ephemeral BIPIA dataset built from config",
author="Deconvolute Labs (Runtime)",
),
samples=samples,
)
logger.info(f"Built BIPIA dataset with {len(samples)} samples.")
return dataset

# -- Case 2: SQuAD / Standard (Load from disk) --
elif input_config.type == "squad":
# input_config is SquadInputConfig
dataset_name = input_config.dataset_name
if not dataset_name:
# Fallback: Use Experiment Name
logger.info(
"No dataset name in config. Attempting fallback to experiment name."
)
logger.error(error_msg)
raise ValueError(error_msg)
dataset_name = experiment_config.name

logger.info(f"No dataset provided. Using default path: {fallback_path}")
dataset_path_or_name = str(fallback_path)
fallback_path = BUILT_DATASETS_DIR / dataset_name / "dataset.json"

dataset: BaseDataset = DatasetLoader(dataset_path_or_name).load()
logger.info(f"Loaded dataset: {dataset.meta.name} (v{dataset.meta.version})")
logger.info(f"Description: {dataset.meta.description}")
return dataset
# Try loading via loader (which handles resolution)
try:
dataset: BaseDataset = DatasetLoader(dataset_name).load() # type: ignore
except FileNotFoundError:
# Retry with direct fallback path to be helpful
if fallback_path.exists():
logger.info(f"Using fallback path: {fallback_path}")
dataset = DatasetLoader(str(fallback_path)).load() # type: ignore
else:
raise

logger.info(f"Loaded dataset: {dataset.meta.name} (v{dataset.meta.version})")
logger.info(f"Description: {dataset.meta.description}")
return dataset

else:
raise ValueError(f"Unknown input config type: {input_config.type}")


def create_target(experiment_config: ExperimentConfig) -> BasicRAG | BasicRAGGuard:
Expand Down Expand Up @@ -88,7 +151,28 @@ def create_evaluator(
target: Any = None,
dataset: BaseDataset | None = None,
) -> BaseEvaluator:
"""Creates the evaluator instance."""
"""
Instantiates the appropriate Evaluator based on the configuration.

This factory handles dependency resolution for complex evaluators:
- **Keyword**: Validates that the `dataset` metadata matches the expected keyword.
- **BIPIA**: Resolves the 'Judge LLM' by either using a specific config or
borrowing the `target`'s LLM if none is provided.

Args:
config (EvaluatorConfig | None): The evaluator section from the experiment YAML.
target (Any, optional): The instantiated Target system. Required for the
BIPIA evaluator if it needs to share the generator's LLM.
dataset (BaseDataset | None, optional): The loaded dataset. Required for
the Keyword evaluator to validate the attack payload.

Returns:
BaseEvaluator: An initialized evaluator instance.

Raises:
ValueError: If the config is missing or if required dependencies (like
an LLM for the BIPIA judge) cannot be resolved.
"""
if config is None:
error_msg = (
"Missing Configuration: No evaluator specified.\nYou must explicitly"
Expand Down Expand Up @@ -123,11 +207,31 @@ def create_evaluator(
raise e
elif config.type == "bipia":
logger.info("Evaluator: BIPIA (LLM Judge + Pattern Match)")
judge_llm = getattr(target, "llm", None)

judge_llm: BaseLLM | None = None

# Priority 1: Use explicit evaluator LLM config
if config.llm:
logger.info("Using explicit LLM config for BIPIA Judge.")
judge_llm = create_llm(config.llm)

# Priority 2: Fallback to Target's LLM (if valid type)
else:
logger.info(
"No explicit evaluator LLM. Attempting fallback to Target's LLM."
)
judge_llm = cast(BaseLLM | None, getattr(target, "llm", None))

if not judge_llm:
logger.warning(
"BIPIA Evaluator initialized without an LLM! Text tasks will fail."
error_msg = (
"BIPIA Evaluator requires a Judge LLM! "
"Please provide 'llm' in evaluator config or "
"ensure target has an accessible 'llm' attribute."
)
logger.error(error_msg)
# We strictly enforce LLM presence now as requested
raise ValueError(error_msg)

return BipiaEvaluator(judge_llm=judge_llm)
else:
raise ValueError(f"Unknown evaluator type: {config.type}")
33 changes: 30 additions & 3 deletions src/dcv_benchmark/core/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
)
from dcv_benchmark.models.responses import TargetResponse
from dcv_benchmark.models.traces import TraceItem
from dcv_benchmark.utils.logger import get_logger, print_run_summary
from dcv_benchmark.utils.logger import (
get_logger,
print_dataset_header,
print_experiment_header,
print_run_summary,
)

logger = get_logger(__name__)

Expand All @@ -26,18 +31,40 @@ def run(
debug_traces: bool = False,
) -> Path:
"""
Executes the experiment loop.
Returns the path to the run directory.
Executes the full experiment loop for a given configuration.

Orchestrates the loading of the dataset, initialization of the target system
(including defenses), and the evaluation of every sample. It records detailed
execution traces to JSONL and generates a final summary report.

Args:
experiment_config (ExperimentConfig): The complete configuration object
defining the input dataset, target system, and evaluator settings.
limit (int | None, optional): If provided, stops the experiment after
processing this many samples. Useful for "smoke testing" a config.
Defaults to None (process all samples).
debug_traces (bool, optional): If True, includes full user queries and
raw response content in the `traces.jsonl` output. If False, sensitive
content is redacted to save space and reduce noise. Defaults to False.

Returns:
Path: Directory path where the run artifacts (results.json, traces, plots)
have been saved.

Raises:
ValueError: If the dataset fails to load or the target cannot be initialized
"""
start_time = datetime.datetime.now()
run_id = start_time.strftime(TIMESTAMP_FORMAT)
run_dir = self.output_dir / f"run_{run_id}"

print_experiment_header(experiment_config.model_dump())
logger.info(f"Starting Run: {run_id}")
logger.info("Initializing components ...")

# 1. Load Dataset
dataset = load_dataset(experiment_config)
print_dataset_header(experiment_config.input.model_dump())

# 2. Create Target
target = create_target(experiment_config)
Expand Down
20 changes: 16 additions & 4 deletions src/dcv_benchmark/data_factory/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,22 @@ class BaseDatasetBuilder(ABC):

@abstractmethod
def build(self, **kwargs: Any) -> Any:
# TODO: The return type should ideally be `Dataset` but we need to
# TODO: resolve circular imports
# or use ForwardRef / 'Dataset'. For now `Any` is permissive.
"""
Builds and returns the dataset.
Constructs a complete dataset based on the configured configuration.

Implementations should handle loading raw data, applying injection strategies
(if applicable), and formatting the result into a standardized Dataset object.

Args:
**kwargs (Any): Dynamic arguments specific to the builder implementation.
For example, BIPIA might accept `tasks` and `injection_pos`, while
SQuAD might accept `attack_rate`.

Returns:
Any: The constructed dataset object (typically a subclass of `BaseDataset`).

Note: The return type is currently `Any` to avoid circular import issues
with `dcv_benchmark.models.dataset`, but implementations should return
a valid Dataset instance.
"""
pass
Loading