From fda8d39516590507110d260c43b515e4f7eac27f Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 6 Feb 2026 20:46:19 -0800 Subject: [PATCH] Remove unnecessary pyright ignores --- .../compute_mmlu_accuracy.py | 2 +- .../eval_mmlu_pro_zeroshot.py | 2 +- src/tamperbench/whitebox/attacks/base.py | 10 +- .../embedding_attack/embedding_attack.py | 6 +- .../whitebox/evals/ifeval/ifeval.py | 8 +- src/tamperbench/whitebox/evals/mbpp/mbpp.py | 4 +- .../evals/minerva_math/minerva_math.py | 4 +- .../whitebox/evals/mmlu_pro/mmlu_pro.py | 4 +- .../whitebox/utils/analysis/builders.py | 6 +- .../whitebox/utils/analysis/config.py | 108 +++++++++--------- .../whitebox/utils/analysis/filters.py | 8 +- src/tamperbench/whitebox/utils/analysis/io.py | 2 +- .../whitebox/utils/analysis/metrics.py | 12 +- .../whitebox/utils/analysis/pipeline.py | 18 ++- .../whitebox/utils/analysis/validation.py | 6 +- .../whitebox/utils/benchmark/config.py | 6 +- .../whitebox/utils/benchmark/io.py | 8 +- .../whitebox/utils/benchmark/sweep.py | 8 +- .../whitebox/utils/benchmark/trial_manager.py | 13 +-- .../whitebox/utils/models/config.py | 4 +- 20 files changed, 117 insertions(+), 122 deletions(-) diff --git a/scripts/user/tomtseng/mmlu_pro_tar_251203/compute_mmlu_accuracy.py b/scripts/user/tomtseng/mmlu_pro_tar_251203/compute_mmlu_accuracy.py index 35eb6548..5bd0d011 100644 --- a/scripts/user/tomtseng/mmlu_pro_tar_251203/compute_mmlu_accuracy.py +++ b/scripts/user/tomtseng/mmlu_pro_tar_251203/compute_mmlu_accuracy.py @@ -61,7 +61,7 @@ def build_ground_truth() -> list[dict[str, Any]]: """Load and preprocess MMLU-Pro test split, same as tamperbench.""" ds = datasets.load_dataset(DATASET_NAME, split="test") # HuggingFace datasets library doesn't properly type Dataset iteration - rows = [dict(item) for item in ds] # pyright: ignore[reportCallIssue, reportArgumentType] + rows = [dict(item) for item in ds] rows = preprocess(rows) return limit_examples_per_subject(rows, VAL_MAX_EXAMPLES_PER_SUBJECT) diff --git a/scripts/user/tomtseng/mmlu_pro_tar_251203/eval_mmlu_pro_zeroshot.py b/scripts/user/tomtseng/mmlu_pro_tar_251203/eval_mmlu_pro_zeroshot.py index 4d795f20..02af959b 100644 --- a/scripts/user/tomtseng/mmlu_pro_tar_251203/eval_mmlu_pro_zeroshot.py +++ b/scripts/user/tomtseng/mmlu_pro_tar_251203/eval_mmlu_pro_zeroshot.py @@ -50,7 +50,7 @@ def build_ground_truth() -> list[dict[str, Any]]: """Load and preprocess MMLU-Pro test split, returning the same 140 questions.""" ds = datasets.load_dataset(DATASET_NAME, split="test") # HuggingFace datasets library doesn't properly type Dataset iteration - rows = [dict(item) for item in ds] # pyright: ignore[reportCallIssue, reportArgumentType] + rows = [dict(item) for item in ds] rows = preprocess(rows) return limit_examples_per_subject(rows, VAL_MAX_EXAMPLES_PER_SUBJECT) diff --git a/src/tamperbench/whitebox/attacks/base.py b/src/tamperbench/whitebox/attacks/base.py index 61318027..76043d72 100644 --- a/src/tamperbench/whitebox/attacks/base.py +++ b/src/tamperbench/whitebox/attacks/base.py @@ -55,7 +55,7 @@ class TamperAttackConfig: random_seed: int @classmethod - def from_dict(cls, data: dict[str, Any]) -> Self: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> Self: """All subclasses must implement a `from_dict` constructor. Args: @@ -70,7 +70,7 @@ def from_dict(cls, data: dict[str, Any]) -> Self: # pyright: ignore[reportExpli return cls(**data) @classmethod - def prepare_dict_for_init(cls, data: dict[str, Any]) -> dict[str, Any]: # pyright: ignore[reportExplicitAny] + def prepare_dict_for_init(cls, data: dict[str, Any]) -> dict[str, Any]: """Transform dictionary data into proper types for dataclass initialization. Converts nested dictionaries into their corresponding dataclass instances @@ -82,14 +82,14 @@ def prepare_dict_for_init(cls, data: dict[str, Any]) -> dict[str, Any]: # pyrig Returns: dict[str, Any]: Dictionary with nested objects properly instantiated. """ - model_config_dict = data.pop("model_config") # pyright: ignore[reportAny] - model_config = ModelConfig.from_dict(model_config_dict) # pyright: ignore[reportAny] + model_config_dict = data.pop("model_config") + model_config = ModelConfig.from_dict(model_config_dict) data.update({"model_config": model_config}) return data @classmethod - def _validate_dict_keys(cls, data: dict[str, Any]) -> None: # pyright: ignore[reportExplicitAny] + def _validate_dict_keys(cls, data: dict[str, Any]) -> None: """Check that the dictionary to be used to construct dataclass has correct fields. Args: diff --git a/src/tamperbench/whitebox/attacks/embedding_attack/embedding_attack.py b/src/tamperbench/whitebox/attacks/embedding_attack/embedding_attack.py index 305be780..d7ae1e98 100644 --- a/src/tamperbench/whitebox/attacks/embedding_attack/embedding_attack.py +++ b/src/tamperbench/whitebox/attacks/embedding_attack/embedding_attack.py @@ -31,7 +31,7 @@ class EmbeddingAttackConfig(TamperAttackConfig): soft_opt_config: SoftOptConfig @classmethod - def prepare_dict_for_init(cls, data: dict[str, Any]) -> dict[str, Any]: # pyright: ignore[reportExplicitAny] + def prepare_dict_for_init(cls, data: dict[str, Any]) -> dict[str, Any]: """Transform dictionary data into proper types for dataclass initialization. Args: @@ -42,11 +42,11 @@ def prepare_dict_for_init(cls, data: dict[str, Any]) -> dict[str, Any]: # pyrig """ data = super().prepare_dict_for_init(data) - soft_opt_config_dict = data.pop("soft_opt_config") # pyright: ignore[reportAny] + soft_opt_config_dict = data.pop("soft_opt_config") # Pass random_seed from attack config to SoftOptConfig soft_opt_config_dict["seed"] = data["random_seed"] - soft_opt_config = SoftOptConfig(**soft_opt_config_dict) # pyright: ignore[reportAny] + soft_opt_config = SoftOptConfig(**soft_opt_config_dict) data.update({"soft_opt_config": soft_opt_config}) return data diff --git a/src/tamperbench/whitebox/evals/ifeval/ifeval.py b/src/tamperbench/whitebox/evals/ifeval/ifeval.py index 18e26da2..22e686b9 100644 --- a/src/tamperbench/whitebox/evals/ifeval/ifeval.py +++ b/src/tamperbench/whitebox/evals/ifeval/ifeval.py @@ -90,7 +90,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: models like Qwen, Llama, etc. """ dataset = datasets.load_dataset(DATASET_NAME, split=TEST_SPLIT) - test_docs = [dict(item) for item in dataset] # pyright: ignore[reportArgumentType] + test_docs = [dict(item) for item in dataset] if self.eval_config.max_samples is not None: test_docs = test_docs[: self.eval_config.max_samples] @@ -114,7 +114,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[ScoreSchema]: """Apply instruction-following checks (utils.process_results).""" dataset = datasets.load_dataset(DATASET_NAME, split=TEST_SPLIT) - test_docs = [dict(item) for item in dataset][: len(inferences)] # pyright: ignore[reportArgumentType] + test_docs = [dict(item) for item in dataset][: len(inferences)] self._prompt_level_strict = [] self._prompt_level_loose = [] @@ -132,8 +132,8 @@ def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[Sc metrics = utils.process_results(doc, response) strict_prompt = float(metrics["prompt_level_strict_acc"]) loose_prompt = float(metrics["prompt_level_loose_acc"]) - inst_strict = list(metrics["inst_level_strict_acc"]) # pyright: ignore[reportArgumentType] - inst_loose = list(metrics["inst_level_loose_acc"]) # pyright: ignore[reportArgumentType] + inst_strict = list(metrics["inst_level_strict_acc"]) + inst_loose = list(metrics["inst_level_loose_acc"]) self._prompt_level_strict.append(strict_prompt) self._prompt_level_loose.append(loose_prompt) diff --git a/src/tamperbench/whitebox/evals/mbpp/mbpp.py b/src/tamperbench/whitebox/evals/mbpp/mbpp.py index 58da911b..07f236ab 100644 --- a/src/tamperbench/whitebox/evals/mbpp/mbpp.py +++ b/src/tamperbench/whitebox/evals/mbpp/mbpp.py @@ -225,7 +225,7 @@ class MBPPEvaluation(WhiteBoxEvaluation[MBPPEvaluationConfig]): def compute_inferences(self) -> DataFrame[InferenceSchema]: """Generate code completions for MBPP problems.""" dataset = datasets.load_dataset(DATASET_PATH, split=TEST_SPLIT) - test_docs = [dict(item) for item in dataset] # pyright: ignore[reportArgumentType] + test_docs = [dict(item) for item in dataset] # Validate dataset size assert len(test_docs) == 500, ( @@ -261,7 +261,7 @@ def _format_with_chat_template(self, prompt: str) -> str: def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[ScoreSchema]: """Execute generated code and compute pass/fail scores.""" dataset = datasets.load_dataset(DATASET_PATH, split=TEST_SPLIT) - test_docs = [dict(item) for item in dataset][: len(inferences)] # pyright: ignore[reportArgumentType] + test_docs = [dict(item) for item in dataset][: len(inferences)] # Get prompts and references prompts = [get_prompt(doc) for doc in test_docs] diff --git a/src/tamperbench/whitebox/evals/minerva_math/minerva_math.py b/src/tamperbench/whitebox/evals/minerva_math/minerva_math.py index 066abb8c..a67d9a8d 100644 --- a/src/tamperbench/whitebox/evals/minerva_math/minerva_math.py +++ b/src/tamperbench/whitebox/evals/minerva_math/minerva_math.py @@ -83,7 +83,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: for subset in DATASET_SUBSETS: dataset = datasets.load_dataset(DATASET_NAME, subset, split=TEST_SPLIT) processed = utils.process_docs(dataset) - all_test_docs.extend([dict(item) for item in processed]) # pyright: ignore[reportArgumentType] + all_test_docs.extend([dict(item) for item in processed]) # Get few-shot examples fewshot_examples = utils.list_fewshot_samples() @@ -111,7 +111,7 @@ def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[Sc for subset in DATASET_SUBSETS: dataset = datasets.load_dataset(DATASET_NAME, subset, split=TEST_SPLIT) processed = utils.process_docs(dataset) - all_test_docs.extend([dict(item) for item in processed]) # pyright: ignore[reportArgumentType] + all_test_docs.extend([dict(item) for item in processed]) # Extract and normalize answers from responses responses_list = list(inferences[InferenceSchema.response]) diff --git a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py index 58bdba4d..61507476 100644 --- a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py +++ b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py @@ -111,7 +111,7 @@ def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[Sc @override def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[EvaluationSchema]: - acc = float(pl.Series(scores[ScoreSchema.score]).mean()) # pyright: ignore[reportArgumentType] + acc = float(pl.Series(scores[ScoreSchema.score]).mean()) df = pl.from_dict( { EvaluationSchema.metric_name: [str(MetricName.MMLU_PRO_ACCURACY)], @@ -126,7 +126,7 @@ def _load_split(self, split_name: str) -> list[dict]: if SPLIT_SAMPLE_LIMIT is None or SPLIT_SAMPLE_LIMIT <= 0 else f"{split_name}[:{SPLIT_SAMPLE_LIMIT}]" ) - ds = datasets.load_dataset(DATASET_NAME, split=split) # pyright: ignore[reportArgumentType] + ds = datasets.load_dataset(DATASET_NAME, split=split) rows = [dict(item) for item in ds] return mmlu_api.preprocess(rows) diff --git a/src/tamperbench/whitebox/utils/analysis/builders.py b/src/tamperbench/whitebox/utils/analysis/builders.py index a1c32a5e..cd5d6cf7 100644 --- a/src/tamperbench/whitebox/utils/analysis/builders.py +++ b/src/tamperbench/whitebox/utils/analysis/builders.py @@ -43,9 +43,9 @@ class HeatmapData: utility_delta: NDArray[np.float32] models: list[str] attacks: list[AttackName] - metadata: dict[str, Any] = field(default_factory=dict) # pyright: ignore[reportExplicitAny] + metadata: dict[str, Any] = field(default_factory=dict) - def to_dict(self) -> dict[str, Any]: # pyright: ignore[reportExplicitAny] + def to_dict(self) -> dict[str, Any]: """Convert to dictionary for JSON serialization. Converts numpy arrays to lists and attack names to strings for JSON compatibility. @@ -68,7 +68,7 @@ def to_dict(self) -> dict[str, Any]: # pyright: ignore[reportExplicitAny] "metadata": self.metadata, } - def _to_list(self, array: NDArray[np.float32]) -> list[Any] | Any: # pyright: ignore[reportExplicitAny] + def _to_list(self, array: NDArray[np.float32]) -> list[Any] | Any: """Convert numpy array to list, or return as-is if not an array.""" return array.tolist() if hasattr(array, "tolist") else array diff --git a/src/tamperbench/whitebox/utils/analysis/config.py b/src/tamperbench/whitebox/utils/analysis/config.py index 8b4d58cc..f19eff3a 100644 --- a/src/tamperbench/whitebox/utils/analysis/config.py +++ b/src/tamperbench/whitebox/utils/analysis/config.py @@ -25,10 +25,10 @@ class AttackCategory: attacks: list[AttackName] @classmethod - def from_dict(cls, data: dict[str, Any]) -> AttackCategory: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> AttackCategory: """Create from dictionary with validation.""" - attacks = [AttackName(a) for a in data["attacks"]] # pyright: ignore[reportAny] - return cls(display_name=data["display_name"], attacks=attacks) # pyright: ignore[reportAny] + attacks = [AttackName(a) for a in data["attacks"]] + return cls(display_name=data["display_name"], attacks=attacks) @dataclass @@ -39,7 +39,7 @@ class AttackConfig: categories: dict[str, AttackCategory] @classmethod - def from_dict(cls, data: dict[str, Any]) -> AttackConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> AttackConfig: """Create from dictionary with validation. Validates that all attack names exist in the registry and parses attack categories. @@ -62,8 +62,8 @@ def from_dict(cls, data: dict[str, Any]) -> AttackConfig: # pyright: ignore[rep >>> config.preferred_order[0] AttackName.NO_WEIGHT_MODIFICATION """ - preferred_order = cls._validate_attacks(data["preferred_order"]) # pyright: ignore[reportAny] - categories = cls._parse_categories(data["categories"]) # pyright: ignore[reportAny] + preferred_order = cls._validate_attacks(data["preferred_order"]) + categories = cls._parse_categories(data["categories"]) return cls(preferred_order=preferred_order, categories=categories) @classmethod @@ -83,12 +83,12 @@ def _validate_attacks(cls, attack_strings: list[str]) -> list[AttackName]: @classmethod def _parse_categories( cls, - categories_data: dict[str, Any], # pyright: ignore[reportExplicitAny] + categories_data: dict[str, Any], ) -> dict[str, AttackCategory]: """Parse category dict into AttackCategory instances.""" categories: dict[str, AttackCategory] = {} - for cat_key, cat_data in categories_data.items(): # pyright: ignore[reportAny] - categories[cat_key] = AttackCategory.from_dict(cat_data) # pyright: ignore[reportAny] + for cat_key, cat_data in categories_data.items(): + categories[cat_key] = AttackCategory.from_dict(cat_data) return categories @@ -100,11 +100,11 @@ class ModelGroup: models: list[str] @classmethod - def from_dict(cls, data: dict[str, Any]) -> ModelGroup: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> ModelGroup: """Create from dictionary.""" return cls( - display_name=data["display_name"], # pyright: ignore[reportAny] - models=data["models"], # pyright: ignore[reportAny] + display_name=data["display_name"], + models=data["models"], ) @@ -121,7 +121,7 @@ class ModelConfig: groups: dict[str, ModelGroup] | None = None # Optional: model groups for visual separation @classmethod - def from_dict(cls, data: dict[str, Any]) -> ModelConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> ModelConfig: """Create from dictionary. Parses model configuration including required model list and optional groups. @@ -139,13 +139,13 @@ def from_dict(cls, data: dict[str, Any]) -> ModelConfig: # pyright: ignore[repo groups = cls._parse_model_groups(data) return cls( - preferred_order=data["preferred_order"], # pyright: ignore[reportAny] + preferred_order=data["preferred_order"], include_models=include_models, groups=groups, ) @classmethod - def _validate_include_models(cls, data: dict[str, Any]) -> list[str]: # pyright: ignore[reportExplicitAny] + def _validate_include_models(cls, data: dict[str, Any]) -> list[str]: """Extract and validate the required include_models list.""" include_models: list[str] | None = data.get("include_models") if not include_models: @@ -155,14 +155,14 @@ def _validate_include_models(cls, data: dict[str, Any]) -> list[str]: # pyright return include_models @classmethod - def _parse_model_groups(cls, data: dict[str, Any]) -> dict[str, ModelGroup] | None: # pyright: ignore[reportExplicitAny] + def _parse_model_groups(cls, data: dict[str, Any]) -> dict[str, ModelGroup] | None: """Parse optional model group configurations into ModelGroup instances.""" if "groups" not in data: return None groups: dict[str, ModelGroup] = {} - for group_key, group_data in data["groups"].items(): # pyright: ignore[reportAny] - groups[group_key] = ModelGroup.from_dict(group_data) # pyright: ignore[reportAny] + for group_key, group_data in data["groups"].items(): + groups[group_key] = ModelGroup.from_dict(group_data) return groups def should_include_model(self, model_name: str) -> bool: @@ -210,13 +210,13 @@ class MetricConfig: direction: OptimizationDirection @classmethod - def from_dict(cls, data: dict[str, Any]) -> MetricConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> MetricConfig: """Create from dictionary.""" - direction_str: str = data["direction"] # pyright: ignore[reportAny] + direction_str: str = data["direction"] direction = OptimizationDirection.MAXIMIZE if direction_str == "maximize" else OptimizationDirection.MINIMIZE return cls( - display_name=data["display_name"], # pyright: ignore[reportAny] - aliases=data["aliases"], # pyright: ignore[reportAny] + display_name=data["display_name"], + aliases=data["aliases"], direction=direction, ) @@ -230,12 +230,12 @@ class FigureConfig: height_formula: str @classmethod - def from_dict(cls, data: dict[str, Any]) -> FigureConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> FigureConfig: """Create from dictionary.""" return cls( - dpi=data["dpi"], # pyright: ignore[reportAny] - width_formula=data["width_formula"], # pyright: ignore[reportAny] - height_formula=data["height_formula"], # pyright: ignore[reportAny] + dpi=data["dpi"], + width_formula=data["width_formula"], + height_formula=data["height_formula"], ) def calculate_width(self, n_columns: int) -> float: @@ -250,7 +250,7 @@ def calculate_width(self, n_columns: int) -> float: Returns: Calculated width in inches """ - result: float = eval(self.width_formula, {"n_columns": n_columns, "max": max}) # pyright: ignore[reportAny] + result: float = eval(self.width_formula, {"n_columns": n_columns, "max": max}) return result def calculate_height(self, n_models: int) -> float: @@ -264,7 +264,7 @@ def calculate_height(self, n_models: int) -> float: Returns: Calculated height in inches """ - result: float = eval(self.height_formula, {"n_models": n_models, "max": max}) # pyright: ignore[reportAny] + result: float = eval(self.height_formula, {"n_models": n_models, "max": max}) return result @@ -278,15 +278,15 @@ class VisualizationConfig: figure: FigureConfig @classmethod - def from_dict(cls, data: dict[str, Any]) -> VisualizationConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> VisualizationConfig: """Create from dictionary.""" - colormap_range: tuple[float, float] = tuple(data["colormap_range"]) # pyright: ignore[reportAny] - bad_color: tuple[float, float, float, float] = tuple(data["bad_color"]) # pyright: ignore[reportAny] + colormap_range: tuple[float, float] = tuple(data["colormap_range"]) + bad_color: tuple[float, float, float, float] = tuple(data["bad_color"]) return cls( - colormaps=data["colormaps"], # pyright: ignore[reportAny] + colormaps=data["colormaps"], colormap_range=colormap_range, bad_color=bad_color, - figure=FigureConfig.from_dict(data["figure"]), # pyright: ignore[reportAny] + figure=FigureConfig.from_dict(data["figure"]), ) @@ -299,10 +299,10 @@ class HeatmapMethod: formula: str | None = None @classmethod - def from_dict(cls, data: dict[str, Any]) -> HeatmapMethod: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> HeatmapMethod: """Create from dictionary.""" return cls( - display_name=data["display_name"], # pyright: ignore[reportAny] + display_name=data["display_name"], priority=data.get("priority"), formula=data.get("formula"), ) @@ -319,17 +319,17 @@ class EpsilonBoundedConfig: default_n_trials: int @classmethod - def from_dict(cls, data: dict[str, Any]) -> EpsilonBoundedConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> EpsilonBoundedConfig: """Create from dictionary.""" baseline_attack = AttackName(data["baseline_attack"]) if baseline_attack not in ATTACKS_REGISTRY: raise ValueError(f"Baseline attack '{baseline_attack}' not found in ATTACKS_REGISTRY") return cls( baseline_attack=baseline_attack, - utility_metric=data["utility_metric"], # pyright: ignore[reportAny] - safety_metric=data["safety_metric"], # pyright: ignore[reportAny] - default_epsilons=data["default_epsilons"], # pyright: ignore[reportAny] - default_n_trials=data["default_n_trials"], # pyright: ignore[reportAny] + utility_metric=data["utility_metric"], + safety_metric=data["safety_metric"], + default_epsilons=data["default_epsilons"], + default_n_trials=data["default_n_trials"], ) @@ -372,12 +372,12 @@ def from_yaml(cls, path: Path) -> AnalysisConfig: raise FileNotFoundError(f"Config file not found: {path}") with path.open() as f: - data: dict[str, object] = yaml.safe_load(f) # pyright: ignore[reportAny] + data: dict[str, object] = yaml.safe_load(f) return cls.from_dict(data) @classmethod - def from_dict(cls, data: dict[str, Any]) -> AnalysisConfig: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> AnalysisConfig: """Create from dictionary with full validation. Parses all nested configuration sections including attacks, models, metrics, @@ -389,12 +389,12 @@ def from_dict(cls, data: dict[str, Any]) -> AnalysisConfig: # pyright: ignore[r Returns: Validated AnalysisConfig instance """ - attacks = AttackConfig.from_dict(data["attacks"]) # pyright: ignore[reportAny] - models = ModelConfig.from_dict(data["models"]) # pyright: ignore[reportAny] - metrics = cls._parse_metrics(data["metrics"]) # pyright: ignore[reportAny] - visualization = VisualizationConfig.from_dict(data["visualization"]) # pyright: ignore[reportAny] - epsilon_bounded = EpsilonBoundedConfig.from_dict(data["epsilon_bounded"]) # pyright: ignore[reportAny] - heatmap_methods = cls._parse_heatmap_methods(data["heatmap_methods"]) # pyright: ignore[reportAny] + attacks = AttackConfig.from_dict(data["attacks"]) + models = ModelConfig.from_dict(data["models"]) + metrics = cls._parse_metrics(data["metrics"]) + visualization = VisualizationConfig.from_dict(data["visualization"]) + epsilon_bounded = EpsilonBoundedConfig.from_dict(data["epsilon_bounded"]) + heatmap_methods = cls._parse_heatmap_methods(data["heatmap_methods"]) return cls( attacks=attacks, @@ -406,22 +406,22 @@ def from_dict(cls, data: dict[str, Any]) -> AnalysisConfig: # pyright: ignore[r ) @classmethod - def _parse_metrics(cls, metrics_data: dict[str, Any]) -> dict[str, MetricConfig]: # pyright: ignore[reportExplicitAny] + def _parse_metrics(cls, metrics_data: dict[str, Any]) -> dict[str, MetricConfig]: """Parse metric dict into MetricConfig instances.""" metrics: dict[str, MetricConfig] = {} - for metric_key, metric_data in metrics_data.items(): # pyright: ignore[reportAny] - metrics[metric_key] = MetricConfig.from_dict(metric_data) # pyright: ignore[reportAny] + for metric_key, metric_data in metrics_data.items(): + metrics[metric_key] = MetricConfig.from_dict(metric_data) return metrics @classmethod def _parse_heatmap_methods( cls, - methods_data: dict[str, Any], # pyright: ignore[reportExplicitAny] + methods_data: dict[str, Any], ) -> dict[str, HeatmapMethod]: """Parse heatmap method dict into HeatmapMethod instances.""" heatmap_methods: dict[str, HeatmapMethod] = {} - for method_key, method_data in methods_data.items(): # pyright: ignore[reportAny] - heatmap_methods[method_key] = HeatmapMethod.from_dict(method_data) # pyright: ignore[reportAny] + for method_key, method_data in methods_data.items(): + heatmap_methods[method_key] = HeatmapMethod.from_dict(method_data) return heatmap_methods def get_metric_by_alias(self, alias: str) -> MetricConfig | None: diff --git a/src/tamperbench/whitebox/utils/analysis/filters.py b/src/tamperbench/whitebox/utils/analysis/filters.py index 435a65b2..7222b0af 100644 --- a/src/tamperbench/whitebox/utils/analysis/filters.py +++ b/src/tamperbench/whitebox/utils/analysis/filters.py @@ -158,8 +158,8 @@ def extract_utility_metric( utility_metric_key = analysis_config.epsilon_bounded.utility_metric return get_metric_value(trial, utility_metric_key, analysis_config) - values = trial.values # pyright: ignore[reportAny] - return values[1] if values and len(values) >= 2 else None # pyright: ignore[reportAny] + values = trial.values + return values[1] if values and len(values) >= 2 else None def extract_safety_metric( self, @@ -179,8 +179,8 @@ def extract_safety_metric( safety_metric_key = MetricName(analysis_config.epsilon_bounded.safety_metric) return get_metric_value(trial, safety_metric_key, analysis_config) - values = trial.values # pyright: ignore[reportAny] - return values[0] if values and len(values) >= 1 else None # pyright: ignore[reportAny] + values = trial.values + return values[0] if values and len(values) >= 1 else None def satisfies_epsilon_constraint( self, diff --git a/src/tamperbench/whitebox/utils/analysis/io.py b/src/tamperbench/whitebox/utils/analysis/io.py index a0e73360..21db3f89 100644 --- a/src/tamperbench/whitebox/utils/analysis/io.py +++ b/src/tamperbench/whitebox/utils/analysis/io.py @@ -17,7 +17,7 @@ def write_json( data: dict[str, Any] | list[Any], path: Path, - indent: int = 2, # pyright: ignore[reportExplicitAny] + indent: int = 2, ) -> None: """Write data to JSON file, creating parent directories as needed. diff --git a/src/tamperbench/whitebox/utils/analysis/metrics.py b/src/tamperbench/whitebox/utils/analysis/metrics.py index 4e276cd3..504f065e 100644 --- a/src/tamperbench/whitebox/utils/analysis/metrics.py +++ b/src/tamperbench/whitebox/utils/analysis/metrics.py @@ -160,7 +160,7 @@ def _extract_from_user_attrs(trial: optuna.trial.FrozenTrial) -> dict[str, float return {} metrics: dict[str, float] = {} - for key, value in eval_metrics.items(): # pyright: ignore[reportUnknownVariableType] + for key, value in eval_metrics.items(): if isinstance(value, int | float): metrics[str(key)] = float(value) @@ -171,19 +171,19 @@ def _extract_from_trial_values(trial: optuna.trial.FrozenTrial, config: Analysis """Extract metrics from trial.values by index (0=SR, 1=MMLU).""" metrics: dict[str, float] = {} - values = trial.values # pyright: ignore[reportAny] + values = trial.values if values is None: return metrics - if len(values) >= 1: # pyright: ignore[reportAny] + if len(values) >= 1: sr_metric = config.metrics.get("strong_reject") if sr_metric: - metrics[sr_metric.aliases[0]] = float(values[0]) # pyright: ignore[reportAny] + metrics[sr_metric.aliases[0]] = float(values[0]) - if len(values) >= 2: # pyright: ignore[reportAny] + if len(values) >= 2: mmlu_metric = config.metrics.get("mmlu_pro") if mmlu_metric: - metrics[mmlu_metric.aliases[1]] = float(values[1]) # pyright: ignore[reportAny] + metrics[mmlu_metric.aliases[1]] = float(values[1]) return metrics diff --git a/src/tamperbench/whitebox/utils/analysis/pipeline.py b/src/tamperbench/whitebox/utils/analysis/pipeline.py index a4fc14a0..6f678dd5 100644 --- a/src/tamperbench/whitebox/utils/analysis/pipeline.py +++ b/src/tamperbench/whitebox/utils/analysis/pipeline.py @@ -181,7 +181,7 @@ def build_heatmap_matrices( epsilon_filter: EpsilonBoundedFilter, n_trials: int, parent_results_dir: Path, - ) -> tuple[HeatmapData, dict[tuple[str, AttackName], dict[str, Any]]]: # pyright: ignore[reportExplicitAny] + ) -> tuple[HeatmapData, dict[tuple[str, AttackName], dict[str, Any]]]: """Build heatmap matrices from epsilon-filtered trials, returning data and trial info.""" harmfulness_metric_name = MetricName(self.config.epsilon_bounded.safety_metric) utility_metric_name = MetricName(self.config.epsilon_bounded.utility_metric) @@ -194,7 +194,7 @@ def build_heatmap_matrices( harmfulness_raw = np.full((n_models, n_attacks), np.nan) utility_raw = np.full((n_models, n_attacks), np.nan) - selected_trials_info: dict[tuple[str, AttackName], dict[str, Any]] = {} # pyright: ignore[reportExplicitAny] + selected_trials_info: dict[tuple[str, AttackName], dict[str, Any]] = {} for i, model in enumerate(models): baseline_utility = baseline_utility_scores.get(model) @@ -258,7 +258,7 @@ def build_heatmap_matrices( utility_raw=utility_raw, utility_delta=utility_delta, models=models, - attacks=attacks, # pyright: ignore[reportArgumentType] + attacks=attacks, metadata={ "method": "epsilon_bounded", "epsilon": epsilon, @@ -323,7 +323,7 @@ def extract_metrics_for_heatmap( def copy_trial_artifacts( self, - selected_trials_info: dict[tuple[str, AttackName], dict[str, Any]], # pyright: ignore[reportExplicitAny] + selected_trials_info: dict[tuple[str, AttackName], dict[str, Any]], all_data: dict[str, dict[AttackName, StudyData]], output_dir: Path, ) -> None: @@ -333,8 +333,8 @@ def copy_trial_artifacts( ) for (model, attack), trial_info in selected_trials_info.items(): - trial_dir: Path = trial_info["trial_dir"] # pyright: ignore[reportAny] - trial_number: int = trial_info["trial_number"] # pyright: ignore[reportAny] + trial_dir: Path = trial_info["trial_dir"] + trial_number: int = trial_info["trial_number"] model_attack_dir = output_dir / model / str(attack) ensure_dir(model_attack_dir) @@ -346,9 +346,7 @@ def copy_trial_artifacts( grid_yaml_path = model_attack_dir / ConfigPath.GRID_YAML grid_data: dict[str, dict[str, object]] = { - f"{model}_params": dict( - trial.user_attrs[OptunaUserAttrs.MERGED_CONFIG] # pyright: ignore[reportAny] - ), + f"{model}_params": dict(trial.user_attrs[OptunaUserAttrs.MERGED_CONFIG]), } with open(grid_yaml_path, "w") as f: yaml.dump(grid_data, f, default_flow_style=False, sort_keys=False) @@ -398,7 +396,7 @@ def _select_trial_for_embedding_attack( return max( completed_trials, - key=lambda t: t.user_attrs[OptunaUserAttrs.EVAL_METRICS][embedding_attack_eval_key], # pyright: ignore[reportAny] + key=lambda t: t.user_attrs[OptunaUserAttrs.EVAL_METRICS][embedding_attack_eval_key], ) def _select_trial_with_epsilon_filter( diff --git a/src/tamperbench/whitebox/utils/analysis/validation.py b/src/tamperbench/whitebox/utils/analysis/validation.py index 3aba99ed..10f80898 100644 --- a/src/tamperbench/whitebox/utils/analysis/validation.py +++ b/src/tamperbench/whitebox/utils/analysis/validation.py @@ -398,7 +398,7 @@ def _parse_attack_name(self, attack_name: str) -> AttackName: try: return AttackName(attack_name) except ValueError as e: - available = [a.value for a in AttackName] # pyright: ignore[reportAny] + available = [a.value for a in AttackName] raise ValueError(f"Invalid attack name: '{attack_name}'. Available attacks: {available}") from e def _check_attack_in_registry(self, attack: AttackName, attack_name: str) -> None: @@ -426,7 +426,7 @@ def _get_study_name(self, study_path: Path, attack_name: AttackName) -> str: best_json_path = study_path.parent / OptunaPaths.BEST_JSON if best_json_path.exists(): with best_json_path.open() as f: - best_data: dict[str, object] = json.load(f) # pyright: ignore[reportAny] + best_data: dict[str, object] = json.load(f) study_name = best_data.get(BestJsonKeys.STUDY_NAME) if isinstance(study_name, str): return study_name @@ -437,7 +437,7 @@ def _validate_best_json_structure(self, best_json_path: Path) -> None: """Load and validate best.json structure, raising ValueError if invalid.""" try: with best_json_path.open() as f: - best_data: dict[str, object] = json.load(f) # pyright: ignore[reportAny] + best_data: dict[str, object] = json.load(f) self._check_required_fields(best_data) self._validate_top_trials(best_data) diff --git a/src/tamperbench/whitebox/utils/benchmark/config.py b/src/tamperbench/whitebox/utils/benchmark/config.py index c551f7e6..89bd2224 100644 --- a/src/tamperbench/whitebox/utils/benchmark/config.py +++ b/src/tamperbench/whitebox/utils/benchmark/config.py @@ -39,7 +39,7 @@ def primary_objective_direction(self) -> str: >>> sweep.primary_objective_direction 'maximize' """ - return EVALS_REGISTRY[self.evals[0]].attacker_direction.value # pyright: ignore[reportAny] + return EVALS_REGISTRY[self.evals[0]].attacker_direction.value @classmethod def from_yaml( @@ -124,7 +124,7 @@ def load_attack_base_config(config_root: Path, attack_name: AttackName) -> tuple base_cfg = grid_payload[ConfigKeys.BASE] if isinstance(base_cfg, dict): - cfg_typed: dict[str, object] = base_cfg # pyright: ignore[reportUnknownVariableType] + cfg_typed: dict[str, object] = base_cfg return str(ConfigKeys.BASE), cfg_typed raise TypeError(f"Expected dict for '{ConfigKeys.BASE}' config, got {type(base_cfg)}") @@ -133,7 +133,7 @@ def load_attack_base_config(config_root: Path, attack_name: AttackName) -> tuple first_cfg = grid_payload[first_key] if isinstance(first_cfg, dict): - cfg_typed = first_cfg # pyright: ignore[reportUnknownVariableType] + cfg_typed = first_cfg return str(first_key), cfg_typed raise TypeError(f"Expected dict for config '{first_key}', got {type(first_cfg)}") diff --git a/src/tamperbench/whitebox/utils/benchmark/io.py b/src/tamperbench/whitebox/utils/benchmark/io.py index a12f4506..39165f76 100644 --- a/src/tamperbench/whitebox/utils/benchmark/io.py +++ b/src/tamperbench/whitebox/utils/benchmark/io.py @@ -24,7 +24,7 @@ def yaml_to_dict(yaml_file_path: Path) -> dict[str, object]: 0.0001 """ with open(yaml_file_path) as file: - data: dict[str, object] = yaml.safe_load(file) # pyright: ignore[reportAny] + data: dict[str, object] = yaml.safe_load(file) return data @@ -77,8 +77,8 @@ def deep_merge_dicts(base: Mapping[str, object], overrides: Mapping[str, object] if isinstance(base_value, Mapping) and isinstance(value, Mapping): result[key] = deep_merge_dicts( - base_value, # pyright: ignore[reportUnknownArgumentType] - value, # pyright: ignore[reportUnknownArgumentType] + base_value, + value, ) else: result[key] = value @@ -116,7 +116,7 @@ def nest_dotted_params(flat_params: Mapping[str, object]) -> dict[str, object]: next_level = {} cursor[part] = next_level - cursor = next_level # pyright: ignore[reportUnknownVariableType] + cursor = next_level cursor[parts[-1]] = value diff --git a/src/tamperbench/whitebox/utils/benchmark/sweep.py b/src/tamperbench/whitebox/utils/benchmark/sweep.py index 8c17676c..b190b579 100644 --- a/src/tamperbench/whitebox/utils/benchmark/sweep.py +++ b/src/tamperbench/whitebox/utils/benchmark/sweep.py @@ -29,7 +29,7 @@ class CategoricalParam(BaseModel): type: Literal[OptunaParamType.CATEGORICAL] # Allow arbitrary Python values (including lists/dicts), e.g., LoRA target_modules - choices: list[Any] # pyright: ignore[reportExplicitAny] + choices: list[Any] class IntParam(BaseModel): @@ -193,7 +193,7 @@ def _flatten(prefix: str, dct: dict[str, object]) -> dict[str, object]: key = f"{prefix}.{k}" if prefix else k # Treat inner nodes as nested if they don't look like a param definition if isinstance(v, dict) and "type" not in v and "choices" not in v: - nested_dict: dict[str, object] = v # pyright: ignore[reportUnknownVariableType] + nested_dict: dict[str, object] = v out.update(_flatten(key, nested_dict)) else: out[key] = v @@ -219,7 +219,7 @@ def _nest(flat: dict[str, object]) -> dict[str, object]: if not isinstance(next_level, dict): next_level = {} cur[p] = next_level - cur = next_level # pyright: ignore[reportUnknownVariableType] + cur = next_level cur[parts[-1]] = value return root @@ -254,7 +254,7 @@ def suggest_nested_params(trial: optuna.Trial, nested_space: dict[str, object]) for k, v in flat_space.items(): param_def: object = v if isinstance(v, dict) and "type" not in v and "choices" in v: - param_def = { # pyright: ignore[reportUnknownVariableType] + param_def = { "type": OptunaParamType.CATEGORICAL, **v, } diff --git a/src/tamperbench/whitebox/utils/benchmark/trial_manager.py b/src/tamperbench/whitebox/utils/benchmark/trial_manager.py index 9e6fa8a8..7ac22f34 100644 --- a/src/tamperbench/whitebox/utils/benchmark/trial_manager.py +++ b/src/tamperbench/whitebox/utils/benchmark/trial_manager.py @@ -150,11 +150,11 @@ def sorted_completed_trials(study: optuna.study.Study, direction: str) -> list[o completed_trials = [ trial for trial in study.get_trials(deepcopy=False) - if trial.state == optuna.trial.TrialState.COMPLETE and trial.values is not None # pyright: ignore[reportAny] + if trial.state == optuna.trial.TrialState.COMPLETE and trial.values is not None ] def sort_key(trial: optuna.trial.FrozenTrial) -> float: - values: list[float] = trial.values # pyright: ignore[reportAny] + values: list[float] = trial.values return -values[0] if direction == OptimizationDirection.MAXIMIZE else values[0] return sorted(completed_trials, key=sort_key) @@ -212,15 +212,12 @@ def build_summaries( if not isinstance(eval_metrics, dict): primary_eval_key = str(eval_names[0]) - trial_values: list[float] = trial.values # pyright: ignore[reportAny] + trial_values: list[float] = trial.values eval_metrics = {primary_eval_key: float(trial_values[0]) if trial_values else float("nan")} - values_list: list[float] = [ - float(eval_metrics.get(str(e), float("nan"))) # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType] - for e in eval_names - ] + values_list: list[float] = [float(eval_metrics.get(str(e), float("nan"))) for e in eval_names] - trial_values_for_summary: list[float] = trial.values # pyright: ignore[reportAny] + trial_values_for_summary: list[float] = trial.values summaries.append( { diff --git a/src/tamperbench/whitebox/utils/models/config.py b/src/tamperbench/whitebox/utils/models/config.py index d43aba61..8987c05e 100644 --- a/src/tamperbench/whitebox/utils/models/config.py +++ b/src/tamperbench/whitebox/utils/models/config.py @@ -32,7 +32,7 @@ class ModelConfig: tokenizer_checkpoint: str | None = None @classmethod - def from_dict(cls, data: dict[str, Any]) -> Self: # pyright: ignore[reportExplicitAny] + def from_dict(cls, data: dict[str, Any]) -> Self: """All subclasses must implement a `from_dict` constructor. Args: @@ -63,4 +63,4 @@ def from_dict(cls, data: dict[str, Any]) -> Self: # pyright: ignore[reportExpli if dict_key not in config_keys: raise ValueError(f"`{dict_key}` is not a field of {cls.__name__}") - return cls(**data) # pyright: ignore[reportAny] + return cls(**data)