ginkgobioworks · loodvn · Oct 29, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,23 @@
+exclude: ".*(csv|pdb)$"
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-toml
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+
+  # Fast Python linter and formatter - replaces flake8, isort, and black
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.13.1
+    hooks:
+      # Run the Ruff linter
+      - id: ruff-check
+        args: ["--fix", "--exit-non-zero-on-fix"]
+      # Run the Ruff formatter
+      - id: ruff-format
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -91,56 +91,56 @@ from abdev_core import BaseModel, load_features
 
 class YourModel(BaseModel):
     """Your model description.
-    
+
     This baseline [describe approach].
     """
-    
+
     def train(self, df: pd.DataFrame, run_dir: Path, *, seed: int = 42) -> None:
         """Train model on ALL provided data and save artifacts to run_dir.
-        
+
         Args:
             df: Training dataframe with sequences and labels
             run_dir: Directory to save model artifacts
             seed: Random seed for reproducibility
         """
         run_dir.mkdir(parents=True, exist_ok=True)
-        
+
         # Load features if needed
         features = load_features("YourFeatureSource", dataset="GDPa1")
-        
+
         # Train your model on ALL samples in df
         # The orchestrator handles CV splitting externally
         # ... your training logic here ...
-        
+
         # Save model artifacts
         # model_path = run_dir / "model.pkl"
         # pickle.dump(model, open(model_path, "wb"))
-        
+
         print(f"Model saved to {run_dir}")
-    
+
     def predict(self, df: pd.DataFrame, run_dir: Path) -> pd.DataFrame:
         """Generate predictions for ALL provided samples.
-        
+
         Args:
             df: Input dataframe with sequences
             run_dir: Directory containing saved model artifacts
-            
+
         Returns:
             DataFrame with predictions
         """
         # Load model artifacts
         # model = pickle.load(open(run_dir / "model.pkl", "rb"))
-        
+
         # Load features if needed
         features = load_features("YourFeatureSource")
-        
+
         # Generate predictions for ALL samples
         # ... your prediction logic here ...
-        
+
         # Return predictions
         df_output = df[["antibody_name", "vh_protein_sequence", "vl_protein_sequence"]].copy()
         # df_output["HIC"] = predictions
-        
+
         return df_output
 ```
 
@@ -377,4 +377,3 @@ Add XYZ baseline with feature engineering
 ## Questions?
 
 Open an issue or reach out to maintainers.
-
diff --git a/LICENSE b/LICENSE
@@ -27,19 +27,18 @@ SOFTWARE.
 ### Datasets
 The benchmark datasets included in this repository (under `data/`) may be
 subject to separate terms of use and restrictions. Please refer to the
-documentation of the originating sources of the `data/` directory for specific 
+documentation of the originating sources of the `data/` directory for specific
 dataset licensing information.
 
 ### Baseline Models and External Code
 Individual baseline implementations (under `baselines/`) may incorporate or
 depend on third-party code, models, or methods that are subject to their own
 licenses and terms of use. Each baseline directory may contain its own LICENSE
 or README file specifying applicable terms. Users should review and comply with
-any such terms in addition to those of original origination for models and code when 
+any such terms in addition to those of original origination for models and code when
 using or distributing baseline implementations.
 
 When contributing a new baseline that uses external methods or models, please:
 - Clearly document the source and any licensing requirements in the baseline's README
 - Include appropriate citations and attributions
 - Ensure compatibility with this repository's MIT License or note any restrictions
-
diff --git a/README.md b/README.md
@@ -186,12 +186,12 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
    version = "0.1.0"
    channels = ["conda-forge"]
    platforms = ["linux-64", "osx-64", "osx-arm64"]
-   
+
    [dependencies]
    python = "3.11.*"
    pandas = ">=2.0"
    typer = ">=0.9"
-   
+
    [pypi-dependencies]
    abdev-core = { path = "../../libs/abdev_core", editable = true }
    your-baseline = { path = ".", editable = true }
@@ -204,17 +204,17 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
    from pathlib import Path
    import pandas as pd
    from abdev_core import BaseModel
-   
+
    class YourModel(BaseModel):
        def train(self, df: pd.DataFrame, run_dir: Path, *, seed: int = 42) -> None:
            """Train model on ALL provided data and save artifacts to run_dir."""
            # Train on ALL samples in df (no internal CV)
            # Your training logic here
            pass
-       
+
        def predict(self, df: pd.DataFrame, run_dir: Path) -> pd.DataFrame:
            """Generate predictions for ALL provided samples.
-           
+
            Returns:
                DataFrame with predictions. Orchestrator handles saving to file.
            """
@@ -228,9 +228,9 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
    ```python
    from abdev_core import create_cli_app
    from .model import YourModel
-   
+
    app = create_cli_app(YourModel, "Your Model")
-   
+
    if __name__ == "__main__":
        app()
    ```
@@ -248,7 +248,7 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
    ```bash
    # From repository root
    python tests/test_baseline_contract.py --baseline your_baseline
-   
+
    # Or test train/predict manually
    cd baselines/your_baseline
    pixi install

diff --git a/baselines/aggrescan3d/README.md b/baselines/aggrescan3d/README.md
@@ -67,4 +67,3 @@ Aggrescan3D: Zambrano R, et al. (2015). "AGGRESCAN3D (A3D): server for predictio
 ## Acknowledgements
 
 Aggrescan3D features computed via Tamarind.bio.
-
diff --git a/baselines/aggrescan3d/pixi.toml b/baselines/aggrescan3d/pixi.toml
@@ -25,4 +25,3 @@ ruff = ">=0.1"
 [feature.dev.tasks]
 lint = "ruff check src && ruff format --check src"
 test = "pytest tests -v"
-
diff --git a/baselines/aggrescan3d/pyproject.toml b/baselines/aggrescan3d/pyproject.toml
@@ -17,4 +17,3 @@ where = ["src"]
 
 [tool.setuptools.package-dir]
 "" = "src"
-
diff --git a/baselines/aggrescan3d/src/aggrescan3d/__init__.py b/baselines/aggrescan3d/src/aggrescan3d/__init__.py
@@ -1,4 +1,3 @@
 """Aggrescan3D baseline - aggregation propensity predictions."""
 
 __version__ = "0.1.0"
-
diff --git a/baselines/aggrescan3d/src/aggrescan3d/__main__.py b/baselines/aggrescan3d/src/aggrescan3d/__main__.py
@@ -4,4 +4,3 @@
 
 if __name__ == "__main__":
     app()
-
diff --git a/baselines/aggrescan3d/src/aggrescan3d/model.py b/baselines/aggrescan3d/src/aggrescan3d/model.py
@@ -19,96 +19,97 @@
 
 class Aggrescan3dModel(BaseModel):
     """Aggrescan3D baseline using pre-computed aggregation propensity features.
-    
+
     This is a non-training baseline that directly maps Aggrescan3D features
     to predicted properties based on known correlations.
-    
+
     Features are loaded from the centralized feature store via abdev_core.
     """
-    
+
     def train(self, df: pd.DataFrame, run_dir: Path, *, seed: int = 42) -> None:
         """No-op training - this baseline uses pre-computed features.
-        
+
         Saves configuration to run_dir for consistency.
-        
+
         Args:
             df: Training dataframe (not used)
             run_dir: Directory to save configuration
             seed: Random seed (not used)
         """
         run_dir.mkdir(parents=True, exist_ok=True)
-        
+
         # Save configuration for reference
         config = {
             "model_type": "aggrescan3d",
             "feature_mappings": FEATURE_MAPPINGS,
-            "note": "Non-training baseline using pre-computed Aggrescan3D features"
+            "note": "Non-training baseline using pre-computed Aggrescan3D features",
         }
-        
+
         config_path = run_dir / "config.json"
         with open(config_path, "w") as f:
             json.dump(config, f, indent=2)
-        
+
         print(f"Saved configuration to {config_path}")
         print("Note: This is a non-training baseline using pre-computed features")
-    
+
     def predict(self, df: pd.DataFrame, run_dir: Path) -> pd.DataFrame:
         """Generate predictions using Aggrescan3D features.
-        
+
         Args:
             df: Input dataframe with sequences
             run_dir: Directory containing configuration (not strictly needed)
-            
+
         Returns:
             DataFrame with predictions for each property
         """
         # Load Aggrescan3D features from centralized feature store (all datasets)
         aggrescan_features = load_features("Aggrescan3D")
-        
+
         # Generate predictions for all mapped features
         all_predictions = []
-        
+
         for feature_name, assay_mappings in FEATURE_MAPPINGS.items():
             if feature_name not in aggrescan_features.columns:
                 print(f"Warning: {feature_name} not found in features, skipping")
                 continue
-            
+
             # Merge sequences with features
             df_merged = df.merge(
                 aggrescan_features[[feature_name]].reset_index(),
                 on="antibody_name",
-                how="left"
+                how="left",
             )
-            
+
             # Apply directionality to create predictions
             for assay_name, directionality in assay_mappings:
                 df_merged[f"{assay_name}_from_{feature_name}"] = (
                     df_merged[feature_name] * directionality
                 )
-            
+
             all_predictions.append(df_merged)
-        
+
         # For now, we'll output the first mapping for each property
         # (This matches the original behavior where multiple features map to same properties)
-        df_output = df[["antibody_name", "vh_protein_sequence", "vl_protein_sequence"]].copy()
-
+        df_output = df[
+            ["antibody_name", "vh_protein_sequence", "vl_protein_sequence"]
+        ].copy()
+
         # Collect predictions from the first available feature for each property
         property_sources = {}
         for feature_name, assay_mappings in FEATURE_MAPPINGS.items():
             for assay_name, _ in assay_mappings:
                 if assay_name not in property_sources:
                     property_sources[assay_name] = feature_name
-        
+
         # Merge predictions
         for assay_name, feature_name in property_sources.items():
             col_name = f"{assay_name}_from_{feature_name}"
             for df_pred in all_predictions:
                 if col_name in df_pred.columns:
                     df_output[assay_name] = df_pred[col_name]
                     break
-        
+
         print(f"Generated predictions for {len(df_output)} samples")
         print(f"  Properties: {', '.join(property_sources.keys())}")
-
-        return df_output
 
+        return df_output
diff --git a/baselines/aggrescan3d/src/aggrescan3d/run.py b/baselines/aggrescan3d/src/aggrescan3d/run.py
@@ -9,4 +9,3 @@
 
 if __name__ == "__main__":
     app()
-
diff --git a/baselines/antifold/README.md b/baselines/antifold/README.md
@@ -66,4 +66,3 @@ AntiFold: Ruffolo JA, et al. (2022). "Antibody structure prediction using interp
 ## Acknowledgements
 
 AntiFold features computed via Tamarind.bio.
-
diff --git a/baselines/antifold/pixi.toml b/baselines/antifold/pixi.toml
@@ -25,4 +25,3 @@ ruff = ">=0.1"
 [feature.dev.tasks]
 lint = "ruff check src && ruff format --check src"
 test = "pytest tests -v"
-
diff --git a/baselines/antifold/pyproject.toml b/baselines/antifold/pyproject.toml
@@ -17,4 +17,3 @@ where = ["src"]
 
 [tool.setuptools.package-dir]
 "" = "src"
-
diff --git a/baselines/antifold/src/antifold/__init__.py b/baselines/antifold/src/antifold/__init__.py
@@ -1,4 +1,3 @@
 """AntiFold baseline - antibody stability predictions."""
 
 __version__ = "0.1.0"
-
diff --git a/baselines/antifold/src/antifold/__main__.py b/baselines/antifold/src/antifold/__main__.py
@@ -4,4 +4,3 @@
 
 if __name__ == "__main__":
     app()
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -67,4 +67,3 @@ Aggrescan3D: Zambrano R, et al. (2015). "AGGRESCAN3D (A3D): server for predictio
		## Acknowledgements

		Aggrescan3D features computed via Tamarind.bio.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,4 +25,3 @@ ruff = ">=0.1"
		[feature.dev.tasks]
		lint = "ruff check src && ruff format --check src"
		test = "pytest tests -v"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,3 @@ where = ["src"]

		[tool.setuptools.package-dir]
		"" = "src"
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		"""Aggrescan3D baseline - aggregation propensity predictions."""

		__version__ = "0.1.0"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -66,4 +66,3 @@ AntiFold: Ruffolo JA, et al. (2022). "Antibody structure prediction using interp
		## Acknowledgements

		AntiFold features computed via Tamarind.bio.
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		"""AntiFold baseline - antibody stability predictions."""

		__version__ = "0.1.0"