Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
exclude: ".*(csv|pdb)$"

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: check-added-large-files
- id: check-executables-have-shebangs
- id: check-merge-conflict
- id: check-toml
- id: debug-statements
- id: end-of-file-fixer
- id: trailing-whitespace

# Fast Python linter and formatter - replaces flake8, isort, and black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.13.1
hooks:
# Run the Ruff linter
- id: ruff-check
args: ["--fix", "--exit-non-zero-on-fix"]
# Run the Ruff formatter
- id: ruff-format
29 changes: 14 additions & 15 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,56 +91,56 @@ from abdev_core import BaseModel, load_features

class YourModel(BaseModel):
"""Your model description.

This baseline [describe approach].
"""

def train(self, df: pd.DataFrame, run_dir: Path, *, seed: int = 42) -> None:
"""Train model on ALL provided data and save artifacts to run_dir.

Args:
df: Training dataframe with sequences and labels
run_dir: Directory to save model artifacts
seed: Random seed for reproducibility
"""
run_dir.mkdir(parents=True, exist_ok=True)

# Load features if needed
features = load_features("YourFeatureSource", dataset="GDPa1")

# Train your model on ALL samples in df
# The orchestrator handles CV splitting externally
# ... your training logic here ...

# Save model artifacts
# model_path = run_dir / "model.pkl"
# pickle.dump(model, open(model_path, "wb"))

print(f"Model saved to {run_dir}")

def predict(self, df: pd.DataFrame, run_dir: Path) -> pd.DataFrame:
"""Generate predictions for ALL provided samples.

Args:
df: Input dataframe with sequences
run_dir: Directory containing saved model artifacts

Returns:
DataFrame with predictions
"""
# Load model artifacts
# model = pickle.load(open(run_dir / "model.pkl", "rb"))

# Load features if needed
features = load_features("YourFeatureSource")

# Generate predictions for ALL samples
# ... your prediction logic here ...

# Return predictions
df_output = df[["antibody_name", "vh_protein_sequence", "vl_protein_sequence"]].copy()
# df_output["HIC"] = predictions

return df_output
```

Expand Down Expand Up @@ -377,4 +377,3 @@ Add XYZ baseline with feature engineering
## Questions?

Open an issue or reach out to maintainers.

5 changes: 2 additions & 3 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,18 @@ SOFTWARE.
### Datasets
The benchmark datasets included in this repository (under `data/`) may be
subject to separate terms of use and restrictions. Please refer to the
documentation of the originating sources of the `data/` directory for specific
documentation of the originating sources of the `data/` directory for specific
dataset licensing information.

### Baseline Models and External Code
Individual baseline implementations (under `baselines/`) may incorporate or
depend on third-party code, models, or methods that are subject to their own
licenses and terms of use. Each baseline directory may contain its own LICENSE
or README file specifying applicable terms. Users should review and comply with
any such terms in addition to those of original origination for models and code when
any such terms in addition to those of original origination for models and code when
using or distributing baseline implementations.

When contributing a new baseline that uses external methods or models, please:
- Clearly document the source and any licensing requirements in the baseline's README
- Include appropriate citations and attributions
- Ensure compatibility with this repository's MIT License or note any restrictions

16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,12 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
version = "0.1.0"
channels = ["conda-forge"]
platforms = ["linux-64", "osx-64", "osx-arm64"]

[dependencies]
python = "3.11.*"
pandas = ">=2.0"
typer = ">=0.9"

[pypi-dependencies]
abdev-core = { path = "../../libs/abdev_core", editable = true }
your-baseline = { path = ".", editable = true }
Expand All @@ -204,17 +204,17 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
from pathlib import Path
import pandas as pd
from abdev_core import BaseModel

class YourModel(BaseModel):
def train(self, df: pd.DataFrame, run_dir: Path, *, seed: int = 42) -> None:
"""Train model on ALL provided data and save artifacts to run_dir."""
# Train on ALL samples in df (no internal CV)
# Your training logic here
pass

def predict(self, df: pd.DataFrame, run_dir: Path) -> pd.DataFrame:
"""Generate predictions for ALL provided samples.

Returns:
DataFrame with predictions. Orchestrator handles saving to file.
"""
Expand All @@ -228,9 +228,9 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
```python
from abdev_core import create_cli_app
from .model import YourModel

app = create_cli_app(YourModel, "Your Model")

if __name__ == "__main__":
app()
```
Expand All @@ -248,7 +248,7 @@ All baselines must implement the `BaseModel` interface with `train()` and `predi
```bash
# From repository root
python tests/test_baseline_contract.py --baseline your_baseline

# Or test train/predict manually
cd baselines/your_baseline
pixi install
Expand Down
1 change: 0 additions & 1 deletion baselines/aggrescan3d/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,3 @@ Aggrescan3D: Zambrano R, et al. (2015). "AGGRESCAN3D (A3D): server for predictio
## Acknowledgements

Aggrescan3D features computed via Tamarind.bio.

1 change: 0 additions & 1 deletion baselines/aggrescan3d/pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,3 @@ ruff = ">=0.1"
[feature.dev.tasks]
lint = "ruff check src && ruff format --check src"
test = "pytest tests -v"

1 change: 0 additions & 1 deletion baselines/aggrescan3d/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ where = ["src"]

[tool.setuptools.package-dir]
"" = "src"

1 change: 0 additions & 1 deletion baselines/aggrescan3d/src/aggrescan3d/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
"""Aggrescan3D baseline - aggregation propensity predictions."""

__version__ = "0.1.0"

1 change: 0 additions & 1 deletion baselines/aggrescan3d/src/aggrescan3d/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@

if __name__ == "__main__":
app()

51 changes: 26 additions & 25 deletions baselines/aggrescan3d/src/aggrescan3d/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,96 +19,97 @@

class Aggrescan3dModel(BaseModel):
"""Aggrescan3D baseline using pre-computed aggregation propensity features.

This is a non-training baseline that directly maps Aggrescan3D features
to predicted properties based on known correlations.

Features are loaded from the centralized feature store via abdev_core.
"""

def train(self, df: pd.DataFrame, run_dir: Path, *, seed: int = 42) -> None:
"""No-op training - this baseline uses pre-computed features.

Saves configuration to run_dir for consistency.

Args:
df: Training dataframe (not used)
run_dir: Directory to save configuration
seed: Random seed (not used)
"""
run_dir.mkdir(parents=True, exist_ok=True)

# Save configuration for reference
config = {
"model_type": "aggrescan3d",
"feature_mappings": FEATURE_MAPPINGS,
"note": "Non-training baseline using pre-computed Aggrescan3D features"
"note": "Non-training baseline using pre-computed Aggrescan3D features",
}

config_path = run_dir / "config.json"
with open(config_path, "w") as f:
json.dump(config, f, indent=2)

print(f"Saved configuration to {config_path}")
print("Note: This is a non-training baseline using pre-computed features")

def predict(self, df: pd.DataFrame, run_dir: Path) -> pd.DataFrame:
"""Generate predictions using Aggrescan3D features.

Args:
df: Input dataframe with sequences
run_dir: Directory containing configuration (not strictly needed)

Returns:
DataFrame with predictions for each property
"""
# Load Aggrescan3D features from centralized feature store (all datasets)
aggrescan_features = load_features("Aggrescan3D")

# Generate predictions for all mapped features
all_predictions = []

for feature_name, assay_mappings in FEATURE_MAPPINGS.items():
if feature_name not in aggrescan_features.columns:
print(f"Warning: {feature_name} not found in features, skipping")
continue

# Merge sequences with features
df_merged = df.merge(
aggrescan_features[[feature_name]].reset_index(),
on="antibody_name",
how="left"
how="left",
)

# Apply directionality to create predictions
for assay_name, directionality in assay_mappings:
df_merged[f"{assay_name}_from_{feature_name}"] = (
df_merged[feature_name] * directionality
)

all_predictions.append(df_merged)

# For now, we'll output the first mapping for each property
# (This matches the original behavior where multiple features map to same properties)
df_output = df[["antibody_name", "vh_protein_sequence", "vl_protein_sequence"]].copy()

df_output = df[
["antibody_name", "vh_protein_sequence", "vl_protein_sequence"]
].copy()

# Collect predictions from the first available feature for each property
property_sources = {}
for feature_name, assay_mappings in FEATURE_MAPPINGS.items():
for assay_name, _ in assay_mappings:
if assay_name not in property_sources:
property_sources[assay_name] = feature_name

# Merge predictions
for assay_name, feature_name in property_sources.items():
col_name = f"{assay_name}_from_{feature_name}"
for df_pred in all_predictions:
if col_name in df_pred.columns:
df_output[assay_name] = df_pred[col_name]
break

print(f"Generated predictions for {len(df_output)} samples")
print(f" Properties: {', '.join(property_sources.keys())}")

return df_output

return df_output
1 change: 0 additions & 1 deletion baselines/aggrescan3d/src/aggrescan3d/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@

if __name__ == "__main__":
app()

1 change: 0 additions & 1 deletion baselines/antifold/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,3 @@ AntiFold: Ruffolo JA, et al. (2022). "Antibody structure prediction using interp
## Acknowledgements

AntiFold features computed via Tamarind.bio.

1 change: 0 additions & 1 deletion baselines/antifold/pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,3 @@ ruff = ">=0.1"
[feature.dev.tasks]
lint = "ruff check src && ruff format --check src"
test = "pytest tests -v"

1 change: 0 additions & 1 deletion baselines/antifold/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ where = ["src"]

[tool.setuptools.package-dir]
"" = "src"

1 change: 0 additions & 1 deletion baselines/antifold/src/antifold/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
"""AntiFold baseline - antibody stability predictions."""

__version__ = "0.1.0"

1 change: 0 additions & 1 deletion baselines/antifold/src/antifold/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@

if __name__ == "__main__":
app()

Loading