Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ ESM C | Single sequence | [ESM Team](https://evolutionaryscale.ai/blog/esm-cambr
xTrimoPGLM | Single sequence | [Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.](https://www.nature.com/articles/s41592-025-02636-z)
ProGen3 | Single sequence | [Bhatnagar, A., Jain, S., Beazer, J., Curran, S.C., Hoffnagle, A.M., Ching, K., Martyn, M., Nayfach, S., Ruffolo, J.A., & Madani, A. (2025). Scaling unlocks broader generation and deeper functional understanding of proteins. bioRxiv, 2025.04.15.649055.](https://doi.org/10.1101/2025.04.15.649055)
AIDO | MSA & Structure | [Sun, N., Zou, S., Tao, T., Mahbub, S., Li, D., Zhuang, Y., Wang, H., Cheng, X., Song, L., & Xing, E.P. (2024). Mixture of Experts Enable Efficient and Effective Protein Understanding and Design. bioRxiv.](https://www.biorxiv.org/content/10.1101/2024.11.29.625425v1)
StructureDCA | MSA & Structure | [Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. (2026). Structure-informed direct coupling analysis improves protein mutational landscape predictions. bioRxiv.](https://www.biorxiv.org/content/10.64898/2026.03.27.714804)

For clinical baselines, we used dbNSFP 4.4a as detailed in the manuscript appendix (and in `proteingym/clinical_benchmark_notebooks/clinical_subs_processing.ipynb`).

Expand Down Expand Up @@ -218,6 +219,7 @@ ESM3 | https://github.com/evolutionaryscale/esm
xTrimoPGLM | https://github.com/biomap-research/xTrimoPGLM
ProGen3 | https://github.com/Profluent-AI/progen3
AIDO | https://github.com/genbio-ai/AIDO
StructureDCA | https://github.com/3BioCompBio/StructureDCA

We would like to thank the GEMME team for providing model scores on an earlier version of the benchmark (ProteinGym v0.1), and the ProtSSN, SaProt, PoET, MULAN, VespaG, ProSST, ESCOTT, VenusREM, RSALOR, SiteRM and AIDO teams for integrating their model in the ProteinGym repo.

Expand Down
14 changes: 14 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,20 @@
"directionality": 1,
"key": "mutated_sequence",
"model_type": "MSA"
},
"StructureDCA": {
"input_score_name": "StructureDCA",
"location": "StructureDCA",
"directionality": -1,
"key": "mutant",
"model_type": "Structure & MSA"
},
"StructureDCA[RSA]": {
"input_score_name": "StructureDCA[RSA]",
"location": "StructureDCA",
"directionality": -1,
"key": "mutant",
"model_type": "Structure & MSA"
}
},
"model_list_zero_shot_indels_DMS": {
Expand Down
128 changes: 128 additions & 0 deletions proteingym/baselines/StructureDCA/run_structuredca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@

"""
Run StructureDCA mutational landscape predictions on DMS datasets.
"""

# Imports ----------------------------------------------------------------------
import os
import argparse
import time
try:
from structuredca import StructureDCA
from structuredca.structuredca import CSV
except:
raise ImportError("Import of pip package 'structuredca' failed. Please install the package with 'pip install structuredca'.")


# Main -------------------------------------------------------------------------
def main():

# Constants
t1_total = time.time()
MUTATION_PROPERTY = "mutant"
PREDICTION_PROPERTY = "StructureDCA"
PREDICTION_WITH_RSA_PROPERTY = "StructureDCA[RSA]"
REMOVE_PROPERTIES_IN_OUTPUT = ["DMS_score", "DMS_score_bin"]
SEP = ","
CHAIN = "A"
MAX_CPUS = 16

# Set number of used CPUs
n_cpu_total = os.cpu_count()
if n_cpu_total is None:
n_cpu_total = 1
n_cpu_used = max(1, min(n_cpu_total // 2, MAX_CPUS))
print("\n\nStructureDCA Runs Settings -------------------------------------------------")
print(f" * Used CPUs: {n_cpu_used}")

# Parse arguments
parser = argparse.ArgumentParser(description='StructureDCA arguments')
parser.add_argument("--reference_file_path", type=str, help="Datasets reference '.csv' file.")
parser.add_argument("--data_folder", type=str, help="Initial DMS datasets folder")
parser.add_argument("--MSA_folder", type=str, help="MSA ('.a2m' or '.fasta') folder")
parser.add_argument("--structure_folder", type=str, help="PDB ('.pdb') folder")
parser.add_argument("--output_scores_folder", type=str, help="Output folder")
args = parser.parse_args()
print(f" * reference_file_path: '{args.reference_file_path}'")
print(f" * data_folder: '{args.data_folder}'")
print(f" * MSA_folder: '{args.MSA_folder}'")
print(f" * structure_folder: '{args.structure_folder}'")
print(f" * output_scores_folder: '{args.output_scores_folder}'")

# Read reference file
dataset_reference_path = args.reference_file_path
dataset_reference = CSV.read(dataset_reference_path, sep=SEP, name="DMS reference")
print(f"\nEvaluate StructureDCA on {len(dataset_reference)} DMS datasets")
print(f" * dataset_reference_path: '{dataset_reference_path}'")
dataset_reference.show()

# Init output folder if required
if not os.path.isdir(args.output_scores_folder):
print(f"\nCreate new output directory '{args.output_scores_folder}'.")
os.mkdir(args.output_scores_folder)

# Loop on DMS datasets
print(f"\nRun StructureDCA on datasets from '{args.data_folder}' ...")
for i, dataset_entry in enumerate(dataset_reference):

# Init metadata
dms_name = dataset_entry["DMS_id"]
msa_name = dataset_entry["MSA_filename"]
pdb_name = dataset_entry["pdb_file"]
resid_shift = int(dataset_entry["MSA_start"]) - 1
print(f"\n * Run StructureDCA {i+1} / {len(dataset_reference)}: '{dms_name}'")

# Set paths
dataset_input_path = os.path.join(args.data_folder, f"{dms_name}.csv")
dataset_output_path = os.path.join(args.output_scores_folder, f"{dms_name}.csv")
if os.path.exists(dataset_output_path):
print(f"Already computed scores for {dms_name}")
continue
msa_path = os.path.join(args.MSA_folder, f"{msa_name}")
pdb_path = os.path.join(args.structure_folder, f"{pdb_name}")

# Run StructureDCA
t1 = time.time()
sdca = StructureDCA(
msa_path, pdb_path, CHAIN, # input data
use_contacts_plddt_filter=True, # when working with AlphaFold 3D structures that may contain low pLDDT regions
num_threads=n_cpu_used, # set number of used threads
verbose=False, disable_warnings=True, # disable all logs
)

# Read dataset
dataset = CSV.read(dataset_input_path, sep=SEP, name=dms_name)
for property_to_remove in REMOVE_PROPERTIES_IN_OUTPUT:
dataset.remove_col(property_to_remove)

# Assign predicted values
dataset.add_empty_col(PREDICTION_PROPERTY, allow_replacement=True)
dataset.add_empty_col(PREDICTION_WITH_RSA_PROPERTY, allow_replacement=True)
for mutation_entry in dataset:
mutations_fasta = mutation_entry[MUTATION_PROPERTY]
# map mutation as referenced in the fasta file to its msa coordinates
# (sometimes the MSA range is smaller than the full fasta file)
mutations_msa = ":".join([
sin_mut[0] + str(int(sin_mut[1:-1])-resid_shift) + sin_mut[-1]
for sin_mut in mutations_fasta.split(":")
])
# Evalut dE of the mutation according to StructureDCA
mutation_entry[PREDICTION_PROPERTY] = float(sdca.eval_mutation(mutations_msa, reweight_by_rsa=False))
# Evalut dE of the mutation according to StructureDCA[RSA]
mutation_entry[PREDICTION_WITH_RSA_PROPERTY] = float(sdca.eval_mutation(mutations_msa, reweight_by_rsa=True))
t2 = time.time()
print(f" - done in {t2-t1:.1f} sec.")

# Save output
print(f" - save output to '{dataset_output_path}'")
dataset.show()
dataset.write(dataset_output_path)

# Log DONE
t2_total = time.time()
print(f"\nDONE. Total time: {t2_total - t1_total:.1f} sec.")


# Execution --------------------------------------------------------------------
if __name__ == "__main__":
main()
12 changes: 9 additions & 3 deletions proteingym/constants.json
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@
"xTrimoPGLM-3B-CLM": "xTrimoPGLM (3B params, CLM)",
"xTrimoPGLM-7B-CLM": "xTrimoPGLM (7B params, CLM)",
"xTrimoPGLM-100B-int4": "xTrimoPGLM (100B params, int4 quantized)",
"AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)"
"AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)",
"StructureDCA": "Structure-informed DCA",
"StructureDCA[RSA]": "Structure-informed DCA with RSA-based reweighting"
},
"model_references":{
"Tranception_L_no_retrieval":"<a href='https://proceedings.mlr.press/v162/notin22a.html'>Notin, P., Dias, M., Frazer, J., Marchena-Hurtado, J., Gomez, A.N., Marks, D.S., & Gal, Y. (2022). Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval. ICML.</a>",
Expand Down Expand Up @@ -201,7 +203,9 @@
"xTrimoPGLM-3B-CLM": "<a href='https://www.nature.com/articles/s41592-025-02636-z'>Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.</a>",
"xTrimoPGLM-7B-CLM": "<a href='https://www.nature.com/articles/s41592-025-02636-z'>Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.</a>",
"xTrimoPGLM-100B-int4": "<a href='https://www.nature.com/articles/s41592-025-02636-z'>Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.</a>",
"AIDO.Protein-RAG-16B": "<a href='https://www.biorxiv.org/content/10.1101/2024.11.29.625425v1'>Sun, N., Zou, S., Tao, T., Mahbub, S., Li, D., Zhuang, Y., Wang, H., Cheng, X., Song, L., & Xing, E.P. (2024). Mixture of Experts Enable Efficient and Effective Protein Understanding and Design. bioRxiv.</a>"
"AIDO.Protein-RAG-16B": "<a href='https://www.biorxiv.org/content/10.1101/2024.11.29.625425v1'>Sun, N., Zou, S., Tao, T., Mahbub, S., Li, D., Zhuang, Y., Wang, H., Cheng, X., Song, L., & Xing, E.P. (2024). Mixture of Experts Enable Efficient and Effective Protein Understanding and Design. bioRxiv.</a>",
"StructureDCA": "<a href='https://www.biorxiv.org/content/10.64898/2026.03.27.714804'>Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. (2026). Structure-informed direct coupling analysis improves protein mutational landscape predictions. bioRxiv.</a>",
"StructureDCA[RSA]": "<a href='https://www.biorxiv.org/content/10.64898/2026.03.27.714804'>Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. (2026). Structure-informed direct coupling analysis improves protein mutational landscape predictions. bioRxiv.</a>"
},
"clean_names":{
"Tranception_L_no_retrieval":"Tranception L no retrieval",
Expand Down Expand Up @@ -309,7 +313,9 @@
"xTrimoPGLM-3B-CLM": "xTrimoPGLM-3B-CLM",
"xTrimoPGLM-7B-CLM": "xTrimoPGLM-7B-CLM",
"xTrimoPGLM-100B-int4": "xTrimoPGLM-100B-int4",
"AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)"
"AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)",
"StructureDCA": "StructureDCA",
"StructureDCA[RSA]": "StructureDCA[RSA]"
},
"supervised_model_details": {
"ProteinNPT":"ProteinNPT Model",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

source ../zero_shot_config.sh
source activate proteingym_env
pip install structuredca

export output_scores_folder=${DMS_output_score_folder_subs}/StructureDCA

python ../../proteingym/baselines/StructureDCA/run_structuredca.py \
--reference_file_path ${DMS_reference_file_path_subs} \
--data_folder ${DMS_data_folder_subs} \
--MSA_folder ${DMS_MSA_data_folder} \
--structure_folder ${DMS_structure_folder} \
--output_scores_folder ${output_scores_folder}