OATML-Markslab · MatsveiTsishyn · Mar 29, 2026 · Mar 29, 2026
diff --git a/README.md b/README.md
@@ -102,6 +102,7 @@ ESM C | Single sequence | [ESM Team](https://evolutionaryscale.ai/blog/esm-cambr
 xTrimoPGLM | Single sequence | [Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.](https://www.nature.com/articles/s41592-025-02636-z)
 ProGen3 | Single sequence | [Bhatnagar, A., Jain, S., Beazer, J., Curran, S.C., Hoffnagle, A.M., Ching, K., Martyn, M., Nayfach, S., Ruffolo, J.A., & Madani, A. (2025). Scaling unlocks broader generation and deeper functional understanding of proteins. bioRxiv, 2025.04.15.649055.](https://doi.org/10.1101/2025.04.15.649055)
 AIDO | MSA & Structure | [Sun, N., Zou, S., Tao, T., Mahbub, S., Li, D., Zhuang, Y., Wang, H., Cheng, X., Song, L., & Xing, E.P. (2024). Mixture of Experts Enable Efficient and Effective Protein Understanding and Design. bioRxiv.](https://www.biorxiv.org/content/10.1101/2024.11.29.625425v1)
+StructureDCA | MSA & Structure | [Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. (2026). Structure-informed direct coupling analysis improves protein mutational landscape predictions. bioRxiv.](https://www.biorxiv.org/content/10.64898/2026.03.27.714804)
 
 For clinical baselines, we used dbNSFP 4.4a as detailed in the manuscript appendix (and in `proteingym/clinical_benchmark_notebooks/clinical_subs_processing.ipynb`).
 
@@ -218,6 +219,7 @@ ESM3 | https://github.com/evolutionaryscale/esm
 xTrimoPGLM | https://github.com/biomap-research/xTrimoPGLM
 ProGen3 | https://github.com/Profluent-AI/progen3
 AIDO | https://github.com/genbio-ai/AIDO
+StructureDCA | https://github.com/3BioCompBio/StructureDCA
 
 We would like to thank the GEMME team for providing model scores on an earlier version of the benchmark (ProteinGym v0.1), and the ProtSSN, SaProt, PoET, MULAN, VespaG, ProSST, ESCOTT, VenusREM, RSALOR, SiteRM and AIDO teams for integrating their model in the ProteinGym repo.
 

diff --git a/config.json b/config.json
@@ -679,6 +679,20 @@
             "directionality": 1,
             "key": "mutated_sequence",
             "model_type": "MSA"
+        },
+        "StructureDCA": {
+            "input_score_name": "StructureDCA",
+            "location": "StructureDCA",
+            "directionality": -1,
+            "key": "mutant",
+            "model_type": "Structure & MSA"
+        },
+        "StructureDCA[RSA]": {
+            "input_score_name": "StructureDCA[RSA]",
+            "location": "StructureDCA",
+            "directionality": -1,
+            "key": "mutant",
+            "model_type": "Structure & MSA"
         }
     },
     "model_list_zero_shot_indels_DMS": {

diff --git a/proteingym/baselines/StructureDCA/run_structuredca.py b/proteingym/baselines/StructureDCA/run_structuredca.py
@@ -0,0 +1,128 @@
+
+"""
+Run StructureDCA mutational landscape predictions on DMS datasets.
+"""
+
+# Imports ----------------------------------------------------------------------
+import os
+import argparse
+import time
+try:
+    from structuredca import StructureDCA
+    from structuredca.structuredca import CSV
+except:
+    raise ImportError("Import of pip package 'structuredca' failed. Please install the package with 'pip install structuredca'.")
+
+
+# Main -------------------------------------------------------------------------
+def main():
+
+    # Constants
+    t1_total = time.time()
+    MUTATION_PROPERTY = "mutant"
+    PREDICTION_PROPERTY = "StructureDCA"
+    PREDICTION_WITH_RSA_PROPERTY = "StructureDCA[RSA]"
+    REMOVE_PROPERTIES_IN_OUTPUT = ["DMS_score", "DMS_score_bin"]
+    SEP = ","
+    CHAIN = "A"
+    MAX_CPUS = 16
+
+    # Set number of used CPUs
+    n_cpu_total = os.cpu_count()
+    if n_cpu_total is None:
+        n_cpu_total = 1
+    n_cpu_used = max(1, min(n_cpu_total // 2, MAX_CPUS))
+    print("\n\nStructureDCA Runs Settings -------------------------------------------------")
+    print(f" * Used CPUs: {n_cpu_used}")
+
+    # Parse arguments
+    parser = argparse.ArgumentParser(description='StructureDCA arguments')
+    parser.add_argument("--reference_file_path", type=str, help="Datasets reference '.csv' file.")
+    parser.add_argument("--data_folder", type=str, help="Initial DMS datasets folder")
+    parser.add_argument("--MSA_folder", type=str, help="MSA ('.a2m' or '.fasta') folder")
+    parser.add_argument("--structure_folder", type=str, help="PDB ('.pdb') folder")
+    parser.add_argument("--output_scores_folder", type=str, help="Output folder")
+    args = parser.parse_args()
+    print(f" * reference_file_path: '{args.reference_file_path}'")
+    print(f" * data_folder: '{args.data_folder}'")
+    print(f" * MSA_folder: '{args.MSA_folder}'")
+    print(f" * structure_folder: '{args.structure_folder}'")
+    print(f" * output_scores_folder: '{args.output_scores_folder}'")
+
+    # Read reference file
+    dataset_reference_path = args.reference_file_path
+    dataset_reference = CSV.read(dataset_reference_path, sep=SEP, name="DMS reference")
+    print(f"\nEvaluate StructureDCA on {len(dataset_reference)} DMS datasets")
+    print(f" * dataset_reference_path: '{dataset_reference_path}'")
+    dataset_reference.show()
+
+    # Init output folder if required
+    if not os.path.isdir(args.output_scores_folder):
+        print(f"\nCreate new output directory '{args.output_scores_folder}'.")
+        os.mkdir(args.output_scores_folder)
+
+    # Loop on DMS datasets
+    print(f"\nRun StructureDCA on datasets from '{args.data_folder}' ...")
+    for i, dataset_entry in enumerate(dataset_reference):
+
+        # Init metadata
+        dms_name = dataset_entry["DMS_id"]
+        msa_name = dataset_entry["MSA_filename"]
+        pdb_name = dataset_entry["pdb_file"]
+        resid_shift = int(dataset_entry["MSA_start"]) - 1
+        print(f"\n * Run StructureDCA {i+1} / {len(dataset_reference)}: '{dms_name}'")
+
+        # Set paths
+        dataset_input_path = os.path.join(args.data_folder, f"{dms_name}.csv")
+        dataset_output_path = os.path.join(args.output_scores_folder, f"{dms_name}.csv")
+        if os.path.exists(dataset_output_path):
+            print(f"Already computed scores for {dms_name}")
+            continue
+        msa_path = os.path.join(args.MSA_folder, f"{msa_name}")
+        pdb_path = os.path.join(args.structure_folder, f"{pdb_name}")
+
+        # Run StructureDCA
+        t1 = time.time()
+        sdca = StructureDCA(
+            msa_path, pdb_path, CHAIN, # input data
+            use_contacts_plddt_filter=True, # when working with AlphaFold 3D structures that may contain low pLDDT regions
+            num_threads=n_cpu_used, # set number of used threads
+            verbose=False, disable_warnings=True, # disable all logs
+        )
+
+        # Read dataset
+        dataset = CSV.read(dataset_input_path, sep=SEP, name=dms_name)
+        for property_to_remove in REMOVE_PROPERTIES_IN_OUTPUT:
+            dataset.remove_col(property_to_remove)
+
+        # Assign predicted values
+        dataset.add_empty_col(PREDICTION_PROPERTY, allow_replacement=True)
+        dataset.add_empty_col(PREDICTION_WITH_RSA_PROPERTY, allow_replacement=True)
+        for mutation_entry in dataset:
+            mutations_fasta = mutation_entry[MUTATION_PROPERTY]
+            # map mutation as referenced in the fasta file to its msa coordinates
+            # (sometimes the MSA range is smaller than the full fasta file)
+            mutations_msa = ":".join([
+                sin_mut[0] + str(int(sin_mut[1:-1])-resid_shift) + sin_mut[-1]
+                for sin_mut in mutations_fasta.split(":")
+            ])
+            # Evalut dE of the mutation according to StructureDCA
+            mutation_entry[PREDICTION_PROPERTY] = float(sdca.eval_mutation(mutations_msa, reweight_by_rsa=False))
+            # Evalut dE of the mutation according to StructureDCA[RSA]
+            mutation_entry[PREDICTION_WITH_RSA_PROPERTY] = float(sdca.eval_mutation(mutations_msa, reweight_by_rsa=True))
+        t2 = time.time()
+        print(f"   - done in {t2-t1:.1f} sec.")
+
+        # Save output
+        print(f"   - save output to '{dataset_output_path}'")
+        dataset.show()
+        dataset.write(dataset_output_path)
+
+    # Log DONE
+    t2_total = time.time()
+    print(f"\nDONE. Total time: {t2_total - t1_total:.1f} sec.")
+
+
+# Execution --------------------------------------------------------------------
+if __name__ == "__main__":
+    main()
diff --git a/proteingym/constants.json b/proteingym/constants.json
@@ -99,7 +99,9 @@
         "xTrimoPGLM-3B-CLM": "xTrimoPGLM (3B params, CLM)",
         "xTrimoPGLM-7B-CLM": "xTrimoPGLM (7B params, CLM)",
         "xTrimoPGLM-100B-int4": "xTrimoPGLM (100B params, int4 quantized)",
-        "AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)"
+        "AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)",
+        "StructureDCA": "Structure-informed DCA",
+        "StructureDCA[RSA]": "Structure-informed DCA with RSA-based reweighting"
     },
     "model_references":{
         "Tranception_L_no_retrieval":"<a href='https://proceedings.mlr.press/v162/notin22a.html'>Notin, P., Dias, M., Frazer, J., Marchena-Hurtado, J., Gomez, A.N., Marks, D.S., & Gal, Y. (2022). Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval. ICML.</a>",
@@ -201,7 +203,9 @@
         "xTrimoPGLM-3B-CLM": "<a href='https://www.nature.com/articles/s41592-025-02636-z'>Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.</a>",
         "xTrimoPGLM-7B-CLM": "<a href='https://www.nature.com/articles/s41592-025-02636-z'>Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.</a>",
         "xTrimoPGLM-100B-int4": "<a href='https://www.nature.com/articles/s41592-025-02636-z'>Chen, B., Cheng, X., Li, P., Geng, Y., Gong, J., Li, S., Bei, Z., Tan, X., Wang, B., Zeng, X., Liu, C., Zeng, A., Dong, Y., Tang, J., & Song, L. (2025). xTrimoPGLM: unified 100-billion-parameter pretrained transformer for deciphering the language of proteins. Nature methods.</a>",
-        "AIDO.Protein-RAG-16B": "<a href='https://www.biorxiv.org/content/10.1101/2024.11.29.625425v1'>Sun, N., Zou, S., Tao, T., Mahbub, S., Li, D., Zhuang, Y., Wang, H., Cheng, X., Song, L., & Xing, E.P. (2024). Mixture of Experts Enable Efficient and Effective Protein Understanding and Design. bioRxiv.</a>"
+        "AIDO.Protein-RAG-16B": "<a href='https://www.biorxiv.org/content/10.1101/2024.11.29.625425v1'>Sun, N., Zou, S., Tao, T., Mahbub, S., Li, D., Zhuang, Y., Wang, H., Cheng, X., Song, L., & Xing, E.P. (2024). Mixture of Experts Enable Efficient and Effective Protein Understanding and Design. bioRxiv.</a>",
+        "StructureDCA": "<a href='https://www.biorxiv.org/content/10.64898/2026.03.27.714804'>Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. (2026). Structure-informed direct coupling analysis improves protein mutational landscape predictions. bioRxiv.</a>",
+        "StructureDCA[RSA]": "<a href='https://www.biorxiv.org/content/10.64898/2026.03.27.714804'>Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. (2026). Structure-informed direct coupling analysis improves protein mutational landscape predictions. bioRxiv.</a>"
     },
     "clean_names":{
         "Tranception_L_no_retrieval":"Tranception L no retrieval",
@@ -309,7 +313,9 @@
         "xTrimoPGLM-3B-CLM": "xTrimoPGLM-3B-CLM",
         "xTrimoPGLM-7B-CLM": "xTrimoPGLM-7B-CLM",
         "xTrimoPGLM-100B-int4": "xTrimoPGLM-100B-int4",
-        "AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)"
+        "AIDO.Protein-RAG-16B": "AIDO Protein-RAG (16B)",
+        "StructureDCA": "StructureDCA",
+        "StructureDCA[RSA]": "StructureDCA[RSA]"
     },
     "supervised_model_details": {
         "ProteinNPT":"ProteinNPT Model",

diff --git a/scripts/scoring_DMS_zero_shot/scoring_StructureDCA_substitutions.sh b/scripts/scoring_DMS_zero_shot/scoring_StructureDCA_substitutions.sh
@@ -0,0 +1,14 @@
+#!/bin/bash 
+
+source ../zero_shot_config.sh
+source activate proteingym_env
+pip install structuredca
+
+export output_scores_folder=${DMS_output_score_folder_subs}/StructureDCA
+
+python ../../proteingym/baselines/StructureDCA/run_structuredca.py \
+    --reference_file_path ${DMS_reference_file_path_subs} \
+    --data_folder ${DMS_data_folder_subs} \
+    --MSA_folder ${DMS_MSA_data_folder} \
+    --structure_folder ${DMS_structure_folder} \
+    --output_scores_folder ${output_scores_folder}