From f23462ac9a98d6cfff7afdc378d018af8e9011fd Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Thu, 12 Jun 2025 17:07:18 -0700
Subject: [PATCH 01/19] rety mechanism

---
 tcrdock/tcrdock_info.py | 160 ++++++++++++++++++++++------------------
 1 file changed, 87 insertions(+), 73 deletions(-)

diff --git a/tcrdock/tcrdock_info.py b/tcrdock/tcrdock_info.py
index 5f6caeb..ee4c1c3 100644
--- a/tcrdock/tcrdock_info.py
+++ b/tcrdock/tcrdock_info.py
@@ -4,8 +4,9 @@
 from . import tcrdist
 import json
 
-class TCRdockInfo():
-    ''' A class to store information about a TCR:peptide:MHC pose
+
+class TCRdockInfo:
+    """A class to store information about a TCR:peptide:MHC pose
 
     CONTENTS:
 
@@ -22,7 +23,7 @@ class TCRdockInfo():
 
     note that the CDR3 sequence and loop position is "extended"
     ie starts at C and ends at "F"
-    '''
+    """
 
     def __init__(self):
         self.organism = None
@@ -42,29 +43,29 @@ def __repr__(self):
         return self.to_string()
 
     def from_sequences(
-            self,
-            organism,
-            mhc_class, # pass None if tcr_only
-            mhc_aseq, # '' if tcr_only
-            mhc_bseq, # ignored if mhc_class==1
-            pep_seq, # '' if tcr_only
-            tcr_aseq,
-            tcr_bseq,
+        self,
+        organism,
+        mhc_class,  # pass None if tcr_only
+        mhc_aseq,  # '' if tcr_only
+        mhc_bseq,  # ignored if mhc_class==1
+        pep_seq,  # '' if tcr_only
+        tcr_aseq,
+        tcr_bseq,
     ):
         self.valid = False
         self.organism = organism
         self.mhc_class = int(mhc_class)
         self.pep_seq = pep_seq
-        #self.mhc_aseq = mhc_aseq
-        #self.mhc_bseq = mhc_bseq
-        #self.tcr_aseq = tcr_aseq
-        #self.tcr_bseq = tcr_bseq
+        # self.mhc_aseq = mhc_aseq
+        # self.mhc_bseq = mhc_bseq
+        # self.tcr_aseq = tcr_aseq
+        # self.tcr_bseq = tcr_bseq
 
         # parse the mhc sequence
-        if mhc_class is None: # tcr only
-            assert mhc_aseq == '' and mhc_bseq == '' and pep_seq == ''
+        if mhc_class is None:  # tcr only
+            assert mhc_aseq == "" and mhc_bseq == "" and pep_seq == ""
             self.mhc_core = []
-            self.mhc_allele = ''
+            self.mhc_allele = ""
 
         elif mhc_class == 1:
             self.mhc_core = mhc_util.get_mhc_core_positions_class1(mhc_aseq)
@@ -73,106 +74,119 @@ def from_sequences(
         else:
             assert mhc_class == 2
             self.mhc_core = mhc_util.get_mhc_core_positions_class2(
-                mhc_aseq, mhc_bseq)
-            self.mhc_allele = (mhc_util.get_mhc_allele(mhc_aseq, organism)+','+
-                               mhc_util.get_mhc_allele(mhc_bseq, organism))
+                mhc_aseq, mhc_bseq
+            )
+            self.mhc_allele = (
+                mhc_util.get_mhc_allele(mhc_aseq, organism)
+                + ","
+                + mhc_util.get_mhc_allele(mhc_bseq, organism)
+            )
 
         if -1 in self.mhc_core:
-            print('TCRdockInfo:: MHC parse fail!')
-            return self ########## early return
+            print("TCRdockInfo:: MHC parse fail!")
+            return self  ########## early return
 
         # this shifts everything to full pose numbering (0-indexed)
         offset = len(mhc_aseq) + len(pep_seq)
-        if mhc_class==2:
+        if mhc_class == 2:
             offset += len(mhc_bseq)
 
         self.tcr_cdrs = []
         self.tcr_core = []
         self.tcr = []
-        for chain, chainseq in zip('AB', [tcr_aseq, tcr_bseq]):
-            res = tcrdist.parsing.parse_tcr_sequence(
-                organism, chain, chainseq)
+        for chain, chainseq in zip("AB", [tcr_aseq, tcr_bseq]):
+            res = tcrdist.parsing.parse_tcr_sequence(organism, chain, chainseq)
             if not res:
-                # parse failure
-                print('TCRdockInfo:: TCR parse fail!')
-                return self ########## early return
-            cdr_loops = res['cdr_loops']
+                other_organism = "human" if organism == "mouse" else "mouse"
+                res = tcrdist.parsing.parse_tcr_sequence(
+                    other_organism, chain, chainseq
+                )
+                if not res:
+                    # parse failure
+                    print("TCRdockInfo:: TCR parse fail!")
+                    return self  ########## early return
+            cdr_loops = res["cdr_loops"]
             cdr3_start, cdr3_end = cdr_loops[-1]
-            cdr3 = chainseq[cdr3_start:cdr3_end+1]
-            self.tcr.append((res['v_gene'], res['j_gene'], cdr3))
+            cdr3 = chainseq[cdr3_start : cdr3_end + 1]
+            self.tcr.append((res["v_gene"], res["j_gene"], cdr3))
             # the ints here are converting from np.int64 to int
             # ran into trouble serializing:
             #  "Object of type 'int64' is not JSON serializable"
-            self.tcr_cdrs.extend([(int(x[0]+offset), int(x[1]+offset))
-                                  for x in cdr_loops])
-            self.tcr_core.extend([int(x+offset) for x in res['core_positions']])
+            self.tcr_cdrs.extend(
+                [(int(x[0] + offset), int(x[1] + offset)) for x in cdr_loops]
+            )
+            self.tcr_core.extend(
+                [int(x + offset) for x in res["core_positions"]]
+            )
 
             offset += len(chainseq)
-        self.valid = True # signal success
+        self.valid = True  # signal success
         return self
 
     def renumber(self, old2new):
-        ''' old2new is a mapping of 0-indexed positions from old to new
+        """old2new is a mapping of 0-indexed positions from old to new
         numbering systems
 
         will be bad if old2new doesn't cover all our positions
-        '''
-        old2new = {int(x):int(y) for x,y in old2new.items()} # no int64 in our data!
+        """
+        old2new = {
+            int(x): int(y) for x, y in old2new.items()
+        }  # no int64 in our data!
         if self.mhc_core is not None:
             self.mhc_core = [old2new[x] for x in self.mhc_core]
         if self.tcr_core is not None:
             self.tcr_core = [old2new[x] for x in self.tcr_core]
         if self.tcr_cdrs is not None:
-            self.tcr_cdrs = [(old2new[x], old2new[y]) for x,y in self.tcr_cdrs]
+            self.tcr_cdrs = [
+                (old2new[x], old2new[y]) for x, y in self.tcr_cdrs
+            ]
         return self
 
     def delete_residue_range(self, start, stop):
-        ''' delete from start to stop , EXCLUSIVE of stop !!!!!!!!!!!!!!!!!!!!!!
+        """delete from start to stop , EXCLUSIVE of stop !!!!!!!!!!!!!!!!!!!!!!
         ie delete [start,stop) or range(start,stop)
 
         puts -1 for residues in [start,stop) range, if there are any
-        '''
-
-        maxpos = self.tcr_cdrs[-1][-1] # end of CDR3beta is biggest
-        numdel = stop-start
-        old2new = {x:x if x<start else -1 if x<stop else x-numdel
-                   for x in range(maxpos+1)}
+        """
+
+        maxpos = self.tcr_cdrs[-1][-1]  # end of CDR3beta is biggest
+        numdel = stop - start
+        old2new = {
+            x: x if x < start else -1 if x < stop else x - numdel
+            for x in range(maxpos + 1)
+        }
         self.renumber(old2new)
         return self
 
-
     def to_dict(self):
-        ''' should be inverse of from_dict
-        '''
+        """should be inverse of from_dict"""
         return dict(
-            organism = self.organism,
-            mhc_class = self.mhc_class,
-            mhc_allele = self.mhc_allele,
-            mhc_core = self.mhc_core,
-            pep_seq = self.pep_seq,
-            tcr = self.tcr,
-            tcr_core = self.tcr_core,
-            tcr_cdrs = self.tcr_cdrs,
-            valid = self.valid,
+            organism=self.organism,
+            mhc_class=self.mhc_class,
+            mhc_allele=self.mhc_allele,
+            mhc_core=self.mhc_core,
+            pep_seq=self.pep_seq,
+            tcr=self.tcr,
+            tcr_core=self.tcr_core,
+            tcr_cdrs=self.tcr_cdrs,
+            valid=self.valid,
         )
 
     def from_dict(self, D):
-        ''' should be inverse of to_dict
-        '''
-        self.organism = D['organism']
-        self.mhc_class = D['mhc_class']
-        self.mhc_allele = D['mhc_allele']
-        self.mhc_core = D['mhc_core']
-        self.pep_seq = D['pep_seq']
-        self.tcr = D['tcr']
-        self.tcr_core = D['tcr_core']
-        self.tcr_cdrs = D['tcr_cdrs']
-        self.valid = D.get('valid', True)
+        """should be inverse of to_dict"""
+        self.organism = D["organism"]
+        self.mhc_class = D["mhc_class"]
+        self.mhc_allele = D["mhc_allele"]
+        self.mhc_core = D["mhc_core"]
+        self.pep_seq = D["pep_seq"]
+        self.tcr = D["tcr"]
+        self.tcr_core = D["tcr_core"]
+        self.tcr_cdrs = D["tcr_cdrs"]
+        self.valid = D.get("valid", True)
         return self
 
     def to_string(self):
-        return json.dumps(self.to_dict(), separators=(',', ':'))
+        return json.dumps(self.to_dict(), separators=(",", ":"))
 
     def from_string(self, info):
         return self.from_dict(json.loads(info))
-

From 5376f90bd7a056a6f2d476a0cd591260a47d0f8c Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Thu, 12 Jun 2025 17:40:38 -0700
Subject: [PATCH 02/19] make package installable

---
 .gitignore | 11 ++++++++++-
 setup.py   | 30 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index 184cf46..a99af23 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 *pyc
 __pycache__
 *.fasta.pdb
+*.fasta.blast
+*.fasta
+*.tsv
 *.phr
 *.pin
 *.pot
@@ -9,4 +12,10 @@ __pycache__
 *.ptf
 *.pto
 imgt_prot_blast_db_*fasta
-ncbi-blast-*
\ No newline at end of file
+ncbi-blast-*
+*.egg-info
+**/*.log
+**/*.ipynb
+.vscode
+.nextflow
+build
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..76a9084
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,30 @@
+# setup.py
+import subprocess
+import sys
+from setuptools import setup, find_packages
+from setuptools.command.install import install
+
+
+class CustomInstall(install):
+    """Run our database installer, then fall back to the normal install."""
+
+    def run(self):
+        subprocess.check_call([sys.executable, "download_blast.py"])
+        super().run()
+
+
+setup(
+    name="tcrdock",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "biopython==1.79",
+        "numpy==1.19.5",
+        "pandas==1.3.4",
+        "scipy==1.7.0",
+        "matplotlib==3.3.4",
+    ],
+    cmdclass={
+        "install": CustomInstall,
+    },
+)

From 8aa337ce827699fa72e85e563e20640dee392a20 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Thu, 12 Jun 2025 17:41:37 -0700
Subject: [PATCH 03/19] update README

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b56defb..0c56e54 100644
--- a/README.md
+++ b/README.md
@@ -143,8 +143,7 @@ to be installed, which can be done by running the script
 ```
 conda create --name tcrdock_test python=3.8
 source activate tcrdock_test   # or: conda activate tcrdock_test
-pip3 install -r requirements.txt
-python download_blast.py
+pip install <path/to/cloned/repo>
 ```
 
 To run the AlphaFold simulations, you will need a Python environment that satisfies

From 586e327a35febddd79bf22aee98ec06ee843c6f9 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 10:58:13 -0700
Subject: [PATCH 04/19] dependency relaxation

---
 .github/workflows/gh-ci.yml | 72 +++++++++++++++++++++++++++++++++++++
 setup.py                    | 10 +++---
 2 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/gh-ci.yml

diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml
new file mode 100644
index 0000000..8adfdbd
--- /dev/null
+++ b/.github/workflows/gh-ci.yml
@@ -0,0 +1,72 @@
+name: GH Actions CI
+'on':
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+      - develop
+  schedule:
+    # Weekly tests at midnight on Sundays run on main by default:
+    #   Scheduled workflows run on the latest commit on the default or base branch.
+    #   (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule)
+    - cron: 0 0 * * 0
+
+concurrency:
+  # Specific group naming so CI is only cancelled
+  # within same PR or on merge to main
+  group: ${{ github.ref }}-${{ github.head_ref }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l {0}
+
+
+  main-tests:
+    if: github.repository == 'ljwoods2/TCRdock'
+    needs: environment-config
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - macOS-latest
+          - ubuntu-latest
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build information
+        run: |
+          uname -a
+          df -h
+          ulimit -a
+
+      # More info on options: https://github.com/conda-incubator/setup-miniconda
+      - name: Create & activate conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          add-pip-as-python-dependency: true
+          environment-name: tcrdock-test
+          auto-activate-base: false
+
+      - name: Install package
+        run: |
+          python --version
+          python -m pip install .
+
+      - name: Python information
+        run: |
+          which python
+          which pip
+          pip list
+          conda info
+          conda list
+
+      - name: Run tests
+        run: |
+          pytest -v -x --color=yes tcrdock/tests
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 76a9084..47ba6a6 100644
--- a/setup.py
+++ b/setup.py
@@ -18,11 +18,11 @@ def run(self):
     version="0.1.0",
     packages=find_packages(),
     install_requires=[
-        "biopython==1.79",
-        "numpy==1.19.5",
-        "pandas==1.3.4",
-        "scipy==1.7.0",
-        "matplotlib==3.3.4",
+        "biopython",
+        "numpy",
+        "pandas",
+        "scipy",
+        "matplotlib",
     ],
     cmdclass={
         "install": CustomInstall,

From 3e4fa5df2367a83ffa3cbb721e7aa1bf7bfa052d Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 11:08:53 -0700
Subject: [PATCH 05/19] add __init__.py to dat dirs

---
 =3.10                               | 30 +++++++++++++++++++++++++++++
 tcrdock/db/__init__.py              |  0
 tcrdock/db/new_imgt_hla/__init__.py |  0
 tcrdock/db/pdb/__init__.py          |  0
 tcrdock/db/pdb/pmhc/__init__.py     |  0
 tcrdock/db/pdb/tcr/__init__.py      |  0
 tcrdock/db/pdb/ternary/__init__.py  |  0
 7 files changed, 30 insertions(+)
 create mode 100644 =3.10
 create mode 100644 tcrdock/db/__init__.py
 create mode 100644 tcrdock/db/new_imgt_hla/__init__.py
 create mode 100644 tcrdock/db/pdb/__init__.py
 create mode 100644 tcrdock/db/pdb/pmhc/__init__.py
 create mode 100644 tcrdock/db/pdb/tcr/__init__.py
 create mode 100644 tcrdock/db/pdb/ternary/__init__.py

diff --git a/=3.10 b/=3.10
new file mode 100644
index 0000000..f4f691d
--- /dev/null
+++ b/=3.10
@@ -0,0 +1,30 @@
+Channels:
+ - conda-forge
+ - defaults
+Platform: linux-64
+Collecting package metadata (repodata.json): ...working... done
+Solving environment: ...working... done
+
+## Package Plan ##
+
+  environment location: /home/lwoods/miniconda3/envs/tmp
+
+  added / updated specs:
+    - python
+
+
+The following packages will be downloaded:
+
+    package                    |            build
+    ---------------------------|-----------------
+    python-3.12.11             |h9e4cc4f_0_cpython        30.0 MB  conda-forge
+    ------------------------------------------------------------
+                                           Total:        30.0 MB
+
+The following packages will be UPDATED:
+
+  python                          3.8.20-h4a871b0_2_cpython --> 3.12.11-h9e4cc4f_0_cpython 
+  python_abi                                     3.8-7_cp38 --> 3.12-7_cp312 
+
+
+Proceed ([y]/n)? 
\ No newline at end of file
diff --git a/tcrdock/db/__init__.py b/tcrdock/db/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/new_imgt_hla/__init__.py b/tcrdock/db/new_imgt_hla/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/__init__.py b/tcrdock/db/pdb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/pmhc/__init__.py b/tcrdock/db/pdb/pmhc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/tcr/__init__.py b/tcrdock/db/pdb/tcr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/ternary/__init__.py b/tcrdock/db/pdb/ternary/__init__.py
new file mode 100644
index 0000000..e69de29

From 1847e01cbbc68262e9f643a8c77e23a90450f4a0 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 11:40:10 -0700
Subject: [PATCH 06/19] remove __init__, add package_data

---
 =3.10                               | 30 -----------------------------
 setup.py                            | 14 ++++++++++++++
 tcrdock/db/__init__.py              |  0
 tcrdock/db/new_imgt_hla/__init__.py |  0
 tcrdock/db/pdb/__init__.py          |  0
 tcrdock/db/pdb/pmhc/__init__.py     |  0
 tcrdock/db/pdb/tcr/__init__.py      |  0
 7 files changed, 14 insertions(+), 30 deletions(-)
 delete mode 100644 =3.10
 delete mode 100644 tcrdock/db/__init__.py
 delete mode 100644 tcrdock/db/new_imgt_hla/__init__.py
 delete mode 100644 tcrdock/db/pdb/__init__.py
 delete mode 100644 tcrdock/db/pdb/pmhc/__init__.py
 delete mode 100644 tcrdock/db/pdb/tcr/__init__.py

diff --git a/=3.10 b/=3.10
deleted file mode 100644
index f4f691d..0000000
--- a/=3.10
+++ /dev/null
@@ -1,30 +0,0 @@
-Channels:
- - conda-forge
- - defaults
-Platform: linux-64
-Collecting package metadata (repodata.json): ...working... done
-Solving environment: ...working... done
-
-## Package Plan ##
-
-  environment location: /home/lwoods/miniconda3/envs/tmp
-
-  added / updated specs:
-    - python
-
-
-The following packages will be downloaded:
-
-    package                    |            build
-    ---------------------------|-----------------
-    python-3.12.11             |h9e4cc4f_0_cpython        30.0 MB  conda-forge
-    ------------------------------------------------------------
-                                           Total:        30.0 MB
-
-The following packages will be UPDATED:
-
-  python                          3.8.20-h4a871b0_2_cpython --> 3.12.11-h9e4cc4f_0_cpython 
-  python_abi                                     3.8-7_cp38 --> 3.12-7_cp312 
-
-
-Proceed ([y]/n)? 
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 47ba6a6..102c334 100644
--- a/setup.py
+++ b/setup.py
@@ -27,4 +27,18 @@ def run(self):
     cmdclass={
         "install": CustomInstall,
     },
+    package_data={
+        "": [
+            "**/*.alfas",
+            "**/*.fasta",
+            "**/*.txt",
+            "**/*.tsv",
+            "**/tcr/*.pdb",
+            "**/tcr/*.json",
+            "**/ternary/*.pdb",
+            "**/ternary/*.json",
+            "**/pmhc/*.pdb",
+            "**/pmhc/*.json",
+        ]
+    },
 )
diff --git a/tcrdock/db/__init__.py b/tcrdock/db/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tcrdock/db/new_imgt_hla/__init__.py b/tcrdock/db/new_imgt_hla/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tcrdock/db/pdb/__init__.py b/tcrdock/db/pdb/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tcrdock/db/pdb/pmhc/__init__.py b/tcrdock/db/pdb/pmhc/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tcrdock/db/pdb/tcr/__init__.py b/tcrdock/db/pdb/tcr/__init__.py
deleted file mode 100644
index e69de29..0000000

From 683aa6e58f4d1d01ab717a86c7756c54d96e948c Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 11:48:03 -0700
Subject: [PATCH 07/19] add back __init__.py

---
 .github/workflows/gh-ci.yml         | 2 +-
 setup.py                            | 4 ++--
 tcrdock/db/__init__.py              | 0
 tcrdock/db/new_imgt_hla/__init__.py | 0
 tcrdock/db/pdb/__init__.py          | 0
 tcrdock/db/pdb/pmhc/__init__.py     | 0
 tcrdock/db/pdb/tcr/__init__.py      | 0
 7 files changed, 3 insertions(+), 3 deletions(-)
 create mode 100644 tcrdock/db/__init__.py
 create mode 100644 tcrdock/db/new_imgt_hla/__init__.py
 create mode 100644 tcrdock/db/pdb/__init__.py
 create mode 100644 tcrdock/db/pdb/pmhc/__init__.py
 create mode 100644 tcrdock/db/pdb/tcr/__init__.py

diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml
index 8adfdbd..859afc6 100644
--- a/.github/workflows/gh-ci.yml
+++ b/.github/workflows/gh-ci.yml
@@ -23,7 +23,7 @@ defaults:
   run:
     shell: bash -l {0}
 
-
+jobs:
   main-tests:
     if: github.repository == 'ljwoods2/TCRdock'
     needs: environment-config
diff --git a/setup.py b/setup.py
index 102c334..75a5d3e 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def run(self):
 setup(
     name="tcrdock",
     version="0.1.0",
-    packages=find_packages(),
+    packages=["tcrdock"],
     install_requires=[
         "biopython",
         "numpy",
@@ -28,7 +28,7 @@ def run(self):
         "install": CustomInstall,
     },
     package_data={
-        "": [
+        "tcrdock": [
             "**/*.alfas",
             "**/*.fasta",
             "**/*.txt",
diff --git a/tcrdock/db/__init__.py b/tcrdock/db/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/new_imgt_hla/__init__.py b/tcrdock/db/new_imgt_hla/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/__init__.py b/tcrdock/db/pdb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/pmhc/__init__.py b/tcrdock/db/pdb/pmhc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/db/pdb/tcr/__init__.py b/tcrdock/db/pdb/tcr/__init__.py
new file mode 100644
index 0000000..e69de29

From da4e6d87f208049f3d61e3902b4a30f2d95b5e93 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 11:49:05 -0700
Subject: [PATCH 08/19] remove dep in workflow

---
 .github/workflows/gh-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml
index 859afc6..689c530 100644
--- a/.github/workflows/gh-ci.yml
+++ b/.github/workflows/gh-ci.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
   main-tests:
     if: github.repository == 'ljwoods2/TCRdock'
-    needs: environment-config
+    # needs: environment-config
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false

From 84137615501fa5208c3adc9bd9fc5341b57a3bcd Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 11:52:23 -0700
Subject: [PATCH 09/19] add pyproject.toml

---
 .github/workflows/gh-ci.yml | 1 +
 pyproject.toml              | 3 +++
 2 files changed, 4 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml
index 689c530..f684388 100644
--- a/.github/workflows/gh-ci.yml
+++ b/.github/workflows/gh-ci.yml
@@ -53,6 +53,7 @@ jobs:
           add-pip-as-python-dependency: true
           environment-name: tcrdock-test
           auto-activate-base: false
+          miniconda-version: "latest"
 
       - name: Install package
         run: |
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4e715e0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools >= 42.0.0"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file

From bdee8439e6ead4097a3d1bf5f95b2f57b81880da Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 12:02:46 -0700
Subject: [PATCH 10/19] basic test setup

---
 .github/workflows/gh-ci.yml       |  5 ++++-
 devtools/conda-envs/test_env.yaml | 10 ++++++++++
 tcrdock/tests/__init__.py         |  0
 tcrdock/tests/test_tcrdock.py     |  5 +++++
 4 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 devtools/conda-envs/test_env.yaml
 create mode 100644 tcrdock/tests/__init__.py
 create mode 100644 tcrdock/tests/test_tcrdock.py

diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml
index f684388..ce9c18a 100644
--- a/.github/workflows/gh-ci.yml
+++ b/.github/workflows/gh-ci.yml
@@ -51,9 +51,12 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           add-pip-as-python-dependency: true
-          environment-name: tcrdock-test
           auto-activate-base: false
           miniconda-version: "latest"
+          environment-file: devtools/conda-envs/test_env.yaml
+          activate-environment: tcrdock-test
+          auto-update-conda: true
+          show-channel-urls: true
 
       - name: Install package
         run: |
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
new file mode 100644
index 0000000..2402f01
--- /dev/null
+++ b/devtools/conda-envs/test_env.yaml
@@ -0,0 +1,10 @@
+name: tcrdock-test
+channels:
+  - defaults
+  - conda-forge
+dependencies:
+  ### Base depends ###
+  - python>=3.10.0
+  - pip
+
+  - pytest
\ No newline at end of file
diff --git a/tcrdock/tests/__init__.py b/tcrdock/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tcrdock/tests/test_tcrdock.py b/tcrdock/tests/test_tcrdock.py
new file mode 100644
index 0000000..a490498
--- /dev/null
+++ b/tcrdock/tests/test_tcrdock.py
@@ -0,0 +1,5 @@
+import pytest
+
+
+def test_import():
+    import tcrdock

From f4ff42eb677e365b49c0cc37708a9e806740d737 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 12:07:04 -0700
Subject: [PATCH 11/19] remove old substitution matrix code

---
 tcrdock/sequtil.py | 2003 ++++++++++++++++++++++++++------------------
 1 file changed, 1204 insertions(+), 799 deletions(-)

diff --git a/tcrdock/sequtil.py b/tcrdock/sequtil.py
index 04169b4..122cc63 100644
--- a/tcrdock/sequtil.py
+++ b/tcrdock/sequtil.py
@@ -6,7 +6,7 @@
 from .tcrdist.all_genes import all_genes
 from .tcrdist.amino_acids import amino_acids
 from Bio import pairwise2
-from Bio.SubsMat import MatrixInfo as matlist
+from Bio.Align import substitution_matrices
 from Bio.pairwise2 import format_alignment
 from .util import path_to_db
 from . import docking_geometry
@@ -22,49 +22,57 @@
 import copy
 from numpy.linalg import norm
 
-CLASS2_PEPLEN = 1+9+1
+CLASS2_PEPLEN = 1 + 9 + 1
 TCR_CORE_LEN = 13
-ALL_GENES_GAP_CHAR = '.'
+ALL_GENES_GAP_CHAR = "."
 
 
 # human only
-human_structure_alignments = pd.read_table(path_to_db/'new_human_vg_alignments_v1.tsv')
-human_structure_alignments.set_index('v_gene', drop=True, inplace=True)
+human_structure_alignments = pd.read_table(
+    path_to_db / "new_human_vg_alignments_v1.tsv"
+)
+human_structure_alignments.set_index("v_gene", drop=True, inplace=True)
 
 # human and mouse
-both_structure_alignments = pd.read_table(path_to_db/'new_both_vg_alignments_v1.tsv')
-both_structure_alignments.set_index(['organism','v_gene'], drop=True, inplace=True)
+both_structure_alignments = pd.read_table(
+    path_to_db / "new_both_vg_alignments_v1.tsv"
+)
+both_structure_alignments.set_index(
+    ["organism", "v_gene"], drop=True, inplace=True
+)
 
-def read_fasta(filename): # helper
-    ''' return OrderedDict indexed by the ">" lines (everything after >)
-    '''
-    data = open(filename, 'rU')
+
+def read_fasta(filename):  # helper
+    """return OrderedDict indexed by the ">" lines (everything after >)"""
+    data = open(filename, "rU")
     fasta = OrderedDict()
     for line in data:
-        if line[0] == '>':
+        if line[0] == ">":
             seqid = line[1:-1]
-            assert seqid not in fasta # no repeats
-            fasta[seqid] = ''
+            assert seqid not in fasta  # no repeats
+            fasta[seqid] = ""
         else:
-            l= line.split()
+            l = line.split()
             if l:
                 fasta[seqid] += l[0]
     data.close()
     return fasta
 
 
-
-mhc_class_1_alfas = read_fasta(path_to_db / 'ClassI_prot.alfas')
+mhc_class_1_alfas = read_fasta(path_to_db / "ClassI_prot.alfas")
 # add HLA-G 2022-04-30
-mhc_class_1_alfas.update(read_fasta(path_to_db / 'new_imgt_hla/G_prot.alfas'))
+mhc_class_1_alfas.update(read_fasta(path_to_db / "new_imgt_hla/G_prot.alfas"))
 # add HLA-E 2022-05-03
-mhc_class_1_alfas.update(read_fasta(path_to_db / 'new_imgt_hla/E_prot.alfas'))
+mhc_class_1_alfas.update(read_fasta(path_to_db / "new_imgt_hla/E_prot.alfas"))
 
 ks = list(mhc_class_1_alfas.keys())
 lencheck = None
 for k in ks:
-    newseq = mhc_class_1_alfas[k].replace('-', ALL_GENES_GAP_CHAR)\
-             .replace('X',ALL_GENES_GAP_CHAR)
+    newseq = (
+        mhc_class_1_alfas[k]
+        .replace("-", ALL_GENES_GAP_CHAR)
+        .replace("X", ALL_GENES_GAP_CHAR)
+    )
     mhc_class_1_alfas[k] = newseq
     if lencheck is None:
         lencheck = len(newseq)
@@ -73,19 +81,25 @@ def read_fasta(filename): # helper
 
 # read mouse sequences (no gaps in them)
 # these mouse seqs are not the same length
-mhc_class_1_alfas.update(read_fasta(path_to_db / 'mouse_class1_align.fasta'))
+mhc_class_1_alfas.update(read_fasta(path_to_db / "mouse_class1_align.fasta"))
 
 # v2 means new imgt hla class 2 human alignments/sequences:
 mhc_class_2_alfas = {
-    'A': read_fasta(path_to_db / 'both_class_2_A_chains_v2.alfas'),
-    'B': read_fasta(path_to_db / 'both_class_2_B_chains_v2.alfas'),
+    "A": read_fasta(path_to_db / "both_class_2_A_chains_v2.alfas"),
+    "B": read_fasta(path_to_db / "both_class_2_B_chains_v2.alfas"),
 }
 
-assert all(all(all(x in amino_acids or x==ALL_GENES_GAP_CHAR
-                   for x in seq)
-               for seq in alfas.values())
-           for alfas in [mhc_class_1_alfas, mhc_class_2_alfas['A'],
-                         mhc_class_2_alfas['B']])
+assert all(
+    all(
+        all(x in amino_acids or x == ALL_GENES_GAP_CHAR for x in seq)
+        for seq in alfas.values()
+    )
+    for alfas in [
+        mhc_class_1_alfas,
+        mhc_class_2_alfas["A"],
+        mhc_class_2_alfas["B"],
+    ]
+)
 
 # the code below is now redundant since we moved the info files and had applied these
 # changes already
@@ -128,24 +142,26 @@ def read_fasta(filename): # helper
 # TCR, PMHC, TERNARY = 'tcr', 'pmhc', 'ternary'
 # all_template_info = {TCR:tcr_info, TERNARY:ternary_info, PMHC:pmhc_info}
 
-TCR, PMHC, TERNARY = 'tcr', 'pmhc', 'ternary'
+TCR, PMHC, TERNARY = "tcr", "pmhc", "ternary"
 
 all_template_info = {}
 for tag in [TCR, PMHC, TERNARY]:
-    info = pd.read_table(path_to_db / f'{tag}_templates_v2.tsv')
+    info = pd.read_table(path_to_db / f"{tag}_templates_v2.tsv")
     if tag == TCR:
-        info.set_index(['pdbid','ab'], drop=False, inplace=True)
+        info.set_index(["pdbid", "ab"], drop=False, inplace=True)
     else:
-        info.set_index('pdbid', drop=False, inplace=True)
+        info.set_index("pdbid", drop=False, inplace=True)
     all_template_info[tag] = info
 tcr_info = all_template_info[TCR]
 pmhc_info = all_template_info[PMHC]
 ternary_info = all_template_info[TERNARY]
 
-all_template_poses = {TCR:{}, PMHC:{}, TERNARY:{}}
+all_template_poses = {TCR: {}, PMHC: {}, TERNARY: {}}
 
-BAD_DGEOM_PDBIDS = '5sws 7jwi 4jry 4nhu 3tjh 4y19 4y1a 1ymm 2wbj 6uz1'.split()
-BAD_PMHC_PDBIDS = '3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9'.split()
+BAD_DGEOM_PDBIDS = "5sws 7jwi 4jry 4nhu 3tjh 4y19 4y1a 1ymm 2wbj 6uz1".split()
+BAD_PMHC_PDBIDS = (
+    "3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9".split()
+)
 # (new) 6uz1 is engineered and binds down by B2M
 # 3rgv has cterm of peptide out of groove
 # 4ms8 has chainbreaks (should check for those!)
@@ -170,108 +186,127 @@ def read_fasta(filename): # helper
 ######################################################################################88
 
 
+_cached_tcrdisters = {}
 
 
-_cached_tcrdisters = {}
 def get_tcrdister(organism):
     if organism not in _cached_tcrdisters:
-        _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator(organism)
+        _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator(
+            organism
+        )
     return _cached_tcrdisters[organism]
 
 
 def blosum_align(
-        seq1,
-        seq2,
-        gap_open=-11, # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/
-        gap_extend=-1,
-        global_align=False,
-        verbose=False,
+    seq1,
+    seq2,
+    gap_open=-11,  # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/
+    gap_extend=-1,
+    global_align=False,
+    verbose=False,
 ):
-    ''' return 0-indexed dictionary mapping from seq1 to seq2 positions
-    '''
+    """return 0-indexed dictionary mapping from seq1 to seq2 positions"""
 
-    scorematrix = matlist.blosum62
+    scorematrix = substitution_matrix.load("BLOSUM62")
 
     if global_align:
         alignments = pairwise2.align.globalds(
-            seq1, seq2, scorematrix, gap_open, gap_extend)
+            seq1, seq2, scorematrix, gap_open, gap_extend
+        )
     else:
         alignments = pairwise2.align.localds(
-            seq1, seq2, scorematrix, gap_open, gap_extend)
+            seq1, seq2, scorematrix, gap_open, gap_extend
+        )
     # this is annoying, BioPython 1.76 these are vanilla tuples and 1.79 they are named
     try:
-        alignment = max(alignments, key=lambda x:x.score)
+        alignment = max(alignments, key=lambda x: x.score)
     except AttributeError:
-        assert len(alignments[0]) == 5 # seqA, seqB, score, start, end
-        alignment = max(alignments, key=lambda x:x[2])
+        assert len(alignments[0]) == 5  # seqA, seqB, score, start, end
+        alignment = max(alignments, key=lambda x: x[2])
     alseq1, alseq2, score, begin, end = alignment
 
     if verbose:
         print(format_alignment(*alignment))
 
-    assert alseq1.replace('-','') == seq1
-    assert alseq2.replace('-','') == seq2
+    assert alseq1.replace("-", "") == seq1
+    assert alseq2.replace("-", "") == seq2
 
     align = {}
-    for i,(a,b) in enumerate(zip(alseq1, alseq2)):
-        if a!= '-' and b!='-':
-            #assert begin <= i <= end
-            pos1 = i-alseq1[:i].count('-')
-            pos2 = i-alseq2[:i].count('-')
+    for i, (a, b) in enumerate(zip(alseq1, alseq2)):
+        if a != "-" and b != "-":
+            # assert begin <= i <= end
+            pos1 = i - alseq1[:i].count("-")
+            pos2 = i - alseq2[:i].count("-")
             align[pos1] = pos2
     return align
 
 
 # for building pmhc:TCR models from allele/v/j/cdr3 info
 
+
 def get_v_seq_up_to_cys(organism, v_gene):
     vg = all_genes[organism][v_gene]
 
     alseq = vg.alseq
-    cys_alpos = vg.cdr_columns[-1][0]-1 # they were 1-indexed, now 0-indexed
-    if alseq[cys_alpos] != 'C':
-        print('notC', alseq[cys_alpos])
-    geneseq = alseq[:cys_alpos+1].replace(ALL_GENES_GAP_CHAR, '')
+    cys_alpos = vg.cdr_columns[-1][0] - 1  # they were 1-indexed, now 0-indexed
+    if alseq[cys_alpos] != "C":
+        print("notC", alseq[cys_alpos])
+    geneseq = alseq[: cys_alpos + 1].replace(ALL_GENES_GAP_CHAR, "")
     return geneseq
 
+
 def get_j_seq_after_cdr3(organism, j_gene):
     jg = all_genes[organism][j_gene]
     alseq = jg.alseq
     cdr3seq = jg.cdrs[0]
     assert alseq.startswith(cdr3seq)
-    return alseq[len(cdr3seq):].replace(ALL_GENES_GAP_CHAR,'')
+    return alseq[len(cdr3seq) :].replace(ALL_GENES_GAP_CHAR, "")
+
 
-#A02_seq = 'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQR'
+# A02_seq = 'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQR'
 
 
 ## borrowed from cdr3s_human.py
 ## 1-indexed:
-extra_alignment_columns_1x = { 'mouse':{'A':[9,86],'B':[] }, ## 1-indexed
-                               'human':{'A':[],'B':[] } }
+extra_alignment_columns_1x = {
+    "mouse": {"A": [9, 86], "B": []},  ## 1-indexed
+    "human": {"A": [], "B": []},
+}
 
 
 core_positions_generic_1x = [
-    21, 23, 25,   ##  23 is C
-    39, 41,       ##  41 is W
-    53, 54, 55,
-    78,           ##            maybe also 80?
-    89,           ##  89 is L
-    102, 103, 104 ## 104 is C
+    21,
+    23,
+    25,  ##  23 is C
+    39,
+    41,  ##  41 is W
+    53,
+    54,
+    55,
+    78,  ##            maybe also 80?
+    89,  ##  89 is L
+    102,
+    103,
+    104,  ## 104 is C
 ]
 
 all_core_alseq_positions_0x = {}
 for organism in extra_alignment_columns_1x:
-    for ab,xcols in extra_alignment_columns_1x[organism].items():
-        positions = [ x-1+sum(y<=x for y in xcols) # this is not quite right but it wrks
-                      for x in core_positions_generic_1x ]
+    for ab, xcols in extra_alignment_columns_1x[organism].items():
+        positions = [
+            x
+            - 1
+            + sum(y <= x for y in xcols)  # this is not quite right but it wrks
+            for x in core_positions_generic_1x
+        ]
 
         if False:
-            for v,g in all_genes[organism].items():
-                if g.chain == ab and g.region == 'V' and v.endswith('*01'):
-                    coreseq = ''.join([g.alseq[x] for x in positions])
+            for v, g in all_genes[organism].items():
+                if g.chain == ab and g.region == "V" and v.endswith("*01"):
+                    coreseq = "".join([g.alseq[x] for x in positions])
                     print(coreseq, v, organism, ab)
 
-        all_core_alseq_positions_0x[(organism,ab)] = positions
+        all_core_alseq_positions_0x[(organism, ab)] = positions
 
 
 def get_core_positions_0x(organism, v_gene):
@@ -293,99 +328,118 @@ def align_chainseq_to_imgt_msa(organism, chainseq, v_gene):
     geneseq = get_v_seq_up_to_cys(organism, v_gene)
 
     # extends past end of geneseq
-    geneseq_to_alseq = {k-alseq[:k].count(ALL_GENES_GAP_CHAR) : k
-                        for k,a in enumerate(alseq) if a!=ALL_GENES_GAP_CHAR}
+    geneseq_to_alseq = {
+        k - alseq[:k].count(ALL_GENES_GAP_CHAR): k
+        for k, a in enumerate(alseq)
+        if a != ALL_GENES_GAP_CHAR
+    }
 
     chainseq_to_geneseq = blosum_align(chainseq, geneseq)
 
-    chainseq_to_alseq = {i:geneseq_to_alseq[j]
-                         for i,j in chainseq_to_geneseq.items()}
+    chainseq_to_alseq = {
+        i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()
+    }
 
     return chainseq_to_alseq
 
 
-def align_chainseq_to_structure_msa(organism, chainseq, v_gene, msa_type='both'):
-    assert msa_type in ['both','human']
+def align_chainseq_to_structure_msa(
+    organism, chainseq, v_gene, msa_type="both"
+):
+    assert msa_type in ["both", "human"]
 
-    if msa_type == 'human':
-        assert organism == 'human'
-        #assert v_gene in human_structure_alignments.index
+    if msa_type == "human":
+        assert organism == "human"
+        # assert v_gene in human_structure_alignments.index
         row = human_structure_alignments.loc[v_gene]
     else:
-        row = both_structure_alignments.loc[(organism,v_gene)]
+        row = both_structure_alignments.loc[(organism, v_gene)]
 
     alseq = row.alseq
 
     geneseq = get_v_seq_up_to_cys(organism, v_gene)
-    assert alseq.replace(ALL_GENES_GAP_CHAR,'') == geneseq
+    assert alseq.replace(ALL_GENES_GAP_CHAR, "") == geneseq
 
     #
-    geneseq_to_alseq = {k-alseq[:k].count(ALL_GENES_GAP_CHAR) : k
-                        for k,a in enumerate(alseq) if a!=ALL_GENES_GAP_CHAR}
+    geneseq_to_alseq = {
+        k - alseq[:k].count(ALL_GENES_GAP_CHAR): k
+        for k, a in enumerate(alseq)
+        if a != ALL_GENES_GAP_CHAR
+    }
 
     chainseq_to_geneseq = blosum_align(chainseq, geneseq)
 
-    chainseq_to_alseq = {i:geneseq_to_alseq[j]
-                         for i,j in chainseq_to_geneseq.items()}
+    chainseq_to_alseq = {
+        i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()
+    }
 
     return chainseq_to_alseq
 
+
 _tcr_alignment_cache = None
+
+
 def align_tcr_info_pdb_chain_to_structure_msa(pdbid, ab, msa_type_in):
-    ''' pdbid,ab has to be in the tcr_info index
-    '''
+    """pdbid,ab has to be in the tcr_info index"""
     global _tcr_alignment_cache
-    assert msa_type_in in ['both','human']
+    assert msa_type_in in ["both", "human"]
     if _tcr_alignment_cache is None:
         _tcr_alignment_cache = {
-            'both' :{'A':{}, 'B':{}},
-            'human':{'A':{}, 'B':{}}
+            "both": {"A": {}, "B": {}},
+            "human": {"A": {}, "B": {}},
         }
-        print('setting up cache for align_tcr_info_pdb_chain_to_structure_msa function')
+        print(
+            "setting up cache for align_tcr_info_pdb_chain_to_structure_msa function"
+        )
         for l in tcr_info.itertuples():
             geneseq = get_v_seq_up_to_cys(l.organism, l.v_gene)
-            for msa_type in ['both', 'human']:
-                if msa_type == 'human':
-                    if l.organism != 'human':
+            for msa_type in ["both", "human"]:
+                if msa_type == "human":
+                    if l.organism != "human":
                         continue
                     row = human_structure_alignments.loc[l.v_gene]
                 else:
                     row = both_structure_alignments.loc[(l.organism, l.v_gene)]
                 alseq = row.alseq
-                assert alseq.replace(ALL_GENES_GAP_CHAR,'') == geneseq
-                geneseq_to_alseq = {k-alseq[:k].count(ALL_GENES_GAP_CHAR) : k
-                                    for k,a in enumerate(alseq)
-                                    if a!=ALL_GENES_GAP_CHAR}
+                assert alseq.replace(ALL_GENES_GAP_CHAR, "") == geneseq
+                geneseq_to_alseq = {
+                    k - alseq[:k].count(ALL_GENES_GAP_CHAR): k
+                    for k, a in enumerate(alseq)
+                    if a != ALL_GENES_GAP_CHAR
+                }
 
                 chainseq_to_geneseq = blosum_align(l.chainseq, geneseq)
 
-                chainseq_to_alseq = {i:geneseq_to_alseq[j]
-                                     for i,j in chainseq_to_geneseq.items()}
-                _tcr_alignment_cache[msa_type][l.ab][l.pdbid] = chainseq_to_alseq
-
-        print('DONE setting up cache for align_tcr_info_pdb_chain_to_structure_msa',
-              'function')
+                chainseq_to_alseq = {
+                    i: geneseq_to_alseq[j]
+                    for i, j in chainseq_to_geneseq.items()
+                }
+                _tcr_alignment_cache[msa_type][l.ab][
+                    l.pdbid
+                ] = chainseq_to_alseq
+
+        print(
+            "DONE setting up cache for align_tcr_info_pdb_chain_to_structure_msa",
+            "function",
+        )
     return _tcr_alignment_cache[msa_type_in][ab][pdbid]
 
 
-
 def align_cdr3s(a, b):
-    shortseq, longseq = (a,b) if len(a)<len(b) else (b,a)
+    shortseq, longseq = (a, b) if len(a) < len(b) else (b, a)
     lenshort, lenlong = len(shortseq), len(longseq)
     assert lenshort >= 5
-    gappos = min( 6, 3 + (lenshort-5)//2 )
-    num_gaps = lenlong-lenshort
-    align = {i:i for i in range(gappos)}
-    align.update({i:i+num_gaps for i in range(gappos,lenshort)})
-    if len(a)>=len(b): # reverse
-        align = {j:i for i,j in align.items()}
+    gappos = min(6, 3 + (lenshort - 5) // 2)
+    num_gaps = lenlong - lenshort
+    align = {i: i for i in range(gappos)}
+    align.update({i: i + num_gaps for i in range(gappos, lenshort)})
+    if len(a) >= len(b):  # reverse
+        align = {j: i for i, j in align.items()}
     return align
 
 
-
 def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3):
-    ''' could include None if the position is missing (deleted) for this v gene...
-    '''
+    """could include None if the position is missing (deleted) for this v gene..."""
     assert cdr3 in chainseq
     nterm_segment_alseq_positions_0x = [3, 4, 5]
     # 1. imgt alignment 1-index positions 4-6 ([VIL].Q)
@@ -396,7 +450,7 @@ def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3):
     alseq = vg.alseq
 
     geneseq = get_v_seq_up_to_cys(organism, v)
-    assert alseq.replace(ALL_GENES_GAP_CHAR,'').startswith(geneseq)
+    assert alseq.replace(ALL_GENES_GAP_CHAR, "").startswith(geneseq)
 
     geneseq_to_chainseq = blosum_align(geneseq, chainseq)
 
@@ -406,52 +460,56 @@ def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3):
     geneseq_positions = []
     for pos in nterm_segment_alseq_positions_0x:
         if alseq[pos] == ALL_GENES_GAP_CHAR:
-            print('N None:', organism, v)
+            print("N None:", organism, v)
             geneseq_positions.append(None)
         else:
-            geneseq_positions.append(pos-alseq[:pos].count(ALL_GENES_GAP_CHAR))
+            geneseq_positions.append(
+                pos - alseq[:pos].count(ALL_GENES_GAP_CHAR)
+            )
 
-    start, stop = core_positions_0x[0], core_positions_0x[9] # inclusive!
-    geneseq_positions.extend(range(start, stop+1))
+    start, stop = core_positions_0x[0], core_positions_0x[9]  # inclusive!
+    geneseq_positions.extend(range(start, stop + 1))
 
-    chainseq_positions = [geneseq_to_chainseq.get(pos,None)
-                          for pos in geneseq_positions]
+    chainseq_positions = [
+        geneseq_to_chainseq.get(pos, None) for pos in geneseq_positions
+    ]
 
-    start = geneseq_to_chainseq[core_positions_0x[10]-1]
-    stop = chainseq.index(cdr3)+len(cdr3)+3 # index of position after GXG
-    chainseq_positions.extend(range(start, stop+1))
+    start = geneseq_to_chainseq[core_positions_0x[10] - 1]
+    stop = chainseq.index(cdr3) + len(cdr3) + 3  # index of position after GXG
+    chainseq_positions.extend(range(start, stop + 1))
 
     if None in chainseq_positions:
-        print(organism, v, [i for i,x in enumerate(chainseq_positions) if x is None])
+        print(
+            organism,
+            v,
+            [i for i, x in enumerate(chainseq_positions) if x is None],
+        )
 
         blosum_align(geneseq, chainseq, verbose=True)
 
-
     return chainseq_positions
 
 
-
-
 def align_vgene_to_template_pdb_chain(
-        ab,
-        trg_organism,
-        trg_v,
-        trg_j,
-        trg_cdr3,
-        trg_msa_alignments,
-        tmp_pdbid,
-        # tmp_organism,
-        # tmp_v,
-        # tmp_j,
-        # tmp_cdr3,
-        # tmp_chainseq,
-        verbose=False,
+    ab,
+    trg_organism,
+    trg_v,
+    trg_j,
+    trg_cdr3,
+    trg_msa_alignments,
+    tmp_pdbid,
+    # tmp_organism,
+    # tmp_v,
+    # tmp_j,
+    # tmp_cdr3,
+    # tmp_chainseq,
+    verbose=False,
 ):
-    ''' Returns trg_to_tmp, trg_chainseq
+    """Returns trg_to_tmp, trg_chainseq
 
     trg_to_tmp is 0-indexed dict aligning trg_chainseq to tmp_chainseq
 
-    '''
+    """
     # align trg_vg to nearest rep (trg_rep) in structure MSA
     # align tmp_vg to nearest rep (tmp_rep) in structure MSA
     # use structure MSA to create alignment between trg_rep and tmp_rep
@@ -459,7 +517,7 @@ def align_vgene_to_template_pdb_chain(
     # align cdr3 with gaps in the middle
     # align jgenes
 
-    tmp_row = tcr_info.loc[(tmp_pdbid,ab)]
+    tmp_row = tcr_info.loc[(tmp_pdbid, ab)]
     tmp_organism = tmp_row.organism
     tmp_chainseq = tmp_row.chainseq
     tmp_cdr3 = tmp_row.cdr3
@@ -470,8 +528,11 @@ def align_vgene_to_template_pdb_chain(
     trg_chainseq = trg_vseq[:-1] + trg_cdr3 + trg_jseq
 
     # align V regions
-    msa_type = ('human' if trg_organism=='human' and tmp_organism=='human' else
-                'both')
+    msa_type = (
+        "human"
+        if trg_organism == "human" and tmp_organism == "human"
+        else "both"
+    )
 
     trg_vseq_to_alseq = trg_msa_alignments[msa_type]
 
@@ -480,9 +541,10 @@ def align_vgene_to_template_pdb_chain(
 
     # assert (sorted(trg_vseq_to_alseq.items()) ==
     #         sorted(trg_vseq_to_alseq_old.items()))
-    #print(tmp_pdbid, tmp_organism, ab, msa_type)
+    # print(tmp_pdbid, tmp_organism, ab, msa_type)
     tmp_vseq_to_alseq = align_tcr_info_pdb_chain_to_structure_msa(
-        tmp_pdbid, ab, msa_type)
+        tmp_pdbid, ab, msa_type
+    )
 
     # tmp_vseq_to_alseq_old = align_chainseq_to_structure_msa(
     #     tmp_organism, tmp_chainseq, tmp_row.v_gene, msa_type=msa_type)
@@ -494,38 +556,42 @@ def align_vgene_to_template_pdb_chain(
     # assert (sorted(tmp_vseq_to_alseq.items()) ==
     #         sorted(tmp_vseq_to_alseq_old.items()))
 
-    alseq_to_tmp_vseq = {j:i for i,j in tmp_vseq_to_alseq.items()}
+    alseq_to_tmp_vseq = {j: i for i, j in tmp_vseq_to_alseq.items()}
 
-    trg_to_tmp = {} # alignment over this TCR chain, 0-indexed
-    for i,j in trg_vseq_to_alseq.items():
+    trg_to_tmp = {}  # alignment over this TCR chain, 0-indexed
+    for i, j in trg_vseq_to_alseq.items():
         if j in alseq_to_tmp_vseq:
-            trg_to_tmp[i]=alseq_to_tmp_vseq[j]
+            trg_to_tmp[i] = alseq_to_tmp_vseq[j]
 
     # align cdr3 regions
-    assert trg_vseq[-1] == 'C'
-    trg_offset = len(trg_vseq)-1 # skip 'C'
+    assert trg_vseq[-1] == "C"
+    trg_offset = len(trg_vseq) - 1  # skip 'C'
     tmp_offset = tmp_chainseq.index(tmp_cdr3)
 
     al = align_cdr3s(trg_cdr3, tmp_cdr3)
-    trg_to_tmp.update({i+trg_offset:j+tmp_offset
-                       for i,j in al.items()})
+    trg_to_tmp.update({i + trg_offset: j + tmp_offset for i, j in al.items()})
     trg_offset += len(trg_cdr3)
     tmp_offset += len(tmp_cdr3)
 
-
     # align j region
-    tmp_jseq= tmp_chainseq[tmp_chainseq.index(tmp_cdr3)+
-                           len(tmp_cdr3):]
-    if (trg_jseq[:3]+tmp_jseq[:3]).count('G') < 4:
-        print('GXGs?', trg_jseq[:3], tmp_jseq[:3])
+    tmp_jseq = tmp_chainseq[tmp_chainseq.index(tmp_cdr3) + len(tmp_cdr3) :]
+    if (trg_jseq[:3] + tmp_jseq[:3]).count("G") < 4:
+        print("GXGs?", trg_jseq[:3], tmp_jseq[:3])
     for i in range(min(len(trg_jseq), len(tmp_jseq))):
-        trg_to_tmp[trg_offset+i] = tmp_offset+i
+        trg_to_tmp[trg_offset + i] = tmp_offset + i
 
-    identities = sum(trg_chainseq[i]==tmp_chainseq[j]
-                     for i,j in trg_to_tmp.items())/len(trg_chainseq)
+    identities = sum(
+        trg_chainseq[i] == tmp_chainseq[j] for i, j in trg_to_tmp.items()
+    ) / len(trg_chainseq)
     if verbose:
-        print(f'align_vgene_to_template_pdb_chain: {identities:6.3f}',
-              trg_v, tmp_v, trg_cdr3, tmp_cdr3, verbose=verbose)
+        print(
+            f"align_vgene_to_template_pdb_chain: {identities:6.3f}",
+            trg_v,
+            tmp_v,
+            trg_cdr3,
+            tmp_cdr3,
+            verbose=verbose,
+        )
 
     return trg_to_tmp, trg_chainseq
 
@@ -610,110 +676,123 @@ def get_mhc_class_1_alseq(allele):
         return mhc_class_1_alfas[allele]
     sortl = []
     for k in mhc_class_1_alfas:
-        if k.startswith(allele) and k[len(allele)] == ':':
-            suffix = [int(x) if x.isdigit() else 100
-                      for x in k[len(allele)+1:].split(':')]
+        if k.startswith(allele) and k[len(allele)] == ":":
+            suffix = [
+                int(x) if x.isdigit() else 100
+                for x in k[len(allele) + 1 :].split(":")
+            ]
             sortl.append((suffix, k))
     if sortl:
         sortl.sort()
         best_allele = sortl[0][1]
-        #print('close allele:', allele, best_allele)
+        # print('close allele:', allele, best_allele)
         return mhc_class_1_alfas[best_allele]
-    elif allele.count(':')>1:
-        trim_allele = ':'.join(allele.split(':')[:2])
+    elif allele.count(":") > 1:
+        trim_allele = ":".join(allele.split(":")[:2])
         return get_mhc_class_1_alseq(trim_allele)
     else:
         return None
 
+
 def get_mhc_class_2_alseq(chain, allele):
     if allele in mhc_class_2_alfas[chain]:
         return mhc_class_2_alfas[chain][allele]
-    assert '*' in allele # human
+    assert "*" in allele  # human
     sortl = []
     for k in mhc_class_2_alfas[chain]:
-        if k.startswith(allele) and k[len(allele)] == ':':
-            suffix = [int(x) if x.isdigit() else 100
-                      for x in k[len(allele)+1:].split(':')]
+        if k.startswith(allele) and k[len(allele)] == ":":
+            suffix = [
+                int(x) if x.isdigit() else 100
+                for x in k[len(allele) + 1 :].split(":")
+            ]
             sortl.append((suffix, k))
     if sortl:
         sortl.sort()
         best_allele = sortl[0][1]
-        #print('close allele:', allele, best_allele)
+        # print('close allele:', allele, best_allele)
         return mhc_class_2_alfas[chain][best_allele]
-    elif allele.count(':')>1:
-        trim_allele = ':'.join(allele.split(':')[:2])
+    elif allele.count(":") > 1:
+        trim_allele = ":".join(allele.split(":")[:2])
         return get_mhc_class_2_alseq(chain, trim_allele)
     else:
         return None
 
 
 def get_template_pose_and_tdinfo(pdbid, complex_type):
-    ''' returns pose, tdinfo
+    """returns pose, tdinfo
     complex_type should be in {TCR, TERNARY, PMHC}
-    '''
-    info, poses = all_template_info[complex_type], all_template_poses[complex_type]
+    """
+    info, poses = (
+        all_template_info[complex_type],
+        all_template_poses[complex_type],
+    )
     if pdbid not in poses:
-        pdbfile = set(info[info.pdbid==pdbid].pdbfile)
+        pdbfile = set(info[info.pdbid == pdbid].pdbfile)
         if not pdbfile:
             # right now we only have class 1 in the special pmhc info...
             assert complex_type == PMHC
-            info = all_template_info[TERNARY] # NOTE NOTE NOTE
-            pdbfile = set(info[info.pdbid==pdbid].pdbfile)
+            info = all_template_info[TERNARY]  # NOTE NOTE NOTE
+            pdbfile = set(info[info.pdbid == pdbid].pdbfile)
         assert len(pdbfile) == 1
-        pdbfile = str(path_to_db) + '/' + pdbfile.pop()
-        #print('read:', pdbfile)
+        pdbfile = str(path_to_db) + "/" + pdbfile.pop()
+        # print('read:', pdbfile)
         pose = pdblite.pose_from_pdb(pdbfile)
-        tdifile = pdbfile+'.tcrdock_info.json'
-        tdinfo = TCRdockInfo().from_string(open(tdifile,'r').read())
-        #print('make tdinfo 0-indexed:', tdifile)
-        #tdinfo.renumber({i+1:i for i in range(len(pose['sequence']))})
+        tdifile = pdbfile + ".tcrdock_info.json"
+        tdinfo = TCRdockInfo().from_string(open(tdifile, "r").read())
+        # print('make tdinfo 0-indexed:', tdifile)
+        # tdinfo.renumber({i+1:i for i in range(len(pose['sequence']))})
         poses[pdbid] = (pose, tdinfo)
     pose, tdinfo = poses[pdbid]
-    return copy.deepcopy(pose), TCRdockInfo().from_dict(tdinfo.to_dict())# return copies
+    return copy.deepcopy(pose), TCRdockInfo().from_dict(
+        tdinfo.to_dict()
+    )  # return copies
+
 
-def count_peptide_mismatches(a,b):
-    if len(a)>len(b):
-        return count_peptide_mismatches(b,a)
-    lendiff = len(b)-len(a)
-    assert lendiff>=0
+def count_peptide_mismatches(a, b):
+    if len(a) > len(b):
+        return count_peptide_mismatches(b, a)
+    lendiff = len(b) - len(a)
+    assert lendiff >= 0
     min_mismatches = len(a)
-    for shift in range(lendiff+1):
-        mismatches = sum(x!=y for x,y in zip(a,b[shift:]))
+    for shift in range(lendiff + 1):
+        mismatches = sum(x != y for x, y in zip(a, b[shift:]))
         if mismatches < min_mismatches:
             min_mismatches = mismatches
     # also allow bulging out in the middle
-    nt = len(a)//2
+    nt = len(a) // 2
     ct = len(a) - nt
-    btrim = b[:nt]+b[-ct:]
+    btrim = b[:nt] + b[-ct:]
     assert len(btrim) == len(a)
-    mismatches = sum(x!=y for x,y in zip(a,btrim))
+    mismatches = sum(x != y for x, y in zip(a, btrim))
     if mismatches < min_mismatches:
         min_mismatches = mismatches
     return min_mismatches
 
-pep1 = 'DSIODJSJD' # sanity checking...
-pep2 = 'DSIXDJSJD'
-assert count_peptide_mismatches(pep1,pep1)==0
-assert count_peptide_mismatches(pep1,pep1[:-1])==0
-assert count_peptide_mismatches(pep1,pep2)==1
+
+pep1 = "DSIODJSJD"  # sanity checking...
+pep2 = "DSIXDJSJD"
+assert count_peptide_mismatches(pep1, pep1) == 0
+assert count_peptide_mismatches(pep1, pep1[:-1]) == 0
+assert count_peptide_mismatches(pep1, pep2) == 1
+
 
 def get_clean_and_nonredundant_ternary_tcrs_df(
-        min_peptide_mismatches = 3,
-        min_tcrdist = 120.5,
-        peptide_tcrdist_logical = 'or', # 'or' or 'and' -- 'or' is nr by either feature
-        drop_HLA_E = True,
-        verbose=False,
-        skip_redundancy_check=False, # just add resol,mismatches and sort
-        filter_BAD_DGEOM_PDBIDS=True,
-        filter_BAD_PMHC_PDBIDS=True,
-        tcrs=None,
+    min_peptide_mismatches=3,
+    min_tcrdist=120.5,
+    peptide_tcrdist_logical="or",  # 'or' or 'and' -- 'or' is nr by either feature
+    drop_HLA_E=True,
+    verbose=False,
+    skip_redundancy_check=False,  # just add resol,mismatches and sort
+    filter_BAD_DGEOM_PDBIDS=True,
+    filter_BAD_PMHC_PDBIDS=True,
+    tcrs=None,
 ):
-    ''' peptide_tcrdist_logical = 'or' is more stringent redundancy filtering
+    """peptide_tcrdist_logical = 'or' is more stringent redundancy filtering
     ie, smaller df returned. A TCR:pMHC is considered redundant by peptide OR
     by TCRdist.
     'and' means redundant only if both peptide and TCRdist redundant
-    '''
-    assert peptide_tcrdist_logical in ['or','and']
+    """
+    assert peptide_tcrdist_logical in ["or", "and"]
 
     if tcrs is None:
         tcrs = ternary_info.copy()
@@ -725,52 +804,61 @@ def get_clean_and_nonredundant_ternary_tcrs_df(
         bad_pdbids.update(set(BAD_DGEOM_PDBIDS))
     if filter_BAD_PMHC_PDBIDS:
         bad_pdbids.update(set(BAD_PMHC_PDBIDS))
-    print('get_clean_and_nonredundant_ternary_tcrs_df: bad_pdbids', bad_pdbids)
+    print("get_clean_and_nonredundant_ternary_tcrs_df: bad_pdbids", bad_pdbids)
 
     tcrs = tcrs[~tcrs.pdbid.isin(bad_pdbids)]
 
-    if drop_HLA_E: # drop HLA-E
-        tcrs = tcrs[~tcrs.mhc_allele.str.startswith('E*')]
+    if drop_HLA_E:  # drop HLA-E
+        tcrs = tcrs[~tcrs.mhc_allele.str.startswith("E*")]
 
     # need to add resolution
     pdbid2resolution = {}
     pdbid2mismatches = {}
-    for organism in 'human mouse'.split():
+    for organism in "human mouse".split():
         # temporary hack...
-        logfile = path_to_db / f'tmp.pdb_tcr_{organism}.2023-06-02.log'
+        logfile = path_to_db / f"tmp.pdb_tcr_{organism}.2023-06-02.log"
         # logfile = path_to_db / f'tmp.pdb_tcr_{organism}.2021-08-05.log'
         assert exists(logfile)
-        for line in os.popen(f'grep ^both {logfile}'):
+        for line in os.popen(f"grep ^both {logfile}"):
             l = line.split()
             pdbid = l[1]
             resolution = float(l[-2])
             v_mismatches = int(l[3]) + int(l[7])
             pdbid2resolution[pdbid] = resolution
-            pdbid2mismatches[pdbid] = min(v_mismatches, pdbid2mismatches.get(pdbid,100))
+            pdbid2mismatches[pdbid] = min(
+                v_mismatches, pdbid2mismatches.get(pdbid, 100)
+            )
 
-    tcrs['resolution'] = tcrs.pdbid.map(pdbid2resolution)
-    tcrs['mismatches'] = tcrs.pdbid.map(pdbid2mismatches)
-    assert tcrs.resolution.isna().sum()==0
+    tcrs["resolution"] = tcrs.pdbid.map(pdbid2resolution)
+    tcrs["mismatches"] = tcrs.pdbid.map(pdbid2mismatches)
+    assert tcrs.resolution.isna().sum() == 0
 
     ## remove redundancy
     # sort by count for peptide, then resolution
 
-    tcrs['org_pep'] = (tcrs.mhc_class.astype(str) + "_" +
-                       tcrs.organism + "_" + tcrs.pep_seq)
-    tcrs['org_pep_count'] = tcrs.org_pep.map(tcrs.org_pep.value_counts())
+    tcrs["org_pep"] = (
+        tcrs.mhc_class.astype(str) + "_" + tcrs.organism + "_" + tcrs.pep_seq
+    )
+    tcrs["org_pep_count"] = tcrs.org_pep.map(tcrs.org_pep.value_counts())
 
-    tcrs['neg_quality'] = -10*tcrs.org_pep_count + tcrs.resolution + 0.5*tcrs.mismatches
+    tcrs["neg_quality"] = (
+        -10 * tcrs.org_pep_count + tcrs.resolution + 0.5 * tcrs.mismatches
+    )
 
-    tcrs.sort_values('neg_quality', inplace=True)
+    tcrs.sort_values("neg_quality", inplace=True)
 
     if not skip_redundancy_check:
-        tdist = {'human': tcrdist.tcr_distances.TcrDistCalculator('human'),
-                 'mouse': tcrdist.tcr_distances.TcrDistCalculator('mouse')}
+        tdist = {
+            "human": tcrdist.tcr_distances.TcrDistCalculator("human"),
+            "mouse": tcrdist.tcr_distances.TcrDistCalculator("mouse"),
+        }
 
-        tcr_tuples = [((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b))
-                      for l in tcrs.itertuples()]
+        tcr_tuples = [
+            ((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b))
+            for l in tcrs.itertuples()
+        ]
 
-        is_redundant = [False]*tcrs.shape[0]
+        is_redundant = [False] * tcrs.shape[0]
 
         for i, irow in enumerate(tcrs.itertuples()):
             # check to see if i is too close to any previous tcr
@@ -780,223 +868,305 @@ def get_clean_and_nonredundant_ternary_tcrs_df(
                     continue
 
                 pep_mms = count_peptide_mismatches(irow.pep_seq, jrow.pep_seq)
-                if (irow.organism == jrow.organism and
-                    irow.mhc_class == jrow.mhc_class):
+                if (
+                    irow.organism == jrow.organism
+                    and irow.mhc_class == jrow.mhc_class
+                ):
                     tcrd = tdist[irow.organism](tcr_tuples[i], tcr_tuples[j])
-                    pep_red = (pep_mms < min_peptide_mismatches)
-                    tcr_red = (tcrd < min_tcrdist)
-                    if ((peptide_tcrdist_logical=='or' and
-                         (pep_red or tcr_red)) or
-                        (peptide_tcrdist_logical=='and' and
-                         (pep_red and tcr_red))):
+                    pep_red = pep_mms < min_peptide_mismatches
+                    tcr_red = tcrd < min_tcrdist
+                    if (
+                        peptide_tcrdist_logical == "or"
+                        and (pep_red or tcr_red)
+                    ) or (
+                        peptide_tcrdist_logical == "and"
+                        and (pep_red and tcr_red)
+                    ):
 
                         # redundant
                         is_redundant[i] = True
                         if verbose:
-                            print('skip:', i, irow.org_pep, irow.neg_quality,
-                                  tcr_tuples[i])
+                            print(
+                                "skip:",
+                                i,
+                                irow.org_pep,
+                                irow.neg_quality,
+                                tcr_tuples[i],
+                            )
                         break
 
-        tcrs['is_redundant'] = is_redundant
+        tcrs["is_redundant"] = is_redundant
 
         tcrs = tcrs[~tcrs.is_redundant].copy()
     return tcrs
 
+
 def filter_templates_by_tcrdist(
-        templates, # dataframe
-        organism, va, cdr3a, vb, cdr3b,
-        min_paired_tcrdist=-1,
-        min_singlechain_tcrdist=-1,
-        verbose=False,
+    templates,  # dataframe
+    organism,
+    va,
+    cdr3a,
+    vb,
+    cdr3b,
+    min_paired_tcrdist=-1,
+    min_singlechain_tcrdist=-1,
+    verbose=False,
 ):
-    ''' returns new templates
+    """returns new templates
 
     will have paired_tcrdist and singlechain_tcrdist cols
-    '''
-    tcrdister = get_tcrdister('human_and_mouse')
+    """
+    tcrdister = get_tcrdister("human_and_mouse")
 
     template_tcrs = [
-        ((l.organism[0]+l.va, None, l.cdr3a), (l.organism[0]+l.vb, None, l.cdr3b))
-        for l in templates.itertuples()]
+        (
+            (l.organism[0] + l.va, None, l.cdr3a),
+            (l.organism[0] + l.vb, None, l.cdr3b),
+        )
+        for l in templates.itertuples()
+    ]
 
-    target_tcr = ((organism[0]+va, None, cdr3a), (organism[0]+vb, None, cdr3b))
+    target_tcr = (
+        (organism[0] + va, None, cdr3a),
+        (organism[0] + vb, None, cdr3b),
+    )
 
-    templates['paired_tcrdist'] = np.array(
-        [tcrdister(target_tcr,x) for x in template_tcrs])
+    templates["paired_tcrdist"] = np.array(
+        [tcrdister(target_tcr, x) for x in template_tcrs]
+    )
 
-    for iab, ab in enumerate('AB'):
-        templates[ab+'_tcrdist'] = np.array(
-            [tcrdister.single_chain_distance(target_tcr[iab], x[iab])
-             for x in template_tcrs])
+    for iab, ab in enumerate("AB"):
+        templates[ab + "_tcrdist"] = np.array(
+            [
+                tcrdister.single_chain_distance(target_tcr[iab], x[iab])
+                for x in template_tcrs
+            ]
+        )
 
-    templates['AB_tcrdist'] = templates.paired_tcrdist
+    templates["AB_tcrdist"] = templates.paired_tcrdist
 
-    templates['singlechain_tcrdist'] = np.minimum(
-        templates.A_tcrdist, templates.B_tcrdist)
+    templates["singlechain_tcrdist"] = np.minimum(
+        templates.A_tcrdist, templates.B_tcrdist
+    )
 
-    too_close_mask = ((templates.organism==organism)&
-                      ((templates.paired_tcrdist < min_paired_tcrdist)|
-                       (templates.singlechain_tcrdist < min_singlechain_tcrdist)))
+    too_close_mask = (templates.organism == organism) & (
+        (templates.paired_tcrdist < min_paired_tcrdist)
+        | (templates.singlechain_tcrdist < min_singlechain_tcrdist)
+    )
     if verbose:
-        print('too close by tcrdist:', np.sum(too_close_mask), organism,
-              va, cdr3a, vb, cdr3b)
+        print(
+            "too close by tcrdist:",
+            np.sum(too_close_mask),
+            organism,
+            va,
+            cdr3a,
+            vb,
+            cdr3b,
+        )
 
     return templates[~too_close_mask].copy()
 
+
 def filter_templates_by_peptide_mismatches(
-        templates,
-        organism,
-        mhc_class,
-        peptides_for_filtering,
-        min_peptide_mismatches,
-        verbose = False,
+    templates,
+    organism,
+    mhc_class,
+    peptides_for_filtering,
+    min_peptide_mismatches,
+    verbose=False,
 ):
     if not peptides_for_filtering:
         return templates.copy()
 
-    templates['filt_peptide_mismatches'] = np.array(
-        [min(count_peptide_mismatches(p, l.pep_seq) for p in peptides_for_filtering)
-             for l in templates.itertuples()])
-    too_close_mask = ((templates.organism==organism) &
-                      (templates.mhc_class==mhc_class) &
-                      (templates.filt_peptide_mismatches<min_peptide_mismatches))
+    templates["filt_peptide_mismatches"] = np.array(
+        [
+            min(
+                count_peptide_mismatches(p, l.pep_seq)
+                for p in peptides_for_filtering
+            )
+            for l in templates.itertuples()
+        ]
+    )
+    too_close_mask = (
+        (templates.organism == organism)
+        & (templates.mhc_class == mhc_class)
+        & (templates.filt_peptide_mismatches < min_peptide_mismatches)
+    )
     if verbose:
-        print('too close by peptide mismatches:', np.sum(too_close_mask), organism,
-              peptides_for_filtering)
+        print(
+            "too close by peptide mismatches:",
+            np.sum(too_close_mask),
+            organism,
+            peptides_for_filtering,
+        )
 
     return templates[~too_close_mask].copy()
 
+
 def pick_dgeom_templates(
-        num_templates, # per chain setting
-        organism,
-        mhc_class,
-        mhc_allele,
-        peptide,
-        va,
-        cdr3a,
-        vb,
-        cdr3b,
-        min_peptide_mismatches=3, # ie, 3 or more is OK
-        min_paired_tcrdist=96.5, # 96 or less is BAD
-        min_singlechain_tcrdist=35.9, # ie 36 or more is OK
-        min_template_template_paired_tcrdist = 35.9, # for redundancy filtering
-        peptides_for_filtering = None,
-        drop_HLA_E = True,
-        exclude_pdbids = None,
+    num_templates,  # per chain setting
+    organism,
+    mhc_class,
+    mhc_allele,
+    peptide,
+    va,
+    cdr3a,
+    vb,
+    cdr3b,
+    min_peptide_mismatches=3,  # ie, 3 or more is OK
+    min_paired_tcrdist=96.5,  # 96 or less is BAD
+    min_singlechain_tcrdist=35.9,  # ie 36 or more is OK
+    min_template_template_paired_tcrdist=35.9,  # for redundancy filtering
+    peptides_for_filtering=None,
+    drop_HLA_E=True,
+    exclude_pdbids=None,
 ):
-    ''' returns dict
+    """returns dict
     {
       'A' : alpha_templates_df,
       'B' : beta_templates_df,
       'AB': paired_templates_df,
     }
 
-    '''
+    """
     if peptides_for_filtering is None:
         peptides_for_filtering = []
     elif peptides_for_filtering:
-        assert peptide in peptides_for_filtering # sanity? seems like should be true
+        assert (
+            peptide in peptides_for_filtering
+        )  # sanity? seems like should be true
 
     if exclude_pdbids is None:
         exclude_pdbids = []
     else:
         exclude_pdbids = frozenset(exclude_pdbids)
 
-    print('pick_dgeom_templates:', min_peptide_mismatches, min_paired_tcrdist,
-          min_singlechain_tcrdist, min_template_template_paired_tcrdist,
-          peptides_for_filtering)
+    print(
+        "pick_dgeom_templates:",
+        min_peptide_mismatches,
+        min_paired_tcrdist,
+        min_singlechain_tcrdist,
+        min_template_template_paired_tcrdist,
+        peptides_for_filtering,
+    )
 
-    tcrdister = get_tcrdister('human_and_mouse')
+    tcrdister = get_tcrdister("human_and_mouse")
 
     # sorts by pep-count, resolution, mismatches
     # excludes BAD_DGEOM_PDBIDS but not BAD_PMHC_PDBIDS
     templates = get_clean_and_nonredundant_ternary_tcrs_df(
-        skip_redundancy_check=True, filter_BAD_PMHC_PDBIDS=False,
-        drop_HLA_E=drop_HLA_E)
+        skip_redundancy_check=True,
+        filter_BAD_PMHC_PDBIDS=False,
+        drop_HLA_E=drop_HLA_E,
+    )
 
     templates = templates[templates.mhc_class == mhc_class]
     templates = templates[~templates.pdbid.isin(exclude_pdbids)]
 
     templates = filter_templates_by_peptide_mismatches(
-        templates, organism, mhc_class, peptides_for_filtering,
-        min_peptide_mismatches)
+        templates,
+        organism,
+        mhc_class,
+        peptides_for_filtering,
+        min_peptide_mismatches,
+    )
 
     templates = filter_templates_by_tcrdist(
-        templates, organism, va, cdr3a, vb, cdr3b,
+        templates,
+        organism,
+        va,
+        cdr3a,
+        vb,
+        cdr3b,
         min_paired_tcrdist=min_paired_tcrdist,
         min_singlechain_tcrdist=min_singlechain_tcrdist,
     )
 
     # currently unused...
-    templates['peptide_mismatches'] = np.array(
-        [count_peptide_mismatches(peptide, l.pep_seq)
-         for l in templates.itertuples()])
+    templates["peptide_mismatches"] = np.array(
+        [
+            count_peptide_mismatches(peptide, l.pep_seq)
+            for l in templates.itertuples()
+        ]
+    )
 
     def calc_chain_tcrdist(row1, row2, chain, tcrdister=tcrdister):
-        tcr1 = ((row1.organism[0]+row1.va, None, row1.cdr3a),
-                (row1.organism[0]+row1.vb, None, row1.cdr3b))
-        tcr2 = ((row2.organism[0]+row2.va, None, row2.cdr3a),
-                (row2.organism[0]+row2.vb, None, row2.cdr3b))
-        if chain == 'AB':
+        tcr1 = (
+            (row1.organism[0] + row1.va, None, row1.cdr3a),
+            (row1.organism[0] + row1.vb, None, row1.cdr3b),
+        )
+        tcr2 = (
+            (row2.organism[0] + row2.va, None, row2.cdr3a),
+            (row2.organism[0] + row2.vb, None, row2.cdr3b),
+        )
+        if chain == "AB":
             return tcrdister(tcr1, tcr2)
         else:
-            assert chain in ['A','B']
-            ich = 'AB'.index(chain)
+            assert chain in ["A", "B"]
+            ich = "AB".index(chain)
             return tcrdister.single_chain_distance(tcr1[ich], tcr2[ich])
 
-
     all_templates = {}
-    for chain in ['A','B','AB']:
+    for chain in ["A", "B", "AB"]:
         # sort by
-        templates.sort_values(chain+'_tcrdist', inplace=True)
+        templates.sort_values(chain + "_tcrdist", inplace=True)
 
         dfl = []
         for _, row1 in templates.iterrows():
             # check for redundancy
             redundant = False
             for row2 in dfl:
-                if (calc_chain_tcrdist(row1, row2, 'AB') <
-                    min_template_template_paired_tcrdist):
+                if (
+                    calc_chain_tcrdist(row1, row2, "AB")
+                    < min_template_template_paired_tcrdist
+                ):
                     redundant = True
-                    print('redundant template:', chain, row1.pdbid, row2.pdbid)
+                    print("redundant template:", chain, row1.pdbid, row2.pdbid)
                     break
             if redundant:
                 continue
             dfl.append(row1.copy())
-            print('new_dgeom_template:', chain, len(dfl), row1[chain+'_tcrdist'],
-                  row1.organism, row1.mhc_class, row1.mhc_allele, row1.pep_seq,
-                  row1.va, row1.cdr3a, row1.vb, row1.cdr3b)
-            if len(dfl)>=num_templates:
+            print(
+                "new_dgeom_template:",
+                chain,
+                len(dfl),
+                row1[chain + "_tcrdist"],
+                row1.organism,
+                row1.mhc_class,
+                row1.mhc_allele,
+                row1.pep_seq,
+                row1.va,
+                row1.cdr3a,
+                row1.vb,
+                row1.cdr3b,
+            )
+            if len(dfl) >= num_templates:
                 break
         all_templates[chain] = pd.DataFrame(dfl)
 
     return all_templates
 
 
-
-
-
-
 def make_templates_for_alphafold_same_pmhc(
-        organism,
-        va,
-        ja,
-        cdr3a,
-        vb,
-        jb,
-        cdr3b,
-        mhc_class,
-        mhc_allele,
-        peptide,
-        outfile_prefix,
-        num_runs=3,
-        num_templates_per_run=4,
-        min_single_chain_tcrdist=-1,
-        next_best_identity_threshold_mhc=0.98,
-        next_best_identity_threshold_tcr=0.98,
-        verbose=False,
+    organism,
+    va,
+    ja,
+    cdr3a,
+    vb,
+    jb,
+    cdr3b,
+    mhc_class,
+    mhc_allele,
+    peptide,
+    outfile_prefix,
+    num_runs=3,
+    num_templates_per_run=4,
+    min_single_chain_tcrdist=-1,
+    next_best_identity_threshold_mhc=0.98,
+    next_best_identity_threshold_tcr=0.98,
+    verbose=False,
 ):
-    ''' Only use ternary templates: no franken templates
+    """Only use ternary templates: no franken templates
 
     Exclude potential templates based on
 
@@ -1015,92 +1185,106 @@ def make_templates_for_alphafold_same_pmhc(
 
     returns None for failure
 
-    '''
+    """
     # check arguments
     if mhc_class == 2:
         assert len(peptide) == CLASS2_PEPLEN
-        assert mhc_allele.count(',') == 1
+        assert mhc_allele.count(",") == 1
 
-    tcrdister = get_tcrdister('human_and_mouse')
+    tcrdister = get_tcrdister("human_and_mouse")
 
     core_len = TCR_CORE_LEN
 
-    template_mask = ((ternary_info.organism == organism) &
-                     (ternary_info.mhc_class == mhc_class) &
-                     (ternary_info.mhc_allele == mhc_allele) &
-                     (ternary_info.pep_seq == peptide) &
-                     (~ternary_info.pdbid.isin(BAD_PMHC_PDBIDS))&
-                     (~ternary_info.pdbid.isin(BAD_DGEOM_PDBIDS)))
+    template_mask = (
+        (ternary_info.organism == organism)
+        & (ternary_info.mhc_class == mhc_class)
+        & (ternary_info.mhc_allele == mhc_allele)
+        & (ternary_info.pep_seq == peptide)
+        & (~ternary_info.pdbid.isin(BAD_PMHC_PDBIDS))
+        & (~ternary_info.pdbid.isin(BAD_DGEOM_PDBIDS))
+    )
     if not template_mask.sum():
-        print('ERROR no matching templates:: make_templates_for_alphafold_same_pmhc',
-              organism, mhc_class, mhc_allele, peptide)
+        print(
+            "ERROR no matching templates:: make_templates_for_alphafold_same_pmhc",
+            organism,
+            mhc_class,
+            mhc_allele,
+            peptide,
+        )
     templates = ternary_info[template_mask]
-    print('make_templates_for_alphafold_same_pmhc num_templates=', templates.shape[0])
-
+    print(
+        "make_templates_for_alphafold_same_pmhc num_templates=",
+        templates.shape[0],
+    )
 
     # get MHC align-seqs
     if mhc_class == 1:
         trg_mhc_alseqs = [get_mhc_class_1_alseq(mhc_allele)]
     else:
-        trg_mhc_alseqs = [get_mhc_class_2_alseq('A', mhc_allele.split(',')[0]),
-                          get_mhc_class_2_alseq('B', mhc_allele.split(',')[1])]
-
+        trg_mhc_alseqs = [
+            get_mhc_class_2_alseq("A", mhc_allele.split(",")[0]),
+            get_mhc_class_2_alseq("B", mhc_allele.split(",")[1]),
+        ]
 
     return None
 
+
 def align_vgene_to_structure_msas(organism, v_gene):
     msa_alignments = {}
 
     vseq = get_v_seq_up_to_cys(organism, v_gene)
-    for msa_type in ['both','human']:
-        if msa_type == 'human':
-            if organism != 'human':
+    for msa_type in ["both", "human"]:
+        if msa_type == "human":
+            if organism != "human":
                 continue
             row = human_structure_alignments.loc[v_gene]
         else:
             row = both_structure_alignments.loc[(organism, v_gene)]
-        assert row.alseq.replace(ALL_GENES_GAP_CHAR,'') == vseq
-        vseq_to_alseq = {k-row.alseq[:k].count(ALL_GENES_GAP_CHAR) : k
-                         for k,a in enumerate(row.alseq) if a!=ALL_GENES_GAP_CHAR}
+        assert row.alseq.replace(ALL_GENES_GAP_CHAR, "") == vseq
+        vseq_to_alseq = {
+            k - row.alseq[:k].count(ALL_GENES_GAP_CHAR): k
+            for k, a in enumerate(row.alseq)
+            if a != ALL_GENES_GAP_CHAR
+        }
         msa_alignments[msa_type] = vseq_to_alseq
     return msa_alignments
 
 
 def make_templates_for_alphafold(
-        organism,
-        va,
-        ja,
-        cdr3a,
-        vb,
-        jb,
-        cdr3b,
-        mhc_class,
-        mhc_allele,
-        peptide,
-        outfile_prefix,
-        num_runs=5, # match default in setup_for_alphafold
-        num_templates_per_run=4,
-        exclude_self_peptide_docking_geometries=False,
-        exclude_docking_geometry_peptides=None, # None or list of peptides
-        # below is only applied if exclude_docking_geometry_peptides is nonempty
-        #   or exclude_self_peptide_docking_geometries
-        min_dgeom_peptide_mismatches=3, # not used if exclude_* are False/None
-        min_dgeom_paired_tcrdist=-1,
-        min_dgeom_singlechain_tcrdist=-1,
-        min_single_chain_tcrdist=-1,
-        min_pmhc_peptide_mismatches=-1,
-        next_best_identity_threshold_mhc=0.98,
-        next_best_identity_threshold_tcr=0.98,
-        ternary_bonus=0.05, # for tcr templates, in frac identities
-        alt_self_peptides=None, # or list of peptides
-        verbose=False,
-        pick_dgeoms_using_tcrdist=False, # implies num_runs=3
-        use_same_pmhc_dgeoms=False,
-        exclude_pdbids=None,
-        force_pmhc_pdbids=None,
-        use_opt_dgeoms=False,
+    organism,
+    va,
+    ja,
+    cdr3a,
+    vb,
+    jb,
+    cdr3b,
+    mhc_class,
+    mhc_allele,
+    peptide,
+    outfile_prefix,
+    num_runs=5,  # match default in setup_for_alphafold
+    num_templates_per_run=4,
+    exclude_self_peptide_docking_geometries=False,
+    exclude_docking_geometry_peptides=None,  # None or list of peptides
+    # below is only applied if exclude_docking_geometry_peptides is nonempty
+    #   or exclude_self_peptide_docking_geometries
+    min_dgeom_peptide_mismatches=3,  # not used if exclude_* are False/None
+    min_dgeom_paired_tcrdist=-1,
+    min_dgeom_singlechain_tcrdist=-1,
+    min_single_chain_tcrdist=-1,
+    min_pmhc_peptide_mismatches=-1,
+    next_best_identity_threshold_mhc=0.98,
+    next_best_identity_threshold_tcr=0.98,
+    ternary_bonus=0.05,  # for tcr templates, in frac identities
+    alt_self_peptides=None,  # or list of peptides
+    verbose=False,
+    pick_dgeoms_using_tcrdist=False,  # implies num_runs=3
+    use_same_pmhc_dgeoms=False,
+    exclude_pdbids=None,
+    force_pmhc_pdbids=None,
+    use_opt_dgeoms=False,
 ):
-    ''' Makes num_templates_per_run * num_runs template pdb files
+    """Makes num_templates_per_run * num_runs template pdb files
 
     Make num_runs alignfiles <outfile_prefix>_<run>_alignments.tsv
 
@@ -1109,23 +1293,27 @@ def make_templates_for_alphafold(
 
     returns None for failure
 
-    '''
-    from .pdblite import (apply_transform_Rx_plus_v, delete_chains, append_chains,
-                          dump_pdb)
+    """
+    from .pdblite import (
+        apply_transform_Rx_plus_v,
+        delete_chains,
+        append_chains,
+        dump_pdb,
+    )
 
     if exclude_pdbids is None:
         exclude_pdbids = []
     else:
         exclude_pdbids = frozenset(exclude_pdbids)
-        print(f'will exclude {len(exclude_pdbids)} pdbids')
+        print(f"will exclude {len(exclude_pdbids)} pdbids")
 
     # check arguments
     if mhc_class == 2:
         assert len(peptide) == CLASS2_PEPLEN
-        assert mhc_allele.count(',') == 1
+        assert mhc_allele.count(",") == 1
 
     if pick_dgeoms_using_tcrdist:
-        assert num_runs == 3 # AB, A, B
+        assert num_runs == 3  # AB, A, B
 
     if use_opt_dgeoms:
         assert num_runs == 1
@@ -1138,238 +1326,331 @@ def make_templates_for_alphafold(
     if exclude_self_peptide_docking_geometries:
         # dont modify the passed-in list
         exclude_docking_geometry_peptides = (
-            exclude_docking_geometry_peptides+[peptide]+alt_self_peptides)
-
-    tcrdister = get_tcrdister('human_and_mouse')
+            exclude_docking_geometry_peptides + [peptide] + alt_self_peptides
+        )
 
+    tcrdister = get_tcrdister("human_and_mouse")
 
-    def show_alignment(al,seq1,seq2):
+    def show_alignment(al, seq1, seq2):
         if verbose:
-            for i,j in sorted(al.items()):
-                a,b = seq1[i], seq2[j]
-                star = '*' if a==b else ' '
-                print(f'{i:4d} {j:4d} {a} {star} {b}')
-            idents = sum(seq1[i] == seq2[j] for i,j in al.items())/len(seq1)
-            print(f'idents: {idents:6.3f}')
+            for i, j in sorted(al.items()):
+                a, b = seq1[i], seq2[j]
+                star = "*" if a == b else " "
+                print(f"{i:4d} {j:4d} {a} {star} {b}")
+            idents = sum(seq1[i] == seq2[j] for i, j in al.items()) / len(seq1)
+            print(f"idents: {idents:6.3f}")
 
     core_len = TCR_CORE_LEN
 
     if mhc_class == 1:
-        if organism=='human':
+        if organism == "human":
             # now adding HLA-E 2022-05-03
-            assert mhc_allele[0] in 'ABCE' and mhc_allele[1]=='*' and ':' in mhc_allele
-            mhc_allele = ':'.join(mhc_allele.split(':')[:2]) # just the 4 digits
+            assert (
+                mhc_allele[0] in "ABCE"
+                and mhc_allele[1] == "*"
+                and ":" in mhc_allele
+            )
+            mhc_allele = ":".join(
+                mhc_allele.split(":")[:2]
+            )  # just the 4 digits
         else:
-            assert mhc_allele.startswith('H2') and mhc_allele in mhc_class_1_alfas
+            assert (
+                mhc_allele.startswith("H2") and mhc_allele in mhc_class_1_alfas
+            )
 
         # first: MHC part
         trg_mhc_alseq = get_mhc_class_1_alseq(mhc_allele)
-        trg_mhc_seq = trg_mhc_alseq.replace(ALL_GENES_GAP_CHAR,'')
+        trg_mhc_seq = trg_mhc_alseq.replace(ALL_GENES_GAP_CHAR, "")
 
         sortl = []
 
         # use new pmhc-only data
         for l in pmhc_info.itertuples():
-            if (l.organism!=organism or l.mhc_class!=mhc_class or
-                l.pdbid in BAD_PMHC_PDBIDS or l.pdbid in exclude_pdbids or
-                (force_pmhc_pdbids and l.pdbid not in force_pmhc_pdbids)):
+            if (
+                l.organism != organism
+                or l.mhc_class != mhc_class
+                or l.pdbid in BAD_PMHC_PDBIDS
+                or l.pdbid in exclude_pdbids
+                or (force_pmhc_pdbids and l.pdbid not in force_pmhc_pdbids)
+            ):
                 continue
 
             tmp_mhc_alseq = l.mhc_alignseq
             assert len(trg_mhc_alseq) == len(tmp_mhc_alseq)
 
-            mhc_idents = sum(a==b and a!=ALL_GENES_GAP_CHAR
-                             for a,b in zip(trg_mhc_alseq, tmp_mhc_alseq))
+            mhc_idents = sum(
+                a == b and a != ALL_GENES_GAP_CHAR
+                for a, b in zip(trg_mhc_alseq, tmp_mhc_alseq)
+            )
 
             if len(peptide) == len(l.pep_seq):
-                pep_idents = sum(a==b for a,b in zip(peptide,l.pep_seq))
+                pep_idents = sum(a == b for a, b in zip(peptide, l.pep_seq))
             else:
-                pep_idents = sum(a==b for a,b in zip(peptide[:3]+peptide[-3:],
-                                                     l.pep_seq[:3]+l.pep_seq[-3:]))
+                pep_idents = sum(
+                    a == b
+                    for a, b in zip(
+                        peptide[:3] + peptide[-3:],
+                        l.pep_seq[:3] + l.pep_seq[-3:],
+                    )
+                )
             mismatches_for_excluding = min(
                 count_peptide_mismatches(x, l.pep_seq)
-                for x in [peptide]+alt_self_peptides)
+                for x in [peptide] + alt_self_peptides
+            )
             if mismatches_for_excluding < min_pmhc_peptide_mismatches:
                 if verbose:
-                    print('peptide too close:', peptide, l.pep_seq,
-                          'mismatches_for_excluding:', mismatches_for_excluding,
-                          alt_self_peptides)
+                    print(
+                        "peptide too close:",
+                        peptide,
+                        l.pep_seq,
+                        "mismatches_for_excluding:",
+                        mismatches_for_excluding,
+                        alt_self_peptides,
+                    )
                 continue
-            assert len(peptide)-pep_idents >= min_pmhc_peptide_mismatches #sanity
-            total = len(peptide)+len(trg_mhc_seq)
-            frac = (mhc_idents+pep_idents)/total - 0.01*l.mhc_total_chainbreak
+            assert (
+                len(peptide) - pep_idents >= min_pmhc_peptide_mismatches
+            )  # sanity
+            total = len(peptide) + len(trg_mhc_seq)
+            frac = (
+                mhc_idents + pep_idents
+            ) / total - 0.01 * l.mhc_total_chainbreak
             sortl.append((frac, l.Index))
 
         sortl.sort(reverse=True)
         max_idents = sortl[0][0]
-        print(f'mhc max_idents: {max_idents:.3f}', mhc_allele, peptide,
-              pmhc_info.loc[sortl[0][1], 'mhc_allele'],
-              pmhc_info.loc[sortl[0][1], 'pep_seq'])
+        print(
+            f"mhc max_idents: {max_idents:.3f}",
+            mhc_allele,
+            peptide,
+            pmhc_info.loc[sortl[0][1], "mhc_allele"],
+            pmhc_info.loc[sortl[0][1], "pep_seq"],
+        )
 
         pmhc_alignments = []
-        for (idents, pdbid) in sortl[:num_templates_per_run]:
-            if idents < next_best_identity_threshold_tcr*max_idents:
+        for idents, pdbid in sortl[:num_templates_per_run]:
+            if idents < next_best_identity_threshold_tcr * max_idents:
                 break
             templatel = pmhc_info.loc[pdbid]
             tmp_mhc_alseq = templatel.mhc_alignseq
-            tmp_seql = templatel.chainseq.split('/')
-            assert len(tmp_seql)==2 and tmp_seql[1] == templatel.pep_seq
-            tmp_mhc_seq = tmp_seql[0] # class 1
-            tmp_mhc_alseq_seq = tmp_mhc_alseq.replace(ALL_GENES_GAP_CHAR,'')
+            tmp_seql = templatel.chainseq.split("/")
+            assert len(tmp_seql) == 2 and tmp_seql[1] == templatel.pep_seq
+            tmp_mhc_seq = tmp_seql[0]  # class 1
+            tmp_mhc_alseq_seq = tmp_mhc_alseq.replace(ALL_GENES_GAP_CHAR, "")
             assert tmp_mhc_alseq_seq in tmp_mhc_seq
             npad = tmp_mhc_seq.index(tmp_mhc_alseq_seq)
-            #al1 = blosum_align(tmp_mhc_seq, tmp_mhc_alseq_seq)
-            al1 = {i+npad:i for i in range(len(tmp_mhc_alseq_seq))}
+            # al1 = blosum_align(tmp_mhc_seq, tmp_mhc_alseq_seq)
+            al1 = {i + npad: i for i in range(len(tmp_mhc_alseq_seq))}
 
-            al2 = {i-tmp_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR):
-                   i-trg_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR)
-                   for i,(a,b) in enumerate(zip(tmp_mhc_alseq, trg_mhc_alseq))
-                   if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR}
+            al2 = {
+                i
+                - tmp_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR): i
+                - trg_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR)
+                for i, (a, b) in enumerate(zip(tmp_mhc_alseq, trg_mhc_alseq))
+                if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR
+            }
 
-            tmp_to_trg = {x:al2[y] for x,y in al1.items() if y in al2}
-            trg_to_tmp = {y:x for x,y in tmp_to_trg.items()}
+            tmp_to_trg = {x: al2[y] for x, y in al1.items() if y in al2}
+            trg_to_tmp = {y: x for x, y in tmp_to_trg.items()}
 
             trg_offset = len(trg_mhc_seq)
             tmp_offset = len(tmp_mhc_seq)
             trg_peplen, tmp_peplen = len(peptide), len(templatel.pep_seq)
             if trg_peplen == tmp_peplen:
                 for i in range(trg_peplen):
-                    trg_to_tmp[trg_offset+i] = tmp_offset+i
+                    trg_to_tmp[trg_offset + i] = tmp_offset + i
             else:
                 for i in range(3):
-                    trg_to_tmp[trg_offset+i] = tmp_offset+i
-                for i in [-3,-2,-1]:
-                    trg_to_tmp[trg_offset+trg_peplen+i] = tmp_offset+tmp_peplen+i
+                    trg_to_tmp[trg_offset + i] = tmp_offset + i
+                for i in [-3, -2, -1]:
+                    trg_to_tmp[trg_offset + trg_peplen + i] = (
+                        tmp_offset + tmp_peplen + i
+                    )
             trg_pmhc_seq = trg_mhc_seq + peptide
             tmp_pmhc_seq = tmp_mhc_seq + templatel.pep_seq
-            identities = sum(trg_pmhc_seq[i] == tmp_pmhc_seq[j]
-                             for i,j in trg_to_tmp.items())/len(trg_pmhc_seq)
-            identities_for_sorting = (identities - 0.01*templatel.mhc_total_chainbreak)
-            assert abs(identities_for_sorting-idents)<1e-3
+            identities = sum(
+                trg_pmhc_seq[i] == tmp_pmhc_seq[j]
+                for i, j in trg_to_tmp.items()
+            ) / len(trg_pmhc_seq)
+            identities_for_sorting = (
+                identities - 0.01 * templatel.mhc_total_chainbreak
+            )
+            assert abs(identities_for_sorting - idents) < 1e-3
 
             if verbose:
-                print(f'oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}')
+                print(
+                    f"oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}"
+                )
             show_alignment(trg_to_tmp, trg_pmhc_seq, tmp_pmhc_seq)
-            pmhc_alignments.append((identities_for_sorting,
-                                    pdbid,
-                                    trg_to_tmp,
-                                    trg_pmhc_seq,
-                                    tmp_pmhc_seq,
-                                    identities,
-            ))
-    else: # class II
+            pmhc_alignments.append(
+                (
+                    identities_for_sorting,
+                    pdbid,
+                    trg_to_tmp,
+                    trg_pmhc_seq,
+                    tmp_pmhc_seq,
+                    identities,
+                )
+            )
+    else:  # class II
         #
-        trg_mhca_alseq = get_mhc_class_2_alseq('A', mhc_allele.split(',')[0])
-        trg_mhcb_alseq = get_mhc_class_2_alseq('B', mhc_allele.split(',')[1])
-        trg_mhca_seq = trg_mhca_alseq.replace(ALL_GENES_GAP_CHAR,'')
-        trg_mhcb_seq = trg_mhcb_alseq.replace(ALL_GENES_GAP_CHAR,'')
+        trg_mhca_alseq = get_mhc_class_2_alseq("A", mhc_allele.split(",")[0])
+        trg_mhcb_alseq = get_mhc_class_2_alseq("B", mhc_allele.split(",")[1])
+        trg_mhca_seq = trg_mhca_alseq.replace(ALL_GENES_GAP_CHAR, "")
+        trg_mhcb_seq = trg_mhcb_alseq.replace(ALL_GENES_GAP_CHAR, "")
         trg_pmhc_seq = trg_mhca_seq + trg_mhcb_seq + peptide
 
         sortl = []
         for l in ternary_info.itertuples():
-            if (l.organism!=organism or l.mhc_class!=mhc_class or
-                l.pdbid in BAD_PMHC_PDBIDS or l.pdbid in exclude_pdbids):
+            if (
+                l.organism != organism
+                or l.mhc_class != mhc_class
+                or l.pdbid in BAD_PMHC_PDBIDS
+                or l.pdbid in exclude_pdbids
+            ):
                 continue
             mismatches_for_excluding = min(
                 count_peptide_mismatches(x, l.pep_seq)
-                for x in [peptide]+alt_self_peptides)
+                for x in [peptide] + alt_self_peptides
+            )
 
             if mismatches_for_excluding < min_pmhc_peptide_mismatches:
                 if verbose:
-                    print('peptide too close:', peptide, l.pep_seq,
-                          mismatches_for_excluding, alt_self_peptides)
+                    print(
+                        "peptide too close:",
+                        peptide,
+                        l.pep_seq,
+                        mismatches_for_excluding,
+                        alt_self_peptides,
+                    )
                 continue
-            tmp_mhca_alseq, tmp_mhcb_alseq = l.mhc_alignseq.split('/')
+            tmp_mhca_alseq, tmp_mhcb_alseq = l.mhc_alignseq.split("/")
             idents = 0
-            for a,b in zip([trg_mhca_alseq, trg_mhcb_alseq, peptide],
-                           [tmp_mhca_alseq, tmp_mhcb_alseq, l.pep_seq]):
+            for a, b in zip(
+                [trg_mhca_alseq, trg_mhcb_alseq, peptide],
+                [tmp_mhca_alseq, tmp_mhcb_alseq, l.pep_seq],
+            ):
                 assert len(a) == len(b)
-                idents += sum(x==y for x,y in zip(a,b) if x!=ALL_GENES_GAP_CHAR)
-            sortl.append((idents/len(trg_pmhc_seq), l.pdbid))
+                idents += sum(
+                    x == y for x, y in zip(a, b) if x != ALL_GENES_GAP_CHAR
+                )
+            sortl.append((idents / len(trg_pmhc_seq), l.pdbid))
         sortl.sort(reverse=True)
         max_idents = sortl[0][0]
-        print(f'mhc max_idents: {max_idents:.3f}', mhc_allele, peptide,
-              ternary_info.loc[sortl[0][1], 'mhc_allele'],
-              ternary_info.loc[sortl[0][1], 'pep_seq'])
+        print(
+            f"mhc max_idents: {max_idents:.3f}",
+            mhc_allele,
+            peptide,
+            ternary_info.loc[sortl[0][1], "mhc_allele"],
+            ternary_info.loc[sortl[0][1], "pep_seq"],
+        )
 
         pmhc_alignments = []
-        for (idents, pdbid) in sortl[:num_templates_per_run]:
-            if idents < next_best_identity_threshold_tcr*max_idents:
+        for idents, pdbid in sortl[:num_templates_per_run]:
+            if idents < next_best_identity_threshold_tcr * max_idents:
                 break
             templatel = ternary_info.loc[pdbid]
-            tmp_mhca_alseq, tmp_mhcb_alseq = templatel.mhc_alignseq.split('/')
-            mhca_part = tmp_mhca_alseq.replace(ALL_GENES_GAP_CHAR,'')
-            mhcb_part = tmp_mhcb_alseq.replace(ALL_GENES_GAP_CHAR,'')
-            tmp_mhca_seq, tmp_mhcb_seq = templatel.chainseq.split('/')[:2]
+            tmp_mhca_alseq, tmp_mhcb_alseq = templatel.mhc_alignseq.split("/")
+            mhca_part = tmp_mhca_alseq.replace(ALL_GENES_GAP_CHAR, "")
+            mhcb_part = tmp_mhcb_alseq.replace(ALL_GENES_GAP_CHAR, "")
+            tmp_mhca_seq, tmp_mhcb_seq = templatel.chainseq.split("/")[:2]
             assert mhca_part in tmp_mhca_seq and mhcb_part in tmp_mhcb_seq
             mhca_npad = tmp_mhca_seq.find(mhca_part)
             mhcb_npad = tmp_mhcb_seq.find(mhcb_part)
 
             trg_offset, tmp_offset = 0, mhca_npad
-            al1 = {i-trg_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR)+trg_offset:
-                   i-tmp_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR)+tmp_offset
-                   for i,(a,b) in enumerate(zip(trg_mhca_alseq, tmp_mhca_alseq))
-                   if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR}
+            al1 = {
+                i
+                - trg_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR)
+                + trg_offset: i
+                - tmp_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR)
+                + tmp_offset
+                for i, (a, b) in enumerate(zip(trg_mhca_alseq, tmp_mhca_alseq))
+                if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR
+            }
             trg_offset = len(trg_mhca_seq)
-            tmp_offset = len(tmp_mhca_seq)+mhcb_npad
-            al2 = {i-trg_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR)+trg_offset:
-                   i-tmp_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR)+tmp_offset
-                   for i,(a,b) in enumerate(zip(trg_mhcb_alseq, tmp_mhcb_alseq))
-                   if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR}
-            trg_offset = len(trg_mhca_seq)+len(trg_mhcb_seq)
-            tmp_offset = len(tmp_mhca_seq)+len(tmp_mhcb_seq)
-            al3 = {i+trg_offset:i+tmp_offset for i in range(CLASS2_PEPLEN)}
+            tmp_offset = len(tmp_mhca_seq) + mhcb_npad
+            al2 = {
+                i
+                - trg_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR)
+                + trg_offset: i
+                - tmp_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR)
+                + tmp_offset
+                for i, (a, b) in enumerate(zip(trg_mhcb_alseq, tmp_mhcb_alseq))
+                if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR
+            }
+            trg_offset = len(trg_mhca_seq) + len(trg_mhcb_seq)
+            tmp_offset = len(tmp_mhca_seq) + len(tmp_mhcb_seq)
+            al3 = {
+                i + trg_offset: i + tmp_offset for i in range(CLASS2_PEPLEN)
+            }
             trg_to_tmp = {**al1, **al2, **al3}
             tmp_pmhc_seq = tmp_mhca_seq + tmp_mhcb_seq + templatel.pep_seq
-            idents_redo = sum(trg_pmhc_seq[i] == tmp_pmhc_seq[j]
-                              for i,j in trg_to_tmp.items())/len(trg_pmhc_seq)
-            #print(f'oldnew mhc_idents: {idents:6.3f} {idents_redo:6.3f} {pdbid}')
-            assert abs(idents-idents_redo)<1e-4
+            idents_redo = sum(
+                trg_pmhc_seq[i] == tmp_pmhc_seq[j]
+                for i, j in trg_to_tmp.items()
+            ) / len(trg_pmhc_seq)
+            # print(f'oldnew mhc_idents: {idents:6.3f} {idents_redo:6.3f} {pdbid}')
+            assert abs(idents - idents_redo) < 1e-4
             show_alignment(trg_to_tmp, trg_pmhc_seq, tmp_pmhc_seq)
-            pmhc_alignments.append((idents,
-                                    pdbid,
-                                    trg_to_tmp,
-                                    trg_pmhc_seq,
-                                    tmp_pmhc_seq,
-                                    idents,
-            ))
-
-
-
-
+            pmhc_alignments.append(
+                (
+                    idents,
+                    pdbid,
+                    trg_to_tmp,
+                    trg_pmhc_seq,
+                    tmp_pmhc_seq,
+                    idents,
+                )
+            )
 
-    tcr_alignments = {'A':[], 'B':[]}
+    tcr_alignments = {"A": [], "B": []}
 
     # compute single-chain tcrdists to candidate template chains
 
     # drop out of the loop if vdist hits this value and we've already got enough
     # templates
-    big_v_dist=50
+    big_v_dist = 50
 
-    for ab, trg_v, trg_j, trg_cdr3 in  [['A',va,ja,cdr3a],['B',vb,jb,cdr3b]]:
-        trg_tcr = (organism[0]+trg_v, trg_j, trg_cdr3)
+    for ab, trg_v, trg_j, trg_cdr3 in [
+        ["A", va, ja, cdr3a],
+        ["B", vb, jb, cdr3b],
+    ]:
+        trg_tcr = (organism[0] + trg_v, trg_j, trg_cdr3)
 
         trg_msa_alignments = align_vgene_to_structure_msas(organism, trg_v)
 
-        templates = tcr_info[tcr_info.ab==ab]
+        templates = tcr_info[tcr_info.ab == ab]
         templates = templates[~templates.pdbid.isin(exclude_pdbids)]
         template_tcrs = [
-            (x.organism[0]+x.v_gene, None, x.cdr3) for x in templates.itertuples()]
-        #templates['v_gene j_gene cdr3'.split()].itertuples(index=False))
-        closest_tcrs = [(x[0],x[1],trg_cdr3) for x in template_tcrs] # tmp v, trg cdr3
+            (x.organism[0] + x.v_gene, None, x.cdr3)
+            for x in templates.itertuples()
+        ]
+        # templates['v_gene j_gene cdr3'.split()].itertuples(index=False))
+        closest_tcrs = [
+            (x[0], x[1], trg_cdr3) for x in template_tcrs
+        ]  # tmp v, trg cdr3
         sortl = sorted(
-            [(tcrdister.single_chain_distance(trg_tcr,x),
-              tcrdister.single_chain_distance(trg_tcr,y),
-              i)
-             for i,(x,y) in enumerate(zip(closest_tcrs, template_tcrs))])
+            [
+                (
+                    tcrdister.single_chain_distance(trg_tcr, x),
+                    tcrdister.single_chain_distance(trg_tcr, y),
+                    i,
+                )
+                for i, (x, y) in enumerate(zip(closest_tcrs, template_tcrs))
+            ]
+        )
 
         alignments = []
         for closest_dist, dist, ind in sortl:
             if dist < min_single_chain_tcrdist:
-                #print('too close:', dist, trg_v, trg_j, trg_cdr3, template_tcrs[ind])
+                # print('too close:', dist, trg_v, trg_j, trg_cdr3, template_tcrs[ind])
                 continue
-            if closest_dist > big_v_dist and len(alignments)>=num_templates_per_run:
-                #print('too far:', closest_dist)
+            if (
+                closest_dist > big_v_dist
+                and len(alignments) >= num_templates_per_run
+            ):
+                # print('too far:', closest_dist)
                 break
 
             templatel = templates.iloc[ind]
@@ -1380,68 +1661,99 @@ def show_alignment(al,seq1,seq2):
             #     msa_type = 'both'
 
             trg_to_tmp, trg_chainseq = align_vgene_to_template_pdb_chain(
-                ab, organism, trg_v, trg_j, trg_cdr3, trg_msa_alignments,
+                ab,
+                organism,
+                trg_v,
+                trg_j,
+                trg_cdr3,
+                trg_msa_alignments,
                 templatel.pdbid,
             )
 
             # tmp_chainseq_to_msa = align_tcr_info_pdb_chain_to_structure_msa(
             #     templatel.pdbid, templatel.ab, msa_type)
 
-            identities = sum(trg_chainseq[i]==templatel.chainseq[j]
-                             for i,j in trg_to_tmp.items()) / len(trg_chainseq)
+            identities = sum(
+                trg_chainseq[i] == templatel.chainseq[j]
+                for i, j in trg_to_tmp.items()
+            ) / len(trg_chainseq)
 
-            identities_for_sorting = identities + ternary_bonus * templatel.ternary
+            identities_for_sorting = (
+                identities + ternary_bonus * templatel.ternary
+            )
 
             alignments.append(
-                (identities_for_sorting,
-                 templatel.pdbid,
-                 trg_to_tmp,
-                 trg_chainseq,
-                 templatel.chainseq,
-                 closest_dist,
-                 identities,
-                ))
+                (
+                    identities_for_sorting,
+                    templatel.pdbid,
+                    trg_to_tmp,
+                    trg_chainseq,
+                    templatel.chainseq,
+                    closest_dist,
+                    identities,
+                )
+            )
         alignments.sort(reverse=True)
         max_idents = alignments[0][0]
-        tcr_alignments[ab] = [x for x in alignments[:num_templates_per_run]
-                              if x[0] >= next_best_identity_threshold_tcr*max_idents]
+        tcr_alignments[ab] = [
+            x
+            for x in alignments[:num_templates_per_run]
+            if x[0] >= next_best_identity_threshold_tcr * max_idents
+        ]
 
     # docking geometries
     # exclude same-epitope geoms
     if pick_dgeoms_using_tcrdist:
         dgeom_info_by_chain = pick_dgeom_templates(
-            num_templates_per_run, organism, mhc_class, mhc_allele,
-            peptide, va, cdr3a, vb, cdr3b,
-            min_peptide_mismatches = min_dgeom_peptide_mismatches,
-            min_paired_tcrdist = min_dgeom_paired_tcrdist,
-            min_singlechain_tcrdist = min_dgeom_singlechain_tcrdist,
-            peptides_for_filtering = exclude_docking_geometry_peptides,
-            exclude_pdbids = exclude_pdbids,
+            num_templates_per_run,
+            organism,
+            mhc_class,
+            mhc_allele,
+            peptide,
+            va,
+            cdr3a,
+            vb,
+            cdr3b,
+            min_peptide_mismatches=min_dgeom_peptide_mismatches,
+            min_paired_tcrdist=min_dgeom_paired_tcrdist,
+            min_singlechain_tcrdist=min_dgeom_singlechain_tcrdist,
+            peptides_for_filtering=exclude_docking_geometry_peptides,
+            exclude_pdbids=exclude_pdbids,
         )
     elif use_same_pmhc_dgeoms:
         # require matching organism, mhc, peptide with dgeoms
         dgeom_info = ternary_info[ternary_info.mhc_class == mhc_class]
         dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(BAD_DGEOM_PDBIDS)]
         dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(exclude_pdbids)]
-        print('same mhc_class', dgeom_info.shape)
+        print("same mhc_class", dgeom_info.shape)
         dgeom_info = dgeom_info[dgeom_info.organism == organism]
-        print('same organism', dgeom_info.shape)
+        print("same organism", dgeom_info.shape)
         dgeom_info = dgeom_info[dgeom_info.mhc_allele == mhc_allele]
-        print('same mhc_allele', dgeom_info.shape)
+        print("same mhc_allele", dgeom_info.shape)
         dgeom_info = dgeom_info[dgeom_info.pep_seq == peptide]
-        print('same peptide', dgeom_info.shape)
-        if dgeom_info.shape[0]==0:
-            print('ERROR no matching pmhc templates for dgeom:',
-                  organism, mhc_class, mhc_allele, peptide)
+        print("same peptide", dgeom_info.shape)
+        if dgeom_info.shape[0] == 0:
+            print(
+                "ERROR no matching pmhc templates for dgeom:",
+                organism,
+                mhc_class,
+                mhc_allele,
+                peptide,
+            )
             exit()
-        dgeoms = [DockingGeometry().from_dict(x) for _,x in dgeom_info.iterrows()]
-        if num_runs*num_templates_per_run > len(dgeoms):
+        dgeoms = [
+            DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()
+        ]
+        if num_runs * num_templates_per_run > len(dgeoms):
             rep_dgeom_indices = np.random.permutation(len(dgeoms))
             rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices]
         else:
-            dummy_organism = 'human' # just used for avg cdr coords
-            rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps(
-                dummy_organism, dgeoms, num_runs*num_templates_per_run)
+            dummy_organism = "human"  # just used for avg cdr coords
+            rep_dgeoms, rep_dgeom_indices = (
+                docking_geometry.pick_docking_geometry_reps(
+                    dummy_organism, dgeoms, num_runs * num_templates_per_run
+                )
+            )
     elif use_opt_dgeoms:
         rep_dgeoms = docking_geometry.load_opt_dgeoms(mhc_class)
         assert len(rep_dgeoms) == 4
@@ -1449,74 +1761,101 @@ def show_alignment(al,seq1,seq2):
     else:
 
         dgeom_info = ternary_info[ternary_info.mhc_class == mhc_class]
-        if organism=='human': # restrict to human docks
+        if organism == "human":  # restrict to human docks
             dgeom_info = dgeom_info[dgeom_info.organism == organism]
         dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(BAD_DGEOM_PDBIDS)]
         dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(exclude_pdbids)]
         dgeom_info = filter_templates_by_tcrdist(
-            dgeom_info, organism, va, cdr3a, vb, cdr3b,
+            dgeom_info,
+            organism,
+            va,
+            cdr3a,
+            vb,
+            cdr3b,
             min_paired_tcrdist=min_dgeom_paired_tcrdist,
             min_singlechain_tcrdist=min_dgeom_singlechain_tcrdist,
         )
         dgeom_info = filter_templates_by_peptide_mismatches(
-            dgeom_info, organism, mhc_class, exclude_docking_geometry_peptides,
-            min_dgeom_peptide_mismatches)
-        dgeoms = [DockingGeometry().from_dict(x) for _,x in dgeom_info.iterrows()]
+            dgeom_info,
+            organism,
+            mhc_class,
+            exclude_docking_geometry_peptides,
+            min_dgeom_peptide_mismatches,
+        )
+        dgeoms = [
+            DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()
+        ]
 
-        if num_runs*num_templates_per_run > len(dgeoms):
+        if num_runs * num_templates_per_run > len(dgeoms):
             rep_dgeom_indices = np.random.permutation(len(dgeoms))
             rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices]
         else:
-            dummy_organism = 'human' # just used for avg cdr coords
-            rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps(
-                dummy_organism, dgeoms, num_runs*num_templates_per_run)
+            dummy_organism = "human"  # just used for avg cdr coords
+            rep_dgeoms, rep_dgeom_indices = (
+                docking_geometry.pick_docking_geometry_reps(
+                    dummy_organism, dgeoms, num_runs * num_templates_per_run
+                )
+            )
 
-    #print('dgeoms:', len(dgeoms), len(rep_dgeoms))
+    # print('dgeoms:', len(dgeoms), len(rep_dgeoms))
 
-    trg_pmhc_seq = pmhc_alignments[0][3] # check for consistency below
-    trg_tcra_seq = tcr_alignments['A'][0][3] # ditto
-    trg_tcrb_seq = tcr_alignments['B'][0][3] # ditto
+    trg_pmhc_seq = pmhc_alignments[0][3]  # check for consistency below
+    trg_tcra_seq = tcr_alignments["A"][0][3]  # ditto
+    trg_tcrb_seq = tcr_alignments["B"][0][3]  # ditto
 
     # now make the template pdbs
 
     dfl = []
     for run in range(num_runs):
         for itmp in range(num_templates_per_run):
-            pmhc_al = pmhc_alignments[itmp%len(pmhc_alignments)]
-            tcra_al = tcr_alignments['A'][itmp%len(tcr_alignments['A'])]
-            tcrb_al = tcr_alignments['B'][itmp%len(tcr_alignments['B'])]
+            pmhc_al = pmhc_alignments[itmp % len(pmhc_alignments)]
+            tcra_al = tcr_alignments["A"][itmp % len(tcr_alignments["A"])]
+            tcrb_al = tcr_alignments["B"][itmp % len(tcr_alignments["B"])]
             if pick_dgeoms_using_tcrdist:
-                chain = ['AB','A','B'][run]
+                chain = ["AB", "A", "B"][run]
                 dgeom_info = dgeom_info_by_chain[chain]
-                dgeom_row = dgeom_info.iloc[itmp%dgeom_info.shape[0]]
+                dgeom_row = dgeom_info.iloc[itmp % dgeom_info.shape[0]]
                 dgeom = DockingGeometry().from_dict(dgeom_row)
             elif use_opt_dgeoms:
                 dgeom = rep_dgeoms[itmp]
                 dgeom_row = None
             else:
                 # order them this way so each run has diverse dgeoms...
-                dgeom_repno = (itmp*num_runs + run)%len(rep_dgeoms)
+                dgeom_repno = (itmp * num_runs + run) % len(rep_dgeoms)
                 dgeom = rep_dgeoms[dgeom_repno]
                 dgeom_row = dgeom_info.iloc[rep_dgeom_indices[dgeom_repno]]
 
             pmhc_pdbid = pmhc_al[1]
-            pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo(pmhc_pdbid, PMHC)
+            pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo(
+                pmhc_pdbid, PMHC
+            )
 
             tcra_pdbid = tcra_al[1]
-            tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo(tcra_pdbid, TCR)
+            tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo(
+                tcra_pdbid, TCR
+            )
 
             tcrb_pdbid = tcrb_al[1]
-            tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo(tcrb_pdbid, TCR)
+            tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo(
+                tcrb_pdbid, TCR
+            )
 
             # assert ((mhc_class==1 and pmhc_pose.num_chains() == 4) or
             #         (mhc_class==2 and pmhc_pose.num_chains() == 5))
-            assert len(tcra_pose['chains']) == 2 and len(tcrb_pose['chains']) == 2
+            assert (
+                len(tcra_pose["chains"]) == 2 and len(tcrb_pose["chains"]) == 2
+            )
 
             # copy tcrb into tcra_pose by superimposing core coords
-            fix_coords = tcra_pose['ca_coords'][tcra_tdinfo.tcr_core[core_len:]]
-            mov_coords = tcrb_pose['ca_coords'][tcrb_tdinfo.tcr_core[core_len:]]
+            fix_coords = tcra_pose["ca_coords"][
+                tcra_tdinfo.tcr_core[core_len:]
+            ]
+            mov_coords = tcrb_pose["ca_coords"][
+                tcrb_tdinfo.tcr_core[core_len:]
+            ]
             R, v = superimpose.superimposition_transform(
-                fix_coords, mov_coords)
+                fix_coords, mov_coords
+            )
             tcrb_pose = apply_transform_Rx_plus_v(tcrb_pose, R, v)
             tcra_pose = delete_chains(tcra_pose, [1])
             tcra_pose = append_chains(tcra_pose, tcrb_pose, [1])
@@ -1524,49 +1863,52 @@ def show_alignment(al,seq1,seq2):
             # update tcra_tdinfo, compute tcr_stub
             # use dgeom to compute desired tcr_stub
             # transform tcra_pose
-            offset = tcra_pose['chainbounds'][1]-tcrb_pose['chainbounds'][1]
-            tcra_tdinfo.tcr_core = (
-                tcra_tdinfo.tcr_core[:core_len] +
-                [x+offset for x in tcrb_tdinfo.tcr_core[core_len:]])
-            tcra_tdinfo.tcr_cdrs = (
-                tcra_tdinfo.tcr_cdrs[:4] +
-                [[x+offset,y+offset] for x,y in tcrb_tdinfo.tcr_cdrs[4:]])
-
-            assert tcra_pose['sequence'][tcra_tdinfo.tcr_cdrs[3][0]]=='C'
-            assert tcra_pose['sequence'][tcra_tdinfo.tcr_cdrs[7][0]]=='C'
+            offset = tcra_pose["chainbounds"][1] - tcrb_pose["chainbounds"][1]
+            tcra_tdinfo.tcr_core = tcra_tdinfo.tcr_core[:core_len] + [
+                x + offset for x in tcrb_tdinfo.tcr_core[core_len:]
+            ]
+            tcra_tdinfo.tcr_cdrs = tcra_tdinfo.tcr_cdrs[:4] + [
+                [x + offset, y + offset] for x, y in tcrb_tdinfo.tcr_cdrs[4:]
+            ]
+
+            assert tcra_pose["sequence"][tcra_tdinfo.tcr_cdrs[3][0]] == "C"
+            assert tcra_pose["sequence"][tcra_tdinfo.tcr_cdrs[7][0]] == "C"
             old_tcr_stub = tcr_util.get_tcr_stub(tcra_pose, tcra_tdinfo)
-            new_tcr_stub = docking_geometry.stub_from_docking_geometry(
-                dgeom)
+            new_tcr_stub = docking_geometry.stub_from_docking_geometry(dgeom)
             # R @ old_tcr_stub['axes'].T = new_tcr_stub['axes'].T
-            R = new_tcr_stub['axes'].T @ old_tcr_stub['axes']
+            R = new_tcr_stub["axes"].T @ old_tcr_stub["axes"]
             # R @ old_tcr_stub['origin'] + v = new_tcr_stub['origin']
-            v = new_tcr_stub['origin'] - R@old_tcr_stub['origin']
+            v = new_tcr_stub["origin"] - R @ old_tcr_stub["origin"]
             tcra_pose = apply_transform_Rx_plus_v(tcra_pose, R, v)
 
             #
             # copy tcr from tcra_pose into pmhc_pose
-            num_pmhc_chains = mhc_class+1
-            if len(pmhc_pose['chains']) > num_pmhc_chains:
-                del_chains = list(range(num_pmhc_chains, len(pmhc_pose['chains'])))
+            num_pmhc_chains = mhc_class + 1
+            if len(pmhc_pose["chains"]) > num_pmhc_chains:
+                del_chains = list(
+                    range(num_pmhc_chains, len(pmhc_pose["chains"]))
+                )
                 pmhc_pose = delete_chains(pmhc_pose, del_chains)
-            pmhc_pose = append_chains(pmhc_pose, tcra_pose, [0,1])
-            assert len(pmhc_pose['chains'])==2+num_pmhc_chains
-            offset = pmhc_pose['chainbounds'][num_pmhc_chains]
-            pmhc_tdinfo.tcr_core = (
-                [x+offset for x in tcra_tdinfo.tcr_core])
-            pmhc_tdinfo.tcr_cdrs = (
-                [[x+offset,y+offset] for x,y in tcra_tdinfo.tcr_cdrs])
-            assert pmhc_pose['sequence'][pmhc_tdinfo.tcr_cdrs[3][0]] == 'C'
-            assert pmhc_pose['sequence'][pmhc_tdinfo.tcr_cdrs[7][0]] == 'C'
+            pmhc_pose = append_chains(pmhc_pose, tcra_pose, [0, 1])
+            assert len(pmhc_pose["chains"]) == 2 + num_pmhc_chains
+            offset = pmhc_pose["chainbounds"][num_pmhc_chains]
+            pmhc_tdinfo.tcr_core = [x + offset for x in tcra_tdinfo.tcr_core]
+            pmhc_tdinfo.tcr_cdrs = [
+                [x + offset, y + offset] for x, y in tcra_tdinfo.tcr_cdrs
+            ]
+            assert pmhc_pose["sequence"][pmhc_tdinfo.tcr_cdrs[3][0]] == "C"
+            assert pmhc_pose["sequence"][pmhc_tdinfo.tcr_cdrs[7][0]] == "C"
 
             # should be the same as new_tcr_stub!
             redo_tcr_stub = tcr_util.get_tcr_stub(pmhc_pose, pmhc_tdinfo)
-            v_dev = norm(redo_tcr_stub['origin']-new_tcr_stub['origin'])
-            M_dev = norm(new_tcr_stub['axes'] @ redo_tcr_stub['axes'].T - np.eye(3))
-            if max(v_dev, M_dev)>5e-2:
-                print('devs:', v_dev, M_dev)
-            assert v_dev<5e-2
-            assert M_dev<5e-2
+            v_dev = norm(redo_tcr_stub["origin"] - new_tcr_stub["origin"])
+            M_dev = norm(
+                new_tcr_stub["axes"] @ redo_tcr_stub["axes"].T - np.eye(3)
+            )
+            if max(v_dev, M_dev) > 5e-2:
+                print("devs:", v_dev, M_dev)
+            assert v_dev < 5e-2
+            assert M_dev < 5e-2
 
             # setup the alignment
             trg_to_tmp = dict(pmhc_al[2])
@@ -1574,9 +1916,9 @@ def show_alignment(al,seq1,seq2):
             tmp_tcra_seq = tcra_al[4]
             tmp_tcrb_seq = tcrb_al[4]
             tmp_fullseq = tmp_pmhc_seq + tmp_tcra_seq + tmp_tcrb_seq
-            assert pmhc_pose['sequence'] == tmp_fullseq
+            assert pmhc_pose["sequence"] == tmp_fullseq
 
-            assert len(pmhc_pose['chains']) == num_pmhc_chains+2
+            assert len(pmhc_pose["chains"]) == num_pmhc_chains + 2
 
             assert trg_pmhc_seq == pmhc_al[3]
             assert trg_tcra_seq == tcra_al[3]
@@ -1585,44 +1927,58 @@ def show_alignment(al,seq1,seq2):
             trg_fullseq = trg_pmhc_seq + trg_tcra_seq + trg_tcrb_seq
             trg_offset = len(trg_pmhc_seq)
             tmp_offset = len(tmp_pmhc_seq)
-            trg_to_tmp.update({i+trg_offset:j+tmp_offset
-                               for i,j in tcra_al[2].items()})
+            trg_to_tmp.update(
+                {i + trg_offset: j + tmp_offset for i, j in tcra_al[2].items()}
+            )
             trg_offset += len(trg_tcra_seq)
             tmp_offset += len(tmp_tcra_seq)
-            trg_to_tmp.update({i+trg_offset:j+tmp_offset
-                               for i,j in tcrb_al[2].items()})
-            assert Counter(trg_to_tmp.values()).most_common(1)[0][1]==1
-
-            identities = sum(trg_fullseq[i]==tmp_fullseq[j]
-                             for i,j in trg_to_tmp.items())
-            overall_idents = identities/len(trg_fullseq)
-            if run==0:
-                print(f'identities: {itmp} {overall_idents:.3f} '
-                      f'pmhc: {pmhc_al[-1]:.3f} '
-                      f'tcra: {tcra_al[-1]:.3f} '
-                      f'tcrb: {tcrb_al[-1]:.3f} ', va, ja, cdr3a, vb, jb, cdr3b,
-                      flush=True)
+            trg_to_tmp.update(
+                {i + trg_offset: j + tmp_offset for i, j in tcrb_al[2].items()}
+            )
+            assert Counter(trg_to_tmp.values()).most_common(1)[0][1] == 1
+
+            identities = sum(
+                trg_fullseq[i] == tmp_fullseq[j] for i, j in trg_to_tmp.items()
+            )
+            overall_idents = identities / len(trg_fullseq)
+            if run == 0:
+                print(
+                    f"identities: {itmp} {overall_idents:.3f} "
+                    f"pmhc: {pmhc_al[-1]:.3f} "
+                    f"tcra: {tcra_al[-1]:.3f} "
+                    f"tcrb: {tcrb_al[-1]:.3f} ",
+                    va,
+                    ja,
+                    cdr3a,
+                    vb,
+                    jb,
+                    cdr3b,
+                    flush=True,
+                )
                 if verbose:
-                    for i,j in sorted(trg_to_tmp.items()):
-                        a,b = trg_fullseq[i], tmp_fullseq[j]
-                        star = '*' if a==b else ' '
-                        print(f'{i:4d} {j:4d} {a} {star} {b}')
+                    for i, j in sorted(trg_to_tmp.items()):
+                        a, b = trg_fullseq[i], tmp_fullseq[j]
+                        star = "*" if a == b else " "
+                        print(f"{i:4d} {j:4d} {a} {star} {b}")
 
-            outpdbfile = f'{outfile_prefix}_{run}_{itmp}.pdb'
-            #pmhc_pose.dump_pdb(outpdbfile)
+            outpdbfile = f"{outfile_prefix}_{run}_{itmp}.pdb"
+            # pmhc_pose.dump_pdb(outpdbfile)
             dump_pdb(pmhc_pose, outpdbfile)
-            #print('made:', outpdbfile)
+            # print('made:', outpdbfile)
 
-            trg_pmhc_seqs = ([trg_mhc_seq, peptide] if mhc_class==1 else
-                             [trg_mhca_seq, trg_mhcb_seq, peptide])
-            trg_cbseq = '/'.join(trg_pmhc_seqs+[trg_tcra_seq, trg_tcrb_seq])
+            trg_pmhc_seqs = (
+                [trg_mhc_seq, peptide]
+                if mhc_class == 1
+                else [trg_mhca_seq, trg_mhcb_seq, peptide]
+            )
+            trg_cbseq = "/".join(trg_pmhc_seqs + [trg_tcra_seq, trg_tcrb_seq])
 
-            alignstring = ';'.join(f'{i}:{j}' for i,j in trg_to_tmp.items())
+            alignstring = ";".join(f"{i}:{j}" for i, j in trg_to_tmp.items())
 
             if pmhc_pdbid in pmhc_info.index:
-                pmhc_allele=pmhc_info.loc[pmhc_pdbid, 'mhc_allele']
+                pmhc_allele = pmhc_info.loc[pmhc_pdbid, "mhc_allele"]
             else:
-                pmhc_allele=ternary_info.loc[pmhc_pdbid, 'mhc_allele']
+                pmhc_allele = ternary_info.loc[pmhc_pdbid, "mhc_allele"]
 
             outl = OrderedDict(
                 run=run,
@@ -1631,18 +1987,20 @@ def show_alignment(al,seq1,seq2):
                 overall_idents=overall_idents,
                 pmhc_pdbid=pmhc_pdbid,
                 pmhc_idents=pmhc_al[-1],
-                pmhc_allele=pmhc_allele,#pmhc_info.loc[pmhc_pdbid, 'mhc_allele'],
+                pmhc_allele=pmhc_allele,  # pmhc_info.loc[pmhc_pdbid, 'mhc_allele'],
                 tcra_pdbid=tcra_pdbid,
                 tcra_idents=tcra_al[-1],
-                tcra_v   =tcr_info.loc[(tcra_pdbid,'A'), 'v_gene'],
-                tcra_j   =tcr_info.loc[(tcra_pdbid,'A'), 'j_gene'],
-                tcra_cdr3=tcr_info.loc[(tcra_pdbid,'A'), 'cdr3'],
+                tcra_v=tcr_info.loc[(tcra_pdbid, "A"), "v_gene"],
+                tcra_j=tcr_info.loc[(tcra_pdbid, "A"), "j_gene"],
+                tcra_cdr3=tcr_info.loc[(tcra_pdbid, "A"), "cdr3"],
                 tcrb_pdbid=tcrb_pdbid,
                 tcrb_idents=tcrb_al[-1],
-                tcrb_v   =tcr_info.loc[(tcrb_pdbid,'B'), 'v_gene'],
-                tcrb_j   =tcr_info.loc[(tcrb_pdbid,'B'), 'j_gene'],
-                tcrb_cdr3=tcr_info.loc[(tcrb_pdbid,'B'), 'cdr3'],
-                dgeom_pdbid=f'opt{itmp}' if dgeom_row is None else dgeom_row.pdbid,
+                tcrb_v=tcr_info.loc[(tcrb_pdbid, "B"), "v_gene"],
+                tcrb_j=tcr_info.loc[(tcrb_pdbid, "B"), "j_gene"],
+                tcrb_cdr3=tcr_info.loc[(tcrb_pdbid, "B"), "cdr3"],
+                dgeom_pdbid=(
+                    f"opt{itmp}" if dgeom_row is None else dgeom_row.pdbid
+                ),
                 template_pdbfile=outpdbfile,
                 target_to_template_alignstring=alignstring,
                 identities=identities,
@@ -1653,30 +2011,34 @@ def show_alignment(al,seq1,seq2):
     assert len(dfl) == num_runs * num_templates_per_run
     return pd.DataFrame(dfl)
 
+
 def genes_ok_for_modeling(organism, va, ja, vb, jb, verbose=True):
 
     if organism not in all_genes:
-        print(f'ERROR unrecognized organism: "{organism}" expected one of',
-              all_genes.keys())
+        print(
+            f'ERROR unrecognized organism: "{organism}" expected one of',
+            all_genes.keys(),
+        )
         return False
 
     for g in [va, ja, vb, jb]:
         if g not in all_genes[organism]:
             from .tcrdist.basic import path_to_db, db_file
+
             print(f'ERROR: unrecognized gene: "{g}" for organism "{organism}"')
             # for error output:
             dbfile = path_to_db / db_file
-            print('check the ids in', dbfile, 'for organism', organism)
+            print("check the ids in", dbfile, "for organism", organism)
             return False
 
-    if (organism,va) not in both_structure_alignments.index:
+    if (organism, va) not in both_structure_alignments.index:
         if verbose:
-            print('ERROR: no va alignment:', organism, va)
+            print("ERROR: no va alignment:", organism, va)
         return False
 
-    if (organism,vb) not in both_structure_alignments.index:
+    if (organism, vb) not in both_structure_alignments.index:
         if verbose:
-            print('ERROR: no vb alignment:', organism, vb)
+            print("ERROR: no vb alignment:", organism, vb)
         return False
 
     va_seq = get_v_seq_up_to_cys(organism, va)
@@ -1684,221 +2046,264 @@ def genes_ok_for_modeling(organism, va, ja, vb, jb, verbose=True):
     vb_seq = get_v_seq_up_to_cys(organism, vb)
     jb_seq = get_j_seq_after_cdr3(organism, jb)
 
-    if ( '*' in va_seq+ja_seq+vb_seq+jb_seq or
-         va_seq[-1] != 'C' or vb_seq[-1] != 'C'):
+    if (
+        "*" in va_seq + ja_seq + vb_seq + jb_seq
+        or va_seq[-1] != "C"
+        or vb_seq[-1] != "C"
+    ):
         if verbose:
-            print('ERROR bad seqs:', va, va_seq, ja, ja_seq,
-                  vb, vb_seq, ja, jb_seq)
+            print(
+                "ERROR bad seqs:",
+                va,
+                va_seq,
+                ja,
+                ja_seq,
+                vb,
+                vb_seq,
+                ja,
+                jb_seq,
+            )
         return False
     return True
 
+
 def check_genes_for_modeling(tcr_db):
     all_ok = True
     for index, targetl in tcr_db.iterrows():
         ok = genes_ok_for_modeling(
-            targetl.organism, targetl.va, targetl.ja, targetl.vb, targetl.jb,
-            verbose=True)
+            targetl.organism,
+            targetl.va,
+            targetl.ja,
+            targetl.vb,
+            targetl.jb,
+            verbose=True,
+        )
         all_ok = all_ok and ok
         if not ok:
-            print('ERROR bad tcr genes at index=', index)
+            print("ERROR bad tcr genes at index=", index)
     return all_ok
 
 
 def setup_for_alphafold(
-        tcr_db,
-        outdir,
-        organism=None, # if None, should be present in tcr_db
-        min_pmhc_count=None,
-        max_pmhc_count=None,
-        random_seed=1, # for subsampling tcrs
-        num_runs=3,
-        clobber=False,
-        exclude_self_peptide_docking_geometries=False,
-        alt_self_peptides_column=None, # this column should be comma-separated
-        exclude_pdbids_column=None, # this column should be comma-separated
-        targetid_prefix_suffix='',
-        use_opt_dgeoms=False,
-        **kwargs,
+    tcr_db,
+    outdir,
+    organism=None,  # if None, should be present in tcr_db
+    min_pmhc_count=None,
+    max_pmhc_count=None,
+    random_seed=1,  # for subsampling tcrs
+    num_runs=3,
+    clobber=False,
+    exclude_self_peptide_docking_geometries=False,
+    alt_self_peptides_column=None,  # this column should be comma-separated
+    exclude_pdbids_column=None,  # this column should be comma-separated
+    targetid_prefix_suffix="",
+    use_opt_dgeoms=False,
+    **kwargs,
 ):
-    '''
-    '''
-    assert outdir.endswith('/')
+    """ """
+    assert outdir.endswith("/")
     if use_opt_dgeoms:
         assert num_runs == 1
-    required_cols = 'va ja cdr3a vb jb cdr3b mhc_class mhc peptide'.split()
+    required_cols = "va ja cdr3a vb jb cdr3b mhc_class mhc peptide".split()
     if organism is None:
-        required_cols.append('organism')
+        required_cols.append("organism")
     for col in required_cols:
         assert col in tcr_db.columns
 
-    #assert not any(tcr_db.mhc.str.startswith('E*'))
+    # assert not any(tcr_db.mhc.str.startswith('E*'))
 
     tcr_db = tcr_db.copy()
-    tcr_db['mhc_peptide'] = (
-        tcr_db.mhc.str.replace('*','',regex=False).str.replace(':','',regex=False)+
-        '_'+tcr_db.peptide)
+    tcr_db["mhc_peptide"] = (
+        tcr_db.mhc.str.replace("*", "", regex=False).str.replace(
+            ":", "", regex=False
+        )
+        + "_"
+        + tcr_db.peptide
+    )
     if organism is not None:
-        if 'organism' in tcr_db.columns:
-            assert all(tcr_db.organism==organism)
+        if "organism" in tcr_db.columns:
+            assert all(tcr_db.organism == organism)
         else:
-            tcr_db['organism'] = organism
+            tcr_db["organism"] = organism
 
     # doesnt do anything if init has already been called (I don't think)
 
     if not exists(outdir):
         os.mkdir(outdir)
     else:
-        assert clobber, 'outdir already exists: '+outdir
+        assert clobber, "outdir already exists: " + outdir
 
-    #optionally filter to peptide mhc combos with min/max counts ##########
+    # optionally filter to peptide mhc combos with min/max counts ##########
     if min_pmhc_count is not None or max_pmhc_count is not None:
-        tcr_db.sort_values('mhc_peptide', inplace=True) ## IMPORTANT B/C MASKING
-        random.seed(random_seed) # shuffling of epitope tcrs
+        tcr_db.sort_values(
+            "mhc_peptide", inplace=True
+        )  ## IMPORTANT B/C MASKING
+        random.seed(random_seed)  # shuffling of epitope tcrs
         assert min_pmhc_count is not None and max_pmhc_count is not None
         mhc_peptides = tcr_db.mhc_peptide.drop_duplicates().to_list()
         mask = []
         for mhc_peptide in mhc_peptides:
-            count = np.sum(tcr_db.mhc_peptide==mhc_peptide)
+            count = np.sum(tcr_db.mhc_peptide == mhc_peptide)
             if count < min_pmhc_count:
-                mhc_peptide_mask = [False]*count
+                mhc_peptide_mask = [False] * count
             elif count <= max_pmhc_count:
-                mhc_peptide_mask = [True]*count
+                mhc_peptide_mask = [True] * count
             else:
-                mhc_peptide_mask = [True]*max_pmhc_count+ [False]*(count-max_pmhc_count)
+                mhc_peptide_mask = [True] * max_pmhc_count + [False] * (
+                    count - max_pmhc_count
+                )
                 random.shuffle(mhc_peptide_mask)
             mask.extend(mhc_peptide_mask)
 
         old_size = tcr_db.shape[0]
         tcr_db = tcr_db[mask].copy()
-        print('subset tcr_db:', old_size, tcr_db.shape[0])
+        print("subset tcr_db:", old_size, tcr_db.shape[0])
         counts = tcr_db.mhc_peptide.value_counts().to_list()
         assert min_pmhc_count <= min(counts) and max(counts) <= max_pmhc_count
 
-    tcr_db_outfile = outdir+'tcr_db.tsv'
-    tcr_db.to_csv(tcr_db_outfile, sep='\t', index=False)
+    tcr_db_outfile = outdir + "tcr_db.tsv"
+    tcr_db.to_csv(tcr_db_outfile, sep="\t", index=False)
 
     tcr_db = tcr_db.reset_index(drop=True)
 
-    print('check genes for modeling', tcr_db.shape[0])
+    print("check genes for modeling", tcr_db.shape[0])
     for index, targetl in tcr_db.iterrows():
         if not genes_ok_for_modeling(
-                targetl.organism, targetl.va, targetl.ja, targetl.vb, targetl.jb):
-            print('ERROR bad genes:', index, targetl.va, targetl.ja,
-                  targetl.vb, targetl.jb)
+            targetl.organism, targetl.va, targetl.ja, targetl.vb, targetl.jb
+        ):
+            print(
+                "ERROR bad genes:",
+                index,
+                targetl.va,
+                targetl.ja,
+                targetl.vb,
+                targetl.jb,
+            )
             assert False
             exit()
 
     targets_dfl = []
     for index, targetl in tcr_db.iterrows():
-        targetid_prefix = f'T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}'
-        print('START', index, tcr_db.shape[0], targetid_prefix)
-        outfile_prefix = f'{outdir}{targetid_prefix}'
+        targetid_prefix = (
+            f"T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}"
+        )
+        print("START", index, tcr_db.shape[0], targetid_prefix)
+        outfile_prefix = f"{outdir}{targetid_prefix}"
         if exclude_self_peptide_docking_geometries:
             exclude_docking_geometry_peptides = [targetl.peptide]
         else:
             exclude_docking_geometry_peptides = []
         if alt_self_peptides_column is not None:
             assert exclude_docking_geometry_peptides
-            alt_self_peptides = targetl[alt_self_peptides_column].split(',')
+            alt_self_peptides = targetl[alt_self_peptides_column].split(",")
             if exclude_self_peptide_docking_geometries:
                 exclude_docking_geometry_peptides.extend(alt_self_peptides)
         else:
-            alt_self_peptides=None
+            alt_self_peptides = None
 
         if exclude_pdbids_column is not None:
             exclude_pdbids = targetl[exclude_pdbids_column]
             if pd.isna(exclude_pdbids):
                 exclude_pdbids = None
             else:
-                exclude_pdbids = exclude_pdbids.split(',')
+                exclude_pdbids = exclude_pdbids.split(",")
         else:
             exclude_pdbids = None
 
         all_run_info = make_templates_for_alphafold(
-            targetl.organism, targetl.va, targetl.ja, targetl.cdr3a,
-            targetl.vb, targetl.jb, targetl.cdr3b,
-            targetl.mhc_class, targetl.mhc, targetl.peptide, outfile_prefix,
+            targetl.organism,
+            targetl.va,
+            targetl.ja,
+            targetl.cdr3a,
+            targetl.vb,
+            targetl.jb,
+            targetl.cdr3b,
+            targetl.mhc_class,
+            targetl.mhc,
+            targetl.peptide,
+            outfile_prefix,
             exclude_docking_geometry_peptides=exclude_docking_geometry_peptides,
-            num_runs = num_runs,
+            num_runs=num_runs,
             alt_self_peptides=alt_self_peptides,
-            exclude_pdbids = exclude_pdbids,
-            use_opt_dgeoms = use_opt_dgeoms,
+            exclude_pdbids=exclude_pdbids,
+            use_opt_dgeoms=use_opt_dgeoms,
             **kwargs,
         )
 
         for run in range(num_runs):
-            info = all_run_info[all_run_info.run==run]
-            assert info.shape[0] == 4#num templates
-            targetid = f'{targetid_prefix}_{run}'
+            info = all_run_info[all_run_info.run == run]
+            assert info.shape[0] == 4  # num templates
+            targetid = f"{targetid_prefix}_{run}"
             trg_cbseq = set(info.target_chainseq).pop()
-            alignfile = f'{outdir}{targetid}_alignments.tsv'
-            info.to_csv(alignfile, sep='\t', index=False)
+            alignfile = f"{outdir}{targetid}_alignments.tsv"
+            info.to_csv(alignfile, sep="\t", index=False)
             outl = pd.Series(targetl)
-            outl['targetid'] = targetid
-            outl['target_chainseq'] = trg_cbseq
-            outl['templates_alignfile'] = alignfile
+            outl["targetid"] = targetid
+            outl["target_chainseq"] = trg_cbseq
+            outl["templates_alignfile"] = alignfile
             targets_dfl.append(outl)
         sys.stdout.flush()
 
         # save partial work... since this is so freakin slow
-        outfile = outdir+'targets.tsv'
-        pd.DataFrame(targets_dfl).to_csv(outfile, sep='\t', index=False)
+        outfile = outdir + "targets.tsv"
+        pd.DataFrame(targets_dfl).to_csv(outfile, sep="\t", index=False)
 
-    outfile = outdir+'targets.tsv'
-    pd.DataFrame(targets_dfl).to_csv(outfile, sep='\t', index=False)
-    print('made:', outfile)
+    outfile = outdir + "targets.tsv"
+    pd.DataFrame(targets_dfl).to_csv(outfile, sep="\t", index=False)
+    print("made:", outfile)
 
 
-def get_mhc_chain_trim_positions(chainseq, organism, mhc_class, mhc_allele, chain=None):
-    '''
-    '''
-    assert mhc_class in [1,2]
-    if mhc_class==2:
-        assert chain in ['A','B']
+def get_mhc_chain_trim_positions(
+    chainseq, organism, mhc_class, mhc_allele, chain=None
+):
+    """ """
+    assert mhc_class in [1, 2]
+    if mhc_class == 2:
+        assert chain in ["A", "B"]
         alseq = get_mhc_class_2_alseq(chain, mhc_allele)
         if alseq is None:
-            print('ERROR: None alseq:', organism, mhc_class, mhc_allele)
+            print("ERROR: None alseq:", organism, mhc_class, mhc_allele)
             return [None]
-        if chain == 'A':
-            #H2AKa EPQGGLQNIATGKHNLEI
-            #DRA   EAQGALANIAVDKANLEI
-            ref_alseq = 'HVIIQ.AEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSN'
-            ref_helices = ['EAQGALANIAVDKANLEI']
+        if chain == "A":
+            # H2AKa EPQGGLQNIATGKHNLEI
+            # DRA   EAQGALANIAVDKANLEI
+            ref_alseq = "HVIIQ.AEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSN"
+            ref_helices = ["EAQGALANIAVDKANLEI"]
         else:
-            #H2AKb   YWNKQ..YLERTRAELDTVCRHN
-            #DRB1*01 YWNSQKDLLEQRRAAVDTYCRHN
-            ref_alseq = 'FVHQFQPFCYFTNGTQRIRLVIRYIYNREEYVRFDSDVGEYRAVTELGRPDAEYWNKQ..YLERTRAELDTVCRHNYEKTETPTS'
-            ref_helices = ['YWNKQ..YLERTRAELDTVCRHN']
+            # H2AKb   YWNKQ..YLERTRAELDTVCRHN
+            # DRB1*01 YWNSQKDLLEQRRAAVDTYCRHN
+            ref_alseq = "FVHQFQPFCYFTNGTQRIRLVIRYIYNREEYVRFDSDVGEYRAVTELGRPDAEYWNKQ..YLERTRAELDTVCRHNYEKTETPTS"
+            ref_helices = ["YWNKQ..YLERTRAELDTVCRHN"]
 
     else:
         alseq = get_mhc_class_1_alseq(mhc_allele)
         if alseq is None:
-            print('ERROR: None alseq:', organism, mhc_class, mhc_allele)
+            print("ERROR: None alseq:", organism, mhc_class, mhc_allele)
             return [None]
-        #A02: GETRKVKAHSQTHRVDLGT and KWEAAHVAEQLRAYLEGTCVEW
-        #D-B: RETQKAKGQEQWFRVSLRN and KWEQSGAAEHYKAYLEGECVEW
-        if organism=='mouse':
-            ref_alseq = 'GPHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLR'
-            ref_helices = ['RETQKAKGQEQWFRVSLRN', 'KWEQSGAAEHYKAYLEGECVEW']
+        # A02: GETRKVKAHSQTHRVDLGT and KWEAAHVAEQLRAYLEGTCVEW
+        # D-B: RETQKAKGQEQWFRVSLRN and KWEQSGAAEHYKAYLEGECVEW
+        if organism == "mouse":
+            ref_alseq = "GPHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLR"
+            ref_helices = ["RETQKAKGQEQWFRVSLRN", "KWEQSGAAEHYKAYLEGECVEW"]
         else:
-            ref_alseq = 'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENG'
-            ref_helices = ['GETRKVKAHSQTHRVDLGT', 'KWEAAHVAEQLRAYLEGTCVEW']
+            ref_alseq = "GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENG"
+            ref_helices = ["GETRKVKAHSQTHRVDLGT", "KWEAAHVAEQLRAYLEGTCVEW"]
 
     assert len(alseq) == len(ref_alseq)
-    alseqseq = alseq.replace(ALL_GENES_GAP_CHAR, '')
+    alseqseq = alseq.replace(ALL_GENES_GAP_CHAR, "")
     if chainseq != alseqseq:
-        #print('WARNING: mhc seq mismatch: chainseq=', chainseq)
-        #print('WARNING: mhc seq mismatch: alseqseq=', alseqseq)
+        # print('WARNING: mhc seq mismatch: chainseq=', chainseq)
+        # print('WARNING: mhc seq mismatch: alseqseq=', alseqseq)
         as2cs = blosum_align(alseqseq, chainseq)
     else:
-        as2cs = {i:i for i in range(len(chainseq))}
+        as2cs = {i: i for i in range(len(chainseq))}
 
     positions = []
     for helix in ref_helices:
         start = ref_alseq.index(helix)
-        for pos in range(start, start+len(helix)):
+        for pos in range(start, start + len(helix)):
             if alseq[pos] != ALL_GENES_GAP_CHAR:
-                i = pos-alseq[:pos].count(ALL_GENES_GAP_CHAR)
+                i = pos - alseq[:pos].count(ALL_GENES_GAP_CHAR)
                 if i in as2cs:
                     positions.append(as2cs[i])
     return positions

From 752c2cacbf60b7c223d3b25bc0d4c4113740297b Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 12:12:10 -0700
Subject: [PATCH 12/19] fix deprecated read mode

---
 tcrdock/sequtil.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcrdock/sequtil.py b/tcrdock/sequtil.py
index 122cc63..8a852bb 100644
--- a/tcrdock/sequtil.py
+++ b/tcrdock/sequtil.py
@@ -44,7 +44,7 @@
 
 def read_fasta(filename):  # helper
     """return OrderedDict indexed by the ">" lines (everything after >)"""
-    data = open(filename, "rU")
+    data = open(filename, "r")
     fasta = OrderedDict()
     for line in data:
         if line[0] == ">":

From 89854aeebcec1be96fd3216a6b133c03fdaa9327 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 12:24:26 -0700
Subject: [PATCH 13/19] switch to pyproject.toml

---
 pyproject.toml | 31 ++++++++++++++++++++++++++++++-
 setup.py       | 44 --------------------------------------------
 2 files changed, 30 insertions(+), 45 deletions(-)
 delete mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
index 4e715e0..d7941d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,32 @@
 [build-system]
 requires = ["setuptools >= 42.0.0"]
-build-backend = "setuptools.build_meta"
\ No newline at end of file
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "tcrdock"
+version = "0.1.0"
+
+dependencies = [
+    "biopython",
+    "numpy",
+    "pandas",
+    "scipy",
+    "matplotlib",
+]
+
+[tool.setuptools]
+packages = ["tcrdock"]
+
+[tool.setuptools.package-data]
+tcrdock = [
+    "**/*.alfas",
+    "**/*.fasta",
+    "**/*.txt",
+    "**/*.tsv",
+    "**/tcr/*.pdb",
+    "**/tcr/*.json",
+    "**/ternary/*.pdb",
+    "**/ternary/*.json",
+    "**/pmhc/*.pdb",
+    "**/pmhc/*.json",
+]
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 75a5d3e..0000000
--- a/setup.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# setup.py
-import subprocess
-import sys
-from setuptools import setup, find_packages
-from setuptools.command.install import install
-
-
-class CustomInstall(install):
-    """Run our database installer, then fall back to the normal install."""
-
-    def run(self):
-        subprocess.check_call([sys.executable, "download_blast.py"])
-        super().run()
-
-
-setup(
-    name="tcrdock",
-    version="0.1.0",
-    packages=["tcrdock"],
-    install_requires=[
-        "biopython",
-        "numpy",
-        "pandas",
-        "scipy",
-        "matplotlib",
-    ],
-    cmdclass={
-        "install": CustomInstall,
-    },
-    package_data={
-        "tcrdock": [
-            "**/*.alfas",
-            "**/*.fasta",
-            "**/*.txt",
-            "**/*.tsv",
-            "**/tcr/*.pdb",
-            "**/tcr/*.json",
-            "**/ternary/*.pdb",
-            "**/ternary/*.json",
-            "**/pmhc/*.pdb",
-            "**/pmhc/*.json",
-        ]
-    },
-)

From 2fdf2d00f7474e46d591b35cef2acae41bc06f0f Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 12:28:39 -0700
Subject: [PATCH 14/19] move blast to conda

---
 README.md                         |  2 +-
 devtools/conda-envs/test_env.yaml |  4 ++
 tcrdock/blast.py                  | 93 ++++++++++++++++---------------
 3 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 0c56e54..0bd734b 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ to be installed, which can be done by running the script
 `download_blast.py`. A potential installation route would be:
 
 ```
-conda create --name tcrdock_test python=3.8
+conda create --name tcrdock_test python blast
 source activate tcrdock_test   # or: conda activate tcrdock_test
 pip install <path/to/cloned/repo>
 ```
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index 2402f01..a3e0072 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -7,4 +7,8 @@ dependencies:
   - python>=3.10.0
   - pip
 
+  ## main deps
+  - blast
+
+  ## testing
   - pytest
\ No newline at end of file
diff --git a/tcrdock/blast.py b/tcrdock/blast.py
index ed783ee..93850e0 100644
--- a/tcrdock/blast.py
+++ b/tcrdock/blast.py
@@ -5,63 +5,64 @@
 from os.path import exists, isdir
 from .util import amino_acids
 
-path_to_blast_executables = Path(__file__).parents[1] / 'ncbi-blast-2.11.0+' / 'bin'
-assert isdir( path_to_blast_executables ),\
-    'You need to download blast; please run download_blast.py in TCRdock/ folder'
+blastp_exe = str("blastp")
+makeblastdb_exe = str("makeblastdb")
 
-blastp_exe = str(path_to_blast_executables / 'blastp')
-makeblastdb_exe = str(path_to_blast_executables / 'makeblastdb')
+blast_fields = (
+    "evalue bitscore qaccver saccver pident length mismatch"
+    " gapopen qstart qend qlen qseq sstart send slen sseq"
+)
 
-blast_fields = ('evalue bitscore qaccver saccver pident length mismatch'
-                ' gapopen qstart qend qlen qseq sstart send slen sseq')
 
-def make_blast_dbs(fastafile, dbtype='prot'):
-    assert dbtype in ['prot','nucl']
+def make_blast_dbs(fastafile, dbtype="prot"):
+    assert dbtype in ["prot", "nucl"]
 
-    cmd = f'{makeblastdb_exe} -in {fastafile} -dbtype {dbtype}'
+    cmd = f"{makeblastdb_exe} -in {fastafile} -dbtype {dbtype}"
     print(cmd)
     system(cmd)
 
 
 def check_for_blast_dbs(fastafile):
-    return exists(str(fastafile)+'.phr')
+    return exists(str(fastafile) + ".phr")
+
 
 def blast_file_and_read_hits(
-        fname,
-        dbfile,
-        evalue = 1e-3,
-        num_alignments = 10000,
-        verbose=False,
-        clobber=False,
-        extra_blast_args=''
+    fname,
+    dbfile,
+    evalue=1e-3,
+    num_alignments=10000,
+    verbose=False,
+    clobber=False,
+    extra_blast_args="",
 ):
-    assert exists(dbfile), f'missing file for BLAST-ing against: {dbfile}'
+    assert exists(dbfile), f"missing file for BLAST-ing against: {dbfile}"
 
     # check for blast database files
     if not check_for_blast_dbs(dbfile):
         # maybe we haven't set up the blast files yet...
-        print('WARNING: missing blast db files, trying to create...')
+        print("WARNING: missing blast db files, trying to create...")
         make_blast_dbs(dbfile)
-        assert check_for_blast_dbs(dbfile), 'Failed to create blast db files!'
+        assert check_for_blast_dbs(dbfile), "Failed to create blast db files!"
 
-    outfile = fname+'.blast'
+    outfile = fname + ".blast"
     assert clobber or not exists(outfile)
 
-    cmd = (f'{blastp_exe} -query {fname} -db {dbfile} {extra_blast_args} '
-           f' -outfmt "10 delim=, {blast_fields}" -evalue {evalue}'
-           f' -num_alignments {num_alignments} -out {outfile}')
+    cmd = (
+        f"{blastp_exe} -query {fname} -db {dbfile} {extra_blast_args} "
+        f' -outfmt "10 delim=, {blast_fields}" -evalue {evalue}'
+        f" -num_alignments {num_alignments} -out {outfile}"
+    )
 
     if not verbose:
-        cmd += ' 2> /dev/null'
+        cmd += " 2> /dev/null"
 
     if verbose:
         print(cmd)
     system(cmd)
 
-    blast_hits = pd.read_csv(
-        outfile, header=None, names=blast_fields.split())
-    #blast_hits.rename(columns={'saccver':'pdb_chain', 'qaccver':'allele'}, inplace = True)
-    #blast_hits.sort_values('pident', ascending=False, inplace=True)
+    blast_hits = pd.read_csv(outfile, header=None, names=blast_fields.split())
+    # blast_hits.rename(columns={'saccver':'pdb_chain', 'qaccver':'allele'}, inplace = True)
+    # blast_hits.sort_values('pident', ascending=False, inplace=True)
 
     if exists(outfile):
         remove(outfile)
@@ -70,21 +71,22 @@ def blast_file_and_read_hits(
 
 
 def blast_sequence_and_read_hits(
-        query_sequence,
-        dbfile,
-        tmpfile_prefix = '',
-        evalue = 1e-3,
-        num_alignments = 10000,
-        verbose=False,
+    query_sequence,
+    dbfile,
+    tmpfile_prefix="",
+    evalue=1e-3,
+    num_alignments=10000,
+    verbose=False,
 ):
-    tmpfile = f'{tmpfile_prefix}tmp_fasta_{random.random()}.fasta'
+    tmpfile = f"{tmpfile_prefix}tmp_fasta_{random.random()}.fasta"
 
-    out = open(tmpfile, 'w')
-    out.write(f'>tmp\n{query_sequence}\n')
+    out = open(tmpfile, "w")
+    out.write(f">tmp\n{query_sequence}\n")
     out.close()
 
     blast_hits = blast_file_and_read_hits(
-        tmpfile, dbfile, evalue, num_alignments, verbose, clobber=True)
+        tmpfile, dbfile, evalue, num_alignments, verbose, clobber=True
+    )
 
     if exists(tmpfile):
         remove(tmpfile)
@@ -92,19 +94,18 @@ def blast_sequence_and_read_hits(
     return blast_hits
 
 
-
 def setup_query_to_hit_map(hit):
-    ''' hit is a single row from blast_hits
+    """hit is a single row from blast_hits
 
     query2hit_align is a dictionary mapping from full-length, 0-indexed positions
     in query sequence to full-length, 0-indexed positions in hit sequence
-    '''
+    """
 
     query2hit_align = {}
-    for ii,(qaa,haa) in enumerate(zip(hit.qseq, hit.sseq)):
+    for ii, (qaa, haa) in enumerate(zip(hit.qseq, hit.sseq)):
         if qaa in amino_acids and haa in amino_acids:
-            qpos = hit.qstart + ii - hit.qseq[:ii].count('-') - 1 #0-idx
-            hpos = hit.sstart + ii - hit.sseq[:ii].count('-') - 1 #
+            qpos = hit.qstart + ii - hit.qseq[:ii].count("-") - 1  # 0-idx
+            hpos = hit.sstart + ii - hit.sseq[:ii].count("-") - 1  #
             query2hit_align[qpos] = hpos
 
     return query2hit_align

From 1c6621cd0a4b85fb0d46c11b64f62125c920727e Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 12:31:30 -0700
Subject: [PATCH 15/19] add bioconda

---
 README.md                         | 5 ++---
 devtools/conda-envs/test_env.yaml | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0bd734b..ce60b18 100644
--- a/README.md
+++ b/README.md
@@ -137,11 +137,10 @@ The non-AlphaFold Python package requirements are listed in `requirements.txt`.
 Those specific
 package versions should work, but there should also be plenty of flexibility on the
 versions. The TCR and MHC parsing code also requires the NCBI BLAST+ software
-to be installed, which can be done by running the script
-`download_blast.py`. A potential installation route would be:
+to be installed, which can be done using conda. A potential installation route would be:
 
 ```
-conda create --name tcrdock_test python blast
+conda create -c bioconda --name tcrdock_test python blast
 source activate tcrdock_test   # or: conda activate tcrdock_test
 pip install <path/to/cloned/repo>
 ```
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index a3e0072..7478bf1 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -2,6 +2,7 @@ name: tcrdock-test
 channels:
   - defaults
   - conda-forge
+  - bioconda
 dependencies:
   ### Base depends ###
   - python>=3.10.0

From a49d4c3998381f62c4c619dfca8b6832cd974d26 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 13:36:27 -0700
Subject: [PATCH 16/19] remove references to blast executable

---
 tcrdock/mhc_util.py        | 204 +++++++++++++++++++----------------
 tcrdock/tcrdist/parsing.py | 211 +++++++++++++++++++++----------------
 2 files changed, 231 insertions(+), 184 deletions(-)

diff --git a/tcrdock/mhc_util.py b/tcrdock/mhc_util.py
index 6849f02..d9b86b4 100644
--- a/tcrdock/mhc_util.py
+++ b/tcrdock/mhc_util.py
@@ -1,5 +1,6 @@
 # import numpy as np
 import sys
+
 # from os import system
 import os.path
 from os import system, popen
@@ -14,7 +15,6 @@
 from . import sequtil
 from . import pdblite
 
-from .blast import path_to_blast_executables
 from .util import path_to_db
 from .sequtil import ALL_GENES_GAP_CHAR
 
@@ -29,62 +29,81 @@
 ## 94...100 are upward facing in the C-terminal central strand
 ## 113.115 are upward facing in the neighboring C-terminal
 ##
-class1_template_core_positions_1indexed = [4,  6,  8,  10,  23,  25,
-                                           94, 96, 98, 100, 113, 115]
+class1_template_core_positions_1indexed = [
+    4,
+    6,
+    8,
+    10,
+    23,
+    25,
+    94,
+    96,
+    98,
+    100,
+    113,
+    115,
+]
 
 class1_template_core_positions_0indexed = [
-    x-1 for x in class1_template_core_positions_1indexed]
+    x - 1 for x in class1_template_core_positions_1indexed
+]
 
-class1_template_seq = 'PHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLRTDSPKAHVTHHPRSKGEVTLRCWALGFYPADITLTWQLNGEELTQDMELVETRPAGDGTFQKWASVVVPLGKEQNYTCRVYHEGLPEPLTLRWEP'
+class1_template_seq = "PHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLRTDSPKAHVTHHPRSKGEVTLRCWALGFYPADITLTWQLNGEELTQDMELVETRPAGDGTFQKWASVVVPLGKEQNYTCRVYHEGLPEPLTLRWEP"
 
 class2_alfas_positions_0indexed = {
-    'A': [2, 4, 7, 9, 18, 20],
-    'B': [2, 4, 6, 8, 21, 23], # 8 is disulfide posn
+    "A": [2, 4, 7, 9, 18, 20],
+    "B": [2, 4, 6, 8, 21, 23],  # 8 is disulfide posn
 }
 
+
 def get_mhc_core_positions_class1(seq):
-    ''' These are 0-indexed positions (unlike previous tcrdock)
+    """These are 0-indexed positions (unlike previous tcrdock)
 
     will have -1 if there's a parse fail
-    '''
+    """
     al = sequtil.blosum_align(class1_template_seq, seq)
 
     core_positions = [
         al.get(pos, -1) for pos in class1_template_core_positions_0indexed
     ]
 
-    assert -1 not in core_positions # probably should handle this better
+    assert -1 not in core_positions  # probably should handle this better
     return core_positions
 
 
 def get_mhc_core_positions_class2(
-        aseq, # chain 1 ie alpha
-        bseq, # chain 2 ie beta
+    aseq,  # chain 1 ie alpha
+    bseq,  # chain 2 ie beta
 ):
-    ''' these are 0-indexed positions!!! (unlike previous tcrdock)
+    """these are 0-indexed positions!!! (unlike previous tcrdock)
 
     will have -1 if there's a parse fail
-    '''
+    """
     offset = 0
     core_positions = []
 
-    for ab, seq in zip('AB',[aseq,bseq]):
-        dbfile = str(path_to_db / f'both_class_2_{ab}_chains_v2.fasta')
+    for ab, seq in zip("AB", [aseq, bseq]):
+        dbfile = str(path_to_db / f"both_class_2_{ab}_chains_v2.fasta")
         hits = blast.blast_sequence_and_read_hits(seq, dbfile)
         hit = hits.iloc[0]
         blast_align = blast.setup_query_to_hit_map(hit)
-        if hit.pident<99.99:
-            print('get_mhc_core_positions_class2:', ab, hit.saccver, hit.pident)
+        if hit.pident < 99.99:
+            print(
+                "get_mhc_core_positions_class2:", ab, hit.saccver, hit.pident
+            )
 
         alfas = sequtil.mhc_class_2_alfas[ab][hit.saccver]
-        hitseq = alfas.replace(ALL_GENES_GAP_CHAR, '')
-        blast_hitseq = hit.sseq.replace('-','') # debug
-        assert hitseq.index(blast_hitseq) == hit.sstart-1 # debug
+        hitseq = alfas.replace(ALL_GENES_GAP_CHAR, "")
+        blast_hitseq = hit.sseq.replace("-", "")  # debug
+        assert hitseq.index(blast_hitseq) == hit.sstart - 1  # debug
 
         # make an alignment from alfas to seq
-        alfas2hitseq = {i:i-alfas[:i].count(ALL_GENES_GAP_CHAR)
-                        for i,a in enumerate(alfas) if a != ALL_GENES_GAP_CHAR}
-        hitseq2seq = {j:i for i,j in blast_align.items()}
+        alfas2hitseq = {
+            i: i - alfas[:i].count(ALL_GENES_GAP_CHAR)
+            for i, a in enumerate(alfas)
+            if a != ALL_GENES_GAP_CHAR
+        }
+        hitseq2seq = {j: i for i, j in blast_align.items()}
 
         for pos in class2_alfas_positions_0indexed[ab]:
             hitseq_pos = alfas2hitseq[pos]
@@ -94,40 +113,38 @@ def get_mhc_core_positions_class2(
                 core_positions.append(-1)
 
         offset += len(seq)
-    assert -1 not in core_positions # probably should handle this better
+    assert -1 not in core_positions  # probably should handle this better
     return core_positions
 
 
 def _setup_class_2_alfas_blast_dbs():
-    ''' Just called once during setup
-    '''
+    """Just called once during setup"""
 
-    makeblastdb_exe = str(path_to_blast_executables / 'makeblastdb')
+    makeblastdb_exe = str("makeblastdb")
 
-    for ab in 'AB':
-        alfas_fname = str(path_to_db / f'both_class_2_{ab}_chains_v2.alfas')
-        fasta_fname = alfas_fname[:-5]+'fasta'
+    for ab in "AB":
+        alfas_fname = str(path_to_db / f"both_class_2_{ab}_chains_v2.alfas")
+        fasta_fname = alfas_fname[:-5] + "fasta"
 
         alfas = sequtil.read_fasta(alfas_fname)
-        out = open(fasta_fname, 'w')
-        for name,alseq in alfas.items():
-            seq = alseq.replace(ALL_GENES_GAP_CHAR, '')
-            out.write(f'>{name}\n{seq}\n')
+        out = open(fasta_fname, "w")
+        for name, alseq in alfas.items():
+            seq = alseq.replace(ALL_GENES_GAP_CHAR, "")
+            out.write(f">{name}\n{seq}\n")
         out.close()
 
         # format the db
-        cmd = f'{makeblastdb_exe} -in {fasta_fname} -dbtype prot'
+        cmd = f"{makeblastdb_exe} -in {fasta_fname} -dbtype prot"
         print(cmd)
         system(cmd)
 
 
-
 def get_mhc_stub(
-        pose,
-        tdinfo = None,
-        mhc_core_positions = None,
+    pose,
+    tdinfo=None,
+    mhc_core_positions=None,
 ):
-    ''' chain 1 (or 1 and 2) should be the mhc chain
+    """chain 1 (or 1 and 2) should be the mhc chain
     chain -3 should be the peptide chain if TCR else chain -1 (pMHC only)
 
     the mhc_core_positions (or tdinfo.mhc_core) are half on one side of the
@@ -146,61 +163,66 @@ def get_mhc_stub(
     dict {'axes':numpy array with axes as ROW vectors,
           'origin': vector}
 
-    '''
+    """
 
     if mhc_core_positions is None:
         if tdinfo is not None:
             mhc_core_positions = tdinfo.mhc_core
         else:
-            print('get_mhc_stub needs either tdinfo or mhc_core_positions')
+            print("get_mhc_stub needs either tdinfo or mhc_core_positions")
             sys.exit()
 
-    num_chains = len(pose['chains'])
-    if num_chains in [4,5]:
-        pep_chain = num_chains-3
+    num_chains = len(pose["chains"])
+    if num_chains in [4, 5]:
+        pep_chain = num_chains - 3
     else:
         # huh, maybe mhc only?
-        assert num_chains in [2,3]
-        pep_chain = num_chains-1
+        assert num_chains in [2, 3]
+        pep_chain = num_chains - 1
 
     # we may need to flip
-    #current_x = stub.rotation().col_x()
+    # current_x = stub.rotation().col_x()
     # was hardcoding pep_chain to 2 here...
-    chainbounds = pose['chainbounds']
-    pep_positions = list(range(chainbounds[pep_chain], chainbounds[pep_chain+1]))
-    pep_centroid = np.mean(pose['ca_coords'][pep_positions], axis=0)
+    chainbounds = pose["chainbounds"]
+    pep_positions = list(
+        range(chainbounds[pep_chain], chainbounds[pep_chain + 1])
+    )
+    pep_centroid = np.mean(pose["ca_coords"][pep_positions], axis=0)
 
     stub = superimpose.get_symmetry_stub_from_positions(
-        mhc_core_positions, pose, point_towards=pep_centroid)
+        mhc_core_positions, pose, point_towards=pep_centroid
+    )
 
-    assert (pep_centroid - stub['origin']).dot(stub['axes'][0]) > 0
+    assert (pep_centroid - stub["origin"]).dot(stub["axes"][0]) > 0
     return stub
 
 
 def orient_pmhc_pose(
-        pose,
-        mhc_core_positions = None,
-        mhc_class = None,
-        tdinfo = None, # TCRdockInfo
+    pose,
+    mhc_core_positions=None,
+    mhc_class=None,
+    tdinfo=None,  # TCRdockInfo
 ):
-    ''' chain 1 should be the mhc chain
+    """chain 1 should be the mhc chain
     chain 2 should be the peptide chain
-    '''
+    """
     if mhc_core_positions is None:
         if tdinfo is None:
-            cs = pose['chainseq'].split('/')
+            cs = pose["chainseq"].split("/")
             if mhc_class == 1:
                 mhc_core_positions = get_mhc_core_positions_class1(cs[0])
             else:
                 assert mhc_class == 2
-                mhc_core_positions = get_mhc_core_positions_class2(cs[0], cs[1])
+                mhc_core_positions = get_mhc_core_positions_class2(
+                    cs[0], cs[1]
+                )
         else:
             mhc_core_positions = tdinfo.mhc_core
 
     stub = get_mhc_stub(pose, mhc_core_positions=mhc_core_positions)
 
-    R = stub['axes']
-    v = -1. * R @ stub['origin']
+    R = stub["axes"]
+    v = -1.0 * R @ stub["origin"]
 
     pose = pdblite.apply_transform_Rx_plus_v(pose, R, v)
 
@@ -208,55 +230,51 @@ def orient_pmhc_pose(
 
 
 def make_sorting_tuple(allele):
-    ''' Helper function for choosing among equal blast hits
-    '''
-    if '*' not in allele:
-        assert ':' not in allele
-        return (allele,0)
+    """Helper function for choosing among equal blast hits"""
+    if "*" not in allele:
+        assert ":" not in allele
+        return (allele, 0)
     else:
-        pref, suf = allele.split('*') # just one star
+        pref, suf = allele.split("*")  # just one star
         if suf[-1].isdigit():
-            suf = [int(x) for x in suf.split(':')]
+            suf = [int(x) for x in suf.split(":")]
         else:
-            suf = [int(x) for x in suf[:-1].split(':')]+[suf[-1]]
-
-        return tuple([pref]+suf)
+            suf = [int(x) for x in suf[:-1].split(":")] + [suf[-1]]
 
+        return tuple([pref] + suf)
 
 
 def get_mhc_allele(seq, organism, return_identity=False):
-    if organism == 'human':
-        dbfile = (util.path_to_db /
-                  'hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta')
+    if organism == "human":
+        dbfile = (
+            util.path_to_db
+            / "hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta"
+        )
     else:
-        assert organism == 'mouse'
-        dbfile = (util.path_to_db /
-                  'mhc_pdb_chains_mouse_reps_reformat.fasta')
-
+        assert organism == "mouse"
+        dbfile = util.path_to_db / "mhc_pdb_chains_mouse_reps_reformat.fasta"
 
-    hits = blast.blast_sequence_and_read_hits(
-        seq, dbfile, num_alignments=3)
+    hits = blast.blast_sequence_and_read_hits(seq, dbfile, num_alignments=3)
     top_bitscore = max(hits.bitscore)
     top_hits = hits[hits.bitscore == top_bitscore].reset_index()
-    if top_hits.shape[0]>1:# ties
-        #print('ties:', top_hits.shape[0])
-        sortl = [(make_sorting_tuple(a), a, i) for i,a in enumerate(top_hits.saccver)]
+    if top_hits.shape[0] > 1:  # ties
+        # print('ties:', top_hits.shape[0])
+        sortl = [
+            (make_sorting_tuple(a), a, i)
+            for i, a in enumerate(top_hits.saccver)
+        ]
         sortl.sort()
-        #print('sorted:', sortl)
+        # print('sorted:', sortl)
         ind = sortl[0][-1]
         hit = top_hits.iloc[ind]
         assert hit.saccver == sortl[0][1]
     else:
         hit = top_hits.iloc[0]
     mhc = hit.saccver
-    if organism == 'mouse' and len(mhc) == 3 and mhc[1] == '-':
-        mhc = f'H2{mhc[0]}{mhc[2].lower()}'
+    if organism == "mouse" and len(mhc) == 3 and mhc[1] == "-":
+        mhc = f"H2{mhc[0]}{mhc[2].lower()}"
 
     if return_identity:
         return mhc, hit.pident
     else:
         return mhc
-
-
-
-
diff --git a/tcrdock/tcrdist/parsing.py b/tcrdock/tcrdist/parsing.py
index 45c6671..7fed30a 100644
--- a/tcrdock/tcrdist/parsing.py
+++ b/tcrdock/tcrdist/parsing.py
@@ -11,45 +11,59 @@
 from .all_genes import all_genes, gap_character
 from .genetic_code import genetic_code, reverse_genetic_code
 from . import logo_tools
-from ..blast import (blast_sequence_and_read_hits, setup_query_to_hit_map,
-                     path_to_blast_executables)
+from ..blast import blast_sequence_and_read_hits, setup_query_to_hit_map
+
 
 def get_blast_db_path(organism, ab, vj):
-    ''' This is the protein sequence db
-    '''
-    return path_to_db / f'imgt_prot_blast_db_{organism}_{ab}_{vj}.fasta'
+    """This is the protein sequence db"""
+    return path_to_db / f"imgt_prot_blast_db_{organism}_{ab}_{vj}.fasta"
 
 
 ## borrowed from cdr3s_human.py
 ## 1-indexed:
-extra_alignment_columns_1x = { 'mouse':{'A':[9,86],'B':[] }, ## 1-indexed
-                               'human':{'A':[],'B':[] } }
+extra_alignment_columns_1x = {
+    "mouse": {"A": [9, 86], "B": []},  ## 1-indexed
+    "human": {"A": [], "B": []},
+}
 
 
 core_positions_generic_1x = [
-    21, 23, 25,   ##  23 is C
-    39, 41,       ##  41 is W
-    53, 54, 55,
-    78,           ##            maybe also 80?
-    89,           ##  89 is L
-    102, 103, 104 ## 104 is C
+    21,
+    23,
+    25,  ##  23 is C
+    39,
+    41,  ##  41 is W
+    53,
+    54,
+    55,
+    78,  ##            maybe also 80?
+    89,  ##  89 is L
+    102,
+    103,
+    104,  ## 104 is C
 ]
 
 all_core_alseq_positions_0x = {}
 for organism in extra_alignment_columns_1x:
-    for ab,xcols in extra_alignment_columns_1x[organism].items():
-        positions = [ x-1+sum(y<=x for y in xcols) # this is not quite right but it wrks
-                      for x in core_positions_generic_1x ]
-
-        if False:# helpful for debugging/info
-            for v,g in all_genes[organism].items():
-                allele = int(v.split('*')[-1]) # sanity check allele names end with *int
-                if g.chain == ab and g.region == 'V' and v.endswith('*01'):
-                    coreseq = ''.join([g.alseq[x] for x in positions])
-                    if '*' in coreseq or coreseq[1]+coreseq[-1] != 'CC':
-                        print('funny coreseq:', coreseq, v, organism, ab)
-
-        all_core_alseq_positions_0x[(organism,ab)] = positions
+    for ab, xcols in extra_alignment_columns_1x[organism].items():
+        positions = [
+            x
+            - 1
+            + sum(y <= x for y in xcols)  # this is not quite right but it wrks
+            for x in core_positions_generic_1x
+        ]
+
+        if False:  # helpful for debugging/info
+            for v, g in all_genes[organism].items():
+                allele = int(
+                    v.split("*")[-1]
+                )  # sanity check allele names end with *int
+                if g.chain == ab and g.region == "V" and v.endswith("*01"):
+                    coreseq = "".join([g.alseq[x] for x in positions])
+                    if "*" in coreseq or coreseq[1] + coreseq[-1] != "CC":
+                        print("funny coreseq:", coreseq, v, organism, ab)
+
+        all_core_alseq_positions_0x[(organism, ab)] = positions
 
 
 def get_core_positions_0x(organism, v_gene):
@@ -62,40 +76,39 @@ def get_core_positions_0x(organism, v_gene):
         core_positions_0x.append(pos - alseq[:pos].count(gap_character))
     return core_positions_0x
 
+
 def make_blast_dbs():
-    makeblastdb_exe = str(path_to_blast_executables / 'makeblastdb')
+    makeblastdb_exe = str("makeblastdb")
 
     # protein sequences
     for organism in all_genes:
-        for ab in 'AB':
-            for vj in 'VJ':
+        for ab in "AB":
+            for vj in "VJ":
                 pname = get_blast_db_path(organism, ab, vj)
-                out = open(pname, 'w')
+                out = open(pname, "w")
                 for id, g in all_genes[organism].items():
                     if g.chain == ab and g.region == vj:
-                        out.write(f'>{id}\n{g.protseq}\n')
+                        out.write(f">{id}\n{g.protseq}\n")
                 out.close()
-                print('made:', pname)
+                print("made:", pname)
 
                 # format the db
-                cmd = f'{makeblastdb_exe} -in {pname} -dbtype prot'
+                cmd = f"{makeblastdb_exe} -in {pname} -dbtype prot"
                 print(cmd)
                 system(cmd)
 
 
-
-
-def parse_core_positions( organism, ab, qseq, v_gene, q2v_align ):
-    ''' returns qcore_positions, mismatches
+def parse_core_positions(organism, ab, qseq, v_gene, q2v_align):
+    """returns qcore_positions, mismatches
 
     qcore_positions is a list of 0-indexed core positions in qseq, may contain -1 if
     the q2v_align alignment does not cover all the v_gene core positions
-    '''
+    """
     vseq = all_genes[organism][v_gene].protseq
 
     core_positions = get_core_positions_0x(organism, v_gene)
 
-    qcore_positions = [-1]*len(core_positions)
+    qcore_positions = [-1] * len(core_positions)
     mismatches = 0
 
     for qpos, vpos in q2v_align.items():
@@ -112,78 +125,84 @@ def parse_other_cdrs(organism, ab, qseq, v_gene, q2v_align):
     alseq = all_genes[organism][v_gene].alseq
     cdr_cols_1x = all_genes[organism][v_gene].cdr_columns
 
-    assert alseq.replace(gap_character,'') == v_seq
-    alseq2seq = {i:i-alseq[:i].count(gap_character)
-                 for i,a in enumerate(alseq)
-                 if a != gap_character}
+    assert alseq.replace(gap_character, "") == v_seq
+    alseq2seq = {
+        i: i - alseq[:i].count(gap_character)
+        for i, a in enumerate(alseq)
+        if a != gap_character
+    }
 
-    assert len(cdr_cols_1x) == 4 ## CDR1, CDR2, CDR2.5, and CDR3(start)
+    assert len(cdr_cols_1x) == 4  ## CDR1, CDR2, CDR2.5, and CDR3(start)
 
     qseq_cdr_bounds = []
     cdr_mismatches = 0
     for start_col_1x, stop_col_1x in cdr_cols_1x[:3]:
-        start = alseq2seq[start_col_1x-1]
-        stop  = alseq2seq[stop_col_1x-1]
+        start = alseq2seq[start_col_1x - 1]
+        stop = alseq2seq[stop_col_1x - 1]
 
         ## what aligns to this region in v_hit
-        q_loop_positions = [i for i,j in q2v_align.items() if start <= j <= stop]
+        q_loop_positions = [
+            i for i, j in q2v_align.items() if start <= j <= stop
+        ]
         if q_loop_positions:
-            qstart = min( q_loop_positions )
-            qstop  = max( q_loop_positions )
+            qstart = min(q_loop_positions)
+            qstop = max(q_loop_positions)
             qseq_cdr_bounds.append([qstart, qstop])
-            for i in range(qstart,qstop+1):
+            for i in range(qstart, qstop + 1):
                 if i not in q2v_align or v_seq[q2v_align[i]] != qseq[i]:
                     cdr_mismatches += 1
         else:
-            qseq_cdr_bounds.append((None,None))
-            cdr_mismatches += stop-start+1
+            qseq_cdr_bounds.append((None, None))
+            cdr_mismatches += stop - start + 1
     return qseq_cdr_bounds, cdr_mismatches
 
 
 def parse_cdr3(organism, ab, qseq, v_gene, j_gene, q2v_align, q2j_align):
-    ''' returns (C position, F position) ie (CDR3 start, CDR3 stop)
+    """returns (C position, F position) ie (CDR3 start, CDR3 stop)
     in 0-indexed qseq numbering
     either will be None if parsing failed
-    '''
+    """
     vg = all_genes[organism][v_gene]
     jg = all_genes[organism][j_gene]
 
     ## what is the C position in this v gene?
-    cpos = vg.cdr_columns[3][0]-1 ## now 0-indexed, vg.alseq numbers
-    cpos -= vg.alseq[:cpos].count(gap_character) # now vg.protseq numbers
+    cpos = vg.cdr_columns[3][0] - 1  ## now 0-indexed, vg.alseq numbers
+    cpos -= vg.alseq[:cpos].count(gap_character)  # now vg.protseq numbers
 
-    v2q_align = {j:i for i,j in q2v_align.items()}
+    v2q_align = {j: i for i, j in q2v_align.items()}
     query_cpos = v2q_align.get(cpos, None)
 
     # V mismatches outside cdr3 region
-    v_mismatches = sum(qseq[i] != vg.protseq[j]
-                       for i,j in q2v_align.items()
-                       if j < cpos)
+    v_mismatches = sum(
+        qseq[i] != vg.protseq[j] for i, j in q2v_align.items() if j < cpos
+    )
 
-    fpos = jg.cdr_columns[0][1]-1 # 0-idx jg.alseq numbers
-    fpos -= jg.alseq[:fpos].count(gap_character) # 0-idx, jg.protseq numbers
+    fpos = jg.cdr_columns[0][1] - 1  # 0-idx jg.alseq numbers
+    fpos -= jg.alseq[:fpos].count(gap_character)  # 0-idx, jg.protseq numbers
 
-    j2q_align = {j:i for i,j in q2j_align.items()}
+    j2q_align = {j: i for i, j in q2j_align.items()}
     query_fpos = j2q_align.get(fpos, None)
 
-    j_mismatches = sum(qseq[i] != jg.protseq[j]
-                       for i,j in q2j_align.items()
-                       if j > fpos)
+    j_mismatches = sum(
+        qseq[i] != jg.protseq[j] for i, j in q2j_align.items() if j > fpos
+    )
 
     return [query_cpos, query_fpos], v_mismatches, j_mismatches
 
 
 def get_top_blast_hit_with_allele_sorting(hits):
-    ''' In the case of ties, prefer lower allele numbers
-    '''
+    """In the case of ties, prefer lower allele numbers"""
     top_bitscore = max(hits.bitscore)
-    top_hits = hits[hits.bitscore==top_bitscore].copy()
-    #print('top_bitscore:', top_bitscore, 'ties:', top_hits.shape[0])
-    top_hits['allele'] = top_hits.saccver.str.split('*').str.get(-1).astype(int)
-    return top_hits.sort_values('allele').iloc[0].copy()
+    top_hits = hits[hits.bitscore == top_bitscore].copy()
+    # print('top_bitscore:', top_bitscore, 'ties:', top_hits.shape[0])
+    top_hits["allele"] = (
+        top_hits.saccver.str.split("*").str.get(-1).astype(int)
+    )
+    return top_hits.sort_values("allele").iloc[0].copy()
+
 
 def parse_tcr_sequence(organism, chain, sequence):
-    ''' return dict with keys
+    """return dict with keys
     * cdr_loops
     * core_positions
     * v_gene
@@ -199,16 +218,16 @@ def parse_tcr_sequence(organism, chain, sequence):
 
     returns None upon parse failure
 
-    '''
-    assert chain in 'AB'
+    """
+    assert chain in "AB"
 
-    tmpdbfile = get_blast_db_path(organism, chain, 'V')
+    tmpdbfile = get_blast_db_path(organism, chain, "V")
     if not exists(tmpdbfile):
-        make_blast_dbs() # only need to do this once
+        make_blast_dbs()  # only need to do this once
         assert exists(tmpdbfile)
 
     top_hits = []
-    for vj in 'VJ':
+    for vj in "VJ":
         dbfile = get_blast_db_path(organism, chain, vj)
         hits = blast_sequence_and_read_hits(sequence, dbfile)
 
@@ -225,11 +244,21 @@ def parse_tcr_sequence(organism, chain, sequence):
         j_gene = j_hit.saccver
 
         cdr3_bounds, v_mismatches, j_mismatches = parse_cdr3(
-            organism, chain, sequence, v_gene, j_gene, q2v_align, q2j_align,
+            organism,
+            chain,
+            sequence,
+            v_gene,
+            j_gene,
+            q2v_align,
+            q2j_align,
         )
 
         other_cdr_bounds, other_cdr_mismatches = parse_other_cdrs(
-            organism, chain, sequence, v_gene, q2v_align,
+            organism,
+            chain,
+            sequence,
+            v_gene,
+            q2v_align,
         )
 
         core_positions, core_mismatches = parse_core_positions(
@@ -242,21 +271,21 @@ def parse_tcr_sequence(organism, chain, sequence):
 
         cdr_loops = other_cdr_bounds + [cdr3_bounds]
         if any(None in bounds for bounds in cdr_loops):
-            return {} # signal failure ### EARLY RETURN!!!
+            return {}  # signal failure ### EARLY RETURN!!!
 
         result = {
-            'cdr_loops':cdr_loops,
-            'core_positions':core_positions,
-            'v_gene': v_gene,
-            'j_gene': j_gene,
-            'mismatches':{
-                'v_mismatches':v_mismatches,
-                'j_mismatches':j_mismatches,
-                'other_cdr_mismatches':other_cdr_mismatches,
-                'core_mismatches':core_mismatches,
+            "cdr_loops": cdr_loops,
+            "core_positions": core_positions,
+            "v_gene": v_gene,
+            "j_gene": j_gene,
+            "mismatches": {
+                "v_mismatches": v_mismatches,
+                "j_mismatches": j_mismatches,
+                "other_cdr_mismatches": other_cdr_mismatches,
+                "core_mismatches": core_mismatches,
             },
         }
         return result
     else:
-        print('failed to find v and j matches')
+        print("failed to find v and j matches")
         return None

From b8659176af42351b9a9c02a7cf496fa82c07f5c5 Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 14:00:09 -0700
Subject: [PATCH 17/19] fix subdir paths

---
 pyproject.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d7941d4..9860c78 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,8 +14,9 @@ dependencies = [
     "matplotlib",
 ]
 
-[tool.setuptools]
-packages = ["tcrdock"]
+[tool.setuptools.packages.find]
+where   = ["."]
+include = ["tcrdock*"] 
 
 [tool.setuptools.package-data]
 tcrdock = [

From 5532b115dc3101bd99fff54b19c6e28d1dd01b9e Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Fri, 13 Jun 2025 16:01:12 -0700
Subject: [PATCH 18/19] typo

---
 tcrdock/mhc_util.py     |  20 +---
 tcrdock/sequtil.py      | 222 +++++++++++-----------------------------
 tcrdock/tcrdock_info.py |  16 +--
 3 files changed, 67 insertions(+), 191 deletions(-)

diff --git a/tcrdock/mhc_util.py b/tcrdock/mhc_util.py
index d9b86b4..04c19af 100644
--- a/tcrdock/mhc_util.py
+++ b/tcrdock/mhc_util.py
@@ -88,9 +88,7 @@ def get_mhc_core_positions_class2(
         hit = hits.iloc[0]
         blast_align = blast.setup_query_to_hit_map(hit)
         if hit.pident < 99.99:
-            print(
-                "get_mhc_core_positions_class2:", ab, hit.saccver, hit.pident
-            )
+            print("get_mhc_core_positions_class2:", ab, hit.saccver, hit.pident)
 
         alfas = sequtil.mhc_class_2_alfas[ab][hit.saccver]
         hitseq = alfas.replace(ALL_GENES_GAP_CHAR, "")
@@ -184,9 +182,7 @@ def get_mhc_stub(
     # current_x = stub.rotation().col_x()
     # was hardcoding pep_chain to 2 here...
     chainbounds = pose["chainbounds"]
-    pep_positions = list(
-        range(chainbounds[pep_chain], chainbounds[pep_chain + 1])
-    )
+    pep_positions = list(range(chainbounds[pep_chain], chainbounds[pep_chain + 1]))
     pep_centroid = np.mean(pose["ca_coords"][pep_positions], axis=0)
 
     stub = superimpose.get_symmetry_stub_from_positions(
@@ -213,9 +209,7 @@ def orient_pmhc_pose(
                 mhc_core_positions = get_mhc_core_positions_class1(cs[0])
             else:
                 assert mhc_class == 2
-                mhc_core_positions = get_mhc_core_positions_class2(
-                    cs[0], cs[1]
-                )
+                mhc_core_positions = get_mhc_core_positions_class2(cs[0], cs[1])
         else:
             mhc_core_positions = tdinfo.mhc_core
 
@@ -247,8 +241,7 @@ def make_sorting_tuple(allele):
 def get_mhc_allele(seq, organism, return_identity=False):
     if organism == "human":
         dbfile = (
-            util.path_to_db
-            / "hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta"
+            util.path_to_db / "hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta"
         )
     else:
         assert organism == "mouse"
@@ -259,10 +252,7 @@ def get_mhc_allele(seq, organism, return_identity=False):
     top_hits = hits[hits.bitscore == top_bitscore].reset_index()
     if top_hits.shape[0] > 1:  # ties
         # print('ties:', top_hits.shape[0])
-        sortl = [
-            (make_sorting_tuple(a), a, i)
-            for i, a in enumerate(top_hits.saccver)
-        ]
+        sortl = [(make_sorting_tuple(a), a, i) for i, a in enumerate(top_hits.saccver)]
         sortl.sort()
         # print('sorted:', sortl)
         ind = sortl[0][-1]
diff --git a/tcrdock/sequtil.py b/tcrdock/sequtil.py
index 8a852bb..3f8f88b 100644
--- a/tcrdock/sequtil.py
+++ b/tcrdock/sequtil.py
@@ -34,12 +34,8 @@
 human_structure_alignments.set_index("v_gene", drop=True, inplace=True)
 
 # human and mouse
-both_structure_alignments = pd.read_table(
-    path_to_db / "new_both_vg_alignments_v1.tsv"
-)
-both_structure_alignments.set_index(
-    ["organism", "v_gene"], drop=True, inplace=True
-)
+both_structure_alignments = pd.read_table(path_to_db / "new_both_vg_alignments_v1.tsv")
+both_structure_alignments.set_index(["organism", "v_gene"], drop=True, inplace=True)
 
 
 def read_fasta(filename):  # helper
@@ -159,9 +155,7 @@ def read_fasta(filename):  # helper
 all_template_poses = {TCR: {}, PMHC: {}, TERNARY: {}}
 
 BAD_DGEOM_PDBIDS = "5sws 7jwi 4jry 4nhu 3tjh 4y19 4y1a 1ymm 2wbj 6uz1".split()
-BAD_PMHC_PDBIDS = (
-    "3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9".split()
-)
+BAD_PMHC_PDBIDS = "3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9".split()
 # (new) 6uz1 is engineered and binds down by B2M
 # 3rgv has cterm of peptide out of groove
 # 4ms8 has chainbreaks (should check for those!)
@@ -191,9 +185,7 @@ def read_fasta(filename):  # helper
 
 def get_tcrdister(organism):
     if organism not in _cached_tcrdisters:
-        _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator(
-            organism
-        )
+        _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator(organism)
     return _cached_tcrdisters[organism]
 
 
@@ -207,7 +199,7 @@ def blosum_align(
 ):
     """return 0-indexed dictionary mapping from seq1 to seq2 positions"""
 
-    scorematrix = substitution_matrix.load("BLOSUM62")
+    scorematrix = substitution_matrices.load("BLOSUM62")
 
     if global_align:
         alignments = pairwise2.align.globalds(
@@ -294,9 +286,7 @@ def get_j_seq_after_cdr3(organism, j_gene):
 for organism in extra_alignment_columns_1x:
     for ab, xcols in extra_alignment_columns_1x[organism].items():
         positions = [
-            x
-            - 1
-            + sum(y <= x for y in xcols)  # this is not quite right but it wrks
+            x - 1 + sum(y <= x for y in xcols)  # this is not quite right but it wrks
             for x in core_positions_generic_1x
         ]
 
@@ -336,16 +326,12 @@ def align_chainseq_to_imgt_msa(organism, chainseq, v_gene):
 
     chainseq_to_geneseq = blosum_align(chainseq, geneseq)
 
-    chainseq_to_alseq = {
-        i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()
-    }
+    chainseq_to_alseq = {i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()}
 
     return chainseq_to_alseq
 
 
-def align_chainseq_to_structure_msa(
-    organism, chainseq, v_gene, msa_type="both"
-):
+def align_chainseq_to_structure_msa(organism, chainseq, v_gene, msa_type="both"):
     assert msa_type in ["both", "human"]
 
     if msa_type == "human":
@@ -369,9 +355,7 @@ def align_chainseq_to_structure_msa(
 
     chainseq_to_geneseq = blosum_align(chainseq, geneseq)
 
-    chainseq_to_alseq = {
-        i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()
-    }
+    chainseq_to_alseq = {i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()}
 
     return chainseq_to_alseq
 
@@ -388,9 +372,7 @@ def align_tcr_info_pdb_chain_to_structure_msa(pdbid, ab, msa_type_in):
             "both": {"A": {}, "B": {}},
             "human": {"A": {}, "B": {}},
         }
-        print(
-            "setting up cache for align_tcr_info_pdb_chain_to_structure_msa function"
-        )
+        print("setting up cache for align_tcr_info_pdb_chain_to_structure_msa function")
         for l in tcr_info.itertuples():
             geneseq = get_v_seq_up_to_cys(l.organism, l.v_gene)
             for msa_type in ["both", "human"]:
@@ -411,12 +393,9 @@ def align_tcr_info_pdb_chain_to_structure_msa(pdbid, ab, msa_type_in):
                 chainseq_to_geneseq = blosum_align(l.chainseq, geneseq)
 
                 chainseq_to_alseq = {
-                    i: geneseq_to_alseq[j]
-                    for i, j in chainseq_to_geneseq.items()
+                    i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()
                 }
-                _tcr_alignment_cache[msa_type][l.ab][
-                    l.pdbid
-                ] = chainseq_to_alseq
+                _tcr_alignment_cache[msa_type][l.ab][l.pdbid] = chainseq_to_alseq
 
         print(
             "DONE setting up cache for align_tcr_info_pdb_chain_to_structure_msa",
@@ -463,9 +442,7 @@ def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3):
             print("N None:", organism, v)
             geneseq_positions.append(None)
         else:
-            geneseq_positions.append(
-                pos - alseq[:pos].count(ALL_GENES_GAP_CHAR)
-            )
+            geneseq_positions.append(pos - alseq[:pos].count(ALL_GENES_GAP_CHAR))
 
     start, stop = core_positions_0x[0], core_positions_0x[9]  # inclusive!
     geneseq_positions.extend(range(start, stop + 1))
@@ -529,9 +506,7 @@ def align_vgene_to_template_pdb_chain(
 
     # align V regions
     msa_type = (
-        "human"
-        if trg_organism == "human" and tmp_organism == "human"
-        else "both"
+        "human" if trg_organism == "human" and tmp_organism == "human" else "both"
     )
 
     trg_vseq_to_alseq = trg_msa_alignments[msa_type]
@@ -678,8 +653,7 @@ def get_mhc_class_1_alseq(allele):
     for k in mhc_class_1_alfas:
         if k.startswith(allele) and k[len(allele)] == ":":
             suffix = [
-                int(x) if x.isdigit() else 100
-                for x in k[len(allele) + 1 :].split(":")
+                int(x) if x.isdigit() else 100 for x in k[len(allele) + 1 :].split(":")
             ]
             sortl.append((suffix, k))
     if sortl:
@@ -702,8 +676,7 @@ def get_mhc_class_2_alseq(chain, allele):
     for k in mhc_class_2_alfas[chain]:
         if k.startswith(allele) and k[len(allele)] == ":":
             suffix = [
-                int(x) if x.isdigit() else 100
-                for x in k[len(allele) + 1 :].split(":")
+                int(x) if x.isdigit() else 100 for x in k[len(allele) + 1 :].split(":")
             ]
             sortl.append((suffix, k))
     if sortl:
@@ -854,8 +827,7 @@ def get_clean_and_nonredundant_ternary_tcrs_df(
         }
 
         tcr_tuples = [
-            ((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b))
-            for l in tcrs.itertuples()
+            ((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b)) for l in tcrs.itertuples()
         ]
 
         is_redundant = [False] * tcrs.shape[0]
@@ -868,19 +840,12 @@ def get_clean_and_nonredundant_ternary_tcrs_df(
                     continue
 
                 pep_mms = count_peptide_mismatches(irow.pep_seq, jrow.pep_seq)
-                if (
-                    irow.organism == jrow.organism
-                    and irow.mhc_class == jrow.mhc_class
-                ):
+                if irow.organism == jrow.organism and irow.mhc_class == jrow.mhc_class:
                     tcrd = tdist[irow.organism](tcr_tuples[i], tcr_tuples[j])
                     pep_red = pep_mms < min_peptide_mismatches
                     tcr_red = tcrd < min_tcrdist
-                    if (
-                        peptide_tcrdist_logical == "or"
-                        and (pep_red or tcr_red)
-                    ) or (
-                        peptide_tcrdist_logical == "and"
-                        and (pep_red and tcr_red)
+                    if (peptide_tcrdist_logical == "or" and (pep_red or tcr_red)) or (
+                        peptide_tcrdist_logical == "and" and (pep_red and tcr_red)
                     ):
 
                         # redundant
@@ -980,10 +945,7 @@ def filter_templates_by_peptide_mismatches(
 
     templates["filt_peptide_mismatches"] = np.array(
         [
-            min(
-                count_peptide_mismatches(p, l.pep_seq)
-                for p in peptides_for_filtering
-            )
+            min(count_peptide_mismatches(p, l.pep_seq) for p in peptides_for_filtering)
             for l in templates.itertuples()
         ]
     )
@@ -1032,9 +994,7 @@ def pick_dgeom_templates(
     if peptides_for_filtering is None:
         peptides_for_filtering = []
     elif peptides_for_filtering:
-        assert (
-            peptide in peptides_for_filtering
-        )  # sanity? seems like should be true
+        assert peptide in peptides_for_filtering  # sanity? seems like should be true
 
     if exclude_pdbids is None:
         exclude_pdbids = []
@@ -1084,10 +1044,7 @@ def pick_dgeom_templates(
 
     # currently unused...
     templates["peptide_mismatches"] = np.array(
-        [
-            count_peptide_mismatches(peptide, l.pep_seq)
-            for l in templates.itertuples()
-        ]
+        [count_peptide_mismatches(peptide, l.pep_seq) for l in templates.itertuples()]
     )
 
     def calc_chain_tcrdist(row1, row2, chain, tcrdister=tcrdister):
@@ -1346,17 +1303,11 @@ def show_alignment(al, seq1, seq2):
         if organism == "human":
             # now adding HLA-E 2022-05-03
             assert (
-                mhc_allele[0] in "ABCE"
-                and mhc_allele[1] == "*"
-                and ":" in mhc_allele
+                mhc_allele[0] in "ABCE" and mhc_allele[1] == "*" and ":" in mhc_allele
             )
-            mhc_allele = ":".join(
-                mhc_allele.split(":")[:2]
-            )  # just the 4 digits
+            mhc_allele = ":".join(mhc_allele.split(":")[:2])  # just the 4 digits
         else:
-            assert (
-                mhc_allele.startswith("H2") and mhc_allele in mhc_class_1_alfas
-            )
+            assert mhc_allele.startswith("H2") and mhc_allele in mhc_class_1_alfas
 
         # first: MHC part
         trg_mhc_alseq = get_mhc_class_1_alseq(mhc_allele)
@@ -1408,13 +1359,9 @@ def show_alignment(al, seq1, seq2):
                         alt_self_peptides,
                     )
                 continue
-            assert (
-                len(peptide) - pep_idents >= min_pmhc_peptide_mismatches
-            )  # sanity
+            assert len(peptide) - pep_idents >= min_pmhc_peptide_mismatches  # sanity
             total = len(peptide) + len(trg_mhc_seq)
-            frac = (
-                mhc_idents + pep_idents
-            ) / total - 0.01 * l.mhc_total_chainbreak
+            frac = (mhc_idents + pep_idents) / total - 0.01 * l.mhc_total_chainbreak
             sortl.append((frac, l.Index))
 
         sortl.sort(reverse=True)
@@ -1469,18 +1416,13 @@ def show_alignment(al, seq1, seq2):
             trg_pmhc_seq = trg_mhc_seq + peptide
             tmp_pmhc_seq = tmp_mhc_seq + templatel.pep_seq
             identities = sum(
-                trg_pmhc_seq[i] == tmp_pmhc_seq[j]
-                for i, j in trg_to_tmp.items()
+                trg_pmhc_seq[i] == tmp_pmhc_seq[j] for i, j in trg_to_tmp.items()
             ) / len(trg_pmhc_seq)
-            identities_for_sorting = (
-                identities - 0.01 * templatel.mhc_total_chainbreak
-            )
+            identities_for_sorting = identities - 0.01 * templatel.mhc_total_chainbreak
             assert abs(identities_for_sorting - idents) < 1e-3
 
             if verbose:
-                print(
-                    f"oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}"
-                )
+                print(f"oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}")
             show_alignment(trg_to_tmp, trg_pmhc_seq, tmp_pmhc_seq)
             pmhc_alignments.append(
                 (
@@ -1531,9 +1473,7 @@ def show_alignment(al, seq1, seq2):
                 [tmp_mhca_alseq, tmp_mhcb_alseq, l.pep_seq],
             ):
                 assert len(a) == len(b)
-                idents += sum(
-                    x == y for x, y in zip(a, b) if x != ALL_GENES_GAP_CHAR
-                )
+                idents += sum(x == y for x, y in zip(a, b) if x != ALL_GENES_GAP_CHAR)
             sortl.append((idents / len(trg_pmhc_seq), l.pdbid))
         sortl.sort(reverse=True)
         max_idents = sortl[0][0]
@@ -1581,14 +1521,11 @@ def show_alignment(al, seq1, seq2):
             }
             trg_offset = len(trg_mhca_seq) + len(trg_mhcb_seq)
             tmp_offset = len(tmp_mhca_seq) + len(tmp_mhcb_seq)
-            al3 = {
-                i + trg_offset: i + tmp_offset for i in range(CLASS2_PEPLEN)
-            }
+            al3 = {i + trg_offset: i + tmp_offset for i in range(CLASS2_PEPLEN)}
             trg_to_tmp = {**al1, **al2, **al3}
             tmp_pmhc_seq = tmp_mhca_seq + tmp_mhcb_seq + templatel.pep_seq
             idents_redo = sum(
-                trg_pmhc_seq[i] == tmp_pmhc_seq[j]
-                for i, j in trg_to_tmp.items()
+                trg_pmhc_seq[i] == tmp_pmhc_seq[j] for i, j in trg_to_tmp.items()
             ) / len(trg_pmhc_seq)
             # print(f'oldnew mhc_idents: {idents:6.3f} {idents_redo:6.3f} {pdbid}')
             assert abs(idents - idents_redo) < 1e-4
@@ -1623,8 +1560,7 @@ def show_alignment(al, seq1, seq2):
         templates = tcr_info[tcr_info.ab == ab]
         templates = templates[~templates.pdbid.isin(exclude_pdbids)]
         template_tcrs = [
-            (x.organism[0] + x.v_gene, None, x.cdr3)
-            for x in templates.itertuples()
+            (x.organism[0] + x.v_gene, None, x.cdr3) for x in templates.itertuples()
         ]
         # templates['v_gene j_gene cdr3'.split()].itertuples(index=False))
         closest_tcrs = [
@@ -1646,10 +1582,7 @@ def show_alignment(al, seq1, seq2):
             if dist < min_single_chain_tcrdist:
                 # print('too close:', dist, trg_v, trg_j, trg_cdr3, template_tcrs[ind])
                 continue
-            if (
-                closest_dist > big_v_dist
-                and len(alignments) >= num_templates_per_run
-            ):
+            if closest_dist > big_v_dist and len(alignments) >= num_templates_per_run:
                 # print('too far:', closest_dist)
                 break
 
@@ -1674,13 +1607,10 @@ def show_alignment(al, seq1, seq2):
             #     templatel.pdbid, templatel.ab, msa_type)
 
             identities = sum(
-                trg_chainseq[i] == templatel.chainseq[j]
-                for i, j in trg_to_tmp.items()
+                trg_chainseq[i] == templatel.chainseq[j] for i, j in trg_to_tmp.items()
             ) / len(trg_chainseq)
 
-            identities_for_sorting = (
-                identities + ternary_bonus * templatel.ternary
-            )
+            identities_for_sorting = identities + ternary_bonus * templatel.ternary
 
             alignments.append(
                 (
@@ -1741,18 +1671,14 @@ def show_alignment(al, seq1, seq2):
                 peptide,
             )
             exit()
-        dgeoms = [
-            DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()
-        ]
+        dgeoms = [DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()]
         if num_runs * num_templates_per_run > len(dgeoms):
             rep_dgeom_indices = np.random.permutation(len(dgeoms))
             rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices]
         else:
             dummy_organism = "human"  # just used for avg cdr coords
-            rep_dgeoms, rep_dgeom_indices = (
-                docking_geometry.pick_docking_geometry_reps(
-                    dummy_organism, dgeoms, num_runs * num_templates_per_run
-                )
+            rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps(
+                dummy_organism, dgeoms, num_runs * num_templates_per_run
             )
     elif use_opt_dgeoms:
         rep_dgeoms = docking_geometry.load_opt_dgeoms(mhc_class)
@@ -1782,19 +1708,15 @@ def show_alignment(al, seq1, seq2):
             exclude_docking_geometry_peptides,
             min_dgeom_peptide_mismatches,
         )
-        dgeoms = [
-            DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()
-        ]
+        dgeoms = [DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()]
 
         if num_runs * num_templates_per_run > len(dgeoms):
             rep_dgeom_indices = np.random.permutation(len(dgeoms))
             rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices]
         else:
             dummy_organism = "human"  # just used for avg cdr coords
-            rep_dgeoms, rep_dgeom_indices = (
-                docking_geometry.pick_docking_geometry_reps(
-                    dummy_organism, dgeoms, num_runs * num_templates_per_run
-                )
+            rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps(
+                dummy_organism, dgeoms, num_runs * num_templates_per_run
             )
 
     # print('dgeoms:', len(dgeoms), len(rep_dgeoms))
@@ -1826,36 +1748,22 @@ def show_alignment(al, seq1, seq2):
                 dgeom_row = dgeom_info.iloc[rep_dgeom_indices[dgeom_repno]]
 
             pmhc_pdbid = pmhc_al[1]
-            pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo(
-                pmhc_pdbid, PMHC
-            )
+            pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo(pmhc_pdbid, PMHC)
 
             tcra_pdbid = tcra_al[1]
-            tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo(
-                tcra_pdbid, TCR
-            )
+            tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo(tcra_pdbid, TCR)
 
             tcrb_pdbid = tcrb_al[1]
-            tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo(
-                tcrb_pdbid, TCR
-            )
+            tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo(tcrb_pdbid, TCR)
 
             # assert ((mhc_class==1 and pmhc_pose.num_chains() == 4) or
             #         (mhc_class==2 and pmhc_pose.num_chains() == 5))
-            assert (
-                len(tcra_pose["chains"]) == 2 and len(tcrb_pose["chains"]) == 2
-            )
+            assert len(tcra_pose["chains"]) == 2 and len(tcrb_pose["chains"]) == 2
 
             # copy tcrb into tcra_pose by superimposing core coords
-            fix_coords = tcra_pose["ca_coords"][
-                tcra_tdinfo.tcr_core[core_len:]
-            ]
-            mov_coords = tcrb_pose["ca_coords"][
-                tcrb_tdinfo.tcr_core[core_len:]
-            ]
-            R, v = superimpose.superimposition_transform(
-                fix_coords, mov_coords
-            )
+            fix_coords = tcra_pose["ca_coords"][tcra_tdinfo.tcr_core[core_len:]]
+            mov_coords = tcrb_pose["ca_coords"][tcrb_tdinfo.tcr_core[core_len:]]
+            R, v = superimpose.superimposition_transform(fix_coords, mov_coords)
             tcrb_pose = apply_transform_Rx_plus_v(tcrb_pose, R, v)
             tcra_pose = delete_chains(tcra_pose, [1])
             tcra_pose = append_chains(tcra_pose, tcrb_pose, [1])
@@ -1885,9 +1793,7 @@ def show_alignment(al, seq1, seq2):
             # copy tcr from tcra_pose into pmhc_pose
             num_pmhc_chains = mhc_class + 1
             if len(pmhc_pose["chains"]) > num_pmhc_chains:
-                del_chains = list(
-                    range(num_pmhc_chains, len(pmhc_pose["chains"]))
-                )
+                del_chains = list(range(num_pmhc_chains, len(pmhc_pose["chains"])))
                 pmhc_pose = delete_chains(pmhc_pose, del_chains)
             pmhc_pose = append_chains(pmhc_pose, tcra_pose, [0, 1])
             assert len(pmhc_pose["chains"]) == 2 + num_pmhc_chains
@@ -1902,9 +1808,7 @@ def show_alignment(al, seq1, seq2):
             # should be the same as new_tcr_stub!
             redo_tcr_stub = tcr_util.get_tcr_stub(pmhc_pose, pmhc_tdinfo)
             v_dev = norm(redo_tcr_stub["origin"] - new_tcr_stub["origin"])
-            M_dev = norm(
-                new_tcr_stub["axes"] @ redo_tcr_stub["axes"].T - np.eye(3)
-            )
+            M_dev = norm(new_tcr_stub["axes"] @ redo_tcr_stub["axes"].T - np.eye(3))
             if max(v_dev, M_dev) > 5e-2:
                 print("devs:", v_dev, M_dev)
             assert v_dev < 5e-2
@@ -1998,9 +1902,7 @@ def show_alignment(al, seq1, seq2):
                 tcrb_v=tcr_info.loc[(tcrb_pdbid, "B"), "v_gene"],
                 tcrb_j=tcr_info.loc[(tcrb_pdbid, "B"), "j_gene"],
                 tcrb_cdr3=tcr_info.loc[(tcrb_pdbid, "B"), "cdr3"],
-                dgeom_pdbid=(
-                    f"opt{itmp}" if dgeom_row is None else dgeom_row.pdbid
-                ),
+                dgeom_pdbid=(f"opt{itmp}" if dgeom_row is None else dgeom_row.pdbid),
                 template_pdbfile=outpdbfile,
                 target_to_template_alignstring=alignstring,
                 identities=identities,
@@ -2114,9 +2016,7 @@ def setup_for_alphafold(
 
     tcr_db = tcr_db.copy()
     tcr_db["mhc_peptide"] = (
-        tcr_db.mhc.str.replace("*", "", regex=False).str.replace(
-            ":", "", regex=False
-        )
+        tcr_db.mhc.str.replace("*", "", regex=False).str.replace(":", "", regex=False)
         + "_"
         + tcr_db.peptide
     )
@@ -2135,9 +2035,7 @@ def setup_for_alphafold(
 
     # optionally filter to peptide mhc combos with min/max counts ##########
     if min_pmhc_count is not None or max_pmhc_count is not None:
-        tcr_db.sort_values(
-            "mhc_peptide", inplace=True
-        )  ## IMPORTANT B/C MASKING
+        tcr_db.sort_values("mhc_peptide", inplace=True)  ## IMPORTANT B/C MASKING
         random.seed(random_seed)  # shuffling of epitope tcrs
         assert min_pmhc_count is not None and max_pmhc_count is not None
         mhc_peptides = tcr_db.mhc_peptide.drop_duplicates().to_list()
@@ -2184,9 +2082,7 @@ def setup_for_alphafold(
 
     targets_dfl = []
     for index, targetl in tcr_db.iterrows():
-        targetid_prefix = (
-            f"T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}"
-        )
+        targetid_prefix = f"T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}"
         print("START", index, tcr_db.shape[0], targetid_prefix)
         outfile_prefix = f"{outdir}{targetid_prefix}"
         if exclude_self_peptide_docking_geometries:
@@ -2253,9 +2149,7 @@ def setup_for_alphafold(
     print("made:", outfile)
 
 
-def get_mhc_chain_trim_positions(
-    chainseq, organism, mhc_class, mhc_allele, chain=None
-):
+def get_mhc_chain_trim_positions(chainseq, organism, mhc_class, mhc_allele, chain=None):
     """ """
     assert mhc_class in [1, 2]
     if mhc_class == 2:
diff --git a/tcrdock/tcrdock_info.py b/tcrdock/tcrdock_info.py
index ee4c1c3..de9b9ba 100644
--- a/tcrdock/tcrdock_info.py
+++ b/tcrdock/tcrdock_info.py
@@ -73,9 +73,7 @@ def from_sequences(
 
         else:
             assert mhc_class == 2
-            self.mhc_core = mhc_util.get_mhc_core_positions_class2(
-                mhc_aseq, mhc_bseq
-            )
+            self.mhc_core = mhc_util.get_mhc_core_positions_class2(mhc_aseq, mhc_bseq)
             self.mhc_allele = (
                 mhc_util.get_mhc_allele(mhc_aseq, organism)
                 + ","
@@ -115,9 +113,7 @@ def from_sequences(
             self.tcr_cdrs.extend(
                 [(int(x[0] + offset), int(x[1] + offset)) for x in cdr_loops]
             )
-            self.tcr_core.extend(
-                [int(x + offset) for x in res["core_positions"]]
-            )
+            self.tcr_core.extend([int(x + offset) for x in res["core_positions"]])
 
             offset += len(chainseq)
         self.valid = True  # signal success
@@ -129,17 +125,13 @@ def renumber(self, old2new):
 
         will be bad if old2new doesn't cover all our positions
         """
-        old2new = {
-            int(x): int(y) for x, y in old2new.items()
-        }  # no int64 in our data!
+        old2new = {int(x): int(y) for x, y in old2new.items()}  # no int64 in our data!
         if self.mhc_core is not None:
             self.mhc_core = [old2new[x] for x in self.mhc_core]
         if self.tcr_core is not None:
             self.tcr_core = [old2new[x] for x in self.tcr_core]
         if self.tcr_cdrs is not None:
-            self.tcr_cdrs = [
-                (old2new[x], old2new[y]) for x, y in self.tcr_cdrs
-            ]
+            self.tcr_cdrs = [(old2new[x], old2new[y]) for x, y in self.tcr_cdrs]
         return self
 
     def delete_residue_range(self, start, stop):

From c5c68cd6c0c48bfa1c2378d5087774e8f3ce996b Mon Sep 17 00:00:00 2001
From: Lawson <lwoods@tgen.org>
Date: Mon, 18 Aug 2025 17:11:04 -0700
Subject: [PATCH 19/19] add support for anarci-identified CDRs

---
 .gitignore                 |   3 +-
 tcrdock/tcrdist/parsing.py | 142 ++++++++++++++++++++++++++-----------
 tcrdock/tcrdock_info.py    |   9 ++-
 3 files changed, 110 insertions(+), 44 deletions(-)

diff --git a/.gitignore b/.gitignore
index a99af23..b9398eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,5 @@ ncbi-blast-*
 **/*.ipynb
 .vscode
 .nextflow
-build
\ No newline at end of file
+build
+tmp
\ No newline at end of file
diff --git a/tcrdock/tcrdist/parsing.py b/tcrdock/tcrdist/parsing.py
index 7fed30a..7699173 100644
--- a/tcrdock/tcrdist/parsing.py
+++ b/tcrdock/tcrdist/parsing.py
@@ -12,6 +12,7 @@
 from .genetic_code import genetic_code, reverse_genetic_code
 from . import logo_tools
 from ..blast import blast_sequence_and_read_hits, setup_query_to_hit_map
+import numpy as np
 
 
 def get_blast_db_path(organism, ab, vj):
@@ -47,9 +48,7 @@ def get_blast_db_path(organism, ab, vj):
 for organism in extra_alignment_columns_1x:
     for ab, xcols in extra_alignment_columns_1x[organism].items():
         positions = [
-            x
-            - 1
-            + sum(y <= x for y in xcols)  # this is not quite right but it wrks
+            x - 1 + sum(y <= x for y in xcols)  # this is not quite right but it wrks
             for x in core_positions_generic_1x
         ]
 
@@ -141,9 +140,7 @@ def parse_other_cdrs(organism, ab, qseq, v_gene, q2v_align):
         stop = alseq2seq[stop_col_1x - 1]
 
         ## what aligns to this region in v_hit
-        q_loop_positions = [
-            i for i, j in q2v_align.items() if start <= j <= stop
-        ]
+        q_loop_positions = [i for i, j in q2v_align.items() if start <= j <= stop]
         if q_loop_positions:
             qstart = min(q_loop_positions)
             qstop = max(q_loop_positions)
@@ -190,18 +187,73 @@ def parse_cdr3(organism, ab, qseq, v_gene, j_gene, q2v_align, q2j_align):
     return [query_cpos, query_fpos], v_mismatches, j_mismatches
 
 
+def anarci_parse_cdrs(organism, ab, qseq):
+    try:
+        import anarci
+    except ImportError:
+        raise ModuleNotFoundError(
+            "anarci not installed, skipping CDR parsing. Try `conda install -c bioconda anarci`"
+        )
+
+    if ab == "A":
+        allow = set(["A", "D"])
+    elif ab == "B":
+        allow = set(["B"])
+
+    results = anarci.run_anarci(
+        [("AAA", qseq)],
+        scheme="imgt",
+        allowed_species=["human"],
+        allow=allow,
+    )
+    if results[1][0] is None:
+        raise ValueError(f"No domain found for sequence '{qseq}'")
+
+    numbering = results[1][0][0]
+    sub_start = numbering[1]
+    sub_stop = numbering[2] + 1
+
+    # this entire substring will be numbered
+    # however, there may be gaps in the sequence
+    # which are given an IMGT number
+    numbered_substring = qseq[sub_start:sub_stop]
+    imgt_num = np.zeros((len(range(sub_start, sub_stop)),), dtype=np.int32)
+    imgt_tuples = numbering[0]
+    j = 0
+    for i in range(len(numbered_substring)):
+        aa = numbered_substring[i]
+        while j < len(imgt_tuples) and imgt_tuples[j][1] != aa:
+            j += 1
+        if j < len(imgt_tuples):
+            imgt_num[i] = imgt_tuples[j][0][0]
+            j += 1
+
+    # return list of (start, stop) tuples for CDRs
+    cdr_tuples = []
+    cdr_1 = np.nonzero((imgt_num >= 27) & (imgt_num <= 38))[0] + sub_start
+    cdr_tuples.append([cdr_1[0], cdr_1[-1]])
+
+    cdr_2 = np.nonzero((imgt_num >= 50) & (imgt_num <= 65))[0] + sub_start
+    cdr_tuples.append([cdr_2[0], cdr_2[-1]])
+
+    cdr_2_5 = np.nonzero((imgt_num >= 81) & (imgt_num <= 86))[0] + sub_start
+    cdr_tuples.append([cdr_2_5[0], cdr_2_5[-1]])
+
+    cdr_3 = np.nonzero((imgt_num >= 104) & (imgt_num <= 118))[0] + sub_start
+    cdr_tuples.append([cdr_3[0], cdr_3[-1]])
+    return cdr_tuples
+
+
 def get_top_blast_hit_with_allele_sorting(hits):
     """In the case of ties, prefer lower allele numbers"""
     top_bitscore = max(hits.bitscore)
     top_hits = hits[hits.bitscore == top_bitscore].copy()
     # print('top_bitscore:', top_bitscore, 'ties:', top_hits.shape[0])
-    top_hits["allele"] = (
-        top_hits.saccver.str.split("*").str.get(-1).astype(int)
-    )
+    top_hits["allele"] = top_hits.saccver.str.split("*").str.get(-1).astype(int)
     return top_hits.sort_values("allele").iloc[0].copy()
 
 
-def parse_tcr_sequence(organism, chain, sequence):
+def parse_tcr_sequence(organism, chain, sequence, anarci_cdrs=False):
     """return dict with keys
     * cdr_loops
     * core_positions
@@ -221,6 +273,8 @@ def parse_tcr_sequence(organism, chain, sequence):
     """
     assert chain in "AB"
 
+    result = {"mismatches": {}}
+
     tmpdbfile = get_blast_db_path(organism, chain, "V")
     if not exists(tmpdbfile):
         make_blast_dbs()  # only need to do this once
@@ -243,23 +297,38 @@ def parse_tcr_sequence(organism, chain, sequence):
         v_gene = v_hit.saccver
         j_gene = j_hit.saccver
 
-        cdr3_bounds, v_mismatches, j_mismatches = parse_cdr3(
-            organism,
-            chain,
-            sequence,
-            v_gene,
-            j_gene,
-            q2v_align,
-            q2j_align,
-        )
+        if anarci_cdrs:
+            cdr_loops = anarci_parse_cdrs(organism, chain, sequence)
+            result["cdr_loops"] = cdr_loops
 
-        other_cdr_bounds, other_cdr_mismatches = parse_other_cdrs(
-            organism,
-            chain,
-            sequence,
-            v_gene,
-            q2v_align,
-        )
+        else:
+            cdr3_bounds, v_mismatches, j_mismatches = parse_cdr3(
+                organism,
+                chain,
+                sequence,
+                v_gene,
+                j_gene,
+                q2v_align,
+                q2j_align,
+            )
+
+            other_cdr_bounds, other_cdr_mismatches = parse_other_cdrs(
+                organism,
+                chain,
+                sequence,
+                v_gene,
+                q2v_align,
+            )
+
+            cdr_loops = other_cdr_bounds + [cdr3_bounds]
+
+            if any(None in bounds for bounds in cdr_loops):
+                return {}  # signal failure ### EARLY RETURN!!!
+
+            result["cdr_loops"] = cdr_loops
+            result["mismatches"]["other_cdr_mismatches"] = other_cdr_mismatches
+            result["mismatches"]["v_mismatches"] = v_mismatches
+            result["mismatches"]["j_mismatches"] = j_mismatches
 
         core_positions, core_mismatches = parse_core_positions(
             organism, chain, sequence, v_gene, q2v_align
@@ -269,22 +338,11 @@ def parse_tcr_sequence(organism, chain, sequence):
             # fail
             return {}
 
-        cdr_loops = other_cdr_bounds + [cdr3_bounds]
-        if any(None in bounds for bounds in cdr_loops):
-            return {}  # signal failure ### EARLY RETURN!!!
-
-        result = {
-            "cdr_loops": cdr_loops,
-            "core_positions": core_positions,
-            "v_gene": v_gene,
-            "j_gene": j_gene,
-            "mismatches": {
-                "v_mismatches": v_mismatches,
-                "j_mismatches": j_mismatches,
-                "other_cdr_mismatches": other_cdr_mismatches,
-                "core_mismatches": core_mismatches,
-            },
-        }
+        result["core_positions"] = core_positions
+        result["v_gene"] = v_gene
+        result["j_gene"] = j_gene
+        result["mismatches"]["core_mismatches"] = core_mismatches
+
         return result
     else:
         print("failed to find v and j matches")
diff --git a/tcrdock/tcrdock_info.py b/tcrdock/tcrdock_info.py
index de9b9ba..b890a11 100644
--- a/tcrdock/tcrdock_info.py
+++ b/tcrdock/tcrdock_info.py
@@ -51,6 +51,7 @@ def from_sequences(
         pep_seq,  # '' if tcr_only
         tcr_aseq,
         tcr_bseq,
+        anarci_cdrs=False,
     ):
         self.valid = False
         self.organism = organism
@@ -93,7 +94,9 @@ def from_sequences(
         self.tcr_core = []
         self.tcr = []
         for chain, chainseq in zip("AB", [tcr_aseq, tcr_bseq]):
-            res = tcrdist.parsing.parse_tcr_sequence(organism, chain, chainseq)
+            res = tcrdist.parsing.parse_tcr_sequence(
+                organism, chain, chainseq, anarci_cdrs=anarci_cdrs
+            )
             if not res:
                 other_organism = "human" if organism == "mouse" else "mouse"
                 res = tcrdist.parsing.parse_tcr_sequence(
@@ -103,6 +106,7 @@ def from_sequences(
                     # parse failure
                     print("TCRdockInfo:: TCR parse fail!")
                     return self  ########## early return
+
             cdr_loops = res["cdr_loops"]
             cdr3_start, cdr3_end = cdr_loops[-1]
             cdr3 = chainseq[cdr3_start : cdr3_end + 1]
@@ -113,6 +117,9 @@ def from_sequences(
             self.tcr_cdrs.extend(
                 [(int(x[0] + offset), int(x[1] + offset)) for x in cdr_loops]
             )
+
+            self.tcr.append((res["v_gene"], res["j_gene"], None))
+
             self.tcr_core.extend([int(x + offset) for x in res["core_positions"]])
 
             offset += len(chainseq)