From f23462ac9a98d6cfff7afdc378d018af8e9011fd Mon Sep 17 00:00:00 2001 From: Lawson Date: Thu, 12 Jun 2025 17:07:18 -0700 Subject: [PATCH 01/19] rety mechanism --- tcrdock/tcrdock_info.py | 160 ++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 73 deletions(-) diff --git a/tcrdock/tcrdock_info.py b/tcrdock/tcrdock_info.py index 5f6caeb..ee4c1c3 100644 --- a/tcrdock/tcrdock_info.py +++ b/tcrdock/tcrdock_info.py @@ -4,8 +4,9 @@ from . import tcrdist import json -class TCRdockInfo(): - ''' A class to store information about a TCR:peptide:MHC pose + +class TCRdockInfo: + """A class to store information about a TCR:peptide:MHC pose CONTENTS: @@ -22,7 +23,7 @@ class TCRdockInfo(): note that the CDR3 sequence and loop position is "extended" ie starts at C and ends at "F" - ''' + """ def __init__(self): self.organism = None @@ -42,29 +43,29 @@ def __repr__(self): return self.to_string() def from_sequences( - self, - organism, - mhc_class, # pass None if tcr_only - mhc_aseq, # '' if tcr_only - mhc_bseq, # ignored if mhc_class==1 - pep_seq, # '' if tcr_only - tcr_aseq, - tcr_bseq, + self, + organism, + mhc_class, # pass None if tcr_only + mhc_aseq, # '' if tcr_only + mhc_bseq, # ignored if mhc_class==1 + pep_seq, # '' if tcr_only + tcr_aseq, + tcr_bseq, ): self.valid = False self.organism = organism self.mhc_class = int(mhc_class) self.pep_seq = pep_seq - #self.mhc_aseq = mhc_aseq - #self.mhc_bseq = mhc_bseq - #self.tcr_aseq = tcr_aseq - #self.tcr_bseq = tcr_bseq + # self.mhc_aseq = mhc_aseq + # self.mhc_bseq = mhc_bseq + # self.tcr_aseq = tcr_aseq + # self.tcr_bseq = tcr_bseq # parse the mhc sequence - if mhc_class is None: # tcr only - assert mhc_aseq == '' and mhc_bseq == '' and pep_seq == '' + if mhc_class is None: # tcr only + assert mhc_aseq == "" and mhc_bseq == "" and pep_seq == "" self.mhc_core = [] - self.mhc_allele = '' + self.mhc_allele = "" elif mhc_class == 1: self.mhc_core = mhc_util.get_mhc_core_positions_class1(mhc_aseq) @@ -73,106 +74,119 @@ def from_sequences( else: assert mhc_class == 2 self.mhc_core = mhc_util.get_mhc_core_positions_class2( - mhc_aseq, mhc_bseq) - self.mhc_allele = (mhc_util.get_mhc_allele(mhc_aseq, organism)+','+ - mhc_util.get_mhc_allele(mhc_bseq, organism)) + mhc_aseq, mhc_bseq + ) + self.mhc_allele = ( + mhc_util.get_mhc_allele(mhc_aseq, organism) + + "," + + mhc_util.get_mhc_allele(mhc_bseq, organism) + ) if -1 in self.mhc_core: - print('TCRdockInfo:: MHC parse fail!') - return self ########## early return + print("TCRdockInfo:: MHC parse fail!") + return self ########## early return # this shifts everything to full pose numbering (0-indexed) offset = len(mhc_aseq) + len(pep_seq) - if mhc_class==2: + if mhc_class == 2: offset += len(mhc_bseq) self.tcr_cdrs = [] self.tcr_core = [] self.tcr = [] - for chain, chainseq in zip('AB', [tcr_aseq, tcr_bseq]): - res = tcrdist.parsing.parse_tcr_sequence( - organism, chain, chainseq) + for chain, chainseq in zip("AB", [tcr_aseq, tcr_bseq]): + res = tcrdist.parsing.parse_tcr_sequence(organism, chain, chainseq) if not res: - # parse failure - print('TCRdockInfo:: TCR parse fail!') - return self ########## early return - cdr_loops = res['cdr_loops'] + other_organism = "human" if organism == "mouse" else "mouse" + res = tcrdist.parsing.parse_tcr_sequence( + other_organism, chain, chainseq + ) + if not res: + # parse failure + print("TCRdockInfo:: TCR parse fail!") + return self ########## early return + cdr_loops = res["cdr_loops"] cdr3_start, cdr3_end = cdr_loops[-1] - cdr3 = chainseq[cdr3_start:cdr3_end+1] - self.tcr.append((res['v_gene'], res['j_gene'], cdr3)) + cdr3 = chainseq[cdr3_start : cdr3_end + 1] + self.tcr.append((res["v_gene"], res["j_gene"], cdr3)) # the ints here are converting from np.int64 to int # ran into trouble serializing: # "Object of type 'int64' is not JSON serializable" - self.tcr_cdrs.extend([(int(x[0]+offset), int(x[1]+offset)) - for x in cdr_loops]) - self.tcr_core.extend([int(x+offset) for x in res['core_positions']]) + self.tcr_cdrs.extend( + [(int(x[0] + offset), int(x[1] + offset)) for x in cdr_loops] + ) + self.tcr_core.extend( + [int(x + offset) for x in res["core_positions"]] + ) offset += len(chainseq) - self.valid = True # signal success + self.valid = True # signal success return self def renumber(self, old2new): - ''' old2new is a mapping of 0-indexed positions from old to new + """old2new is a mapping of 0-indexed positions from old to new numbering systems will be bad if old2new doesn't cover all our positions - ''' - old2new = {int(x):int(y) for x,y in old2new.items()} # no int64 in our data! + """ + old2new = { + int(x): int(y) for x, y in old2new.items() + } # no int64 in our data! if self.mhc_core is not None: self.mhc_core = [old2new[x] for x in self.mhc_core] if self.tcr_core is not None: self.tcr_core = [old2new[x] for x in self.tcr_core] if self.tcr_cdrs is not None: - self.tcr_cdrs = [(old2new[x], old2new[y]) for x,y in self.tcr_cdrs] + self.tcr_cdrs = [ + (old2new[x], old2new[y]) for x, y in self.tcr_cdrs + ] return self def delete_residue_range(self, start, stop): - ''' delete from start to stop , EXCLUSIVE of stop !!!!!!!!!!!!!!!!!!!!!! + """delete from start to stop , EXCLUSIVE of stop !!!!!!!!!!!!!!!!!!!!!! ie delete [start,stop) or range(start,stop) puts -1 for residues in [start,stop) range, if there are any - ''' - - maxpos = self.tcr_cdrs[-1][-1] # end of CDR3beta is biggest - numdel = stop-start - old2new = {x:x if x Date: Thu, 12 Jun 2025 17:40:38 -0700 Subject: [PATCH 02/19] make package installable --- .gitignore | 11 ++++++++++- setup.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 184cf46..a99af23 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ *pyc __pycache__ *.fasta.pdb +*.fasta.blast +*.fasta +*.tsv *.phr *.pin *.pot @@ -9,4 +12,10 @@ __pycache__ *.ptf *.pto imgt_prot_blast_db_*fasta -ncbi-blast-* \ No newline at end of file +ncbi-blast-* +*.egg-info +**/*.log +**/*.ipynb +.vscode +.nextflow +build \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..76a9084 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +# setup.py +import subprocess +import sys +from setuptools import setup, find_packages +from setuptools.command.install import install + + +class CustomInstall(install): + """Run our database installer, then fall back to the normal install.""" + + def run(self): + subprocess.check_call([sys.executable, "download_blast.py"]) + super().run() + + +setup( + name="tcrdock", + version="0.1.0", + packages=find_packages(), + install_requires=[ + "biopython==1.79", + "numpy==1.19.5", + "pandas==1.3.4", + "scipy==1.7.0", + "matplotlib==3.3.4", + ], + cmdclass={ + "install": CustomInstall, + }, +) From 8aa337ce827699fa72e85e563e20640dee392a20 Mon Sep 17 00:00:00 2001 From: Lawson Date: Thu, 12 Jun 2025 17:41:37 -0700 Subject: [PATCH 03/19] update README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index b56defb..0c56e54 100644 --- a/README.md +++ b/README.md @@ -143,8 +143,7 @@ to be installed, which can be done by running the script ``` conda create --name tcrdock_test python=3.8 source activate tcrdock_test # or: conda activate tcrdock_test -pip3 install -r requirements.txt -python download_blast.py +pip install ``` To run the AlphaFold simulations, you will need a Python environment that satisfies From 586e327a35febddd79bf22aee98ec06ee843c6f9 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 10:58:13 -0700 Subject: [PATCH 04/19] dependency relaxation --- .github/workflows/gh-ci.yml | 72 +++++++++++++++++++++++++++++++++++++ setup.py | 10 +++--- 2 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/gh-ci.yml diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml new file mode 100644 index 0000000..8adfdbd --- /dev/null +++ b/.github/workflows/gh-ci.yml @@ -0,0 +1,72 @@ +name: GH Actions CI +'on': + push: + branches: + - main + pull_request: + branches: + - main + - develop + schedule: + # Weekly tests at midnight on Sundays run on main by default: + # Scheduled workflows run on the latest commit on the default or base branch. + # (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule) + - cron: 0 0 * * 0 + +concurrency: + # Specific group naming so CI is only cancelled + # within same PR or on merge to main + group: ${{ github.ref }}-${{ github.head_ref }}-${{ github.workflow }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l {0} + + + main-tests: + if: github.repository == 'ljwoods2/TCRdock' + needs: environment-config + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - macOS-latest + - ubuntu-latest + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Build information + run: | + uname -a + df -h + ulimit -a + + # More info on options: https://github.com/conda-incubator/setup-miniconda + - name: Create & activate conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.python-version }} + add-pip-as-python-dependency: true + environment-name: tcrdock-test + auto-activate-base: false + + - name: Install package + run: | + python --version + python -m pip install . + + - name: Python information + run: | + which python + which pip + pip list + conda info + conda list + + - name: Run tests + run: | + pytest -v -x --color=yes tcrdock/tests \ No newline at end of file diff --git a/setup.py b/setup.py index 76a9084..47ba6a6 100644 --- a/setup.py +++ b/setup.py @@ -18,11 +18,11 @@ def run(self): version="0.1.0", packages=find_packages(), install_requires=[ - "biopython==1.79", - "numpy==1.19.5", - "pandas==1.3.4", - "scipy==1.7.0", - "matplotlib==3.3.4", + "biopython", + "numpy", + "pandas", + "scipy", + "matplotlib", ], cmdclass={ "install": CustomInstall, From 3e4fa5df2367a83ffa3cbb721e7aa1bf7bfa052d Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 11:08:53 -0700 Subject: [PATCH 05/19] add __init__.py to dat dirs --- =3.10 | 30 +++++++++++++++++++++++++++++ tcrdock/db/__init__.py | 0 tcrdock/db/new_imgt_hla/__init__.py | 0 tcrdock/db/pdb/__init__.py | 0 tcrdock/db/pdb/pmhc/__init__.py | 0 tcrdock/db/pdb/tcr/__init__.py | 0 tcrdock/db/pdb/ternary/__init__.py | 0 7 files changed, 30 insertions(+) create mode 100644 =3.10 create mode 100644 tcrdock/db/__init__.py create mode 100644 tcrdock/db/new_imgt_hla/__init__.py create mode 100644 tcrdock/db/pdb/__init__.py create mode 100644 tcrdock/db/pdb/pmhc/__init__.py create mode 100644 tcrdock/db/pdb/tcr/__init__.py create mode 100644 tcrdock/db/pdb/ternary/__init__.py diff --git a/=3.10 b/=3.10 new file mode 100644 index 0000000..f4f691d --- /dev/null +++ b/=3.10 @@ -0,0 +1,30 @@ +Channels: + - conda-forge + - defaults +Platform: linux-64 +Collecting package metadata (repodata.json): ...working... done +Solving environment: ...working... done + +## Package Plan ## + + environment location: /home/lwoods/miniconda3/envs/tmp + + added / updated specs: + - python + + +The following packages will be downloaded: + + package | build + ---------------------------|----------------- + python-3.12.11 |h9e4cc4f_0_cpython 30.0 MB conda-forge + ------------------------------------------------------------ + Total: 30.0 MB + +The following packages will be UPDATED: + + python 3.8.20-h4a871b0_2_cpython --> 3.12.11-h9e4cc4f_0_cpython + python_abi 3.8-7_cp38 --> 3.12-7_cp312 + + +Proceed ([y]/n)? \ No newline at end of file diff --git a/tcrdock/db/__init__.py b/tcrdock/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/new_imgt_hla/__init__.py b/tcrdock/db/new_imgt_hla/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/__init__.py b/tcrdock/db/pdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/pmhc/__init__.py b/tcrdock/db/pdb/pmhc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/tcr/__init__.py b/tcrdock/db/pdb/tcr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/ternary/__init__.py b/tcrdock/db/pdb/ternary/__init__.py new file mode 100644 index 0000000..e69de29 From 1847e01cbbc68262e9f643a8c77e23a90450f4a0 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 11:40:10 -0700 Subject: [PATCH 06/19] remove __init__, add package_data --- =3.10 | 30 ----------------------------- setup.py | 14 ++++++++++++++ tcrdock/db/__init__.py | 0 tcrdock/db/new_imgt_hla/__init__.py | 0 tcrdock/db/pdb/__init__.py | 0 tcrdock/db/pdb/pmhc/__init__.py | 0 tcrdock/db/pdb/tcr/__init__.py | 0 7 files changed, 14 insertions(+), 30 deletions(-) delete mode 100644 =3.10 delete mode 100644 tcrdock/db/__init__.py delete mode 100644 tcrdock/db/new_imgt_hla/__init__.py delete mode 100644 tcrdock/db/pdb/__init__.py delete mode 100644 tcrdock/db/pdb/pmhc/__init__.py delete mode 100644 tcrdock/db/pdb/tcr/__init__.py diff --git a/=3.10 b/=3.10 deleted file mode 100644 index f4f691d..0000000 --- a/=3.10 +++ /dev/null @@ -1,30 +0,0 @@ -Channels: - - conda-forge - - defaults -Platform: linux-64 -Collecting package metadata (repodata.json): ...working... done -Solving environment: ...working... done - -## Package Plan ## - - environment location: /home/lwoods/miniconda3/envs/tmp - - added / updated specs: - - python - - -The following packages will be downloaded: - - package | build - ---------------------------|----------------- - python-3.12.11 |h9e4cc4f_0_cpython 30.0 MB conda-forge - ------------------------------------------------------------ - Total: 30.0 MB - -The following packages will be UPDATED: - - python 3.8.20-h4a871b0_2_cpython --> 3.12.11-h9e4cc4f_0_cpython - python_abi 3.8-7_cp38 --> 3.12-7_cp312 - - -Proceed ([y]/n)? \ No newline at end of file diff --git a/setup.py b/setup.py index 47ba6a6..102c334 100644 --- a/setup.py +++ b/setup.py @@ -27,4 +27,18 @@ def run(self): cmdclass={ "install": CustomInstall, }, + package_data={ + "": [ + "**/*.alfas", + "**/*.fasta", + "**/*.txt", + "**/*.tsv", + "**/tcr/*.pdb", + "**/tcr/*.json", + "**/ternary/*.pdb", + "**/ternary/*.json", + "**/pmhc/*.pdb", + "**/pmhc/*.json", + ] + }, ) diff --git a/tcrdock/db/__init__.py b/tcrdock/db/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tcrdock/db/new_imgt_hla/__init__.py b/tcrdock/db/new_imgt_hla/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tcrdock/db/pdb/__init__.py b/tcrdock/db/pdb/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tcrdock/db/pdb/pmhc/__init__.py b/tcrdock/db/pdb/pmhc/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tcrdock/db/pdb/tcr/__init__.py b/tcrdock/db/pdb/tcr/__init__.py deleted file mode 100644 index e69de29..0000000 From 683aa6e58f4d1d01ab717a86c7756c54d96e948c Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 11:48:03 -0700 Subject: [PATCH 07/19] add back __init__.py --- .github/workflows/gh-ci.yml | 2 +- setup.py | 4 ++-- tcrdock/db/__init__.py | 0 tcrdock/db/new_imgt_hla/__init__.py | 0 tcrdock/db/pdb/__init__.py | 0 tcrdock/db/pdb/pmhc/__init__.py | 0 tcrdock/db/pdb/tcr/__init__.py | 0 7 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 tcrdock/db/__init__.py create mode 100644 tcrdock/db/new_imgt_hla/__init__.py create mode 100644 tcrdock/db/pdb/__init__.py create mode 100644 tcrdock/db/pdb/pmhc/__init__.py create mode 100644 tcrdock/db/pdb/tcr/__init__.py diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml index 8adfdbd..859afc6 100644 --- a/.github/workflows/gh-ci.yml +++ b/.github/workflows/gh-ci.yml @@ -23,7 +23,7 @@ defaults: run: shell: bash -l {0} - +jobs: main-tests: if: github.repository == 'ljwoods2/TCRdock' needs: environment-config diff --git a/setup.py b/setup.py index 102c334..75a5d3e 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def run(self): setup( name="tcrdock", version="0.1.0", - packages=find_packages(), + packages=["tcrdock"], install_requires=[ "biopython", "numpy", @@ -28,7 +28,7 @@ def run(self): "install": CustomInstall, }, package_data={ - "": [ + "tcrdock": [ "**/*.alfas", "**/*.fasta", "**/*.txt", diff --git a/tcrdock/db/__init__.py b/tcrdock/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/new_imgt_hla/__init__.py b/tcrdock/db/new_imgt_hla/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/__init__.py b/tcrdock/db/pdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/pmhc/__init__.py b/tcrdock/db/pdb/pmhc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/db/pdb/tcr/__init__.py b/tcrdock/db/pdb/tcr/__init__.py new file mode 100644 index 0000000..e69de29 From da4e6d87f208049f3d61e3902b4a30f2d95b5e93 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 11:49:05 -0700 Subject: [PATCH 08/19] remove dep in workflow --- .github/workflows/gh-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml index 859afc6..689c530 100644 --- a/.github/workflows/gh-ci.yml +++ b/.github/workflows/gh-ci.yml @@ -26,7 +26,7 @@ defaults: jobs: main-tests: if: github.repository == 'ljwoods2/TCRdock' - needs: environment-config + # needs: environment-config runs-on: ${{ matrix.os }} strategy: fail-fast: false From 84137615501fa5208c3adc9bd9fc5341b57a3bcd Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 11:52:23 -0700 Subject: [PATCH 09/19] add pyproject.toml --- .github/workflows/gh-ci.yml | 1 + pyproject.toml | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 pyproject.toml diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml index 689c530..f684388 100644 --- a/.github/workflows/gh-ci.yml +++ b/.github/workflows/gh-ci.yml @@ -53,6 +53,7 @@ jobs: add-pip-as-python-dependency: true environment-name: tcrdock-test auto-activate-base: false + miniconda-version: "latest" - name: Install package run: | diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4e715e0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools >= 42.0.0"] +build-backend = "setuptools.build_meta" \ No newline at end of file From bdee8439e6ead4097a3d1bf5f95b2f57b81880da Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 12:02:46 -0700 Subject: [PATCH 10/19] basic test setup --- .github/workflows/gh-ci.yml | 5 ++++- devtools/conda-envs/test_env.yaml | 10 ++++++++++ tcrdock/tests/__init__.py | 0 tcrdock/tests/test_tcrdock.py | 5 +++++ 4 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 devtools/conda-envs/test_env.yaml create mode 100644 tcrdock/tests/__init__.py create mode 100644 tcrdock/tests/test_tcrdock.py diff --git a/.github/workflows/gh-ci.yml b/.github/workflows/gh-ci.yml index f684388..ce9c18a 100644 --- a/.github/workflows/gh-ci.yml +++ b/.github/workflows/gh-ci.yml @@ -51,9 +51,12 @@ jobs: with: python-version: ${{ matrix.python-version }} add-pip-as-python-dependency: true - environment-name: tcrdock-test auto-activate-base: false miniconda-version: "latest" + environment-file: devtools/conda-envs/test_env.yaml + activate-environment: tcrdock-test + auto-update-conda: true + show-channel-urls: true - name: Install package run: | diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml new file mode 100644 index 0000000..2402f01 --- /dev/null +++ b/devtools/conda-envs/test_env.yaml @@ -0,0 +1,10 @@ +name: tcrdock-test +channels: + - defaults + - conda-forge +dependencies: + ### Base depends ### + - python>=3.10.0 + - pip + + - pytest \ No newline at end of file diff --git a/tcrdock/tests/__init__.py b/tcrdock/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcrdock/tests/test_tcrdock.py b/tcrdock/tests/test_tcrdock.py new file mode 100644 index 0000000..a490498 --- /dev/null +++ b/tcrdock/tests/test_tcrdock.py @@ -0,0 +1,5 @@ +import pytest + + +def test_import(): + import tcrdock From f4ff42eb677e365b49c0cc37708a9e806740d737 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 12:07:04 -0700 Subject: [PATCH 11/19] remove old substitution matrix code --- tcrdock/sequtil.py | 2003 ++++++++++++++++++++++++++------------------ 1 file changed, 1204 insertions(+), 799 deletions(-) diff --git a/tcrdock/sequtil.py b/tcrdock/sequtil.py index 04169b4..122cc63 100644 --- a/tcrdock/sequtil.py +++ b/tcrdock/sequtil.py @@ -6,7 +6,7 @@ from .tcrdist.all_genes import all_genes from .tcrdist.amino_acids import amino_acids from Bio import pairwise2 -from Bio.SubsMat import MatrixInfo as matlist +from Bio.Align import substitution_matrices from Bio.pairwise2 import format_alignment from .util import path_to_db from . import docking_geometry @@ -22,49 +22,57 @@ import copy from numpy.linalg import norm -CLASS2_PEPLEN = 1+9+1 +CLASS2_PEPLEN = 1 + 9 + 1 TCR_CORE_LEN = 13 -ALL_GENES_GAP_CHAR = '.' +ALL_GENES_GAP_CHAR = "." # human only -human_structure_alignments = pd.read_table(path_to_db/'new_human_vg_alignments_v1.tsv') -human_structure_alignments.set_index('v_gene', drop=True, inplace=True) +human_structure_alignments = pd.read_table( + path_to_db / "new_human_vg_alignments_v1.tsv" +) +human_structure_alignments.set_index("v_gene", drop=True, inplace=True) # human and mouse -both_structure_alignments = pd.read_table(path_to_db/'new_both_vg_alignments_v1.tsv') -both_structure_alignments.set_index(['organism','v_gene'], drop=True, inplace=True) +both_structure_alignments = pd.read_table( + path_to_db / "new_both_vg_alignments_v1.tsv" +) +both_structure_alignments.set_index( + ["organism", "v_gene"], drop=True, inplace=True +) -def read_fasta(filename): # helper - ''' return OrderedDict indexed by the ">" lines (everything after >) - ''' - data = open(filename, 'rU') + +def read_fasta(filename): # helper + """return OrderedDict indexed by the ">" lines (everything after >)""" + data = open(filename, "rU") fasta = OrderedDict() for line in data: - if line[0] == '>': + if line[0] == ">": seqid = line[1:-1] - assert seqid not in fasta # no repeats - fasta[seqid] = '' + assert seqid not in fasta # no repeats + fasta[seqid] = "" else: - l= line.split() + l = line.split() if l: fasta[seqid] += l[0] data.close() return fasta - -mhc_class_1_alfas = read_fasta(path_to_db / 'ClassI_prot.alfas') +mhc_class_1_alfas = read_fasta(path_to_db / "ClassI_prot.alfas") # add HLA-G 2022-04-30 -mhc_class_1_alfas.update(read_fasta(path_to_db / 'new_imgt_hla/G_prot.alfas')) +mhc_class_1_alfas.update(read_fasta(path_to_db / "new_imgt_hla/G_prot.alfas")) # add HLA-E 2022-05-03 -mhc_class_1_alfas.update(read_fasta(path_to_db / 'new_imgt_hla/E_prot.alfas')) +mhc_class_1_alfas.update(read_fasta(path_to_db / "new_imgt_hla/E_prot.alfas")) ks = list(mhc_class_1_alfas.keys()) lencheck = None for k in ks: - newseq = mhc_class_1_alfas[k].replace('-', ALL_GENES_GAP_CHAR)\ - .replace('X',ALL_GENES_GAP_CHAR) + newseq = ( + mhc_class_1_alfas[k] + .replace("-", ALL_GENES_GAP_CHAR) + .replace("X", ALL_GENES_GAP_CHAR) + ) mhc_class_1_alfas[k] = newseq if lencheck is None: lencheck = len(newseq) @@ -73,19 +81,25 @@ def read_fasta(filename): # helper # read mouse sequences (no gaps in them) # these mouse seqs are not the same length -mhc_class_1_alfas.update(read_fasta(path_to_db / 'mouse_class1_align.fasta')) +mhc_class_1_alfas.update(read_fasta(path_to_db / "mouse_class1_align.fasta")) # v2 means new imgt hla class 2 human alignments/sequences: mhc_class_2_alfas = { - 'A': read_fasta(path_to_db / 'both_class_2_A_chains_v2.alfas'), - 'B': read_fasta(path_to_db / 'both_class_2_B_chains_v2.alfas'), + "A": read_fasta(path_to_db / "both_class_2_A_chains_v2.alfas"), + "B": read_fasta(path_to_db / "both_class_2_B_chains_v2.alfas"), } -assert all(all(all(x in amino_acids or x==ALL_GENES_GAP_CHAR - for x in seq) - for seq in alfas.values()) - for alfas in [mhc_class_1_alfas, mhc_class_2_alfas['A'], - mhc_class_2_alfas['B']]) +assert all( + all( + all(x in amino_acids or x == ALL_GENES_GAP_CHAR for x in seq) + for seq in alfas.values() + ) + for alfas in [ + mhc_class_1_alfas, + mhc_class_2_alfas["A"], + mhc_class_2_alfas["B"], + ] +) # the code below is now redundant since we moved the info files and had applied these # changes already @@ -128,24 +142,26 @@ def read_fasta(filename): # helper # TCR, PMHC, TERNARY = 'tcr', 'pmhc', 'ternary' # all_template_info = {TCR:tcr_info, TERNARY:ternary_info, PMHC:pmhc_info} -TCR, PMHC, TERNARY = 'tcr', 'pmhc', 'ternary' +TCR, PMHC, TERNARY = "tcr", "pmhc", "ternary" all_template_info = {} for tag in [TCR, PMHC, TERNARY]: - info = pd.read_table(path_to_db / f'{tag}_templates_v2.tsv') + info = pd.read_table(path_to_db / f"{tag}_templates_v2.tsv") if tag == TCR: - info.set_index(['pdbid','ab'], drop=False, inplace=True) + info.set_index(["pdbid", "ab"], drop=False, inplace=True) else: - info.set_index('pdbid', drop=False, inplace=True) + info.set_index("pdbid", drop=False, inplace=True) all_template_info[tag] = info tcr_info = all_template_info[TCR] pmhc_info = all_template_info[PMHC] ternary_info = all_template_info[TERNARY] -all_template_poses = {TCR:{}, PMHC:{}, TERNARY:{}} +all_template_poses = {TCR: {}, PMHC: {}, TERNARY: {}} -BAD_DGEOM_PDBIDS = '5sws 7jwi 4jry 4nhu 3tjh 4y19 4y1a 1ymm 2wbj 6uz1'.split() -BAD_PMHC_PDBIDS = '3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9'.split() +BAD_DGEOM_PDBIDS = "5sws 7jwi 4jry 4nhu 3tjh 4y19 4y1a 1ymm 2wbj 6uz1".split() +BAD_PMHC_PDBIDS = ( + "3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9".split() +) # (new) 6uz1 is engineered and binds down by B2M # 3rgv has cterm of peptide out of groove # 4ms8 has chainbreaks (should check for those!) @@ -170,108 +186,127 @@ def read_fasta(filename): # helper ######################################################################################88 +_cached_tcrdisters = {} -_cached_tcrdisters = {} def get_tcrdister(organism): if organism not in _cached_tcrdisters: - _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator(organism) + _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator( + organism + ) return _cached_tcrdisters[organism] def blosum_align( - seq1, - seq2, - gap_open=-11, # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/ - gap_extend=-1, - global_align=False, - verbose=False, + seq1, + seq2, + gap_open=-11, # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/ + gap_extend=-1, + global_align=False, + verbose=False, ): - ''' return 0-indexed dictionary mapping from seq1 to seq2 positions - ''' + """return 0-indexed dictionary mapping from seq1 to seq2 positions""" - scorematrix = matlist.blosum62 + scorematrix = substitution_matrix.load("BLOSUM62") if global_align: alignments = pairwise2.align.globalds( - seq1, seq2, scorematrix, gap_open, gap_extend) + seq1, seq2, scorematrix, gap_open, gap_extend + ) else: alignments = pairwise2.align.localds( - seq1, seq2, scorematrix, gap_open, gap_extend) + seq1, seq2, scorematrix, gap_open, gap_extend + ) # this is annoying, BioPython 1.76 these are vanilla tuples and 1.79 they are named try: - alignment = max(alignments, key=lambda x:x.score) + alignment = max(alignments, key=lambda x: x.score) except AttributeError: - assert len(alignments[0]) == 5 # seqA, seqB, score, start, end - alignment = max(alignments, key=lambda x:x[2]) + assert len(alignments[0]) == 5 # seqA, seqB, score, start, end + alignment = max(alignments, key=lambda x: x[2]) alseq1, alseq2, score, begin, end = alignment if verbose: print(format_alignment(*alignment)) - assert alseq1.replace('-','') == seq1 - assert alseq2.replace('-','') == seq2 + assert alseq1.replace("-", "") == seq1 + assert alseq2.replace("-", "") == seq2 align = {} - for i,(a,b) in enumerate(zip(alseq1, alseq2)): - if a!= '-' and b!='-': - #assert begin <= i <= end - pos1 = i-alseq1[:i].count('-') - pos2 = i-alseq2[:i].count('-') + for i, (a, b) in enumerate(zip(alseq1, alseq2)): + if a != "-" and b != "-": + # assert begin <= i <= end + pos1 = i - alseq1[:i].count("-") + pos2 = i - alseq2[:i].count("-") align[pos1] = pos2 return align # for building pmhc:TCR models from allele/v/j/cdr3 info + def get_v_seq_up_to_cys(organism, v_gene): vg = all_genes[organism][v_gene] alseq = vg.alseq - cys_alpos = vg.cdr_columns[-1][0]-1 # they were 1-indexed, now 0-indexed - if alseq[cys_alpos] != 'C': - print('notC', alseq[cys_alpos]) - geneseq = alseq[:cys_alpos+1].replace(ALL_GENES_GAP_CHAR, '') + cys_alpos = vg.cdr_columns[-1][0] - 1 # they were 1-indexed, now 0-indexed + if alseq[cys_alpos] != "C": + print("notC", alseq[cys_alpos]) + geneseq = alseq[: cys_alpos + 1].replace(ALL_GENES_GAP_CHAR, "") return geneseq + def get_j_seq_after_cdr3(organism, j_gene): jg = all_genes[organism][j_gene] alseq = jg.alseq cdr3seq = jg.cdrs[0] assert alseq.startswith(cdr3seq) - return alseq[len(cdr3seq):].replace(ALL_GENES_GAP_CHAR,'') + return alseq[len(cdr3seq) :].replace(ALL_GENES_GAP_CHAR, "") + -#A02_seq = 'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQR' +# A02_seq = 'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQR' ## borrowed from cdr3s_human.py ## 1-indexed: -extra_alignment_columns_1x = { 'mouse':{'A':[9,86],'B':[] }, ## 1-indexed - 'human':{'A':[],'B':[] } } +extra_alignment_columns_1x = { + "mouse": {"A": [9, 86], "B": []}, ## 1-indexed + "human": {"A": [], "B": []}, +} core_positions_generic_1x = [ - 21, 23, 25, ## 23 is C - 39, 41, ## 41 is W - 53, 54, 55, - 78, ## maybe also 80? - 89, ## 89 is L - 102, 103, 104 ## 104 is C + 21, + 23, + 25, ## 23 is C + 39, + 41, ## 41 is W + 53, + 54, + 55, + 78, ## maybe also 80? + 89, ## 89 is L + 102, + 103, + 104, ## 104 is C ] all_core_alseq_positions_0x = {} for organism in extra_alignment_columns_1x: - for ab,xcols in extra_alignment_columns_1x[organism].items(): - positions = [ x-1+sum(y<=x for y in xcols) # this is not quite right but it wrks - for x in core_positions_generic_1x ] + for ab, xcols in extra_alignment_columns_1x[organism].items(): + positions = [ + x + - 1 + + sum(y <= x for y in xcols) # this is not quite right but it wrks + for x in core_positions_generic_1x + ] if False: - for v,g in all_genes[organism].items(): - if g.chain == ab and g.region == 'V' and v.endswith('*01'): - coreseq = ''.join([g.alseq[x] for x in positions]) + for v, g in all_genes[organism].items(): + if g.chain == ab and g.region == "V" and v.endswith("*01"): + coreseq = "".join([g.alseq[x] for x in positions]) print(coreseq, v, organism, ab) - all_core_alseq_positions_0x[(organism,ab)] = positions + all_core_alseq_positions_0x[(organism, ab)] = positions def get_core_positions_0x(organism, v_gene): @@ -293,99 +328,118 @@ def align_chainseq_to_imgt_msa(organism, chainseq, v_gene): geneseq = get_v_seq_up_to_cys(organism, v_gene) # extends past end of geneseq - geneseq_to_alseq = {k-alseq[:k].count(ALL_GENES_GAP_CHAR) : k - for k,a in enumerate(alseq) if a!=ALL_GENES_GAP_CHAR} + geneseq_to_alseq = { + k - alseq[:k].count(ALL_GENES_GAP_CHAR): k + for k, a in enumerate(alseq) + if a != ALL_GENES_GAP_CHAR + } chainseq_to_geneseq = blosum_align(chainseq, geneseq) - chainseq_to_alseq = {i:geneseq_to_alseq[j] - for i,j in chainseq_to_geneseq.items()} + chainseq_to_alseq = { + i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items() + } return chainseq_to_alseq -def align_chainseq_to_structure_msa(organism, chainseq, v_gene, msa_type='both'): - assert msa_type in ['both','human'] +def align_chainseq_to_structure_msa( + organism, chainseq, v_gene, msa_type="both" +): + assert msa_type in ["both", "human"] - if msa_type == 'human': - assert organism == 'human' - #assert v_gene in human_structure_alignments.index + if msa_type == "human": + assert organism == "human" + # assert v_gene in human_structure_alignments.index row = human_structure_alignments.loc[v_gene] else: - row = both_structure_alignments.loc[(organism,v_gene)] + row = both_structure_alignments.loc[(organism, v_gene)] alseq = row.alseq geneseq = get_v_seq_up_to_cys(organism, v_gene) - assert alseq.replace(ALL_GENES_GAP_CHAR,'') == geneseq + assert alseq.replace(ALL_GENES_GAP_CHAR, "") == geneseq # - geneseq_to_alseq = {k-alseq[:k].count(ALL_GENES_GAP_CHAR) : k - for k,a in enumerate(alseq) if a!=ALL_GENES_GAP_CHAR} + geneseq_to_alseq = { + k - alseq[:k].count(ALL_GENES_GAP_CHAR): k + for k, a in enumerate(alseq) + if a != ALL_GENES_GAP_CHAR + } chainseq_to_geneseq = blosum_align(chainseq, geneseq) - chainseq_to_alseq = {i:geneseq_to_alseq[j] - for i,j in chainseq_to_geneseq.items()} + chainseq_to_alseq = { + i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items() + } return chainseq_to_alseq + _tcr_alignment_cache = None + + def align_tcr_info_pdb_chain_to_structure_msa(pdbid, ab, msa_type_in): - ''' pdbid,ab has to be in the tcr_info index - ''' + """pdbid,ab has to be in the tcr_info index""" global _tcr_alignment_cache - assert msa_type_in in ['both','human'] + assert msa_type_in in ["both", "human"] if _tcr_alignment_cache is None: _tcr_alignment_cache = { - 'both' :{'A':{}, 'B':{}}, - 'human':{'A':{}, 'B':{}} + "both": {"A": {}, "B": {}}, + "human": {"A": {}, "B": {}}, } - print('setting up cache for align_tcr_info_pdb_chain_to_structure_msa function') + print( + "setting up cache for align_tcr_info_pdb_chain_to_structure_msa function" + ) for l in tcr_info.itertuples(): geneseq = get_v_seq_up_to_cys(l.organism, l.v_gene) - for msa_type in ['both', 'human']: - if msa_type == 'human': - if l.organism != 'human': + for msa_type in ["both", "human"]: + if msa_type == "human": + if l.organism != "human": continue row = human_structure_alignments.loc[l.v_gene] else: row = both_structure_alignments.loc[(l.organism, l.v_gene)] alseq = row.alseq - assert alseq.replace(ALL_GENES_GAP_CHAR,'') == geneseq - geneseq_to_alseq = {k-alseq[:k].count(ALL_GENES_GAP_CHAR) : k - for k,a in enumerate(alseq) - if a!=ALL_GENES_GAP_CHAR} + assert alseq.replace(ALL_GENES_GAP_CHAR, "") == geneseq + geneseq_to_alseq = { + k - alseq[:k].count(ALL_GENES_GAP_CHAR): k + for k, a in enumerate(alseq) + if a != ALL_GENES_GAP_CHAR + } chainseq_to_geneseq = blosum_align(l.chainseq, geneseq) - chainseq_to_alseq = {i:geneseq_to_alseq[j] - for i,j in chainseq_to_geneseq.items()} - _tcr_alignment_cache[msa_type][l.ab][l.pdbid] = chainseq_to_alseq - - print('DONE setting up cache for align_tcr_info_pdb_chain_to_structure_msa', - 'function') + chainseq_to_alseq = { + i: geneseq_to_alseq[j] + for i, j in chainseq_to_geneseq.items() + } + _tcr_alignment_cache[msa_type][l.ab][ + l.pdbid + ] = chainseq_to_alseq + + print( + "DONE setting up cache for align_tcr_info_pdb_chain_to_structure_msa", + "function", + ) return _tcr_alignment_cache[msa_type_in][ab][pdbid] - def align_cdr3s(a, b): - shortseq, longseq = (a,b) if len(a)= 5 - gappos = min( 6, 3 + (lenshort-5)//2 ) - num_gaps = lenlong-lenshort - align = {i:i for i in range(gappos)} - align.update({i:i+num_gaps for i in range(gappos,lenshort)}) - if len(a)>=len(b): # reverse - align = {j:i for i,j in align.items()} + gappos = min(6, 3 + (lenshort - 5) // 2) + num_gaps = lenlong - lenshort + align = {i: i for i in range(gappos)} + align.update({i: i + num_gaps for i in range(gappos, lenshort)}) + if len(a) >= len(b): # reverse + align = {j: i for i, j in align.items()} return align - def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3): - ''' could include None if the position is missing (deleted) for this v gene... - ''' + """could include None if the position is missing (deleted) for this v gene...""" assert cdr3 in chainseq nterm_segment_alseq_positions_0x = [3, 4, 5] # 1. imgt alignment 1-index positions 4-6 ([VIL].Q) @@ -396,7 +450,7 @@ def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3): alseq = vg.alseq geneseq = get_v_seq_up_to_cys(organism, v) - assert alseq.replace(ALL_GENES_GAP_CHAR,'').startswith(geneseq) + assert alseq.replace(ALL_GENES_GAP_CHAR, "").startswith(geneseq) geneseq_to_chainseq = blosum_align(geneseq, chainseq) @@ -406,52 +460,56 @@ def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3): geneseq_positions = [] for pos in nterm_segment_alseq_positions_0x: if alseq[pos] == ALL_GENES_GAP_CHAR: - print('N None:', organism, v) + print("N None:", organism, v) geneseq_positions.append(None) else: - geneseq_positions.append(pos-alseq[:pos].count(ALL_GENES_GAP_CHAR)) + geneseq_positions.append( + pos - alseq[:pos].count(ALL_GENES_GAP_CHAR) + ) - start, stop = core_positions_0x[0], core_positions_0x[9] # inclusive! - geneseq_positions.extend(range(start, stop+1)) + start, stop = core_positions_0x[0], core_positions_0x[9] # inclusive! + geneseq_positions.extend(range(start, stop + 1)) - chainseq_positions = [geneseq_to_chainseq.get(pos,None) - for pos in geneseq_positions] + chainseq_positions = [ + geneseq_to_chainseq.get(pos, None) for pos in geneseq_positions + ] - start = geneseq_to_chainseq[core_positions_0x[10]-1] - stop = chainseq.index(cdr3)+len(cdr3)+3 # index of position after GXG - chainseq_positions.extend(range(start, stop+1)) + start = geneseq_to_chainseq[core_positions_0x[10] - 1] + stop = chainseq.index(cdr3) + len(cdr3) + 3 # index of position after GXG + chainseq_positions.extend(range(start, stop + 1)) if None in chainseq_positions: - print(organism, v, [i for i,x in enumerate(chainseq_positions) if x is None]) + print( + organism, + v, + [i for i, x in enumerate(chainseq_positions) if x is None], + ) blosum_align(geneseq, chainseq, verbose=True) - return chainseq_positions - - def align_vgene_to_template_pdb_chain( - ab, - trg_organism, - trg_v, - trg_j, - trg_cdr3, - trg_msa_alignments, - tmp_pdbid, - # tmp_organism, - # tmp_v, - # tmp_j, - # tmp_cdr3, - # tmp_chainseq, - verbose=False, + ab, + trg_organism, + trg_v, + trg_j, + trg_cdr3, + trg_msa_alignments, + tmp_pdbid, + # tmp_organism, + # tmp_v, + # tmp_j, + # tmp_cdr3, + # tmp_chainseq, + verbose=False, ): - ''' Returns trg_to_tmp, trg_chainseq + """Returns trg_to_tmp, trg_chainseq trg_to_tmp is 0-indexed dict aligning trg_chainseq to tmp_chainseq - ''' + """ # align trg_vg to nearest rep (trg_rep) in structure MSA # align tmp_vg to nearest rep (tmp_rep) in structure MSA # use structure MSA to create alignment between trg_rep and tmp_rep @@ -459,7 +517,7 @@ def align_vgene_to_template_pdb_chain( # align cdr3 with gaps in the middle # align jgenes - tmp_row = tcr_info.loc[(tmp_pdbid,ab)] + tmp_row = tcr_info.loc[(tmp_pdbid, ab)] tmp_organism = tmp_row.organism tmp_chainseq = tmp_row.chainseq tmp_cdr3 = tmp_row.cdr3 @@ -470,8 +528,11 @@ def align_vgene_to_template_pdb_chain( trg_chainseq = trg_vseq[:-1] + trg_cdr3 + trg_jseq # align V regions - msa_type = ('human' if trg_organism=='human' and tmp_organism=='human' else - 'both') + msa_type = ( + "human" + if trg_organism == "human" and tmp_organism == "human" + else "both" + ) trg_vseq_to_alseq = trg_msa_alignments[msa_type] @@ -480,9 +541,10 @@ def align_vgene_to_template_pdb_chain( # assert (sorted(trg_vseq_to_alseq.items()) == # sorted(trg_vseq_to_alseq_old.items())) - #print(tmp_pdbid, tmp_organism, ab, msa_type) + # print(tmp_pdbid, tmp_organism, ab, msa_type) tmp_vseq_to_alseq = align_tcr_info_pdb_chain_to_structure_msa( - tmp_pdbid, ab, msa_type) + tmp_pdbid, ab, msa_type + ) # tmp_vseq_to_alseq_old = align_chainseq_to_structure_msa( # tmp_organism, tmp_chainseq, tmp_row.v_gene, msa_type=msa_type) @@ -494,38 +556,42 @@ def align_vgene_to_template_pdb_chain( # assert (sorted(tmp_vseq_to_alseq.items()) == # sorted(tmp_vseq_to_alseq_old.items())) - alseq_to_tmp_vseq = {j:i for i,j in tmp_vseq_to_alseq.items()} + alseq_to_tmp_vseq = {j: i for i, j in tmp_vseq_to_alseq.items()} - trg_to_tmp = {} # alignment over this TCR chain, 0-indexed - for i,j in trg_vseq_to_alseq.items(): + trg_to_tmp = {} # alignment over this TCR chain, 0-indexed + for i, j in trg_vseq_to_alseq.items(): if j in alseq_to_tmp_vseq: - trg_to_tmp[i]=alseq_to_tmp_vseq[j] + trg_to_tmp[i] = alseq_to_tmp_vseq[j] # align cdr3 regions - assert trg_vseq[-1] == 'C' - trg_offset = len(trg_vseq)-1 # skip 'C' + assert trg_vseq[-1] == "C" + trg_offset = len(trg_vseq) - 1 # skip 'C' tmp_offset = tmp_chainseq.index(tmp_cdr3) al = align_cdr3s(trg_cdr3, tmp_cdr3) - trg_to_tmp.update({i+trg_offset:j+tmp_offset - for i,j in al.items()}) + trg_to_tmp.update({i + trg_offset: j + tmp_offset for i, j in al.items()}) trg_offset += len(trg_cdr3) tmp_offset += len(tmp_cdr3) - # align j region - tmp_jseq= tmp_chainseq[tmp_chainseq.index(tmp_cdr3)+ - len(tmp_cdr3):] - if (trg_jseq[:3]+tmp_jseq[:3]).count('G') < 4: - print('GXGs?', trg_jseq[:3], tmp_jseq[:3]) + tmp_jseq = tmp_chainseq[tmp_chainseq.index(tmp_cdr3) + len(tmp_cdr3) :] + if (trg_jseq[:3] + tmp_jseq[:3]).count("G") < 4: + print("GXGs?", trg_jseq[:3], tmp_jseq[:3]) for i in range(min(len(trg_jseq), len(tmp_jseq))): - trg_to_tmp[trg_offset+i] = tmp_offset+i + trg_to_tmp[trg_offset + i] = tmp_offset + i - identities = sum(trg_chainseq[i]==tmp_chainseq[j] - for i,j in trg_to_tmp.items())/len(trg_chainseq) + identities = sum( + trg_chainseq[i] == tmp_chainseq[j] for i, j in trg_to_tmp.items() + ) / len(trg_chainseq) if verbose: - print(f'align_vgene_to_template_pdb_chain: {identities:6.3f}', - trg_v, tmp_v, trg_cdr3, tmp_cdr3, verbose=verbose) + print( + f"align_vgene_to_template_pdb_chain: {identities:6.3f}", + trg_v, + tmp_v, + trg_cdr3, + tmp_cdr3, + verbose=verbose, + ) return trg_to_tmp, trg_chainseq @@ -610,110 +676,123 @@ def get_mhc_class_1_alseq(allele): return mhc_class_1_alfas[allele] sortl = [] for k in mhc_class_1_alfas: - if k.startswith(allele) and k[len(allele)] == ':': - suffix = [int(x) if x.isdigit() else 100 - for x in k[len(allele)+1:].split(':')] + if k.startswith(allele) and k[len(allele)] == ":": + suffix = [ + int(x) if x.isdigit() else 100 + for x in k[len(allele) + 1 :].split(":") + ] sortl.append((suffix, k)) if sortl: sortl.sort() best_allele = sortl[0][1] - #print('close allele:', allele, best_allele) + # print('close allele:', allele, best_allele) return mhc_class_1_alfas[best_allele] - elif allele.count(':')>1: - trim_allele = ':'.join(allele.split(':')[:2]) + elif allele.count(":") > 1: + trim_allele = ":".join(allele.split(":")[:2]) return get_mhc_class_1_alseq(trim_allele) else: return None + def get_mhc_class_2_alseq(chain, allele): if allele in mhc_class_2_alfas[chain]: return mhc_class_2_alfas[chain][allele] - assert '*' in allele # human + assert "*" in allele # human sortl = [] for k in mhc_class_2_alfas[chain]: - if k.startswith(allele) and k[len(allele)] == ':': - suffix = [int(x) if x.isdigit() else 100 - for x in k[len(allele)+1:].split(':')] + if k.startswith(allele) and k[len(allele)] == ":": + suffix = [ + int(x) if x.isdigit() else 100 + for x in k[len(allele) + 1 :].split(":") + ] sortl.append((suffix, k)) if sortl: sortl.sort() best_allele = sortl[0][1] - #print('close allele:', allele, best_allele) + # print('close allele:', allele, best_allele) return mhc_class_2_alfas[chain][best_allele] - elif allele.count(':')>1: - trim_allele = ':'.join(allele.split(':')[:2]) + elif allele.count(":") > 1: + trim_allele = ":".join(allele.split(":")[:2]) return get_mhc_class_2_alseq(chain, trim_allele) else: return None def get_template_pose_and_tdinfo(pdbid, complex_type): - ''' returns pose, tdinfo + """returns pose, tdinfo complex_type should be in {TCR, TERNARY, PMHC} - ''' - info, poses = all_template_info[complex_type], all_template_poses[complex_type] + """ + info, poses = ( + all_template_info[complex_type], + all_template_poses[complex_type], + ) if pdbid not in poses: - pdbfile = set(info[info.pdbid==pdbid].pdbfile) + pdbfile = set(info[info.pdbid == pdbid].pdbfile) if not pdbfile: # right now we only have class 1 in the special pmhc info... assert complex_type == PMHC - info = all_template_info[TERNARY] # NOTE NOTE NOTE - pdbfile = set(info[info.pdbid==pdbid].pdbfile) + info = all_template_info[TERNARY] # NOTE NOTE NOTE + pdbfile = set(info[info.pdbid == pdbid].pdbfile) assert len(pdbfile) == 1 - pdbfile = str(path_to_db) + '/' + pdbfile.pop() - #print('read:', pdbfile) + pdbfile = str(path_to_db) + "/" + pdbfile.pop() + # print('read:', pdbfile) pose = pdblite.pose_from_pdb(pdbfile) - tdifile = pdbfile+'.tcrdock_info.json' - tdinfo = TCRdockInfo().from_string(open(tdifile,'r').read()) - #print('make tdinfo 0-indexed:', tdifile) - #tdinfo.renumber({i+1:i for i in range(len(pose['sequence']))}) + tdifile = pdbfile + ".tcrdock_info.json" + tdinfo = TCRdockInfo().from_string(open(tdifile, "r").read()) + # print('make tdinfo 0-indexed:', tdifile) + # tdinfo.renumber({i+1:i for i in range(len(pose['sequence']))}) poses[pdbid] = (pose, tdinfo) pose, tdinfo = poses[pdbid] - return copy.deepcopy(pose), TCRdockInfo().from_dict(tdinfo.to_dict())# return copies + return copy.deepcopy(pose), TCRdockInfo().from_dict( + tdinfo.to_dict() + ) # return copies + -def count_peptide_mismatches(a,b): - if len(a)>len(b): - return count_peptide_mismatches(b,a) - lendiff = len(b)-len(a) - assert lendiff>=0 +def count_peptide_mismatches(a, b): + if len(a) > len(b): + return count_peptide_mismatches(b, a) + lendiff = len(b) - len(a) + assert lendiff >= 0 min_mismatches = len(a) - for shift in range(lendiff+1): - mismatches = sum(x!=y for x,y in zip(a,b[shift:])) + for shift in range(lendiff + 1): + mismatches = sum(x != y for x, y in zip(a, b[shift:])) if mismatches < min_mismatches: min_mismatches = mismatches # also allow bulging out in the middle - nt = len(a)//2 + nt = len(a) // 2 ct = len(a) - nt - btrim = b[:nt]+b[-ct:] + btrim = b[:nt] + b[-ct:] assert len(btrim) == len(a) - mismatches = sum(x!=y for x,y in zip(a,btrim)) + mismatches = sum(x != y for x, y in zip(a, btrim)) if mismatches < min_mismatches: min_mismatches = mismatches return min_mismatches -pep1 = 'DSIODJSJD' # sanity checking... -pep2 = 'DSIXDJSJD' -assert count_peptide_mismatches(pep1,pep1)==0 -assert count_peptide_mismatches(pep1,pep1[:-1])==0 -assert count_peptide_mismatches(pep1,pep2)==1 + +pep1 = "DSIODJSJD" # sanity checking... +pep2 = "DSIXDJSJD" +assert count_peptide_mismatches(pep1, pep1) == 0 +assert count_peptide_mismatches(pep1, pep1[:-1]) == 0 +assert count_peptide_mismatches(pep1, pep2) == 1 + def get_clean_and_nonredundant_ternary_tcrs_df( - min_peptide_mismatches = 3, - min_tcrdist = 120.5, - peptide_tcrdist_logical = 'or', # 'or' or 'and' -- 'or' is nr by either feature - drop_HLA_E = True, - verbose=False, - skip_redundancy_check=False, # just add resol,mismatches and sort - filter_BAD_DGEOM_PDBIDS=True, - filter_BAD_PMHC_PDBIDS=True, - tcrs=None, + min_peptide_mismatches=3, + min_tcrdist=120.5, + peptide_tcrdist_logical="or", # 'or' or 'and' -- 'or' is nr by either feature + drop_HLA_E=True, + verbose=False, + skip_redundancy_check=False, # just add resol,mismatches and sort + filter_BAD_DGEOM_PDBIDS=True, + filter_BAD_PMHC_PDBIDS=True, + tcrs=None, ): - ''' peptide_tcrdist_logical = 'or' is more stringent redundancy filtering + """peptide_tcrdist_logical = 'or' is more stringent redundancy filtering ie, smaller df returned. A TCR:pMHC is considered redundant by peptide OR by TCRdist. 'and' means redundant only if both peptide and TCRdist redundant - ''' - assert peptide_tcrdist_logical in ['or','and'] + """ + assert peptide_tcrdist_logical in ["or", "and"] if tcrs is None: tcrs = ternary_info.copy() @@ -725,52 +804,61 @@ def get_clean_and_nonredundant_ternary_tcrs_df( bad_pdbids.update(set(BAD_DGEOM_PDBIDS)) if filter_BAD_PMHC_PDBIDS: bad_pdbids.update(set(BAD_PMHC_PDBIDS)) - print('get_clean_and_nonredundant_ternary_tcrs_df: bad_pdbids', bad_pdbids) + print("get_clean_and_nonredundant_ternary_tcrs_df: bad_pdbids", bad_pdbids) tcrs = tcrs[~tcrs.pdbid.isin(bad_pdbids)] - if drop_HLA_E: # drop HLA-E - tcrs = tcrs[~tcrs.mhc_allele.str.startswith('E*')] + if drop_HLA_E: # drop HLA-E + tcrs = tcrs[~tcrs.mhc_allele.str.startswith("E*")] # need to add resolution pdbid2resolution = {} pdbid2mismatches = {} - for organism in 'human mouse'.split(): + for organism in "human mouse".split(): # temporary hack... - logfile = path_to_db / f'tmp.pdb_tcr_{organism}.2023-06-02.log' + logfile = path_to_db / f"tmp.pdb_tcr_{organism}.2023-06-02.log" # logfile = path_to_db / f'tmp.pdb_tcr_{organism}.2021-08-05.log' assert exists(logfile) - for line in os.popen(f'grep ^both {logfile}'): + for line in os.popen(f"grep ^both {logfile}"): l = line.split() pdbid = l[1] resolution = float(l[-2]) v_mismatches = int(l[3]) + int(l[7]) pdbid2resolution[pdbid] = resolution - pdbid2mismatches[pdbid] = min(v_mismatches, pdbid2mismatches.get(pdbid,100)) + pdbid2mismatches[pdbid] = min( + v_mismatches, pdbid2mismatches.get(pdbid, 100) + ) - tcrs['resolution'] = tcrs.pdbid.map(pdbid2resolution) - tcrs['mismatches'] = tcrs.pdbid.map(pdbid2mismatches) - assert tcrs.resolution.isna().sum()==0 + tcrs["resolution"] = tcrs.pdbid.map(pdbid2resolution) + tcrs["mismatches"] = tcrs.pdbid.map(pdbid2mismatches) + assert tcrs.resolution.isna().sum() == 0 ## remove redundancy # sort by count for peptide, then resolution - tcrs['org_pep'] = (tcrs.mhc_class.astype(str) + "_" + - tcrs.organism + "_" + tcrs.pep_seq) - tcrs['org_pep_count'] = tcrs.org_pep.map(tcrs.org_pep.value_counts()) + tcrs["org_pep"] = ( + tcrs.mhc_class.astype(str) + "_" + tcrs.organism + "_" + tcrs.pep_seq + ) + tcrs["org_pep_count"] = tcrs.org_pep.map(tcrs.org_pep.value_counts()) - tcrs['neg_quality'] = -10*tcrs.org_pep_count + tcrs.resolution + 0.5*tcrs.mismatches + tcrs["neg_quality"] = ( + -10 * tcrs.org_pep_count + tcrs.resolution + 0.5 * tcrs.mismatches + ) - tcrs.sort_values('neg_quality', inplace=True) + tcrs.sort_values("neg_quality", inplace=True) if not skip_redundancy_check: - tdist = {'human': tcrdist.tcr_distances.TcrDistCalculator('human'), - 'mouse': tcrdist.tcr_distances.TcrDistCalculator('mouse')} + tdist = { + "human": tcrdist.tcr_distances.TcrDistCalculator("human"), + "mouse": tcrdist.tcr_distances.TcrDistCalculator("mouse"), + } - tcr_tuples = [((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b)) - for l in tcrs.itertuples()] + tcr_tuples = [ + ((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b)) + for l in tcrs.itertuples() + ] - is_redundant = [False]*tcrs.shape[0] + is_redundant = [False] * tcrs.shape[0] for i, irow in enumerate(tcrs.itertuples()): # check to see if i is too close to any previous tcr @@ -780,223 +868,305 @@ def get_clean_and_nonredundant_ternary_tcrs_df( continue pep_mms = count_peptide_mismatches(irow.pep_seq, jrow.pep_seq) - if (irow.organism == jrow.organism and - irow.mhc_class == jrow.mhc_class): + if ( + irow.organism == jrow.organism + and irow.mhc_class == jrow.mhc_class + ): tcrd = tdist[irow.organism](tcr_tuples[i], tcr_tuples[j]) - pep_red = (pep_mms < min_peptide_mismatches) - tcr_red = (tcrd < min_tcrdist) - if ((peptide_tcrdist_logical=='or' and - (pep_red or tcr_red)) or - (peptide_tcrdist_logical=='and' and - (pep_red and tcr_red))): + pep_red = pep_mms < min_peptide_mismatches + tcr_red = tcrd < min_tcrdist + if ( + peptide_tcrdist_logical == "or" + and (pep_red or tcr_red) + ) or ( + peptide_tcrdist_logical == "and" + and (pep_red and tcr_red) + ): # redundant is_redundant[i] = True if verbose: - print('skip:', i, irow.org_pep, irow.neg_quality, - tcr_tuples[i]) + print( + "skip:", + i, + irow.org_pep, + irow.neg_quality, + tcr_tuples[i], + ) break - tcrs['is_redundant'] = is_redundant + tcrs["is_redundant"] = is_redundant tcrs = tcrs[~tcrs.is_redundant].copy() return tcrs + def filter_templates_by_tcrdist( - templates, # dataframe - organism, va, cdr3a, vb, cdr3b, - min_paired_tcrdist=-1, - min_singlechain_tcrdist=-1, - verbose=False, + templates, # dataframe + organism, + va, + cdr3a, + vb, + cdr3b, + min_paired_tcrdist=-1, + min_singlechain_tcrdist=-1, + verbose=False, ): - ''' returns new templates + """returns new templates will have paired_tcrdist and singlechain_tcrdist cols - ''' - tcrdister = get_tcrdister('human_and_mouse') + """ + tcrdister = get_tcrdister("human_and_mouse") template_tcrs = [ - ((l.organism[0]+l.va, None, l.cdr3a), (l.organism[0]+l.vb, None, l.cdr3b)) - for l in templates.itertuples()] + ( + (l.organism[0] + l.va, None, l.cdr3a), + (l.organism[0] + l.vb, None, l.cdr3b), + ) + for l in templates.itertuples() + ] - target_tcr = ((organism[0]+va, None, cdr3a), (organism[0]+vb, None, cdr3b)) + target_tcr = ( + (organism[0] + va, None, cdr3a), + (organism[0] + vb, None, cdr3b), + ) - templates['paired_tcrdist'] = np.array( - [tcrdister(target_tcr,x) for x in template_tcrs]) + templates["paired_tcrdist"] = np.array( + [tcrdister(target_tcr, x) for x in template_tcrs] + ) - for iab, ab in enumerate('AB'): - templates[ab+'_tcrdist'] = np.array( - [tcrdister.single_chain_distance(target_tcr[iab], x[iab]) - for x in template_tcrs]) + for iab, ab in enumerate("AB"): + templates[ab + "_tcrdist"] = np.array( + [ + tcrdister.single_chain_distance(target_tcr[iab], x[iab]) + for x in template_tcrs + ] + ) - templates['AB_tcrdist'] = templates.paired_tcrdist + templates["AB_tcrdist"] = templates.paired_tcrdist - templates['singlechain_tcrdist'] = np.minimum( - templates.A_tcrdist, templates.B_tcrdist) + templates["singlechain_tcrdist"] = np.minimum( + templates.A_tcrdist, templates.B_tcrdist + ) - too_close_mask = ((templates.organism==organism)& - ((templates.paired_tcrdist < min_paired_tcrdist)| - (templates.singlechain_tcrdist < min_singlechain_tcrdist))) + too_close_mask = (templates.organism == organism) & ( + (templates.paired_tcrdist < min_paired_tcrdist) + | (templates.singlechain_tcrdist < min_singlechain_tcrdist) + ) if verbose: - print('too close by tcrdist:', np.sum(too_close_mask), organism, - va, cdr3a, vb, cdr3b) + print( + "too close by tcrdist:", + np.sum(too_close_mask), + organism, + va, + cdr3a, + vb, + cdr3b, + ) return templates[~too_close_mask].copy() + def filter_templates_by_peptide_mismatches( - templates, - organism, - mhc_class, - peptides_for_filtering, - min_peptide_mismatches, - verbose = False, + templates, + organism, + mhc_class, + peptides_for_filtering, + min_peptide_mismatches, + verbose=False, ): if not peptides_for_filtering: return templates.copy() - templates['filt_peptide_mismatches'] = np.array( - [min(count_peptide_mismatches(p, l.pep_seq) for p in peptides_for_filtering) - for l in templates.itertuples()]) - too_close_mask = ((templates.organism==organism) & - (templates.mhc_class==mhc_class) & - (templates.filt_peptide_mismatches=num_templates: + print( + "new_dgeom_template:", + chain, + len(dfl), + row1[chain + "_tcrdist"], + row1.organism, + row1.mhc_class, + row1.mhc_allele, + row1.pep_seq, + row1.va, + row1.cdr3a, + row1.vb, + row1.cdr3b, + ) + if len(dfl) >= num_templates: break all_templates[chain] = pd.DataFrame(dfl) return all_templates - - - - def make_templates_for_alphafold_same_pmhc( - organism, - va, - ja, - cdr3a, - vb, - jb, - cdr3b, - mhc_class, - mhc_allele, - peptide, - outfile_prefix, - num_runs=3, - num_templates_per_run=4, - min_single_chain_tcrdist=-1, - next_best_identity_threshold_mhc=0.98, - next_best_identity_threshold_tcr=0.98, - verbose=False, + organism, + va, + ja, + cdr3a, + vb, + jb, + cdr3b, + mhc_class, + mhc_allele, + peptide, + outfile_prefix, + num_runs=3, + num_templates_per_run=4, + min_single_chain_tcrdist=-1, + next_best_identity_threshold_mhc=0.98, + next_best_identity_threshold_tcr=0.98, + verbose=False, ): - ''' Only use ternary templates: no franken templates + """Only use ternary templates: no franken templates Exclude potential templates based on @@ -1015,92 +1185,106 @@ def make_templates_for_alphafold_same_pmhc( returns None for failure - ''' + """ # check arguments if mhc_class == 2: assert len(peptide) == CLASS2_PEPLEN - assert mhc_allele.count(',') == 1 + assert mhc_allele.count(",") == 1 - tcrdister = get_tcrdister('human_and_mouse') + tcrdister = get_tcrdister("human_and_mouse") core_len = TCR_CORE_LEN - template_mask = ((ternary_info.organism == organism) & - (ternary_info.mhc_class == mhc_class) & - (ternary_info.mhc_allele == mhc_allele) & - (ternary_info.pep_seq == peptide) & - (~ternary_info.pdbid.isin(BAD_PMHC_PDBIDS))& - (~ternary_info.pdbid.isin(BAD_DGEOM_PDBIDS))) + template_mask = ( + (ternary_info.organism == organism) + & (ternary_info.mhc_class == mhc_class) + & (ternary_info.mhc_allele == mhc_allele) + & (ternary_info.pep_seq == peptide) + & (~ternary_info.pdbid.isin(BAD_PMHC_PDBIDS)) + & (~ternary_info.pdbid.isin(BAD_DGEOM_PDBIDS)) + ) if not template_mask.sum(): - print('ERROR no matching templates:: make_templates_for_alphafold_same_pmhc', - organism, mhc_class, mhc_allele, peptide) + print( + "ERROR no matching templates:: make_templates_for_alphafold_same_pmhc", + organism, + mhc_class, + mhc_allele, + peptide, + ) templates = ternary_info[template_mask] - print('make_templates_for_alphafold_same_pmhc num_templates=', templates.shape[0]) - + print( + "make_templates_for_alphafold_same_pmhc num_templates=", + templates.shape[0], + ) # get MHC align-seqs if mhc_class == 1: trg_mhc_alseqs = [get_mhc_class_1_alseq(mhc_allele)] else: - trg_mhc_alseqs = [get_mhc_class_2_alseq('A', mhc_allele.split(',')[0]), - get_mhc_class_2_alseq('B', mhc_allele.split(',')[1])] - + trg_mhc_alseqs = [ + get_mhc_class_2_alseq("A", mhc_allele.split(",")[0]), + get_mhc_class_2_alseq("B", mhc_allele.split(",")[1]), + ] return None + def align_vgene_to_structure_msas(organism, v_gene): msa_alignments = {} vseq = get_v_seq_up_to_cys(organism, v_gene) - for msa_type in ['both','human']: - if msa_type == 'human': - if organism != 'human': + for msa_type in ["both", "human"]: + if msa_type == "human": + if organism != "human": continue row = human_structure_alignments.loc[v_gene] else: row = both_structure_alignments.loc[(organism, v_gene)] - assert row.alseq.replace(ALL_GENES_GAP_CHAR,'') == vseq - vseq_to_alseq = {k-row.alseq[:k].count(ALL_GENES_GAP_CHAR) : k - for k,a in enumerate(row.alseq) if a!=ALL_GENES_GAP_CHAR} + assert row.alseq.replace(ALL_GENES_GAP_CHAR, "") == vseq + vseq_to_alseq = { + k - row.alseq[:k].count(ALL_GENES_GAP_CHAR): k + for k, a in enumerate(row.alseq) + if a != ALL_GENES_GAP_CHAR + } msa_alignments[msa_type] = vseq_to_alseq return msa_alignments def make_templates_for_alphafold( - organism, - va, - ja, - cdr3a, - vb, - jb, - cdr3b, - mhc_class, - mhc_allele, - peptide, - outfile_prefix, - num_runs=5, # match default in setup_for_alphafold - num_templates_per_run=4, - exclude_self_peptide_docking_geometries=False, - exclude_docking_geometry_peptides=None, # None or list of peptides - # below is only applied if exclude_docking_geometry_peptides is nonempty - # or exclude_self_peptide_docking_geometries - min_dgeom_peptide_mismatches=3, # not used if exclude_* are False/None - min_dgeom_paired_tcrdist=-1, - min_dgeom_singlechain_tcrdist=-1, - min_single_chain_tcrdist=-1, - min_pmhc_peptide_mismatches=-1, - next_best_identity_threshold_mhc=0.98, - next_best_identity_threshold_tcr=0.98, - ternary_bonus=0.05, # for tcr templates, in frac identities - alt_self_peptides=None, # or list of peptides - verbose=False, - pick_dgeoms_using_tcrdist=False, # implies num_runs=3 - use_same_pmhc_dgeoms=False, - exclude_pdbids=None, - force_pmhc_pdbids=None, - use_opt_dgeoms=False, + organism, + va, + ja, + cdr3a, + vb, + jb, + cdr3b, + mhc_class, + mhc_allele, + peptide, + outfile_prefix, + num_runs=5, # match default in setup_for_alphafold + num_templates_per_run=4, + exclude_self_peptide_docking_geometries=False, + exclude_docking_geometry_peptides=None, # None or list of peptides + # below is only applied if exclude_docking_geometry_peptides is nonempty + # or exclude_self_peptide_docking_geometries + min_dgeom_peptide_mismatches=3, # not used if exclude_* are False/None + min_dgeom_paired_tcrdist=-1, + min_dgeom_singlechain_tcrdist=-1, + min_single_chain_tcrdist=-1, + min_pmhc_peptide_mismatches=-1, + next_best_identity_threshold_mhc=0.98, + next_best_identity_threshold_tcr=0.98, + ternary_bonus=0.05, # for tcr templates, in frac identities + alt_self_peptides=None, # or list of peptides + verbose=False, + pick_dgeoms_using_tcrdist=False, # implies num_runs=3 + use_same_pmhc_dgeoms=False, + exclude_pdbids=None, + force_pmhc_pdbids=None, + use_opt_dgeoms=False, ): - ''' Makes num_templates_per_run * num_runs template pdb files + """Makes num_templates_per_run * num_runs template pdb files Make num_runs alignfiles __alignments.tsv @@ -1109,23 +1293,27 @@ def make_templates_for_alphafold( returns None for failure - ''' - from .pdblite import (apply_transform_Rx_plus_v, delete_chains, append_chains, - dump_pdb) + """ + from .pdblite import ( + apply_transform_Rx_plus_v, + delete_chains, + append_chains, + dump_pdb, + ) if exclude_pdbids is None: exclude_pdbids = [] else: exclude_pdbids = frozenset(exclude_pdbids) - print(f'will exclude {len(exclude_pdbids)} pdbids') + print(f"will exclude {len(exclude_pdbids)} pdbids") # check arguments if mhc_class == 2: assert len(peptide) == CLASS2_PEPLEN - assert mhc_allele.count(',') == 1 + assert mhc_allele.count(",") == 1 if pick_dgeoms_using_tcrdist: - assert num_runs == 3 # AB, A, B + assert num_runs == 3 # AB, A, B if use_opt_dgeoms: assert num_runs == 1 @@ -1138,238 +1326,331 @@ def make_templates_for_alphafold( if exclude_self_peptide_docking_geometries: # dont modify the passed-in list exclude_docking_geometry_peptides = ( - exclude_docking_geometry_peptides+[peptide]+alt_self_peptides) - - tcrdister = get_tcrdister('human_and_mouse') + exclude_docking_geometry_peptides + [peptide] + alt_self_peptides + ) + tcrdister = get_tcrdister("human_and_mouse") - def show_alignment(al,seq1,seq2): + def show_alignment(al, seq1, seq2): if verbose: - for i,j in sorted(al.items()): - a,b = seq1[i], seq2[j] - star = '*' if a==b else ' ' - print(f'{i:4d} {j:4d} {a} {star} {b}') - idents = sum(seq1[i] == seq2[j] for i,j in al.items())/len(seq1) - print(f'idents: {idents:6.3f}') + for i, j in sorted(al.items()): + a, b = seq1[i], seq2[j] + star = "*" if a == b else " " + print(f"{i:4d} {j:4d} {a} {star} {b}") + idents = sum(seq1[i] == seq2[j] for i, j in al.items()) / len(seq1) + print(f"idents: {idents:6.3f}") core_len = TCR_CORE_LEN if mhc_class == 1: - if organism=='human': + if organism == "human": # now adding HLA-E 2022-05-03 - assert mhc_allele[0] in 'ABCE' and mhc_allele[1]=='*' and ':' in mhc_allele - mhc_allele = ':'.join(mhc_allele.split(':')[:2]) # just the 4 digits + assert ( + mhc_allele[0] in "ABCE" + and mhc_allele[1] == "*" + and ":" in mhc_allele + ) + mhc_allele = ":".join( + mhc_allele.split(":")[:2] + ) # just the 4 digits else: - assert mhc_allele.startswith('H2') and mhc_allele in mhc_class_1_alfas + assert ( + mhc_allele.startswith("H2") and mhc_allele in mhc_class_1_alfas + ) # first: MHC part trg_mhc_alseq = get_mhc_class_1_alseq(mhc_allele) - trg_mhc_seq = trg_mhc_alseq.replace(ALL_GENES_GAP_CHAR,'') + trg_mhc_seq = trg_mhc_alseq.replace(ALL_GENES_GAP_CHAR, "") sortl = [] # use new pmhc-only data for l in pmhc_info.itertuples(): - if (l.organism!=organism or l.mhc_class!=mhc_class or - l.pdbid in BAD_PMHC_PDBIDS or l.pdbid in exclude_pdbids or - (force_pmhc_pdbids and l.pdbid not in force_pmhc_pdbids)): + if ( + l.organism != organism + or l.mhc_class != mhc_class + or l.pdbid in BAD_PMHC_PDBIDS + or l.pdbid in exclude_pdbids + or (force_pmhc_pdbids and l.pdbid not in force_pmhc_pdbids) + ): continue tmp_mhc_alseq = l.mhc_alignseq assert len(trg_mhc_alseq) == len(tmp_mhc_alseq) - mhc_idents = sum(a==b and a!=ALL_GENES_GAP_CHAR - for a,b in zip(trg_mhc_alseq, tmp_mhc_alseq)) + mhc_idents = sum( + a == b and a != ALL_GENES_GAP_CHAR + for a, b in zip(trg_mhc_alseq, tmp_mhc_alseq) + ) if len(peptide) == len(l.pep_seq): - pep_idents = sum(a==b for a,b in zip(peptide,l.pep_seq)) + pep_idents = sum(a == b for a, b in zip(peptide, l.pep_seq)) else: - pep_idents = sum(a==b for a,b in zip(peptide[:3]+peptide[-3:], - l.pep_seq[:3]+l.pep_seq[-3:])) + pep_idents = sum( + a == b + for a, b in zip( + peptide[:3] + peptide[-3:], + l.pep_seq[:3] + l.pep_seq[-3:], + ) + ) mismatches_for_excluding = min( count_peptide_mismatches(x, l.pep_seq) - for x in [peptide]+alt_self_peptides) + for x in [peptide] + alt_self_peptides + ) if mismatches_for_excluding < min_pmhc_peptide_mismatches: if verbose: - print('peptide too close:', peptide, l.pep_seq, - 'mismatches_for_excluding:', mismatches_for_excluding, - alt_self_peptides) + print( + "peptide too close:", + peptide, + l.pep_seq, + "mismatches_for_excluding:", + mismatches_for_excluding, + alt_self_peptides, + ) continue - assert len(peptide)-pep_idents >= min_pmhc_peptide_mismatches #sanity - total = len(peptide)+len(trg_mhc_seq) - frac = (mhc_idents+pep_idents)/total - 0.01*l.mhc_total_chainbreak + assert ( + len(peptide) - pep_idents >= min_pmhc_peptide_mismatches + ) # sanity + total = len(peptide) + len(trg_mhc_seq) + frac = ( + mhc_idents + pep_idents + ) / total - 0.01 * l.mhc_total_chainbreak sortl.append((frac, l.Index)) sortl.sort(reverse=True) max_idents = sortl[0][0] - print(f'mhc max_idents: {max_idents:.3f}', mhc_allele, peptide, - pmhc_info.loc[sortl[0][1], 'mhc_allele'], - pmhc_info.loc[sortl[0][1], 'pep_seq']) + print( + f"mhc max_idents: {max_idents:.3f}", + mhc_allele, + peptide, + pmhc_info.loc[sortl[0][1], "mhc_allele"], + pmhc_info.loc[sortl[0][1], "pep_seq"], + ) pmhc_alignments = [] - for (idents, pdbid) in sortl[:num_templates_per_run]: - if idents < next_best_identity_threshold_tcr*max_idents: + for idents, pdbid in sortl[:num_templates_per_run]: + if idents < next_best_identity_threshold_tcr * max_idents: break templatel = pmhc_info.loc[pdbid] tmp_mhc_alseq = templatel.mhc_alignseq - tmp_seql = templatel.chainseq.split('/') - assert len(tmp_seql)==2 and tmp_seql[1] == templatel.pep_seq - tmp_mhc_seq = tmp_seql[0] # class 1 - tmp_mhc_alseq_seq = tmp_mhc_alseq.replace(ALL_GENES_GAP_CHAR,'') + tmp_seql = templatel.chainseq.split("/") + assert len(tmp_seql) == 2 and tmp_seql[1] == templatel.pep_seq + tmp_mhc_seq = tmp_seql[0] # class 1 + tmp_mhc_alseq_seq = tmp_mhc_alseq.replace(ALL_GENES_GAP_CHAR, "") assert tmp_mhc_alseq_seq in tmp_mhc_seq npad = tmp_mhc_seq.index(tmp_mhc_alseq_seq) - #al1 = blosum_align(tmp_mhc_seq, tmp_mhc_alseq_seq) - al1 = {i+npad:i for i in range(len(tmp_mhc_alseq_seq))} + # al1 = blosum_align(tmp_mhc_seq, tmp_mhc_alseq_seq) + al1 = {i + npad: i for i in range(len(tmp_mhc_alseq_seq))} - al2 = {i-tmp_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR): - i-trg_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR) - for i,(a,b) in enumerate(zip(tmp_mhc_alseq, trg_mhc_alseq)) - if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR} + al2 = { + i + - tmp_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR): i + - trg_mhc_alseq[:i].count(ALL_GENES_GAP_CHAR) + for i, (a, b) in enumerate(zip(tmp_mhc_alseq, trg_mhc_alseq)) + if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR + } - tmp_to_trg = {x:al2[y] for x,y in al1.items() if y in al2} - trg_to_tmp = {y:x for x,y in tmp_to_trg.items()} + tmp_to_trg = {x: al2[y] for x, y in al1.items() if y in al2} + trg_to_tmp = {y: x for x, y in tmp_to_trg.items()} trg_offset = len(trg_mhc_seq) tmp_offset = len(tmp_mhc_seq) trg_peplen, tmp_peplen = len(peptide), len(templatel.pep_seq) if trg_peplen == tmp_peplen: for i in range(trg_peplen): - trg_to_tmp[trg_offset+i] = tmp_offset+i + trg_to_tmp[trg_offset + i] = tmp_offset + i else: for i in range(3): - trg_to_tmp[trg_offset+i] = tmp_offset+i - for i in [-3,-2,-1]: - trg_to_tmp[trg_offset+trg_peplen+i] = tmp_offset+tmp_peplen+i + trg_to_tmp[trg_offset + i] = tmp_offset + i + for i in [-3, -2, -1]: + trg_to_tmp[trg_offset + trg_peplen + i] = ( + tmp_offset + tmp_peplen + i + ) trg_pmhc_seq = trg_mhc_seq + peptide tmp_pmhc_seq = tmp_mhc_seq + templatel.pep_seq - identities = sum(trg_pmhc_seq[i] == tmp_pmhc_seq[j] - for i,j in trg_to_tmp.items())/len(trg_pmhc_seq) - identities_for_sorting = (identities - 0.01*templatel.mhc_total_chainbreak) - assert abs(identities_for_sorting-idents)<1e-3 + identities = sum( + trg_pmhc_seq[i] == tmp_pmhc_seq[j] + for i, j in trg_to_tmp.items() + ) / len(trg_pmhc_seq) + identities_for_sorting = ( + identities - 0.01 * templatel.mhc_total_chainbreak + ) + assert abs(identities_for_sorting - idents) < 1e-3 if verbose: - print(f'oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}') + print( + f"oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}" + ) show_alignment(trg_to_tmp, trg_pmhc_seq, tmp_pmhc_seq) - pmhc_alignments.append((identities_for_sorting, - pdbid, - trg_to_tmp, - trg_pmhc_seq, - tmp_pmhc_seq, - identities, - )) - else: # class II + pmhc_alignments.append( + ( + identities_for_sorting, + pdbid, + trg_to_tmp, + trg_pmhc_seq, + tmp_pmhc_seq, + identities, + ) + ) + else: # class II # - trg_mhca_alseq = get_mhc_class_2_alseq('A', mhc_allele.split(',')[0]) - trg_mhcb_alseq = get_mhc_class_2_alseq('B', mhc_allele.split(',')[1]) - trg_mhca_seq = trg_mhca_alseq.replace(ALL_GENES_GAP_CHAR,'') - trg_mhcb_seq = trg_mhcb_alseq.replace(ALL_GENES_GAP_CHAR,'') + trg_mhca_alseq = get_mhc_class_2_alseq("A", mhc_allele.split(",")[0]) + trg_mhcb_alseq = get_mhc_class_2_alseq("B", mhc_allele.split(",")[1]) + trg_mhca_seq = trg_mhca_alseq.replace(ALL_GENES_GAP_CHAR, "") + trg_mhcb_seq = trg_mhcb_alseq.replace(ALL_GENES_GAP_CHAR, "") trg_pmhc_seq = trg_mhca_seq + trg_mhcb_seq + peptide sortl = [] for l in ternary_info.itertuples(): - if (l.organism!=organism or l.mhc_class!=mhc_class or - l.pdbid in BAD_PMHC_PDBIDS or l.pdbid in exclude_pdbids): + if ( + l.organism != organism + or l.mhc_class != mhc_class + or l.pdbid in BAD_PMHC_PDBIDS + or l.pdbid in exclude_pdbids + ): continue mismatches_for_excluding = min( count_peptide_mismatches(x, l.pep_seq) - for x in [peptide]+alt_self_peptides) + for x in [peptide] + alt_self_peptides + ) if mismatches_for_excluding < min_pmhc_peptide_mismatches: if verbose: - print('peptide too close:', peptide, l.pep_seq, - mismatches_for_excluding, alt_self_peptides) + print( + "peptide too close:", + peptide, + l.pep_seq, + mismatches_for_excluding, + alt_self_peptides, + ) continue - tmp_mhca_alseq, tmp_mhcb_alseq = l.mhc_alignseq.split('/') + tmp_mhca_alseq, tmp_mhcb_alseq = l.mhc_alignseq.split("/") idents = 0 - for a,b in zip([trg_mhca_alseq, trg_mhcb_alseq, peptide], - [tmp_mhca_alseq, tmp_mhcb_alseq, l.pep_seq]): + for a, b in zip( + [trg_mhca_alseq, trg_mhcb_alseq, peptide], + [tmp_mhca_alseq, tmp_mhcb_alseq, l.pep_seq], + ): assert len(a) == len(b) - idents += sum(x==y for x,y in zip(a,b) if x!=ALL_GENES_GAP_CHAR) - sortl.append((idents/len(trg_pmhc_seq), l.pdbid)) + idents += sum( + x == y for x, y in zip(a, b) if x != ALL_GENES_GAP_CHAR + ) + sortl.append((idents / len(trg_pmhc_seq), l.pdbid)) sortl.sort(reverse=True) max_idents = sortl[0][0] - print(f'mhc max_idents: {max_idents:.3f}', mhc_allele, peptide, - ternary_info.loc[sortl[0][1], 'mhc_allele'], - ternary_info.loc[sortl[0][1], 'pep_seq']) + print( + f"mhc max_idents: {max_idents:.3f}", + mhc_allele, + peptide, + ternary_info.loc[sortl[0][1], "mhc_allele"], + ternary_info.loc[sortl[0][1], "pep_seq"], + ) pmhc_alignments = [] - for (idents, pdbid) in sortl[:num_templates_per_run]: - if idents < next_best_identity_threshold_tcr*max_idents: + for idents, pdbid in sortl[:num_templates_per_run]: + if idents < next_best_identity_threshold_tcr * max_idents: break templatel = ternary_info.loc[pdbid] - tmp_mhca_alseq, tmp_mhcb_alseq = templatel.mhc_alignseq.split('/') - mhca_part = tmp_mhca_alseq.replace(ALL_GENES_GAP_CHAR,'') - mhcb_part = tmp_mhcb_alseq.replace(ALL_GENES_GAP_CHAR,'') - tmp_mhca_seq, tmp_mhcb_seq = templatel.chainseq.split('/')[:2] + tmp_mhca_alseq, tmp_mhcb_alseq = templatel.mhc_alignseq.split("/") + mhca_part = tmp_mhca_alseq.replace(ALL_GENES_GAP_CHAR, "") + mhcb_part = tmp_mhcb_alseq.replace(ALL_GENES_GAP_CHAR, "") + tmp_mhca_seq, tmp_mhcb_seq = templatel.chainseq.split("/")[:2] assert mhca_part in tmp_mhca_seq and mhcb_part in tmp_mhcb_seq mhca_npad = tmp_mhca_seq.find(mhca_part) mhcb_npad = tmp_mhcb_seq.find(mhcb_part) trg_offset, tmp_offset = 0, mhca_npad - al1 = {i-trg_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR)+trg_offset: - i-tmp_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR)+tmp_offset - for i,(a,b) in enumerate(zip(trg_mhca_alseq, tmp_mhca_alseq)) - if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR} + al1 = { + i + - trg_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR) + + trg_offset: i + - tmp_mhca_alseq[:i].count(ALL_GENES_GAP_CHAR) + + tmp_offset + for i, (a, b) in enumerate(zip(trg_mhca_alseq, tmp_mhca_alseq)) + if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR + } trg_offset = len(trg_mhca_seq) - tmp_offset = len(tmp_mhca_seq)+mhcb_npad - al2 = {i-trg_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR)+trg_offset: - i-tmp_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR)+tmp_offset - for i,(a,b) in enumerate(zip(trg_mhcb_alseq, tmp_mhcb_alseq)) - if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR} - trg_offset = len(trg_mhca_seq)+len(trg_mhcb_seq) - tmp_offset = len(tmp_mhca_seq)+len(tmp_mhcb_seq) - al3 = {i+trg_offset:i+tmp_offset for i in range(CLASS2_PEPLEN)} + tmp_offset = len(tmp_mhca_seq) + mhcb_npad + al2 = { + i + - trg_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR) + + trg_offset: i + - tmp_mhcb_alseq[:i].count(ALL_GENES_GAP_CHAR) + + tmp_offset + for i, (a, b) in enumerate(zip(trg_mhcb_alseq, tmp_mhcb_alseq)) + if a != ALL_GENES_GAP_CHAR and b != ALL_GENES_GAP_CHAR + } + trg_offset = len(trg_mhca_seq) + len(trg_mhcb_seq) + tmp_offset = len(tmp_mhca_seq) + len(tmp_mhcb_seq) + al3 = { + i + trg_offset: i + tmp_offset for i in range(CLASS2_PEPLEN) + } trg_to_tmp = {**al1, **al2, **al3} tmp_pmhc_seq = tmp_mhca_seq + tmp_mhcb_seq + templatel.pep_seq - idents_redo = sum(trg_pmhc_seq[i] == tmp_pmhc_seq[j] - for i,j in trg_to_tmp.items())/len(trg_pmhc_seq) - #print(f'oldnew mhc_idents: {idents:6.3f} {idents_redo:6.3f} {pdbid}') - assert abs(idents-idents_redo)<1e-4 + idents_redo = sum( + trg_pmhc_seq[i] == tmp_pmhc_seq[j] + for i, j in trg_to_tmp.items() + ) / len(trg_pmhc_seq) + # print(f'oldnew mhc_idents: {idents:6.3f} {idents_redo:6.3f} {pdbid}') + assert abs(idents - idents_redo) < 1e-4 show_alignment(trg_to_tmp, trg_pmhc_seq, tmp_pmhc_seq) - pmhc_alignments.append((idents, - pdbid, - trg_to_tmp, - trg_pmhc_seq, - tmp_pmhc_seq, - idents, - )) - - - - + pmhc_alignments.append( + ( + idents, + pdbid, + trg_to_tmp, + trg_pmhc_seq, + tmp_pmhc_seq, + idents, + ) + ) - tcr_alignments = {'A':[], 'B':[]} + tcr_alignments = {"A": [], "B": []} # compute single-chain tcrdists to candidate template chains # drop out of the loop if vdist hits this value and we've already got enough # templates - big_v_dist=50 + big_v_dist = 50 - for ab, trg_v, trg_j, trg_cdr3 in [['A',va,ja,cdr3a],['B',vb,jb,cdr3b]]: - trg_tcr = (organism[0]+trg_v, trg_j, trg_cdr3) + for ab, trg_v, trg_j, trg_cdr3 in [ + ["A", va, ja, cdr3a], + ["B", vb, jb, cdr3b], + ]: + trg_tcr = (organism[0] + trg_v, trg_j, trg_cdr3) trg_msa_alignments = align_vgene_to_structure_msas(organism, trg_v) - templates = tcr_info[tcr_info.ab==ab] + templates = tcr_info[tcr_info.ab == ab] templates = templates[~templates.pdbid.isin(exclude_pdbids)] template_tcrs = [ - (x.organism[0]+x.v_gene, None, x.cdr3) for x in templates.itertuples()] - #templates['v_gene j_gene cdr3'.split()].itertuples(index=False)) - closest_tcrs = [(x[0],x[1],trg_cdr3) for x in template_tcrs] # tmp v, trg cdr3 + (x.organism[0] + x.v_gene, None, x.cdr3) + for x in templates.itertuples() + ] + # templates['v_gene j_gene cdr3'.split()].itertuples(index=False)) + closest_tcrs = [ + (x[0], x[1], trg_cdr3) for x in template_tcrs + ] # tmp v, trg cdr3 sortl = sorted( - [(tcrdister.single_chain_distance(trg_tcr,x), - tcrdister.single_chain_distance(trg_tcr,y), - i) - for i,(x,y) in enumerate(zip(closest_tcrs, template_tcrs))]) + [ + ( + tcrdister.single_chain_distance(trg_tcr, x), + tcrdister.single_chain_distance(trg_tcr, y), + i, + ) + for i, (x, y) in enumerate(zip(closest_tcrs, template_tcrs)) + ] + ) alignments = [] for closest_dist, dist, ind in sortl: if dist < min_single_chain_tcrdist: - #print('too close:', dist, trg_v, trg_j, trg_cdr3, template_tcrs[ind]) + # print('too close:', dist, trg_v, trg_j, trg_cdr3, template_tcrs[ind]) continue - if closest_dist > big_v_dist and len(alignments)>=num_templates_per_run: - #print('too far:', closest_dist) + if ( + closest_dist > big_v_dist + and len(alignments) >= num_templates_per_run + ): + # print('too far:', closest_dist) break templatel = templates.iloc[ind] @@ -1380,68 +1661,99 @@ def show_alignment(al,seq1,seq2): # msa_type = 'both' trg_to_tmp, trg_chainseq = align_vgene_to_template_pdb_chain( - ab, organism, trg_v, trg_j, trg_cdr3, trg_msa_alignments, + ab, + organism, + trg_v, + trg_j, + trg_cdr3, + trg_msa_alignments, templatel.pdbid, ) # tmp_chainseq_to_msa = align_tcr_info_pdb_chain_to_structure_msa( # templatel.pdbid, templatel.ab, msa_type) - identities = sum(trg_chainseq[i]==templatel.chainseq[j] - for i,j in trg_to_tmp.items()) / len(trg_chainseq) + identities = sum( + trg_chainseq[i] == templatel.chainseq[j] + for i, j in trg_to_tmp.items() + ) / len(trg_chainseq) - identities_for_sorting = identities + ternary_bonus * templatel.ternary + identities_for_sorting = ( + identities + ternary_bonus * templatel.ternary + ) alignments.append( - (identities_for_sorting, - templatel.pdbid, - trg_to_tmp, - trg_chainseq, - templatel.chainseq, - closest_dist, - identities, - )) + ( + identities_for_sorting, + templatel.pdbid, + trg_to_tmp, + trg_chainseq, + templatel.chainseq, + closest_dist, + identities, + ) + ) alignments.sort(reverse=True) max_idents = alignments[0][0] - tcr_alignments[ab] = [x for x in alignments[:num_templates_per_run] - if x[0] >= next_best_identity_threshold_tcr*max_idents] + tcr_alignments[ab] = [ + x + for x in alignments[:num_templates_per_run] + if x[0] >= next_best_identity_threshold_tcr * max_idents + ] # docking geometries # exclude same-epitope geoms if pick_dgeoms_using_tcrdist: dgeom_info_by_chain = pick_dgeom_templates( - num_templates_per_run, organism, mhc_class, mhc_allele, - peptide, va, cdr3a, vb, cdr3b, - min_peptide_mismatches = min_dgeom_peptide_mismatches, - min_paired_tcrdist = min_dgeom_paired_tcrdist, - min_singlechain_tcrdist = min_dgeom_singlechain_tcrdist, - peptides_for_filtering = exclude_docking_geometry_peptides, - exclude_pdbids = exclude_pdbids, + num_templates_per_run, + organism, + mhc_class, + mhc_allele, + peptide, + va, + cdr3a, + vb, + cdr3b, + min_peptide_mismatches=min_dgeom_peptide_mismatches, + min_paired_tcrdist=min_dgeom_paired_tcrdist, + min_singlechain_tcrdist=min_dgeom_singlechain_tcrdist, + peptides_for_filtering=exclude_docking_geometry_peptides, + exclude_pdbids=exclude_pdbids, ) elif use_same_pmhc_dgeoms: # require matching organism, mhc, peptide with dgeoms dgeom_info = ternary_info[ternary_info.mhc_class == mhc_class] dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(BAD_DGEOM_PDBIDS)] dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(exclude_pdbids)] - print('same mhc_class', dgeom_info.shape) + print("same mhc_class", dgeom_info.shape) dgeom_info = dgeom_info[dgeom_info.organism == organism] - print('same organism', dgeom_info.shape) + print("same organism", dgeom_info.shape) dgeom_info = dgeom_info[dgeom_info.mhc_allele == mhc_allele] - print('same mhc_allele', dgeom_info.shape) + print("same mhc_allele", dgeom_info.shape) dgeom_info = dgeom_info[dgeom_info.pep_seq == peptide] - print('same peptide', dgeom_info.shape) - if dgeom_info.shape[0]==0: - print('ERROR no matching pmhc templates for dgeom:', - organism, mhc_class, mhc_allele, peptide) + print("same peptide", dgeom_info.shape) + if dgeom_info.shape[0] == 0: + print( + "ERROR no matching pmhc templates for dgeom:", + organism, + mhc_class, + mhc_allele, + peptide, + ) exit() - dgeoms = [DockingGeometry().from_dict(x) for _,x in dgeom_info.iterrows()] - if num_runs*num_templates_per_run > len(dgeoms): + dgeoms = [ + DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows() + ] + if num_runs * num_templates_per_run > len(dgeoms): rep_dgeom_indices = np.random.permutation(len(dgeoms)) rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices] else: - dummy_organism = 'human' # just used for avg cdr coords - rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps( - dummy_organism, dgeoms, num_runs*num_templates_per_run) + dummy_organism = "human" # just used for avg cdr coords + rep_dgeoms, rep_dgeom_indices = ( + docking_geometry.pick_docking_geometry_reps( + dummy_organism, dgeoms, num_runs * num_templates_per_run + ) + ) elif use_opt_dgeoms: rep_dgeoms = docking_geometry.load_opt_dgeoms(mhc_class) assert len(rep_dgeoms) == 4 @@ -1449,74 +1761,101 @@ def show_alignment(al,seq1,seq2): else: dgeom_info = ternary_info[ternary_info.mhc_class == mhc_class] - if organism=='human': # restrict to human docks + if organism == "human": # restrict to human docks dgeom_info = dgeom_info[dgeom_info.organism == organism] dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(BAD_DGEOM_PDBIDS)] dgeom_info = dgeom_info[~dgeom_info.pdbid.isin(exclude_pdbids)] dgeom_info = filter_templates_by_tcrdist( - dgeom_info, organism, va, cdr3a, vb, cdr3b, + dgeom_info, + organism, + va, + cdr3a, + vb, + cdr3b, min_paired_tcrdist=min_dgeom_paired_tcrdist, min_singlechain_tcrdist=min_dgeom_singlechain_tcrdist, ) dgeom_info = filter_templates_by_peptide_mismatches( - dgeom_info, organism, mhc_class, exclude_docking_geometry_peptides, - min_dgeom_peptide_mismatches) - dgeoms = [DockingGeometry().from_dict(x) for _,x in dgeom_info.iterrows()] + dgeom_info, + organism, + mhc_class, + exclude_docking_geometry_peptides, + min_dgeom_peptide_mismatches, + ) + dgeoms = [ + DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows() + ] - if num_runs*num_templates_per_run > len(dgeoms): + if num_runs * num_templates_per_run > len(dgeoms): rep_dgeom_indices = np.random.permutation(len(dgeoms)) rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices] else: - dummy_organism = 'human' # just used for avg cdr coords - rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps( - dummy_organism, dgeoms, num_runs*num_templates_per_run) + dummy_organism = "human" # just used for avg cdr coords + rep_dgeoms, rep_dgeom_indices = ( + docking_geometry.pick_docking_geometry_reps( + dummy_organism, dgeoms, num_runs * num_templates_per_run + ) + ) - #print('dgeoms:', len(dgeoms), len(rep_dgeoms)) + # print('dgeoms:', len(dgeoms), len(rep_dgeoms)) - trg_pmhc_seq = pmhc_alignments[0][3] # check for consistency below - trg_tcra_seq = tcr_alignments['A'][0][3] # ditto - trg_tcrb_seq = tcr_alignments['B'][0][3] # ditto + trg_pmhc_seq = pmhc_alignments[0][3] # check for consistency below + trg_tcra_seq = tcr_alignments["A"][0][3] # ditto + trg_tcrb_seq = tcr_alignments["B"][0][3] # ditto # now make the template pdbs dfl = [] for run in range(num_runs): for itmp in range(num_templates_per_run): - pmhc_al = pmhc_alignments[itmp%len(pmhc_alignments)] - tcra_al = tcr_alignments['A'][itmp%len(tcr_alignments['A'])] - tcrb_al = tcr_alignments['B'][itmp%len(tcr_alignments['B'])] + pmhc_al = pmhc_alignments[itmp % len(pmhc_alignments)] + tcra_al = tcr_alignments["A"][itmp % len(tcr_alignments["A"])] + tcrb_al = tcr_alignments["B"][itmp % len(tcr_alignments["B"])] if pick_dgeoms_using_tcrdist: - chain = ['AB','A','B'][run] + chain = ["AB", "A", "B"][run] dgeom_info = dgeom_info_by_chain[chain] - dgeom_row = dgeom_info.iloc[itmp%dgeom_info.shape[0]] + dgeom_row = dgeom_info.iloc[itmp % dgeom_info.shape[0]] dgeom = DockingGeometry().from_dict(dgeom_row) elif use_opt_dgeoms: dgeom = rep_dgeoms[itmp] dgeom_row = None else: # order them this way so each run has diverse dgeoms... - dgeom_repno = (itmp*num_runs + run)%len(rep_dgeoms) + dgeom_repno = (itmp * num_runs + run) % len(rep_dgeoms) dgeom = rep_dgeoms[dgeom_repno] dgeom_row = dgeom_info.iloc[rep_dgeom_indices[dgeom_repno]] pmhc_pdbid = pmhc_al[1] - pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo(pmhc_pdbid, PMHC) + pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo( + pmhc_pdbid, PMHC + ) tcra_pdbid = tcra_al[1] - tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo(tcra_pdbid, TCR) + tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo( + tcra_pdbid, TCR + ) tcrb_pdbid = tcrb_al[1] - tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo(tcrb_pdbid, TCR) + tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo( + tcrb_pdbid, TCR + ) # assert ((mhc_class==1 and pmhc_pose.num_chains() == 4) or # (mhc_class==2 and pmhc_pose.num_chains() == 5)) - assert len(tcra_pose['chains']) == 2 and len(tcrb_pose['chains']) == 2 + assert ( + len(tcra_pose["chains"]) == 2 and len(tcrb_pose["chains"]) == 2 + ) # copy tcrb into tcra_pose by superimposing core coords - fix_coords = tcra_pose['ca_coords'][tcra_tdinfo.tcr_core[core_len:]] - mov_coords = tcrb_pose['ca_coords'][tcrb_tdinfo.tcr_core[core_len:]] + fix_coords = tcra_pose["ca_coords"][ + tcra_tdinfo.tcr_core[core_len:] + ] + mov_coords = tcrb_pose["ca_coords"][ + tcrb_tdinfo.tcr_core[core_len:] + ] R, v = superimpose.superimposition_transform( - fix_coords, mov_coords) + fix_coords, mov_coords + ) tcrb_pose = apply_transform_Rx_plus_v(tcrb_pose, R, v) tcra_pose = delete_chains(tcra_pose, [1]) tcra_pose = append_chains(tcra_pose, tcrb_pose, [1]) @@ -1524,49 +1863,52 @@ def show_alignment(al,seq1,seq2): # update tcra_tdinfo, compute tcr_stub # use dgeom to compute desired tcr_stub # transform tcra_pose - offset = tcra_pose['chainbounds'][1]-tcrb_pose['chainbounds'][1] - tcra_tdinfo.tcr_core = ( - tcra_tdinfo.tcr_core[:core_len] + - [x+offset for x in tcrb_tdinfo.tcr_core[core_len:]]) - tcra_tdinfo.tcr_cdrs = ( - tcra_tdinfo.tcr_cdrs[:4] + - [[x+offset,y+offset] for x,y in tcrb_tdinfo.tcr_cdrs[4:]]) - - assert tcra_pose['sequence'][tcra_tdinfo.tcr_cdrs[3][0]]=='C' - assert tcra_pose['sequence'][tcra_tdinfo.tcr_cdrs[7][0]]=='C' + offset = tcra_pose["chainbounds"][1] - tcrb_pose["chainbounds"][1] + tcra_tdinfo.tcr_core = tcra_tdinfo.tcr_core[:core_len] + [ + x + offset for x in tcrb_tdinfo.tcr_core[core_len:] + ] + tcra_tdinfo.tcr_cdrs = tcra_tdinfo.tcr_cdrs[:4] + [ + [x + offset, y + offset] for x, y in tcrb_tdinfo.tcr_cdrs[4:] + ] + + assert tcra_pose["sequence"][tcra_tdinfo.tcr_cdrs[3][0]] == "C" + assert tcra_pose["sequence"][tcra_tdinfo.tcr_cdrs[7][0]] == "C" old_tcr_stub = tcr_util.get_tcr_stub(tcra_pose, tcra_tdinfo) - new_tcr_stub = docking_geometry.stub_from_docking_geometry( - dgeom) + new_tcr_stub = docking_geometry.stub_from_docking_geometry(dgeom) # R @ old_tcr_stub['axes'].T = new_tcr_stub['axes'].T - R = new_tcr_stub['axes'].T @ old_tcr_stub['axes'] + R = new_tcr_stub["axes"].T @ old_tcr_stub["axes"] # R @ old_tcr_stub['origin'] + v = new_tcr_stub['origin'] - v = new_tcr_stub['origin'] - R@old_tcr_stub['origin'] + v = new_tcr_stub["origin"] - R @ old_tcr_stub["origin"] tcra_pose = apply_transform_Rx_plus_v(tcra_pose, R, v) # # copy tcr from tcra_pose into pmhc_pose - num_pmhc_chains = mhc_class+1 - if len(pmhc_pose['chains']) > num_pmhc_chains: - del_chains = list(range(num_pmhc_chains, len(pmhc_pose['chains']))) + num_pmhc_chains = mhc_class + 1 + if len(pmhc_pose["chains"]) > num_pmhc_chains: + del_chains = list( + range(num_pmhc_chains, len(pmhc_pose["chains"])) + ) pmhc_pose = delete_chains(pmhc_pose, del_chains) - pmhc_pose = append_chains(pmhc_pose, tcra_pose, [0,1]) - assert len(pmhc_pose['chains'])==2+num_pmhc_chains - offset = pmhc_pose['chainbounds'][num_pmhc_chains] - pmhc_tdinfo.tcr_core = ( - [x+offset for x in tcra_tdinfo.tcr_core]) - pmhc_tdinfo.tcr_cdrs = ( - [[x+offset,y+offset] for x,y in tcra_tdinfo.tcr_cdrs]) - assert pmhc_pose['sequence'][pmhc_tdinfo.tcr_cdrs[3][0]] == 'C' - assert pmhc_pose['sequence'][pmhc_tdinfo.tcr_cdrs[7][0]] == 'C' + pmhc_pose = append_chains(pmhc_pose, tcra_pose, [0, 1]) + assert len(pmhc_pose["chains"]) == 2 + num_pmhc_chains + offset = pmhc_pose["chainbounds"][num_pmhc_chains] + pmhc_tdinfo.tcr_core = [x + offset for x in tcra_tdinfo.tcr_core] + pmhc_tdinfo.tcr_cdrs = [ + [x + offset, y + offset] for x, y in tcra_tdinfo.tcr_cdrs + ] + assert pmhc_pose["sequence"][pmhc_tdinfo.tcr_cdrs[3][0]] == "C" + assert pmhc_pose["sequence"][pmhc_tdinfo.tcr_cdrs[7][0]] == "C" # should be the same as new_tcr_stub! redo_tcr_stub = tcr_util.get_tcr_stub(pmhc_pose, pmhc_tdinfo) - v_dev = norm(redo_tcr_stub['origin']-new_tcr_stub['origin']) - M_dev = norm(new_tcr_stub['axes'] @ redo_tcr_stub['axes'].T - np.eye(3)) - if max(v_dev, M_dev)>5e-2: - print('devs:', v_dev, M_dev) - assert v_dev<5e-2 - assert M_dev<5e-2 + v_dev = norm(redo_tcr_stub["origin"] - new_tcr_stub["origin"]) + M_dev = norm( + new_tcr_stub["axes"] @ redo_tcr_stub["axes"].T - np.eye(3) + ) + if max(v_dev, M_dev) > 5e-2: + print("devs:", v_dev, M_dev) + assert v_dev < 5e-2 + assert M_dev < 5e-2 # setup the alignment trg_to_tmp = dict(pmhc_al[2]) @@ -1574,9 +1916,9 @@ def show_alignment(al,seq1,seq2): tmp_tcra_seq = tcra_al[4] tmp_tcrb_seq = tcrb_al[4] tmp_fullseq = tmp_pmhc_seq + tmp_tcra_seq + tmp_tcrb_seq - assert pmhc_pose['sequence'] == tmp_fullseq + assert pmhc_pose["sequence"] == tmp_fullseq - assert len(pmhc_pose['chains']) == num_pmhc_chains+2 + assert len(pmhc_pose["chains"]) == num_pmhc_chains + 2 assert trg_pmhc_seq == pmhc_al[3] assert trg_tcra_seq == tcra_al[3] @@ -1585,44 +1927,58 @@ def show_alignment(al,seq1,seq2): trg_fullseq = trg_pmhc_seq + trg_tcra_seq + trg_tcrb_seq trg_offset = len(trg_pmhc_seq) tmp_offset = len(tmp_pmhc_seq) - trg_to_tmp.update({i+trg_offset:j+tmp_offset - for i,j in tcra_al[2].items()}) + trg_to_tmp.update( + {i + trg_offset: j + tmp_offset for i, j in tcra_al[2].items()} + ) trg_offset += len(trg_tcra_seq) tmp_offset += len(tmp_tcra_seq) - trg_to_tmp.update({i+trg_offset:j+tmp_offset - for i,j in tcrb_al[2].items()}) - assert Counter(trg_to_tmp.values()).most_common(1)[0][1]==1 - - identities = sum(trg_fullseq[i]==tmp_fullseq[j] - for i,j in trg_to_tmp.items()) - overall_idents = identities/len(trg_fullseq) - if run==0: - print(f'identities: {itmp} {overall_idents:.3f} ' - f'pmhc: {pmhc_al[-1]:.3f} ' - f'tcra: {tcra_al[-1]:.3f} ' - f'tcrb: {tcrb_al[-1]:.3f} ', va, ja, cdr3a, vb, jb, cdr3b, - flush=True) + trg_to_tmp.update( + {i + trg_offset: j + tmp_offset for i, j in tcrb_al[2].items()} + ) + assert Counter(trg_to_tmp.values()).most_common(1)[0][1] == 1 + + identities = sum( + trg_fullseq[i] == tmp_fullseq[j] for i, j in trg_to_tmp.items() + ) + overall_idents = identities / len(trg_fullseq) + if run == 0: + print( + f"identities: {itmp} {overall_idents:.3f} " + f"pmhc: {pmhc_al[-1]:.3f} " + f"tcra: {tcra_al[-1]:.3f} " + f"tcrb: {tcrb_al[-1]:.3f} ", + va, + ja, + cdr3a, + vb, + jb, + cdr3b, + flush=True, + ) if verbose: - for i,j in sorted(trg_to_tmp.items()): - a,b = trg_fullseq[i], tmp_fullseq[j] - star = '*' if a==b else ' ' - print(f'{i:4d} {j:4d} {a} {star} {b}') + for i, j in sorted(trg_to_tmp.items()): + a, b = trg_fullseq[i], tmp_fullseq[j] + star = "*" if a == b else " " + print(f"{i:4d} {j:4d} {a} {star} {b}") - outpdbfile = f'{outfile_prefix}_{run}_{itmp}.pdb' - #pmhc_pose.dump_pdb(outpdbfile) + outpdbfile = f"{outfile_prefix}_{run}_{itmp}.pdb" + # pmhc_pose.dump_pdb(outpdbfile) dump_pdb(pmhc_pose, outpdbfile) - #print('made:', outpdbfile) + # print('made:', outpdbfile) - trg_pmhc_seqs = ([trg_mhc_seq, peptide] if mhc_class==1 else - [trg_mhca_seq, trg_mhcb_seq, peptide]) - trg_cbseq = '/'.join(trg_pmhc_seqs+[trg_tcra_seq, trg_tcrb_seq]) + trg_pmhc_seqs = ( + [trg_mhc_seq, peptide] + if mhc_class == 1 + else [trg_mhca_seq, trg_mhcb_seq, peptide] + ) + trg_cbseq = "/".join(trg_pmhc_seqs + [trg_tcra_seq, trg_tcrb_seq]) - alignstring = ';'.join(f'{i}:{j}' for i,j in trg_to_tmp.items()) + alignstring = ";".join(f"{i}:{j}" for i, j in trg_to_tmp.items()) if pmhc_pdbid in pmhc_info.index: - pmhc_allele=pmhc_info.loc[pmhc_pdbid, 'mhc_allele'] + pmhc_allele = pmhc_info.loc[pmhc_pdbid, "mhc_allele"] else: - pmhc_allele=ternary_info.loc[pmhc_pdbid, 'mhc_allele'] + pmhc_allele = ternary_info.loc[pmhc_pdbid, "mhc_allele"] outl = OrderedDict( run=run, @@ -1631,18 +1987,20 @@ def show_alignment(al,seq1,seq2): overall_idents=overall_idents, pmhc_pdbid=pmhc_pdbid, pmhc_idents=pmhc_al[-1], - pmhc_allele=pmhc_allele,#pmhc_info.loc[pmhc_pdbid, 'mhc_allele'], + pmhc_allele=pmhc_allele, # pmhc_info.loc[pmhc_pdbid, 'mhc_allele'], tcra_pdbid=tcra_pdbid, tcra_idents=tcra_al[-1], - tcra_v =tcr_info.loc[(tcra_pdbid,'A'), 'v_gene'], - tcra_j =tcr_info.loc[(tcra_pdbid,'A'), 'j_gene'], - tcra_cdr3=tcr_info.loc[(tcra_pdbid,'A'), 'cdr3'], + tcra_v=tcr_info.loc[(tcra_pdbid, "A"), "v_gene"], + tcra_j=tcr_info.loc[(tcra_pdbid, "A"), "j_gene"], + tcra_cdr3=tcr_info.loc[(tcra_pdbid, "A"), "cdr3"], tcrb_pdbid=tcrb_pdbid, tcrb_idents=tcrb_al[-1], - tcrb_v =tcr_info.loc[(tcrb_pdbid,'B'), 'v_gene'], - tcrb_j =tcr_info.loc[(tcrb_pdbid,'B'), 'j_gene'], - tcrb_cdr3=tcr_info.loc[(tcrb_pdbid,'B'), 'cdr3'], - dgeom_pdbid=f'opt{itmp}' if dgeom_row is None else dgeom_row.pdbid, + tcrb_v=tcr_info.loc[(tcrb_pdbid, "B"), "v_gene"], + tcrb_j=tcr_info.loc[(tcrb_pdbid, "B"), "j_gene"], + tcrb_cdr3=tcr_info.loc[(tcrb_pdbid, "B"), "cdr3"], + dgeom_pdbid=( + f"opt{itmp}" if dgeom_row is None else dgeom_row.pdbid + ), template_pdbfile=outpdbfile, target_to_template_alignstring=alignstring, identities=identities, @@ -1653,30 +2011,34 @@ def show_alignment(al,seq1,seq2): assert len(dfl) == num_runs * num_templates_per_run return pd.DataFrame(dfl) + def genes_ok_for_modeling(organism, va, ja, vb, jb, verbose=True): if organism not in all_genes: - print(f'ERROR unrecognized organism: "{organism}" expected one of', - all_genes.keys()) + print( + f'ERROR unrecognized organism: "{organism}" expected one of', + all_genes.keys(), + ) return False for g in [va, ja, vb, jb]: if g not in all_genes[organism]: from .tcrdist.basic import path_to_db, db_file + print(f'ERROR: unrecognized gene: "{g}" for organism "{organism}"') # for error output: dbfile = path_to_db / db_file - print('check the ids in', dbfile, 'for organism', organism) + print("check the ids in", dbfile, "for organism", organism) return False - if (organism,va) not in both_structure_alignments.index: + if (organism, va) not in both_structure_alignments.index: if verbose: - print('ERROR: no va alignment:', organism, va) + print("ERROR: no va alignment:", organism, va) return False - if (organism,vb) not in both_structure_alignments.index: + if (organism, vb) not in both_structure_alignments.index: if verbose: - print('ERROR: no vb alignment:', organism, vb) + print("ERROR: no vb alignment:", organism, vb) return False va_seq = get_v_seq_up_to_cys(organism, va) @@ -1684,221 +2046,264 @@ def genes_ok_for_modeling(organism, va, ja, vb, jb, verbose=True): vb_seq = get_v_seq_up_to_cys(organism, vb) jb_seq = get_j_seq_after_cdr3(organism, jb) - if ( '*' in va_seq+ja_seq+vb_seq+jb_seq or - va_seq[-1] != 'C' or vb_seq[-1] != 'C'): + if ( + "*" in va_seq + ja_seq + vb_seq + jb_seq + or va_seq[-1] != "C" + or vb_seq[-1] != "C" + ): if verbose: - print('ERROR bad seqs:', va, va_seq, ja, ja_seq, - vb, vb_seq, ja, jb_seq) + print( + "ERROR bad seqs:", + va, + va_seq, + ja, + ja_seq, + vb, + vb_seq, + ja, + jb_seq, + ) return False return True + def check_genes_for_modeling(tcr_db): all_ok = True for index, targetl in tcr_db.iterrows(): ok = genes_ok_for_modeling( - targetl.organism, targetl.va, targetl.ja, targetl.vb, targetl.jb, - verbose=True) + targetl.organism, + targetl.va, + targetl.ja, + targetl.vb, + targetl.jb, + verbose=True, + ) all_ok = all_ok and ok if not ok: - print('ERROR bad tcr genes at index=', index) + print("ERROR bad tcr genes at index=", index) return all_ok def setup_for_alphafold( - tcr_db, - outdir, - organism=None, # if None, should be present in tcr_db - min_pmhc_count=None, - max_pmhc_count=None, - random_seed=1, # for subsampling tcrs - num_runs=3, - clobber=False, - exclude_self_peptide_docking_geometries=False, - alt_self_peptides_column=None, # this column should be comma-separated - exclude_pdbids_column=None, # this column should be comma-separated - targetid_prefix_suffix='', - use_opt_dgeoms=False, - **kwargs, + tcr_db, + outdir, + organism=None, # if None, should be present in tcr_db + min_pmhc_count=None, + max_pmhc_count=None, + random_seed=1, # for subsampling tcrs + num_runs=3, + clobber=False, + exclude_self_peptide_docking_geometries=False, + alt_self_peptides_column=None, # this column should be comma-separated + exclude_pdbids_column=None, # this column should be comma-separated + targetid_prefix_suffix="", + use_opt_dgeoms=False, + **kwargs, ): - ''' - ''' - assert outdir.endswith('/') + """ """ + assert outdir.endswith("/") if use_opt_dgeoms: assert num_runs == 1 - required_cols = 'va ja cdr3a vb jb cdr3b mhc_class mhc peptide'.split() + required_cols = "va ja cdr3a vb jb cdr3b mhc_class mhc peptide".split() if organism is None: - required_cols.append('organism') + required_cols.append("organism") for col in required_cols: assert col in tcr_db.columns - #assert not any(tcr_db.mhc.str.startswith('E*')) + # assert not any(tcr_db.mhc.str.startswith('E*')) tcr_db = tcr_db.copy() - tcr_db['mhc_peptide'] = ( - tcr_db.mhc.str.replace('*','',regex=False).str.replace(':','',regex=False)+ - '_'+tcr_db.peptide) + tcr_db["mhc_peptide"] = ( + tcr_db.mhc.str.replace("*", "", regex=False).str.replace( + ":", "", regex=False + ) + + "_" + + tcr_db.peptide + ) if organism is not None: - if 'organism' in tcr_db.columns: - assert all(tcr_db.organism==organism) + if "organism" in tcr_db.columns: + assert all(tcr_db.organism == organism) else: - tcr_db['organism'] = organism + tcr_db["organism"] = organism # doesnt do anything if init has already been called (I don't think) if not exists(outdir): os.mkdir(outdir) else: - assert clobber, 'outdir already exists: '+outdir + assert clobber, "outdir already exists: " + outdir - #optionally filter to peptide mhc combos with min/max counts ########## + # optionally filter to peptide mhc combos with min/max counts ########## if min_pmhc_count is not None or max_pmhc_count is not None: - tcr_db.sort_values('mhc_peptide', inplace=True) ## IMPORTANT B/C MASKING - random.seed(random_seed) # shuffling of epitope tcrs + tcr_db.sort_values( + "mhc_peptide", inplace=True + ) ## IMPORTANT B/C MASKING + random.seed(random_seed) # shuffling of epitope tcrs assert min_pmhc_count is not None and max_pmhc_count is not None mhc_peptides = tcr_db.mhc_peptide.drop_duplicates().to_list() mask = [] for mhc_peptide in mhc_peptides: - count = np.sum(tcr_db.mhc_peptide==mhc_peptide) + count = np.sum(tcr_db.mhc_peptide == mhc_peptide) if count < min_pmhc_count: - mhc_peptide_mask = [False]*count + mhc_peptide_mask = [False] * count elif count <= max_pmhc_count: - mhc_peptide_mask = [True]*count + mhc_peptide_mask = [True] * count else: - mhc_peptide_mask = [True]*max_pmhc_count+ [False]*(count-max_pmhc_count) + mhc_peptide_mask = [True] * max_pmhc_count + [False] * ( + count - max_pmhc_count + ) random.shuffle(mhc_peptide_mask) mask.extend(mhc_peptide_mask) old_size = tcr_db.shape[0] tcr_db = tcr_db[mask].copy() - print('subset tcr_db:', old_size, tcr_db.shape[0]) + print("subset tcr_db:", old_size, tcr_db.shape[0]) counts = tcr_db.mhc_peptide.value_counts().to_list() assert min_pmhc_count <= min(counts) and max(counts) <= max_pmhc_count - tcr_db_outfile = outdir+'tcr_db.tsv' - tcr_db.to_csv(tcr_db_outfile, sep='\t', index=False) + tcr_db_outfile = outdir + "tcr_db.tsv" + tcr_db.to_csv(tcr_db_outfile, sep="\t", index=False) tcr_db = tcr_db.reset_index(drop=True) - print('check genes for modeling', tcr_db.shape[0]) + print("check genes for modeling", tcr_db.shape[0]) for index, targetl in tcr_db.iterrows(): if not genes_ok_for_modeling( - targetl.organism, targetl.va, targetl.ja, targetl.vb, targetl.jb): - print('ERROR bad genes:', index, targetl.va, targetl.ja, - targetl.vb, targetl.jb) + targetl.organism, targetl.va, targetl.ja, targetl.vb, targetl.jb + ): + print( + "ERROR bad genes:", + index, + targetl.va, + targetl.ja, + targetl.vb, + targetl.jb, + ) assert False exit() targets_dfl = [] for index, targetl in tcr_db.iterrows(): - targetid_prefix = f'T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}' - print('START', index, tcr_db.shape[0], targetid_prefix) - outfile_prefix = f'{outdir}{targetid_prefix}' + targetid_prefix = ( + f"T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}" + ) + print("START", index, tcr_db.shape[0], targetid_prefix) + outfile_prefix = f"{outdir}{targetid_prefix}" if exclude_self_peptide_docking_geometries: exclude_docking_geometry_peptides = [targetl.peptide] else: exclude_docking_geometry_peptides = [] if alt_self_peptides_column is not None: assert exclude_docking_geometry_peptides - alt_self_peptides = targetl[alt_self_peptides_column].split(',') + alt_self_peptides = targetl[alt_self_peptides_column].split(",") if exclude_self_peptide_docking_geometries: exclude_docking_geometry_peptides.extend(alt_self_peptides) else: - alt_self_peptides=None + alt_self_peptides = None if exclude_pdbids_column is not None: exclude_pdbids = targetl[exclude_pdbids_column] if pd.isna(exclude_pdbids): exclude_pdbids = None else: - exclude_pdbids = exclude_pdbids.split(',') + exclude_pdbids = exclude_pdbids.split(",") else: exclude_pdbids = None all_run_info = make_templates_for_alphafold( - targetl.organism, targetl.va, targetl.ja, targetl.cdr3a, - targetl.vb, targetl.jb, targetl.cdr3b, - targetl.mhc_class, targetl.mhc, targetl.peptide, outfile_prefix, + targetl.organism, + targetl.va, + targetl.ja, + targetl.cdr3a, + targetl.vb, + targetl.jb, + targetl.cdr3b, + targetl.mhc_class, + targetl.mhc, + targetl.peptide, + outfile_prefix, exclude_docking_geometry_peptides=exclude_docking_geometry_peptides, - num_runs = num_runs, + num_runs=num_runs, alt_self_peptides=alt_self_peptides, - exclude_pdbids = exclude_pdbids, - use_opt_dgeoms = use_opt_dgeoms, + exclude_pdbids=exclude_pdbids, + use_opt_dgeoms=use_opt_dgeoms, **kwargs, ) for run in range(num_runs): - info = all_run_info[all_run_info.run==run] - assert info.shape[0] == 4#num templates - targetid = f'{targetid_prefix}_{run}' + info = all_run_info[all_run_info.run == run] + assert info.shape[0] == 4 # num templates + targetid = f"{targetid_prefix}_{run}" trg_cbseq = set(info.target_chainseq).pop() - alignfile = f'{outdir}{targetid}_alignments.tsv' - info.to_csv(alignfile, sep='\t', index=False) + alignfile = f"{outdir}{targetid}_alignments.tsv" + info.to_csv(alignfile, sep="\t", index=False) outl = pd.Series(targetl) - outl['targetid'] = targetid - outl['target_chainseq'] = trg_cbseq - outl['templates_alignfile'] = alignfile + outl["targetid"] = targetid + outl["target_chainseq"] = trg_cbseq + outl["templates_alignfile"] = alignfile targets_dfl.append(outl) sys.stdout.flush() # save partial work... since this is so freakin slow - outfile = outdir+'targets.tsv' - pd.DataFrame(targets_dfl).to_csv(outfile, sep='\t', index=False) + outfile = outdir + "targets.tsv" + pd.DataFrame(targets_dfl).to_csv(outfile, sep="\t", index=False) - outfile = outdir+'targets.tsv' - pd.DataFrame(targets_dfl).to_csv(outfile, sep='\t', index=False) - print('made:', outfile) + outfile = outdir + "targets.tsv" + pd.DataFrame(targets_dfl).to_csv(outfile, sep="\t", index=False) + print("made:", outfile) -def get_mhc_chain_trim_positions(chainseq, organism, mhc_class, mhc_allele, chain=None): - ''' - ''' - assert mhc_class in [1,2] - if mhc_class==2: - assert chain in ['A','B'] +def get_mhc_chain_trim_positions( + chainseq, organism, mhc_class, mhc_allele, chain=None +): + """ """ + assert mhc_class in [1, 2] + if mhc_class == 2: + assert chain in ["A", "B"] alseq = get_mhc_class_2_alseq(chain, mhc_allele) if alseq is None: - print('ERROR: None alseq:', organism, mhc_class, mhc_allele) + print("ERROR: None alseq:", organism, mhc_class, mhc_allele) return [None] - if chain == 'A': - #H2AKa EPQGGLQNIATGKHNLEI - #DRA EAQGALANIAVDKANLEI - ref_alseq = 'HVIIQ.AEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSN' - ref_helices = ['EAQGALANIAVDKANLEI'] + if chain == "A": + # H2AKa EPQGGLQNIATGKHNLEI + # DRA EAQGALANIAVDKANLEI + ref_alseq = "HVIIQ.AEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSN" + ref_helices = ["EAQGALANIAVDKANLEI"] else: - #H2AKb YWNKQ..YLERTRAELDTVCRHN - #DRB1*01 YWNSQKDLLEQRRAAVDTYCRHN - ref_alseq = 'FVHQFQPFCYFTNGTQRIRLVIRYIYNREEYVRFDSDVGEYRAVTELGRPDAEYWNKQ..YLERTRAELDTVCRHNYEKTETPTS' - ref_helices = ['YWNKQ..YLERTRAELDTVCRHN'] + # H2AKb YWNKQ..YLERTRAELDTVCRHN + # DRB1*01 YWNSQKDLLEQRRAAVDTYCRHN + ref_alseq = "FVHQFQPFCYFTNGTQRIRLVIRYIYNREEYVRFDSDVGEYRAVTELGRPDAEYWNKQ..YLERTRAELDTVCRHNYEKTETPTS" + ref_helices = ["YWNKQ..YLERTRAELDTVCRHN"] else: alseq = get_mhc_class_1_alseq(mhc_allele) if alseq is None: - print('ERROR: None alseq:', organism, mhc_class, mhc_allele) + print("ERROR: None alseq:", organism, mhc_class, mhc_allele) return [None] - #A02: GETRKVKAHSQTHRVDLGT and KWEAAHVAEQLRAYLEGTCVEW - #D-B: RETQKAKGQEQWFRVSLRN and KWEQSGAAEHYKAYLEGECVEW - if organism=='mouse': - ref_alseq = 'GPHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLR' - ref_helices = ['RETQKAKGQEQWFRVSLRN', 'KWEQSGAAEHYKAYLEGECVEW'] + # A02: GETRKVKAHSQTHRVDLGT and KWEAAHVAEQLRAYLEGTCVEW + # D-B: RETQKAKGQEQWFRVSLRN and KWEQSGAAEHYKAYLEGECVEW + if organism == "mouse": + ref_alseq = "GPHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLR" + ref_helices = ["RETQKAKGQEQWFRVSLRN", "KWEQSGAAEHYKAYLEGECVEW"] else: - ref_alseq = 'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENG' - ref_helices = ['GETRKVKAHSQTHRVDLGT', 'KWEAAHVAEQLRAYLEGTCVEW'] + ref_alseq = "GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENG" + ref_helices = ["GETRKVKAHSQTHRVDLGT", "KWEAAHVAEQLRAYLEGTCVEW"] assert len(alseq) == len(ref_alseq) - alseqseq = alseq.replace(ALL_GENES_GAP_CHAR, '') + alseqseq = alseq.replace(ALL_GENES_GAP_CHAR, "") if chainseq != alseqseq: - #print('WARNING: mhc seq mismatch: chainseq=', chainseq) - #print('WARNING: mhc seq mismatch: alseqseq=', alseqseq) + # print('WARNING: mhc seq mismatch: chainseq=', chainseq) + # print('WARNING: mhc seq mismatch: alseqseq=', alseqseq) as2cs = blosum_align(alseqseq, chainseq) else: - as2cs = {i:i for i in range(len(chainseq))} + as2cs = {i: i for i in range(len(chainseq))} positions = [] for helix in ref_helices: start = ref_alseq.index(helix) - for pos in range(start, start+len(helix)): + for pos in range(start, start + len(helix)): if alseq[pos] != ALL_GENES_GAP_CHAR: - i = pos-alseq[:pos].count(ALL_GENES_GAP_CHAR) + i = pos - alseq[:pos].count(ALL_GENES_GAP_CHAR) if i in as2cs: positions.append(as2cs[i]) return positions From 752c2cacbf60b7c223d3b25bc0d4c4113740297b Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 12:12:10 -0700 Subject: [PATCH 12/19] fix deprecated read mode --- tcrdock/sequtil.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcrdock/sequtil.py b/tcrdock/sequtil.py index 122cc63..8a852bb 100644 --- a/tcrdock/sequtil.py +++ b/tcrdock/sequtil.py @@ -44,7 +44,7 @@ def read_fasta(filename): # helper """return OrderedDict indexed by the ">" lines (everything after >)""" - data = open(filename, "rU") + data = open(filename, "r") fasta = OrderedDict() for line in data: if line[0] == ">": From 89854aeebcec1be96fd3216a6b133c03fdaa9327 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 12:24:26 -0700 Subject: [PATCH 13/19] switch to pyproject.toml --- pyproject.toml | 31 ++++++++++++++++++++++++++++++- setup.py | 44 -------------------------------------------- 2 files changed, 30 insertions(+), 45 deletions(-) delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml index 4e715e0..d7941d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,32 @@ [build-system] requires = ["setuptools >= 42.0.0"] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" + +[project] +name = "tcrdock" +version = "0.1.0" + +dependencies = [ + "biopython", + "numpy", + "pandas", + "scipy", + "matplotlib", +] + +[tool.setuptools] +packages = ["tcrdock"] + +[tool.setuptools.package-data] +tcrdock = [ + "**/*.alfas", + "**/*.fasta", + "**/*.txt", + "**/*.tsv", + "**/tcr/*.pdb", + "**/tcr/*.json", + "**/ternary/*.pdb", + "**/ternary/*.json", + "**/pmhc/*.pdb", + "**/pmhc/*.json", +] \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 75a5d3e..0000000 --- a/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -# setup.py -import subprocess -import sys -from setuptools import setup, find_packages -from setuptools.command.install import install - - -class CustomInstall(install): - """Run our database installer, then fall back to the normal install.""" - - def run(self): - subprocess.check_call([sys.executable, "download_blast.py"]) - super().run() - - -setup( - name="tcrdock", - version="0.1.0", - packages=["tcrdock"], - install_requires=[ - "biopython", - "numpy", - "pandas", - "scipy", - "matplotlib", - ], - cmdclass={ - "install": CustomInstall, - }, - package_data={ - "tcrdock": [ - "**/*.alfas", - "**/*.fasta", - "**/*.txt", - "**/*.tsv", - "**/tcr/*.pdb", - "**/tcr/*.json", - "**/ternary/*.pdb", - "**/ternary/*.json", - "**/pmhc/*.pdb", - "**/pmhc/*.json", - ] - }, -) From 2fdf2d00f7474e46d591b35cef2acae41bc06f0f Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 12:28:39 -0700 Subject: [PATCH 14/19] move blast to conda --- README.md | 2 +- devtools/conda-envs/test_env.yaml | 4 ++ tcrdock/blast.py | 93 ++++++++++++++++--------------- 3 files changed, 52 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 0c56e54..0bd734b 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ to be installed, which can be done by running the script `download_blast.py`. A potential installation route would be: ``` -conda create --name tcrdock_test python=3.8 +conda create --name tcrdock_test python blast source activate tcrdock_test # or: conda activate tcrdock_test pip install ``` diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 2402f01..a3e0072 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -7,4 +7,8 @@ dependencies: - python>=3.10.0 - pip + ## main deps + - blast + + ## testing - pytest \ No newline at end of file diff --git a/tcrdock/blast.py b/tcrdock/blast.py index ed783ee..93850e0 100644 --- a/tcrdock/blast.py +++ b/tcrdock/blast.py @@ -5,63 +5,64 @@ from os.path import exists, isdir from .util import amino_acids -path_to_blast_executables = Path(__file__).parents[1] / 'ncbi-blast-2.11.0+' / 'bin' -assert isdir( path_to_blast_executables ),\ - 'You need to download blast; please run download_blast.py in TCRdock/ folder' +blastp_exe = str("blastp") +makeblastdb_exe = str("makeblastdb") -blastp_exe = str(path_to_blast_executables / 'blastp') -makeblastdb_exe = str(path_to_blast_executables / 'makeblastdb') +blast_fields = ( + "evalue bitscore qaccver saccver pident length mismatch" + " gapopen qstart qend qlen qseq sstart send slen sseq" +) -blast_fields = ('evalue bitscore qaccver saccver pident length mismatch' - ' gapopen qstart qend qlen qseq sstart send slen sseq') -def make_blast_dbs(fastafile, dbtype='prot'): - assert dbtype in ['prot','nucl'] +def make_blast_dbs(fastafile, dbtype="prot"): + assert dbtype in ["prot", "nucl"] - cmd = f'{makeblastdb_exe} -in {fastafile} -dbtype {dbtype}' + cmd = f"{makeblastdb_exe} -in {fastafile} -dbtype {dbtype}" print(cmd) system(cmd) def check_for_blast_dbs(fastafile): - return exists(str(fastafile)+'.phr') + return exists(str(fastafile) + ".phr") + def blast_file_and_read_hits( - fname, - dbfile, - evalue = 1e-3, - num_alignments = 10000, - verbose=False, - clobber=False, - extra_blast_args='' + fname, + dbfile, + evalue=1e-3, + num_alignments=10000, + verbose=False, + clobber=False, + extra_blast_args="", ): - assert exists(dbfile), f'missing file for BLAST-ing against: {dbfile}' + assert exists(dbfile), f"missing file for BLAST-ing against: {dbfile}" # check for blast database files if not check_for_blast_dbs(dbfile): # maybe we haven't set up the blast files yet... - print('WARNING: missing blast db files, trying to create...') + print("WARNING: missing blast db files, trying to create...") make_blast_dbs(dbfile) - assert check_for_blast_dbs(dbfile), 'Failed to create blast db files!' + assert check_for_blast_dbs(dbfile), "Failed to create blast db files!" - outfile = fname+'.blast' + outfile = fname + ".blast" assert clobber or not exists(outfile) - cmd = (f'{blastp_exe} -query {fname} -db {dbfile} {extra_blast_args} ' - f' -outfmt "10 delim=, {blast_fields}" -evalue {evalue}' - f' -num_alignments {num_alignments} -out {outfile}') + cmd = ( + f"{blastp_exe} -query {fname} -db {dbfile} {extra_blast_args} " + f' -outfmt "10 delim=, {blast_fields}" -evalue {evalue}' + f" -num_alignments {num_alignments} -out {outfile}" + ) if not verbose: - cmd += ' 2> /dev/null' + cmd += " 2> /dev/null" if verbose: print(cmd) system(cmd) - blast_hits = pd.read_csv( - outfile, header=None, names=blast_fields.split()) - #blast_hits.rename(columns={'saccver':'pdb_chain', 'qaccver':'allele'}, inplace = True) - #blast_hits.sort_values('pident', ascending=False, inplace=True) + blast_hits = pd.read_csv(outfile, header=None, names=blast_fields.split()) + # blast_hits.rename(columns={'saccver':'pdb_chain', 'qaccver':'allele'}, inplace = True) + # blast_hits.sort_values('pident', ascending=False, inplace=True) if exists(outfile): remove(outfile) @@ -70,21 +71,22 @@ def blast_file_and_read_hits( def blast_sequence_and_read_hits( - query_sequence, - dbfile, - tmpfile_prefix = '', - evalue = 1e-3, - num_alignments = 10000, - verbose=False, + query_sequence, + dbfile, + tmpfile_prefix="", + evalue=1e-3, + num_alignments=10000, + verbose=False, ): - tmpfile = f'{tmpfile_prefix}tmp_fasta_{random.random()}.fasta' + tmpfile = f"{tmpfile_prefix}tmp_fasta_{random.random()}.fasta" - out = open(tmpfile, 'w') - out.write(f'>tmp\n{query_sequence}\n') + out = open(tmpfile, "w") + out.write(f">tmp\n{query_sequence}\n") out.close() blast_hits = blast_file_and_read_hits( - tmpfile, dbfile, evalue, num_alignments, verbose, clobber=True) + tmpfile, dbfile, evalue, num_alignments, verbose, clobber=True + ) if exists(tmpfile): remove(tmpfile) @@ -92,19 +94,18 @@ def blast_sequence_and_read_hits( return blast_hits - def setup_query_to_hit_map(hit): - ''' hit is a single row from blast_hits + """hit is a single row from blast_hits query2hit_align is a dictionary mapping from full-length, 0-indexed positions in query sequence to full-length, 0-indexed positions in hit sequence - ''' + """ query2hit_align = {} - for ii,(qaa,haa) in enumerate(zip(hit.qseq, hit.sseq)): + for ii, (qaa, haa) in enumerate(zip(hit.qseq, hit.sseq)): if qaa in amino_acids and haa in amino_acids: - qpos = hit.qstart + ii - hit.qseq[:ii].count('-') - 1 #0-idx - hpos = hit.sstart + ii - hit.sseq[:ii].count('-') - 1 # + qpos = hit.qstart + ii - hit.qseq[:ii].count("-") - 1 # 0-idx + hpos = hit.sstart + ii - hit.sseq[:ii].count("-") - 1 # query2hit_align[qpos] = hpos return query2hit_align From 1c6621cd0a4b85fb0d46c11b64f62125c920727e Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 12:31:30 -0700 Subject: [PATCH 15/19] add bioconda --- README.md | 5 ++--- devtools/conda-envs/test_env.yaml | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0bd734b..ce60b18 100644 --- a/README.md +++ b/README.md @@ -137,11 +137,10 @@ The non-AlphaFold Python package requirements are listed in `requirements.txt`. Those specific package versions should work, but there should also be plenty of flexibility on the versions. The TCR and MHC parsing code also requires the NCBI BLAST+ software -to be installed, which can be done by running the script -`download_blast.py`. A potential installation route would be: +to be installed, which can be done using conda. A potential installation route would be: ``` -conda create --name tcrdock_test python blast +conda create -c bioconda --name tcrdock_test python blast source activate tcrdock_test # or: conda activate tcrdock_test pip install ``` diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index a3e0072..7478bf1 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -2,6 +2,7 @@ name: tcrdock-test channels: - defaults - conda-forge + - bioconda dependencies: ### Base depends ### - python>=3.10.0 From a49d4c3998381f62c4c619dfca8b6832cd974d26 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 13:36:27 -0700 Subject: [PATCH 16/19] remove references to blast executable --- tcrdock/mhc_util.py | 204 +++++++++++++++++++---------------- tcrdock/tcrdist/parsing.py | 211 +++++++++++++++++++++---------------- 2 files changed, 231 insertions(+), 184 deletions(-) diff --git a/tcrdock/mhc_util.py b/tcrdock/mhc_util.py index 6849f02..d9b86b4 100644 --- a/tcrdock/mhc_util.py +++ b/tcrdock/mhc_util.py @@ -1,5 +1,6 @@ # import numpy as np import sys + # from os import system import os.path from os import system, popen @@ -14,7 +15,6 @@ from . import sequtil from . import pdblite -from .blast import path_to_blast_executables from .util import path_to_db from .sequtil import ALL_GENES_GAP_CHAR @@ -29,62 +29,81 @@ ## 94...100 are upward facing in the C-terminal central strand ## 113.115 are upward facing in the neighboring C-terminal ## -class1_template_core_positions_1indexed = [4, 6, 8, 10, 23, 25, - 94, 96, 98, 100, 113, 115] +class1_template_core_positions_1indexed = [ + 4, + 6, + 8, + 10, + 23, + 25, + 94, + 96, + 98, + 100, + 113, + 115, +] class1_template_core_positions_0indexed = [ - x-1 for x in class1_template_core_positions_1indexed] + x - 1 for x in class1_template_core_positions_1indexed +] -class1_template_seq = 'PHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLRTDSPKAHVTHHPRSKGEVTLRCWALGFYPADITLTWQLNGEELTQDMELVETRPAGDGTFQKWASVVVPLGKEQNYTCRVYHEGLPEPLTLRWEP' +class1_template_seq = "PHSMRYFETAVSRPGLEEPRYISVGYVDNKEFVRFDSDAENPRYEPRAPWMEQEGPEYWERETQKAKGQEQWFRVSLRNLLGYYNQSAGGSHTLQQMSGCDLGSDWRLLRGYLQFAYEGRDYIALNEDLKTWTAADMAAQITRRKWEQSGAAEHYKAYLEGECVEWLHRYLKNGNATLLRTDSPKAHVTHHPRSKGEVTLRCWALGFYPADITLTWQLNGEELTQDMELVETRPAGDGTFQKWASVVVPLGKEQNYTCRVYHEGLPEPLTLRWEP" class2_alfas_positions_0indexed = { - 'A': [2, 4, 7, 9, 18, 20], - 'B': [2, 4, 6, 8, 21, 23], # 8 is disulfide posn + "A": [2, 4, 7, 9, 18, 20], + "B": [2, 4, 6, 8, 21, 23], # 8 is disulfide posn } + def get_mhc_core_positions_class1(seq): - ''' These are 0-indexed positions (unlike previous tcrdock) + """These are 0-indexed positions (unlike previous tcrdock) will have -1 if there's a parse fail - ''' + """ al = sequtil.blosum_align(class1_template_seq, seq) core_positions = [ al.get(pos, -1) for pos in class1_template_core_positions_0indexed ] - assert -1 not in core_positions # probably should handle this better + assert -1 not in core_positions # probably should handle this better return core_positions def get_mhc_core_positions_class2( - aseq, # chain 1 ie alpha - bseq, # chain 2 ie beta + aseq, # chain 1 ie alpha + bseq, # chain 2 ie beta ): - ''' these are 0-indexed positions!!! (unlike previous tcrdock) + """these are 0-indexed positions!!! (unlike previous tcrdock) will have -1 if there's a parse fail - ''' + """ offset = 0 core_positions = [] - for ab, seq in zip('AB',[aseq,bseq]): - dbfile = str(path_to_db / f'both_class_2_{ab}_chains_v2.fasta') + for ab, seq in zip("AB", [aseq, bseq]): + dbfile = str(path_to_db / f"both_class_2_{ab}_chains_v2.fasta") hits = blast.blast_sequence_and_read_hits(seq, dbfile) hit = hits.iloc[0] blast_align = blast.setup_query_to_hit_map(hit) - if hit.pident<99.99: - print('get_mhc_core_positions_class2:', ab, hit.saccver, hit.pident) + if hit.pident < 99.99: + print( + "get_mhc_core_positions_class2:", ab, hit.saccver, hit.pident + ) alfas = sequtil.mhc_class_2_alfas[ab][hit.saccver] - hitseq = alfas.replace(ALL_GENES_GAP_CHAR, '') - blast_hitseq = hit.sseq.replace('-','') # debug - assert hitseq.index(blast_hitseq) == hit.sstart-1 # debug + hitseq = alfas.replace(ALL_GENES_GAP_CHAR, "") + blast_hitseq = hit.sseq.replace("-", "") # debug + assert hitseq.index(blast_hitseq) == hit.sstart - 1 # debug # make an alignment from alfas to seq - alfas2hitseq = {i:i-alfas[:i].count(ALL_GENES_GAP_CHAR) - for i,a in enumerate(alfas) if a != ALL_GENES_GAP_CHAR} - hitseq2seq = {j:i for i,j in blast_align.items()} + alfas2hitseq = { + i: i - alfas[:i].count(ALL_GENES_GAP_CHAR) + for i, a in enumerate(alfas) + if a != ALL_GENES_GAP_CHAR + } + hitseq2seq = {j: i for i, j in blast_align.items()} for pos in class2_alfas_positions_0indexed[ab]: hitseq_pos = alfas2hitseq[pos] @@ -94,40 +113,38 @@ def get_mhc_core_positions_class2( core_positions.append(-1) offset += len(seq) - assert -1 not in core_positions # probably should handle this better + assert -1 not in core_positions # probably should handle this better return core_positions def _setup_class_2_alfas_blast_dbs(): - ''' Just called once during setup - ''' + """Just called once during setup""" - makeblastdb_exe = str(path_to_blast_executables / 'makeblastdb') + makeblastdb_exe = str("makeblastdb") - for ab in 'AB': - alfas_fname = str(path_to_db / f'both_class_2_{ab}_chains_v2.alfas') - fasta_fname = alfas_fname[:-5]+'fasta' + for ab in "AB": + alfas_fname = str(path_to_db / f"both_class_2_{ab}_chains_v2.alfas") + fasta_fname = alfas_fname[:-5] + "fasta" alfas = sequtil.read_fasta(alfas_fname) - out = open(fasta_fname, 'w') - for name,alseq in alfas.items(): - seq = alseq.replace(ALL_GENES_GAP_CHAR, '') - out.write(f'>{name}\n{seq}\n') + out = open(fasta_fname, "w") + for name, alseq in alfas.items(): + seq = alseq.replace(ALL_GENES_GAP_CHAR, "") + out.write(f">{name}\n{seq}\n") out.close() # format the db - cmd = f'{makeblastdb_exe} -in {fasta_fname} -dbtype prot' + cmd = f"{makeblastdb_exe} -in {fasta_fname} -dbtype prot" print(cmd) system(cmd) - def get_mhc_stub( - pose, - tdinfo = None, - mhc_core_positions = None, + pose, + tdinfo=None, + mhc_core_positions=None, ): - ''' chain 1 (or 1 and 2) should be the mhc chain + """chain 1 (or 1 and 2) should be the mhc chain chain -3 should be the peptide chain if TCR else chain -1 (pMHC only) the mhc_core_positions (or tdinfo.mhc_core) are half on one side of the @@ -146,61 +163,66 @@ def get_mhc_stub( dict {'axes':numpy array with axes as ROW vectors, 'origin': vector} - ''' + """ if mhc_core_positions is None: if tdinfo is not None: mhc_core_positions = tdinfo.mhc_core else: - print('get_mhc_stub needs either tdinfo or mhc_core_positions') + print("get_mhc_stub needs either tdinfo or mhc_core_positions") sys.exit() - num_chains = len(pose['chains']) - if num_chains in [4,5]: - pep_chain = num_chains-3 + num_chains = len(pose["chains"]) + if num_chains in [4, 5]: + pep_chain = num_chains - 3 else: # huh, maybe mhc only? - assert num_chains in [2,3] - pep_chain = num_chains-1 + assert num_chains in [2, 3] + pep_chain = num_chains - 1 # we may need to flip - #current_x = stub.rotation().col_x() + # current_x = stub.rotation().col_x() # was hardcoding pep_chain to 2 here... - chainbounds = pose['chainbounds'] - pep_positions = list(range(chainbounds[pep_chain], chainbounds[pep_chain+1])) - pep_centroid = np.mean(pose['ca_coords'][pep_positions], axis=0) + chainbounds = pose["chainbounds"] + pep_positions = list( + range(chainbounds[pep_chain], chainbounds[pep_chain + 1]) + ) + pep_centroid = np.mean(pose["ca_coords"][pep_positions], axis=0) stub = superimpose.get_symmetry_stub_from_positions( - mhc_core_positions, pose, point_towards=pep_centroid) + mhc_core_positions, pose, point_towards=pep_centroid + ) - assert (pep_centroid - stub['origin']).dot(stub['axes'][0]) > 0 + assert (pep_centroid - stub["origin"]).dot(stub["axes"][0]) > 0 return stub def orient_pmhc_pose( - pose, - mhc_core_positions = None, - mhc_class = None, - tdinfo = None, # TCRdockInfo + pose, + mhc_core_positions=None, + mhc_class=None, + tdinfo=None, # TCRdockInfo ): - ''' chain 1 should be the mhc chain + """chain 1 should be the mhc chain chain 2 should be the peptide chain - ''' + """ if mhc_core_positions is None: if tdinfo is None: - cs = pose['chainseq'].split('/') + cs = pose["chainseq"].split("/") if mhc_class == 1: mhc_core_positions = get_mhc_core_positions_class1(cs[0]) else: assert mhc_class == 2 - mhc_core_positions = get_mhc_core_positions_class2(cs[0], cs[1]) + mhc_core_positions = get_mhc_core_positions_class2( + cs[0], cs[1] + ) else: mhc_core_positions = tdinfo.mhc_core stub = get_mhc_stub(pose, mhc_core_positions=mhc_core_positions) - R = stub['axes'] - v = -1. * R @ stub['origin'] + R = stub["axes"] + v = -1.0 * R @ stub["origin"] pose = pdblite.apply_transform_Rx_plus_v(pose, R, v) @@ -208,55 +230,51 @@ def orient_pmhc_pose( def make_sorting_tuple(allele): - ''' Helper function for choosing among equal blast hits - ''' - if '*' not in allele: - assert ':' not in allele - return (allele,0) + """Helper function for choosing among equal blast hits""" + if "*" not in allele: + assert ":" not in allele + return (allele, 0) else: - pref, suf = allele.split('*') # just one star + pref, suf = allele.split("*") # just one star if suf[-1].isdigit(): - suf = [int(x) for x in suf.split(':')] + suf = [int(x) for x in suf.split(":")] else: - suf = [int(x) for x in suf[:-1].split(':')]+[suf[-1]] - - return tuple([pref]+suf) + suf = [int(x) for x in suf[:-1].split(":")] + [suf[-1]] + return tuple([pref] + suf) def get_mhc_allele(seq, organism, return_identity=False): - if organism == 'human': - dbfile = (util.path_to_db / - 'hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta') + if organism == "human": + dbfile = ( + util.path_to_db + / "hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta" + ) else: - assert organism == 'mouse' - dbfile = (util.path_to_db / - 'mhc_pdb_chains_mouse_reps_reformat.fasta') - + assert organism == "mouse" + dbfile = util.path_to_db / "mhc_pdb_chains_mouse_reps_reformat.fasta" - hits = blast.blast_sequence_and_read_hits( - seq, dbfile, num_alignments=3) + hits = blast.blast_sequence_and_read_hits(seq, dbfile, num_alignments=3) top_bitscore = max(hits.bitscore) top_hits = hits[hits.bitscore == top_bitscore].reset_index() - if top_hits.shape[0]>1:# ties - #print('ties:', top_hits.shape[0]) - sortl = [(make_sorting_tuple(a), a, i) for i,a in enumerate(top_hits.saccver)] + if top_hits.shape[0] > 1: # ties + # print('ties:', top_hits.shape[0]) + sortl = [ + (make_sorting_tuple(a), a, i) + for i, a in enumerate(top_hits.saccver) + ] sortl.sort() - #print('sorted:', sortl) + # print('sorted:', sortl) ind = sortl[0][-1] hit = top_hits.iloc[ind] assert hit.saccver == sortl[0][1] else: hit = top_hits.iloc[0] mhc = hit.saccver - if organism == 'mouse' and len(mhc) == 3 and mhc[1] == '-': - mhc = f'H2{mhc[0]}{mhc[2].lower()}' + if organism == "mouse" and len(mhc) == 3 and mhc[1] == "-": + mhc = f"H2{mhc[0]}{mhc[2].lower()}" if return_identity: return mhc, hit.pident else: return mhc - - - - diff --git a/tcrdock/tcrdist/parsing.py b/tcrdock/tcrdist/parsing.py index 45c6671..7fed30a 100644 --- a/tcrdock/tcrdist/parsing.py +++ b/tcrdock/tcrdist/parsing.py @@ -11,45 +11,59 @@ from .all_genes import all_genes, gap_character from .genetic_code import genetic_code, reverse_genetic_code from . import logo_tools -from ..blast import (blast_sequence_and_read_hits, setup_query_to_hit_map, - path_to_blast_executables) +from ..blast import blast_sequence_and_read_hits, setup_query_to_hit_map + def get_blast_db_path(organism, ab, vj): - ''' This is the protein sequence db - ''' - return path_to_db / f'imgt_prot_blast_db_{organism}_{ab}_{vj}.fasta' + """This is the protein sequence db""" + return path_to_db / f"imgt_prot_blast_db_{organism}_{ab}_{vj}.fasta" ## borrowed from cdr3s_human.py ## 1-indexed: -extra_alignment_columns_1x = { 'mouse':{'A':[9,86],'B':[] }, ## 1-indexed - 'human':{'A':[],'B':[] } } +extra_alignment_columns_1x = { + "mouse": {"A": [9, 86], "B": []}, ## 1-indexed + "human": {"A": [], "B": []}, +} core_positions_generic_1x = [ - 21, 23, 25, ## 23 is C - 39, 41, ## 41 is W - 53, 54, 55, - 78, ## maybe also 80? - 89, ## 89 is L - 102, 103, 104 ## 104 is C + 21, + 23, + 25, ## 23 is C + 39, + 41, ## 41 is W + 53, + 54, + 55, + 78, ## maybe also 80? + 89, ## 89 is L + 102, + 103, + 104, ## 104 is C ] all_core_alseq_positions_0x = {} for organism in extra_alignment_columns_1x: - for ab,xcols in extra_alignment_columns_1x[organism].items(): - positions = [ x-1+sum(y<=x for y in xcols) # this is not quite right but it wrks - for x in core_positions_generic_1x ] - - if False:# helpful for debugging/info - for v,g in all_genes[organism].items(): - allele = int(v.split('*')[-1]) # sanity check allele names end with *int - if g.chain == ab and g.region == 'V' and v.endswith('*01'): - coreseq = ''.join([g.alseq[x] for x in positions]) - if '*' in coreseq or coreseq[1]+coreseq[-1] != 'CC': - print('funny coreseq:', coreseq, v, organism, ab) - - all_core_alseq_positions_0x[(organism,ab)] = positions + for ab, xcols in extra_alignment_columns_1x[organism].items(): + positions = [ + x + - 1 + + sum(y <= x for y in xcols) # this is not quite right but it wrks + for x in core_positions_generic_1x + ] + + if False: # helpful for debugging/info + for v, g in all_genes[organism].items(): + allele = int( + v.split("*")[-1] + ) # sanity check allele names end with *int + if g.chain == ab and g.region == "V" and v.endswith("*01"): + coreseq = "".join([g.alseq[x] for x in positions]) + if "*" in coreseq or coreseq[1] + coreseq[-1] != "CC": + print("funny coreseq:", coreseq, v, organism, ab) + + all_core_alseq_positions_0x[(organism, ab)] = positions def get_core_positions_0x(organism, v_gene): @@ -62,40 +76,39 @@ def get_core_positions_0x(organism, v_gene): core_positions_0x.append(pos - alseq[:pos].count(gap_character)) return core_positions_0x + def make_blast_dbs(): - makeblastdb_exe = str(path_to_blast_executables / 'makeblastdb') + makeblastdb_exe = str("makeblastdb") # protein sequences for organism in all_genes: - for ab in 'AB': - for vj in 'VJ': + for ab in "AB": + for vj in "VJ": pname = get_blast_db_path(organism, ab, vj) - out = open(pname, 'w') + out = open(pname, "w") for id, g in all_genes[organism].items(): if g.chain == ab and g.region == vj: - out.write(f'>{id}\n{g.protseq}\n') + out.write(f">{id}\n{g.protseq}\n") out.close() - print('made:', pname) + print("made:", pname) # format the db - cmd = f'{makeblastdb_exe} -in {pname} -dbtype prot' + cmd = f"{makeblastdb_exe} -in {pname} -dbtype prot" print(cmd) system(cmd) - - -def parse_core_positions( organism, ab, qseq, v_gene, q2v_align ): - ''' returns qcore_positions, mismatches +def parse_core_positions(organism, ab, qseq, v_gene, q2v_align): + """returns qcore_positions, mismatches qcore_positions is a list of 0-indexed core positions in qseq, may contain -1 if the q2v_align alignment does not cover all the v_gene core positions - ''' + """ vseq = all_genes[organism][v_gene].protseq core_positions = get_core_positions_0x(organism, v_gene) - qcore_positions = [-1]*len(core_positions) + qcore_positions = [-1] * len(core_positions) mismatches = 0 for qpos, vpos in q2v_align.items(): @@ -112,78 +125,84 @@ def parse_other_cdrs(organism, ab, qseq, v_gene, q2v_align): alseq = all_genes[organism][v_gene].alseq cdr_cols_1x = all_genes[organism][v_gene].cdr_columns - assert alseq.replace(gap_character,'') == v_seq - alseq2seq = {i:i-alseq[:i].count(gap_character) - for i,a in enumerate(alseq) - if a != gap_character} + assert alseq.replace(gap_character, "") == v_seq + alseq2seq = { + i: i - alseq[:i].count(gap_character) + for i, a in enumerate(alseq) + if a != gap_character + } - assert len(cdr_cols_1x) == 4 ## CDR1, CDR2, CDR2.5, and CDR3(start) + assert len(cdr_cols_1x) == 4 ## CDR1, CDR2, CDR2.5, and CDR3(start) qseq_cdr_bounds = [] cdr_mismatches = 0 for start_col_1x, stop_col_1x in cdr_cols_1x[:3]: - start = alseq2seq[start_col_1x-1] - stop = alseq2seq[stop_col_1x-1] + start = alseq2seq[start_col_1x - 1] + stop = alseq2seq[stop_col_1x - 1] ## what aligns to this region in v_hit - q_loop_positions = [i for i,j in q2v_align.items() if start <= j <= stop] + q_loop_positions = [ + i for i, j in q2v_align.items() if start <= j <= stop + ] if q_loop_positions: - qstart = min( q_loop_positions ) - qstop = max( q_loop_positions ) + qstart = min(q_loop_positions) + qstop = max(q_loop_positions) qseq_cdr_bounds.append([qstart, qstop]) - for i in range(qstart,qstop+1): + for i in range(qstart, qstop + 1): if i not in q2v_align or v_seq[q2v_align[i]] != qseq[i]: cdr_mismatches += 1 else: - qseq_cdr_bounds.append((None,None)) - cdr_mismatches += stop-start+1 + qseq_cdr_bounds.append((None, None)) + cdr_mismatches += stop - start + 1 return qseq_cdr_bounds, cdr_mismatches def parse_cdr3(organism, ab, qseq, v_gene, j_gene, q2v_align, q2j_align): - ''' returns (C position, F position) ie (CDR3 start, CDR3 stop) + """returns (C position, F position) ie (CDR3 start, CDR3 stop) in 0-indexed qseq numbering either will be None if parsing failed - ''' + """ vg = all_genes[organism][v_gene] jg = all_genes[organism][j_gene] ## what is the C position in this v gene? - cpos = vg.cdr_columns[3][0]-1 ## now 0-indexed, vg.alseq numbers - cpos -= vg.alseq[:cpos].count(gap_character) # now vg.protseq numbers + cpos = vg.cdr_columns[3][0] - 1 ## now 0-indexed, vg.alseq numbers + cpos -= vg.alseq[:cpos].count(gap_character) # now vg.protseq numbers - v2q_align = {j:i for i,j in q2v_align.items()} + v2q_align = {j: i for i, j in q2v_align.items()} query_cpos = v2q_align.get(cpos, None) # V mismatches outside cdr3 region - v_mismatches = sum(qseq[i] != vg.protseq[j] - for i,j in q2v_align.items() - if j < cpos) + v_mismatches = sum( + qseq[i] != vg.protseq[j] for i, j in q2v_align.items() if j < cpos + ) - fpos = jg.cdr_columns[0][1]-1 # 0-idx jg.alseq numbers - fpos -= jg.alseq[:fpos].count(gap_character) # 0-idx, jg.protseq numbers + fpos = jg.cdr_columns[0][1] - 1 # 0-idx jg.alseq numbers + fpos -= jg.alseq[:fpos].count(gap_character) # 0-idx, jg.protseq numbers - j2q_align = {j:i for i,j in q2j_align.items()} + j2q_align = {j: i for i, j in q2j_align.items()} query_fpos = j2q_align.get(fpos, None) - j_mismatches = sum(qseq[i] != jg.protseq[j] - for i,j in q2j_align.items() - if j > fpos) + j_mismatches = sum( + qseq[i] != jg.protseq[j] for i, j in q2j_align.items() if j > fpos + ) return [query_cpos, query_fpos], v_mismatches, j_mismatches def get_top_blast_hit_with_allele_sorting(hits): - ''' In the case of ties, prefer lower allele numbers - ''' + """In the case of ties, prefer lower allele numbers""" top_bitscore = max(hits.bitscore) - top_hits = hits[hits.bitscore==top_bitscore].copy() - #print('top_bitscore:', top_bitscore, 'ties:', top_hits.shape[0]) - top_hits['allele'] = top_hits.saccver.str.split('*').str.get(-1).astype(int) - return top_hits.sort_values('allele').iloc[0].copy() + top_hits = hits[hits.bitscore == top_bitscore].copy() + # print('top_bitscore:', top_bitscore, 'ties:', top_hits.shape[0]) + top_hits["allele"] = ( + top_hits.saccver.str.split("*").str.get(-1).astype(int) + ) + return top_hits.sort_values("allele").iloc[0].copy() + def parse_tcr_sequence(organism, chain, sequence): - ''' return dict with keys + """return dict with keys * cdr_loops * core_positions * v_gene @@ -199,16 +218,16 @@ def parse_tcr_sequence(organism, chain, sequence): returns None upon parse failure - ''' - assert chain in 'AB' + """ + assert chain in "AB" - tmpdbfile = get_blast_db_path(organism, chain, 'V') + tmpdbfile = get_blast_db_path(organism, chain, "V") if not exists(tmpdbfile): - make_blast_dbs() # only need to do this once + make_blast_dbs() # only need to do this once assert exists(tmpdbfile) top_hits = [] - for vj in 'VJ': + for vj in "VJ": dbfile = get_blast_db_path(organism, chain, vj) hits = blast_sequence_and_read_hits(sequence, dbfile) @@ -225,11 +244,21 @@ def parse_tcr_sequence(organism, chain, sequence): j_gene = j_hit.saccver cdr3_bounds, v_mismatches, j_mismatches = parse_cdr3( - organism, chain, sequence, v_gene, j_gene, q2v_align, q2j_align, + organism, + chain, + sequence, + v_gene, + j_gene, + q2v_align, + q2j_align, ) other_cdr_bounds, other_cdr_mismatches = parse_other_cdrs( - organism, chain, sequence, v_gene, q2v_align, + organism, + chain, + sequence, + v_gene, + q2v_align, ) core_positions, core_mismatches = parse_core_positions( @@ -242,21 +271,21 @@ def parse_tcr_sequence(organism, chain, sequence): cdr_loops = other_cdr_bounds + [cdr3_bounds] if any(None in bounds for bounds in cdr_loops): - return {} # signal failure ### EARLY RETURN!!! + return {} # signal failure ### EARLY RETURN!!! result = { - 'cdr_loops':cdr_loops, - 'core_positions':core_positions, - 'v_gene': v_gene, - 'j_gene': j_gene, - 'mismatches':{ - 'v_mismatches':v_mismatches, - 'j_mismatches':j_mismatches, - 'other_cdr_mismatches':other_cdr_mismatches, - 'core_mismatches':core_mismatches, + "cdr_loops": cdr_loops, + "core_positions": core_positions, + "v_gene": v_gene, + "j_gene": j_gene, + "mismatches": { + "v_mismatches": v_mismatches, + "j_mismatches": j_mismatches, + "other_cdr_mismatches": other_cdr_mismatches, + "core_mismatches": core_mismatches, }, } return result else: - print('failed to find v and j matches') + print("failed to find v and j matches") return None From b8659176af42351b9a9c02a7cf496fa82c07f5c5 Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 14:00:09 -0700 Subject: [PATCH 17/19] fix subdir paths --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d7941d4..9860c78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,8 +14,9 @@ dependencies = [ "matplotlib", ] -[tool.setuptools] -packages = ["tcrdock"] +[tool.setuptools.packages.find] +where = ["."] +include = ["tcrdock*"] [tool.setuptools.package-data] tcrdock = [ From 5532b115dc3101bd99fff54b19c6e28d1dd01b9e Mon Sep 17 00:00:00 2001 From: Lawson Date: Fri, 13 Jun 2025 16:01:12 -0700 Subject: [PATCH 18/19] typo --- tcrdock/mhc_util.py | 20 +--- tcrdock/sequtil.py | 222 +++++++++++----------------------------- tcrdock/tcrdock_info.py | 16 +-- 3 files changed, 67 insertions(+), 191 deletions(-) diff --git a/tcrdock/mhc_util.py b/tcrdock/mhc_util.py index d9b86b4..04c19af 100644 --- a/tcrdock/mhc_util.py +++ b/tcrdock/mhc_util.py @@ -88,9 +88,7 @@ def get_mhc_core_positions_class2( hit = hits.iloc[0] blast_align = blast.setup_query_to_hit_map(hit) if hit.pident < 99.99: - print( - "get_mhc_core_positions_class2:", ab, hit.saccver, hit.pident - ) + print("get_mhc_core_positions_class2:", ab, hit.saccver, hit.pident) alfas = sequtil.mhc_class_2_alfas[ab][hit.saccver] hitseq = alfas.replace(ALL_GENES_GAP_CHAR, "") @@ -184,9 +182,7 @@ def get_mhc_stub( # current_x = stub.rotation().col_x() # was hardcoding pep_chain to 2 here... chainbounds = pose["chainbounds"] - pep_positions = list( - range(chainbounds[pep_chain], chainbounds[pep_chain + 1]) - ) + pep_positions = list(range(chainbounds[pep_chain], chainbounds[pep_chain + 1])) pep_centroid = np.mean(pose["ca_coords"][pep_positions], axis=0) stub = superimpose.get_symmetry_stub_from_positions( @@ -213,9 +209,7 @@ def orient_pmhc_pose( mhc_core_positions = get_mhc_core_positions_class1(cs[0]) else: assert mhc_class == 2 - mhc_core_positions = get_mhc_core_positions_class2( - cs[0], cs[1] - ) + mhc_core_positions = get_mhc_core_positions_class2(cs[0], cs[1]) else: mhc_core_positions = tdinfo.mhc_core @@ -247,8 +241,7 @@ def make_sorting_tuple(allele): def get_mhc_allele(seq, organism, return_identity=False): if organism == "human": dbfile = ( - util.path_to_db - / "hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta" + util.path_to_db / "hla_prot_plus_trimmed_minus_funny_w_CD1s_nr_v2.fasta" ) else: assert organism == "mouse" @@ -259,10 +252,7 @@ def get_mhc_allele(seq, organism, return_identity=False): top_hits = hits[hits.bitscore == top_bitscore].reset_index() if top_hits.shape[0] > 1: # ties # print('ties:', top_hits.shape[0]) - sortl = [ - (make_sorting_tuple(a), a, i) - for i, a in enumerate(top_hits.saccver) - ] + sortl = [(make_sorting_tuple(a), a, i) for i, a in enumerate(top_hits.saccver)] sortl.sort() # print('sorted:', sortl) ind = sortl[0][-1] diff --git a/tcrdock/sequtil.py b/tcrdock/sequtil.py index 8a852bb..3f8f88b 100644 --- a/tcrdock/sequtil.py +++ b/tcrdock/sequtil.py @@ -34,12 +34,8 @@ human_structure_alignments.set_index("v_gene", drop=True, inplace=True) # human and mouse -both_structure_alignments = pd.read_table( - path_to_db / "new_both_vg_alignments_v1.tsv" -) -both_structure_alignments.set_index( - ["organism", "v_gene"], drop=True, inplace=True -) +both_structure_alignments = pd.read_table(path_to_db / "new_both_vg_alignments_v1.tsv") +both_structure_alignments.set_index(["organism", "v_gene"], drop=True, inplace=True) def read_fasta(filename): # helper @@ -159,9 +155,7 @@ def read_fasta(filename): # helper all_template_poses = {TCR: {}, PMHC: {}, TERNARY: {}} BAD_DGEOM_PDBIDS = "5sws 7jwi 4jry 4nhu 3tjh 4y19 4y1a 1ymm 2wbj 6uz1".split() -BAD_PMHC_PDBIDS = ( - "3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9".split() -) +BAD_PMHC_PDBIDS = "3rgv 4ms8 6v1a 6v19 6v18 6v15 6v13 6v0y 2uwe 2jcc 2j8u 1lp9".split() # (new) 6uz1 is engineered and binds down by B2M # 3rgv has cterm of peptide out of groove # 4ms8 has chainbreaks (should check for those!) @@ -191,9 +185,7 @@ def read_fasta(filename): # helper def get_tcrdister(organism): if organism not in _cached_tcrdisters: - _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator( - organism - ) + _cached_tcrdisters[organism] = tcrdist.tcr_distances.TcrDistCalculator(organism) return _cached_tcrdisters[organism] @@ -207,7 +199,7 @@ def blosum_align( ): """return 0-indexed dictionary mapping from seq1 to seq2 positions""" - scorematrix = substitution_matrix.load("BLOSUM62") + scorematrix = substitution_matrices.load("BLOSUM62") if global_align: alignments = pairwise2.align.globalds( @@ -294,9 +286,7 @@ def get_j_seq_after_cdr3(organism, j_gene): for organism in extra_alignment_columns_1x: for ab, xcols in extra_alignment_columns_1x[organism].items(): positions = [ - x - - 1 - + sum(y <= x for y in xcols) # this is not quite right but it wrks + x - 1 + sum(y <= x for y in xcols) # this is not quite right but it wrks for x in core_positions_generic_1x ] @@ -336,16 +326,12 @@ def align_chainseq_to_imgt_msa(organism, chainseq, v_gene): chainseq_to_geneseq = blosum_align(chainseq, geneseq) - chainseq_to_alseq = { - i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items() - } + chainseq_to_alseq = {i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()} return chainseq_to_alseq -def align_chainseq_to_structure_msa( - organism, chainseq, v_gene, msa_type="both" -): +def align_chainseq_to_structure_msa(organism, chainseq, v_gene, msa_type="both"): assert msa_type in ["both", "human"] if msa_type == "human": @@ -369,9 +355,7 @@ def align_chainseq_to_structure_msa( chainseq_to_geneseq = blosum_align(chainseq, geneseq) - chainseq_to_alseq = { - i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items() - } + chainseq_to_alseq = {i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items()} return chainseq_to_alseq @@ -388,9 +372,7 @@ def align_tcr_info_pdb_chain_to_structure_msa(pdbid, ab, msa_type_in): "both": {"A": {}, "B": {}}, "human": {"A": {}, "B": {}}, } - print( - "setting up cache for align_tcr_info_pdb_chain_to_structure_msa function" - ) + print("setting up cache for align_tcr_info_pdb_chain_to_structure_msa function") for l in tcr_info.itertuples(): geneseq = get_v_seq_up_to_cys(l.organism, l.v_gene) for msa_type in ["both", "human"]: @@ -411,12 +393,9 @@ def align_tcr_info_pdb_chain_to_structure_msa(pdbid, ab, msa_type_in): chainseq_to_geneseq = blosum_align(l.chainseq, geneseq) chainseq_to_alseq = { - i: geneseq_to_alseq[j] - for i, j in chainseq_to_geneseq.items() + i: geneseq_to_alseq[j] for i, j in chainseq_to_geneseq.items() } - _tcr_alignment_cache[msa_type][l.ab][ - l.pdbid - ] = chainseq_to_alseq + _tcr_alignment_cache[msa_type][l.ab][l.pdbid] = chainseq_to_alseq print( "DONE setting up cache for align_tcr_info_pdb_chain_to_structure_msa", @@ -463,9 +442,7 @@ def get_tcr_chain_trim_positions(organism, chainseq, v, j, cdr3): print("N None:", organism, v) geneseq_positions.append(None) else: - geneseq_positions.append( - pos - alseq[:pos].count(ALL_GENES_GAP_CHAR) - ) + geneseq_positions.append(pos - alseq[:pos].count(ALL_GENES_GAP_CHAR)) start, stop = core_positions_0x[0], core_positions_0x[9] # inclusive! geneseq_positions.extend(range(start, stop + 1)) @@ -529,9 +506,7 @@ def align_vgene_to_template_pdb_chain( # align V regions msa_type = ( - "human" - if trg_organism == "human" and tmp_organism == "human" - else "both" + "human" if trg_organism == "human" and tmp_organism == "human" else "both" ) trg_vseq_to_alseq = trg_msa_alignments[msa_type] @@ -678,8 +653,7 @@ def get_mhc_class_1_alseq(allele): for k in mhc_class_1_alfas: if k.startswith(allele) and k[len(allele)] == ":": suffix = [ - int(x) if x.isdigit() else 100 - for x in k[len(allele) + 1 :].split(":") + int(x) if x.isdigit() else 100 for x in k[len(allele) + 1 :].split(":") ] sortl.append((suffix, k)) if sortl: @@ -702,8 +676,7 @@ def get_mhc_class_2_alseq(chain, allele): for k in mhc_class_2_alfas[chain]: if k.startswith(allele) and k[len(allele)] == ":": suffix = [ - int(x) if x.isdigit() else 100 - for x in k[len(allele) + 1 :].split(":") + int(x) if x.isdigit() else 100 for x in k[len(allele) + 1 :].split(":") ] sortl.append((suffix, k)) if sortl: @@ -854,8 +827,7 @@ def get_clean_and_nonredundant_ternary_tcrs_df( } tcr_tuples = [ - ((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b)) - for l in tcrs.itertuples() + ((l.va, l.ja, l.cdr3a), (l.vb, l.jb, l.cdr3b)) for l in tcrs.itertuples() ] is_redundant = [False] * tcrs.shape[0] @@ -868,19 +840,12 @@ def get_clean_and_nonredundant_ternary_tcrs_df( continue pep_mms = count_peptide_mismatches(irow.pep_seq, jrow.pep_seq) - if ( - irow.organism == jrow.organism - and irow.mhc_class == jrow.mhc_class - ): + if irow.organism == jrow.organism and irow.mhc_class == jrow.mhc_class: tcrd = tdist[irow.organism](tcr_tuples[i], tcr_tuples[j]) pep_red = pep_mms < min_peptide_mismatches tcr_red = tcrd < min_tcrdist - if ( - peptide_tcrdist_logical == "or" - and (pep_red or tcr_red) - ) or ( - peptide_tcrdist_logical == "and" - and (pep_red and tcr_red) + if (peptide_tcrdist_logical == "or" and (pep_red or tcr_red)) or ( + peptide_tcrdist_logical == "and" and (pep_red and tcr_red) ): # redundant @@ -980,10 +945,7 @@ def filter_templates_by_peptide_mismatches( templates["filt_peptide_mismatches"] = np.array( [ - min( - count_peptide_mismatches(p, l.pep_seq) - for p in peptides_for_filtering - ) + min(count_peptide_mismatches(p, l.pep_seq) for p in peptides_for_filtering) for l in templates.itertuples() ] ) @@ -1032,9 +994,7 @@ def pick_dgeom_templates( if peptides_for_filtering is None: peptides_for_filtering = [] elif peptides_for_filtering: - assert ( - peptide in peptides_for_filtering - ) # sanity? seems like should be true + assert peptide in peptides_for_filtering # sanity? seems like should be true if exclude_pdbids is None: exclude_pdbids = [] @@ -1084,10 +1044,7 @@ def pick_dgeom_templates( # currently unused... templates["peptide_mismatches"] = np.array( - [ - count_peptide_mismatches(peptide, l.pep_seq) - for l in templates.itertuples() - ] + [count_peptide_mismatches(peptide, l.pep_seq) for l in templates.itertuples()] ) def calc_chain_tcrdist(row1, row2, chain, tcrdister=tcrdister): @@ -1346,17 +1303,11 @@ def show_alignment(al, seq1, seq2): if organism == "human": # now adding HLA-E 2022-05-03 assert ( - mhc_allele[0] in "ABCE" - and mhc_allele[1] == "*" - and ":" in mhc_allele + mhc_allele[0] in "ABCE" and mhc_allele[1] == "*" and ":" in mhc_allele ) - mhc_allele = ":".join( - mhc_allele.split(":")[:2] - ) # just the 4 digits + mhc_allele = ":".join(mhc_allele.split(":")[:2]) # just the 4 digits else: - assert ( - mhc_allele.startswith("H2") and mhc_allele in mhc_class_1_alfas - ) + assert mhc_allele.startswith("H2") and mhc_allele in mhc_class_1_alfas # first: MHC part trg_mhc_alseq = get_mhc_class_1_alseq(mhc_allele) @@ -1408,13 +1359,9 @@ def show_alignment(al, seq1, seq2): alt_self_peptides, ) continue - assert ( - len(peptide) - pep_idents >= min_pmhc_peptide_mismatches - ) # sanity + assert len(peptide) - pep_idents >= min_pmhc_peptide_mismatches # sanity total = len(peptide) + len(trg_mhc_seq) - frac = ( - mhc_idents + pep_idents - ) / total - 0.01 * l.mhc_total_chainbreak + frac = (mhc_idents + pep_idents) / total - 0.01 * l.mhc_total_chainbreak sortl.append((frac, l.Index)) sortl.sort(reverse=True) @@ -1469,18 +1416,13 @@ def show_alignment(al, seq1, seq2): trg_pmhc_seq = trg_mhc_seq + peptide tmp_pmhc_seq = tmp_mhc_seq + templatel.pep_seq identities = sum( - trg_pmhc_seq[i] == tmp_pmhc_seq[j] - for i, j in trg_to_tmp.items() + trg_pmhc_seq[i] == tmp_pmhc_seq[j] for i, j in trg_to_tmp.items() ) / len(trg_pmhc_seq) - identities_for_sorting = ( - identities - 0.01 * templatel.mhc_total_chainbreak - ) + identities_for_sorting = identities - 0.01 * templatel.mhc_total_chainbreak assert abs(identities_for_sorting - idents) < 1e-3 if verbose: - print( - f"oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}" - ) + print(f"oldnew mhc_idents: {idents:6.3f} {identities:6.3f} {pdbid}") show_alignment(trg_to_tmp, trg_pmhc_seq, tmp_pmhc_seq) pmhc_alignments.append( ( @@ -1531,9 +1473,7 @@ def show_alignment(al, seq1, seq2): [tmp_mhca_alseq, tmp_mhcb_alseq, l.pep_seq], ): assert len(a) == len(b) - idents += sum( - x == y for x, y in zip(a, b) if x != ALL_GENES_GAP_CHAR - ) + idents += sum(x == y for x, y in zip(a, b) if x != ALL_GENES_GAP_CHAR) sortl.append((idents / len(trg_pmhc_seq), l.pdbid)) sortl.sort(reverse=True) max_idents = sortl[0][0] @@ -1581,14 +1521,11 @@ def show_alignment(al, seq1, seq2): } trg_offset = len(trg_mhca_seq) + len(trg_mhcb_seq) tmp_offset = len(tmp_mhca_seq) + len(tmp_mhcb_seq) - al3 = { - i + trg_offset: i + tmp_offset for i in range(CLASS2_PEPLEN) - } + al3 = {i + trg_offset: i + tmp_offset for i in range(CLASS2_PEPLEN)} trg_to_tmp = {**al1, **al2, **al3} tmp_pmhc_seq = tmp_mhca_seq + tmp_mhcb_seq + templatel.pep_seq idents_redo = sum( - trg_pmhc_seq[i] == tmp_pmhc_seq[j] - for i, j in trg_to_tmp.items() + trg_pmhc_seq[i] == tmp_pmhc_seq[j] for i, j in trg_to_tmp.items() ) / len(trg_pmhc_seq) # print(f'oldnew mhc_idents: {idents:6.3f} {idents_redo:6.3f} {pdbid}') assert abs(idents - idents_redo) < 1e-4 @@ -1623,8 +1560,7 @@ def show_alignment(al, seq1, seq2): templates = tcr_info[tcr_info.ab == ab] templates = templates[~templates.pdbid.isin(exclude_pdbids)] template_tcrs = [ - (x.organism[0] + x.v_gene, None, x.cdr3) - for x in templates.itertuples() + (x.organism[0] + x.v_gene, None, x.cdr3) for x in templates.itertuples() ] # templates['v_gene j_gene cdr3'.split()].itertuples(index=False)) closest_tcrs = [ @@ -1646,10 +1582,7 @@ def show_alignment(al, seq1, seq2): if dist < min_single_chain_tcrdist: # print('too close:', dist, trg_v, trg_j, trg_cdr3, template_tcrs[ind]) continue - if ( - closest_dist > big_v_dist - and len(alignments) >= num_templates_per_run - ): + if closest_dist > big_v_dist and len(alignments) >= num_templates_per_run: # print('too far:', closest_dist) break @@ -1674,13 +1607,10 @@ def show_alignment(al, seq1, seq2): # templatel.pdbid, templatel.ab, msa_type) identities = sum( - trg_chainseq[i] == templatel.chainseq[j] - for i, j in trg_to_tmp.items() + trg_chainseq[i] == templatel.chainseq[j] for i, j in trg_to_tmp.items() ) / len(trg_chainseq) - identities_for_sorting = ( - identities + ternary_bonus * templatel.ternary - ) + identities_for_sorting = identities + ternary_bonus * templatel.ternary alignments.append( ( @@ -1741,18 +1671,14 @@ def show_alignment(al, seq1, seq2): peptide, ) exit() - dgeoms = [ - DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows() - ] + dgeoms = [DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()] if num_runs * num_templates_per_run > len(dgeoms): rep_dgeom_indices = np.random.permutation(len(dgeoms)) rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices] else: dummy_organism = "human" # just used for avg cdr coords - rep_dgeoms, rep_dgeom_indices = ( - docking_geometry.pick_docking_geometry_reps( - dummy_organism, dgeoms, num_runs * num_templates_per_run - ) + rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps( + dummy_organism, dgeoms, num_runs * num_templates_per_run ) elif use_opt_dgeoms: rep_dgeoms = docking_geometry.load_opt_dgeoms(mhc_class) @@ -1782,19 +1708,15 @@ def show_alignment(al, seq1, seq2): exclude_docking_geometry_peptides, min_dgeom_peptide_mismatches, ) - dgeoms = [ - DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows() - ] + dgeoms = [DockingGeometry().from_dict(x) for _, x in dgeom_info.iterrows()] if num_runs * num_templates_per_run > len(dgeoms): rep_dgeom_indices = np.random.permutation(len(dgeoms)) rep_dgeoms = [dgeoms[x] for x in rep_dgeom_indices] else: dummy_organism = "human" # just used for avg cdr coords - rep_dgeoms, rep_dgeom_indices = ( - docking_geometry.pick_docking_geometry_reps( - dummy_organism, dgeoms, num_runs * num_templates_per_run - ) + rep_dgeoms, rep_dgeom_indices = docking_geometry.pick_docking_geometry_reps( + dummy_organism, dgeoms, num_runs * num_templates_per_run ) # print('dgeoms:', len(dgeoms), len(rep_dgeoms)) @@ -1826,36 +1748,22 @@ def show_alignment(al, seq1, seq2): dgeom_row = dgeom_info.iloc[rep_dgeom_indices[dgeom_repno]] pmhc_pdbid = pmhc_al[1] - pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo( - pmhc_pdbid, PMHC - ) + pmhc_pose, pmhc_tdinfo = get_template_pose_and_tdinfo(pmhc_pdbid, PMHC) tcra_pdbid = tcra_al[1] - tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo( - tcra_pdbid, TCR - ) + tcra_pose, tcra_tdinfo = get_template_pose_and_tdinfo(tcra_pdbid, TCR) tcrb_pdbid = tcrb_al[1] - tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo( - tcrb_pdbid, TCR - ) + tcrb_pose, tcrb_tdinfo = get_template_pose_and_tdinfo(tcrb_pdbid, TCR) # assert ((mhc_class==1 and pmhc_pose.num_chains() == 4) or # (mhc_class==2 and pmhc_pose.num_chains() == 5)) - assert ( - len(tcra_pose["chains"]) == 2 and len(tcrb_pose["chains"]) == 2 - ) + assert len(tcra_pose["chains"]) == 2 and len(tcrb_pose["chains"]) == 2 # copy tcrb into tcra_pose by superimposing core coords - fix_coords = tcra_pose["ca_coords"][ - tcra_tdinfo.tcr_core[core_len:] - ] - mov_coords = tcrb_pose["ca_coords"][ - tcrb_tdinfo.tcr_core[core_len:] - ] - R, v = superimpose.superimposition_transform( - fix_coords, mov_coords - ) + fix_coords = tcra_pose["ca_coords"][tcra_tdinfo.tcr_core[core_len:]] + mov_coords = tcrb_pose["ca_coords"][tcrb_tdinfo.tcr_core[core_len:]] + R, v = superimpose.superimposition_transform(fix_coords, mov_coords) tcrb_pose = apply_transform_Rx_plus_v(tcrb_pose, R, v) tcra_pose = delete_chains(tcra_pose, [1]) tcra_pose = append_chains(tcra_pose, tcrb_pose, [1]) @@ -1885,9 +1793,7 @@ def show_alignment(al, seq1, seq2): # copy tcr from tcra_pose into pmhc_pose num_pmhc_chains = mhc_class + 1 if len(pmhc_pose["chains"]) > num_pmhc_chains: - del_chains = list( - range(num_pmhc_chains, len(pmhc_pose["chains"])) - ) + del_chains = list(range(num_pmhc_chains, len(pmhc_pose["chains"]))) pmhc_pose = delete_chains(pmhc_pose, del_chains) pmhc_pose = append_chains(pmhc_pose, tcra_pose, [0, 1]) assert len(pmhc_pose["chains"]) == 2 + num_pmhc_chains @@ -1902,9 +1808,7 @@ def show_alignment(al, seq1, seq2): # should be the same as new_tcr_stub! redo_tcr_stub = tcr_util.get_tcr_stub(pmhc_pose, pmhc_tdinfo) v_dev = norm(redo_tcr_stub["origin"] - new_tcr_stub["origin"]) - M_dev = norm( - new_tcr_stub["axes"] @ redo_tcr_stub["axes"].T - np.eye(3) - ) + M_dev = norm(new_tcr_stub["axes"] @ redo_tcr_stub["axes"].T - np.eye(3)) if max(v_dev, M_dev) > 5e-2: print("devs:", v_dev, M_dev) assert v_dev < 5e-2 @@ -1998,9 +1902,7 @@ def show_alignment(al, seq1, seq2): tcrb_v=tcr_info.loc[(tcrb_pdbid, "B"), "v_gene"], tcrb_j=tcr_info.loc[(tcrb_pdbid, "B"), "j_gene"], tcrb_cdr3=tcr_info.loc[(tcrb_pdbid, "B"), "cdr3"], - dgeom_pdbid=( - f"opt{itmp}" if dgeom_row is None else dgeom_row.pdbid - ), + dgeom_pdbid=(f"opt{itmp}" if dgeom_row is None else dgeom_row.pdbid), template_pdbfile=outpdbfile, target_to_template_alignstring=alignstring, identities=identities, @@ -2114,9 +2016,7 @@ def setup_for_alphafold( tcr_db = tcr_db.copy() tcr_db["mhc_peptide"] = ( - tcr_db.mhc.str.replace("*", "", regex=False).str.replace( - ":", "", regex=False - ) + tcr_db.mhc.str.replace("*", "", regex=False).str.replace(":", "", regex=False) + "_" + tcr_db.peptide ) @@ -2135,9 +2035,7 @@ def setup_for_alphafold( # optionally filter to peptide mhc combos with min/max counts ########## if min_pmhc_count is not None or max_pmhc_count is not None: - tcr_db.sort_values( - "mhc_peptide", inplace=True - ) ## IMPORTANT B/C MASKING + tcr_db.sort_values("mhc_peptide", inplace=True) ## IMPORTANT B/C MASKING random.seed(random_seed) # shuffling of epitope tcrs assert min_pmhc_count is not None and max_pmhc_count is not None mhc_peptides = tcr_db.mhc_peptide.drop_duplicates().to_list() @@ -2184,9 +2082,7 @@ def setup_for_alphafold( targets_dfl = [] for index, targetl in tcr_db.iterrows(): - targetid_prefix = ( - f"T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}" - ) + targetid_prefix = f"T{index:05d}_{targetl.mhc_peptide}{targetid_prefix_suffix}" print("START", index, tcr_db.shape[0], targetid_prefix) outfile_prefix = f"{outdir}{targetid_prefix}" if exclude_self_peptide_docking_geometries: @@ -2253,9 +2149,7 @@ def setup_for_alphafold( print("made:", outfile) -def get_mhc_chain_trim_positions( - chainseq, organism, mhc_class, mhc_allele, chain=None -): +def get_mhc_chain_trim_positions(chainseq, organism, mhc_class, mhc_allele, chain=None): """ """ assert mhc_class in [1, 2] if mhc_class == 2: diff --git a/tcrdock/tcrdock_info.py b/tcrdock/tcrdock_info.py index ee4c1c3..de9b9ba 100644 --- a/tcrdock/tcrdock_info.py +++ b/tcrdock/tcrdock_info.py @@ -73,9 +73,7 @@ def from_sequences( else: assert mhc_class == 2 - self.mhc_core = mhc_util.get_mhc_core_positions_class2( - mhc_aseq, mhc_bseq - ) + self.mhc_core = mhc_util.get_mhc_core_positions_class2(mhc_aseq, mhc_bseq) self.mhc_allele = ( mhc_util.get_mhc_allele(mhc_aseq, organism) + "," @@ -115,9 +113,7 @@ def from_sequences( self.tcr_cdrs.extend( [(int(x[0] + offset), int(x[1] + offset)) for x in cdr_loops] ) - self.tcr_core.extend( - [int(x + offset) for x in res["core_positions"]] - ) + self.tcr_core.extend([int(x + offset) for x in res["core_positions"]]) offset += len(chainseq) self.valid = True # signal success @@ -129,17 +125,13 @@ def renumber(self, old2new): will be bad if old2new doesn't cover all our positions """ - old2new = { - int(x): int(y) for x, y in old2new.items() - } # no int64 in our data! + old2new = {int(x): int(y) for x, y in old2new.items()} # no int64 in our data! if self.mhc_core is not None: self.mhc_core = [old2new[x] for x in self.mhc_core] if self.tcr_core is not None: self.tcr_core = [old2new[x] for x in self.tcr_core] if self.tcr_cdrs is not None: - self.tcr_cdrs = [ - (old2new[x], old2new[y]) for x, y in self.tcr_cdrs - ] + self.tcr_cdrs = [(old2new[x], old2new[y]) for x, y in self.tcr_cdrs] return self def delete_residue_range(self, start, stop): From c5c68cd6c0c48bfa1c2378d5087774e8f3ce996b Mon Sep 17 00:00:00 2001 From: Lawson Date: Mon, 18 Aug 2025 17:11:04 -0700 Subject: [PATCH 19/19] add support for anarci-identified CDRs --- .gitignore | 3 +- tcrdock/tcrdist/parsing.py | 142 ++++++++++++++++++++++++++----------- tcrdock/tcrdock_info.py | 9 ++- 3 files changed, 110 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index a99af23..b9398eb 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,5 @@ ncbi-blast-* **/*.ipynb .vscode .nextflow -build \ No newline at end of file +build +tmp \ No newline at end of file diff --git a/tcrdock/tcrdist/parsing.py b/tcrdock/tcrdist/parsing.py index 7fed30a..7699173 100644 --- a/tcrdock/tcrdist/parsing.py +++ b/tcrdock/tcrdist/parsing.py @@ -12,6 +12,7 @@ from .genetic_code import genetic_code, reverse_genetic_code from . import logo_tools from ..blast import blast_sequence_and_read_hits, setup_query_to_hit_map +import numpy as np def get_blast_db_path(organism, ab, vj): @@ -47,9 +48,7 @@ def get_blast_db_path(organism, ab, vj): for organism in extra_alignment_columns_1x: for ab, xcols in extra_alignment_columns_1x[organism].items(): positions = [ - x - - 1 - + sum(y <= x for y in xcols) # this is not quite right but it wrks + x - 1 + sum(y <= x for y in xcols) # this is not quite right but it wrks for x in core_positions_generic_1x ] @@ -141,9 +140,7 @@ def parse_other_cdrs(organism, ab, qseq, v_gene, q2v_align): stop = alseq2seq[stop_col_1x - 1] ## what aligns to this region in v_hit - q_loop_positions = [ - i for i, j in q2v_align.items() if start <= j <= stop - ] + q_loop_positions = [i for i, j in q2v_align.items() if start <= j <= stop] if q_loop_positions: qstart = min(q_loop_positions) qstop = max(q_loop_positions) @@ -190,18 +187,73 @@ def parse_cdr3(organism, ab, qseq, v_gene, j_gene, q2v_align, q2j_align): return [query_cpos, query_fpos], v_mismatches, j_mismatches +def anarci_parse_cdrs(organism, ab, qseq): + try: + import anarci + except ImportError: + raise ModuleNotFoundError( + "anarci not installed, skipping CDR parsing. Try `conda install -c bioconda anarci`" + ) + + if ab == "A": + allow = set(["A", "D"]) + elif ab == "B": + allow = set(["B"]) + + results = anarci.run_anarci( + [("AAA", qseq)], + scheme="imgt", + allowed_species=["human"], + allow=allow, + ) + if results[1][0] is None: + raise ValueError(f"No domain found for sequence '{qseq}'") + + numbering = results[1][0][0] + sub_start = numbering[1] + sub_stop = numbering[2] + 1 + + # this entire substring will be numbered + # however, there may be gaps in the sequence + # which are given an IMGT number + numbered_substring = qseq[sub_start:sub_stop] + imgt_num = np.zeros((len(range(sub_start, sub_stop)),), dtype=np.int32) + imgt_tuples = numbering[0] + j = 0 + for i in range(len(numbered_substring)): + aa = numbered_substring[i] + while j < len(imgt_tuples) and imgt_tuples[j][1] != aa: + j += 1 + if j < len(imgt_tuples): + imgt_num[i] = imgt_tuples[j][0][0] + j += 1 + + # return list of (start, stop) tuples for CDRs + cdr_tuples = [] + cdr_1 = np.nonzero((imgt_num >= 27) & (imgt_num <= 38))[0] + sub_start + cdr_tuples.append([cdr_1[0], cdr_1[-1]]) + + cdr_2 = np.nonzero((imgt_num >= 50) & (imgt_num <= 65))[0] + sub_start + cdr_tuples.append([cdr_2[0], cdr_2[-1]]) + + cdr_2_5 = np.nonzero((imgt_num >= 81) & (imgt_num <= 86))[0] + sub_start + cdr_tuples.append([cdr_2_5[0], cdr_2_5[-1]]) + + cdr_3 = np.nonzero((imgt_num >= 104) & (imgt_num <= 118))[0] + sub_start + cdr_tuples.append([cdr_3[0], cdr_3[-1]]) + return cdr_tuples + + def get_top_blast_hit_with_allele_sorting(hits): """In the case of ties, prefer lower allele numbers""" top_bitscore = max(hits.bitscore) top_hits = hits[hits.bitscore == top_bitscore].copy() # print('top_bitscore:', top_bitscore, 'ties:', top_hits.shape[0]) - top_hits["allele"] = ( - top_hits.saccver.str.split("*").str.get(-1).astype(int) - ) + top_hits["allele"] = top_hits.saccver.str.split("*").str.get(-1).astype(int) return top_hits.sort_values("allele").iloc[0].copy() -def parse_tcr_sequence(organism, chain, sequence): +def parse_tcr_sequence(organism, chain, sequence, anarci_cdrs=False): """return dict with keys * cdr_loops * core_positions @@ -221,6 +273,8 @@ def parse_tcr_sequence(organism, chain, sequence): """ assert chain in "AB" + result = {"mismatches": {}} + tmpdbfile = get_blast_db_path(organism, chain, "V") if not exists(tmpdbfile): make_blast_dbs() # only need to do this once @@ -243,23 +297,38 @@ def parse_tcr_sequence(organism, chain, sequence): v_gene = v_hit.saccver j_gene = j_hit.saccver - cdr3_bounds, v_mismatches, j_mismatches = parse_cdr3( - organism, - chain, - sequence, - v_gene, - j_gene, - q2v_align, - q2j_align, - ) + if anarci_cdrs: + cdr_loops = anarci_parse_cdrs(organism, chain, sequence) + result["cdr_loops"] = cdr_loops - other_cdr_bounds, other_cdr_mismatches = parse_other_cdrs( - organism, - chain, - sequence, - v_gene, - q2v_align, - ) + else: + cdr3_bounds, v_mismatches, j_mismatches = parse_cdr3( + organism, + chain, + sequence, + v_gene, + j_gene, + q2v_align, + q2j_align, + ) + + other_cdr_bounds, other_cdr_mismatches = parse_other_cdrs( + organism, + chain, + sequence, + v_gene, + q2v_align, + ) + + cdr_loops = other_cdr_bounds + [cdr3_bounds] + + if any(None in bounds for bounds in cdr_loops): + return {} # signal failure ### EARLY RETURN!!! + + result["cdr_loops"] = cdr_loops + result["mismatches"]["other_cdr_mismatches"] = other_cdr_mismatches + result["mismatches"]["v_mismatches"] = v_mismatches + result["mismatches"]["j_mismatches"] = j_mismatches core_positions, core_mismatches = parse_core_positions( organism, chain, sequence, v_gene, q2v_align @@ -269,22 +338,11 @@ def parse_tcr_sequence(organism, chain, sequence): # fail return {} - cdr_loops = other_cdr_bounds + [cdr3_bounds] - if any(None in bounds for bounds in cdr_loops): - return {} # signal failure ### EARLY RETURN!!! - - result = { - "cdr_loops": cdr_loops, - "core_positions": core_positions, - "v_gene": v_gene, - "j_gene": j_gene, - "mismatches": { - "v_mismatches": v_mismatches, - "j_mismatches": j_mismatches, - "other_cdr_mismatches": other_cdr_mismatches, - "core_mismatches": core_mismatches, - }, - } + result["core_positions"] = core_positions + result["v_gene"] = v_gene + result["j_gene"] = j_gene + result["mismatches"]["core_mismatches"] = core_mismatches + return result else: print("failed to find v and j matches") diff --git a/tcrdock/tcrdock_info.py b/tcrdock/tcrdock_info.py index de9b9ba..b890a11 100644 --- a/tcrdock/tcrdock_info.py +++ b/tcrdock/tcrdock_info.py @@ -51,6 +51,7 @@ def from_sequences( pep_seq, # '' if tcr_only tcr_aseq, tcr_bseq, + anarci_cdrs=False, ): self.valid = False self.organism = organism @@ -93,7 +94,9 @@ def from_sequences( self.tcr_core = [] self.tcr = [] for chain, chainseq in zip("AB", [tcr_aseq, tcr_bseq]): - res = tcrdist.parsing.parse_tcr_sequence(organism, chain, chainseq) + res = tcrdist.parsing.parse_tcr_sequence( + organism, chain, chainseq, anarci_cdrs=anarci_cdrs + ) if not res: other_organism = "human" if organism == "mouse" else "mouse" res = tcrdist.parsing.parse_tcr_sequence( @@ -103,6 +106,7 @@ def from_sequences( # parse failure print("TCRdockInfo:: TCR parse fail!") return self ########## early return + cdr_loops = res["cdr_loops"] cdr3_start, cdr3_end = cdr_loops[-1] cdr3 = chainseq[cdr3_start : cdr3_end + 1] @@ -113,6 +117,9 @@ def from_sequences( self.tcr_cdrs.extend( [(int(x[0] + offset), int(x[1] + offset)) for x in cdr_loops] ) + + self.tcr.append((res["v_gene"], res["j_gene"], None)) + self.tcr_core.extend([int(x + offset) for x in res["core_positions"]]) offset += len(chainseq)