invitae · bsgiles73 · Jan 2, 2025 · Jan 2, 2025
diff --git a/sbin/uta-diff b/sbin/uta-diff
@@ -17,6 +17,7 @@ cmp_cols.update({
     "gene": "gene_id".split(),
     "seq_anno": "seq_anno_id seq_id origin_id ac added".split(),
     "transcript": "ac".split(),
+    "translation_exception": "tx_ac start_position end_position amino_acid".split(),
     })
 
 
@@ -47,7 +48,8 @@ if __name__ == "__main__":
 
     url = "postgresql://uta_admin@localhost/uta"
     tables = ["associated_accessions", "exon", "exon_aln", "exon_set",
-              "gene", "meta", "origin", "seq", "seq_anno", "transcript",]
+              "gene", "meta", "origin", "seq", "seq_anno", "transcript",
+              "translation_exception",]
 
     s1, s2 = sys.argv[1:3]
     con = psycopg2.connect(url)

diff --git a/src/alembic/versions/8f20be3a9031_add_source_to_translation_exception.py b/src/alembic/versions/8f20be3a9031_add_source_to_translation_exception.py
@@ -0,0 +1,30 @@
+"""add source to translation exception
+
+Revision ID: 8f20be3a9031
+Revises: 77076df4224c
+Create Date: 2024-12-31 18:19:11.827603
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '8f20be3a9031'
+down_revision: Union[str, None] = '77076df4224c'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('translation_exception', sa.Column('source', sa.Text(), server_default='NCBI', nullable=False), schema='uta')
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('translation_exception', 'source', schema='uta')
+    # ### end Alembic commands ###
diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -13,7 +13,7 @@
 from biocommons.seqrepo import SeqRepo
 from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
 from bioutils.digests import seq_md5
-from bioutils.sequences import reverse_complement, translate_cds
+from bioutils.sequences import reverse_complement, translate_cds, aa_to_aa3
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 from sqlalchemy.orm.exc import NoResultFound
@@ -209,7 +209,7 @@ def check_transcripts(session: Session, opts: Dict, cf: ConfigParser):
 def check_transl_except(session, opts, cf):
     """
     Find transcripts in the given transcript file which are in the given UTA database version
-    and do not have transl_except entries when they should.
+    and do not have transl_except entries when they should, and add them.
     """
     # required opts
     transcript_file = opts['TRANSCRIPT_FILE']
@@ -247,12 +247,12 @@ def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None
 
                 yield transcript.ac, transcript.cds_start_i, transcript.cds_end_i, protein_ac
 
-    result_transcripts = set()
+    result_transl_excepts = set()
     warning_transcripts = set()
     sf = _get_seqfetcher(cf)
     for i, (transcript_ac, transcript_cds_start_i, transcript_cds_end_i, protein_ac) in enumerate(transcript_iterator()):
         if i % 1000 == 0:
-            print(f'Progress: {i}')
+            logger.info(f'Progress: {i}')
 
         if protein_ac is None:
             warning_transcripts.add(transcript_ac)
@@ -281,10 +281,54 @@ def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None
             continue
 
         if protein_seq != translated_protein_seq:
-            result_transcripts.add(transcript_ac)
+            if len(protein_seq) != len(translated_protein_seq):
+                logger.warning(f'Protein sequence length mismatch: {protein_ac} {len(protein_seq)} != '
+                               f'translated {transcript_ac} {len(translated_protein_seq)}')
+                continue
+
+            for i in range(len(protein_seq)):
+                if protein_seq[i] != translated_protein_seq[i]:
+                    codon_start_i = (i * 3)
+                    codon_end_i = codon_start_i + 3
+                    tx_start_i = codon_start_i + transcript_cds_start_i
+                    tx_end_i = codon_end_i + transcript_cds_start_i
+                    transl_except_aa = aa_to_aa3(protein_seq[i])
+                    if transl_except_aa == "Xaa":
+                        transl_except_aa = "OTHER"
+
+                    # add transl_except entry if it does not already exist.
+                    te, created = _get_or_insert(
+                        session=session,
+                        table=usam.TranslationException,
+                        row={
+                            'tx_ac': transcript_ac,
+                            'start_position': tx_start_i,
+                            'end_position': tx_end_i,
+                            'amino_acid': transl_except_aa,
+                            'source': 'Internal',
+                        },
+                        row_identifier=('tx_ac', 'start_position', 'end_position', 'amino_acid'),
+                    )
+                    logger.info(
+                        f'Translation exception {"created:" if created else "already exists"}: {transcript_ac} '
+                        f'{tx_start_i}..{tx_end_i} {transcript_seq[codon_start_i:codon_end_i]} -> {transl_except_aa}'
+                    )
+
+                    result_transl_excepts.add((
+                        transcript_ac,
+                        f'pos:{codon_start_i}..{codon_end_i}',
+                        f'{transcript_seq[codon_start_i:codon_end_i]}',
+                        f'aa:{transl_except_aa}',
+                        created
+                    ))
+
+    session.commit()
+
+    for tx_ac in warning_transcripts:
+        logger.warning(f'Warning: {tx_ac} not processed')
 
     with open(output_file, 'wt') as output_fp:
-        output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts))
+        output_fp.writelines("\t".join(row) for row in sorted(result_transl_excepts))
 
 
 def create_schema(session, opts, cf):

diff --git a/src/uta/models.py b/src/uta/models.py
@@ -18,7 +18,7 @@
 # schema name support
 # also see etc/uta.conf
 
-schema_version = "1.2"
+schema_version = "1.3"
 use_schema = strtobool(os.environ.get('UTA_USE_SCHEMA', 'true'))
 if use_schema:
     schema_name = "uta"
@@ -162,6 +162,7 @@ class TranslationException(Base):
     start_position = sa.Column(sa.Integer, nullable=False)
     end_position = sa.Column(sa.Integer, nullable=False)
     amino_acid = sa.Column(sa.Text, nullable=False)
+    source = sa.Column(sa.Text, nullable=False, server_default="NCBI")
 
     # relationships:
     transcript = sao.relationship("Transcript", backref="translation_exceptions")