From 4d809b8b3ceb4003d763ad65999bdaf93227fe5c Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 2 Jan 2025 09:50:26 -0700 Subject: [PATCH 1/2] feat(IPVC-3129): add source column to translation exception --- ...031_add_source_to_translation_exception.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/alembic/versions/8f20be3a9031_add_source_to_translation_exception.py diff --git a/src/alembic/versions/8f20be3a9031_add_source_to_translation_exception.py b/src/alembic/versions/8f20be3a9031_add_source_to_translation_exception.py new file mode 100644 index 0000000..691c5eb --- /dev/null +++ b/src/alembic/versions/8f20be3a9031_add_source_to_translation_exception.py @@ -0,0 +1,30 @@ +"""add source to translation exception + +Revision ID: 8f20be3a9031 +Revises: 77076df4224c +Create Date: 2024-12-31 18:19:11.827603 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '8f20be3a9031' +down_revision: Union[str, None] = '77076df4224c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('translation_exception', sa.Column('source', sa.Text(), server_default='NCBI', nullable=False), schema='uta') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('translation_exception', 'source', schema='uta') + # ### end Alembic commands ### From aab86ec634c5809d8f84f7a486eff3fd1991bee9 Mon Sep 17 00:00:00 2001 From: Shane Giles Date: Thu, 2 Jan 2025 10:15:18 -0700 Subject: [PATCH 2/2] feat(IPVC-3129): model changes and updates to loading check-tranls-except method --- sbin/uta-diff | 4 +++- src/uta/loading.py | 56 +++++++++++++++++++++++++++++++++++++++++----- src/uta/models.py | 3 ++- 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/sbin/uta-diff b/sbin/uta-diff index d89798d..b346240 100755 --- a/sbin/uta-diff +++ b/sbin/uta-diff @@ -17,6 +17,7 @@ cmp_cols.update({ "gene": "gene_id".split(), "seq_anno": "seq_anno_id seq_id origin_id ac added".split(), "transcript": "ac".split(), + "translation_exception": "tx_ac start_position end_position amino_acid".split(), }) @@ -47,7 +48,8 @@ if __name__ == "__main__": url = "postgresql://uta_admin@localhost/uta" tables = ["associated_accessions", "exon", "exon_aln", "exon_set", - "gene", "meta", "origin", "seq", "seq_anno", "transcript",] + "gene", "meta", "origin", "seq", "seq_anno", "transcript", + "translation_exception",] s1, s2 = sys.argv[1:3] con = psycopg2.connect(url) diff --git a/src/uta/loading.py b/src/uta/loading.py index d91fe23..b6fc8f5 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -13,7 +13,7 @@ from biocommons.seqrepo import SeqRepo from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND from bioutils.digests import seq_md5 -from bioutils.sequences import reverse_complement, translate_cds +from bioutils.sequences import reverse_complement, translate_cds, aa_to_aa3 from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session from sqlalchemy.orm.exc import NoResultFound @@ -209,7 +209,7 @@ def check_transcripts(session: Session, opts: Dict, cf: ConfigParser): def check_transl_except(session, opts, cf): """ Find transcripts in the given transcript file which are in the given UTA database version - and do not have transl_except entries when they should. + and do not have transl_except entries when they should, and add them. """ # required opts transcript_file = opts['TRANSCRIPT_FILE'] @@ -247,12 +247,12 @@ def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None yield transcript.ac, transcript.cds_start_i, transcript.cds_end_i, protein_ac - result_transcripts = set() + result_transl_excepts = set() warning_transcripts = set() sf = _get_seqfetcher(cf) for i, (transcript_ac, transcript_cds_start_i, transcript_cds_end_i, protein_ac) in enumerate(transcript_iterator()): if i % 1000 == 0: - print(f'Progress: {i}') + logger.info(f'Progress: {i}') if protein_ac is None: warning_transcripts.add(transcript_ac) @@ -281,10 +281,54 @@ def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None continue if protein_seq != translated_protein_seq: - result_transcripts.add(transcript_ac) + if len(protein_seq) != len(translated_protein_seq): + logger.warning(f'Protein sequence length mismatch: {protein_ac} {len(protein_seq)} != ' + f'translated {transcript_ac} {len(translated_protein_seq)}') + continue + + for i in range(len(protein_seq)): + if protein_seq[i] != translated_protein_seq[i]: + codon_start_i = (i * 3) + codon_end_i = codon_start_i + 3 + tx_start_i = codon_start_i + transcript_cds_start_i + tx_end_i = codon_end_i + transcript_cds_start_i + transl_except_aa = aa_to_aa3(protein_seq[i]) + if transl_except_aa == "Xaa": + transl_except_aa = "OTHER" + + # add transl_except entry if it does not already exist. + te, created = _get_or_insert( + session=session, + table=usam.TranslationException, + row={ + 'tx_ac': transcript_ac, + 'start_position': tx_start_i, + 'end_position': tx_end_i, + 'amino_acid': transl_except_aa, + 'source': 'Internal', + }, + row_identifier=('tx_ac', 'start_position', 'end_position', 'amino_acid'), + ) + logger.info( + f'Translation exception {"created:" if created else "already exists"}: {transcript_ac} ' + f'{tx_start_i}..{tx_end_i} {transcript_seq[codon_start_i:codon_end_i]} -> {transl_except_aa}' + ) + + result_transl_excepts.add(( + transcript_ac, + f'pos:{codon_start_i}..{codon_end_i}', + f'{transcript_seq[codon_start_i:codon_end_i]}', + f'aa:{transl_except_aa}', + created + )) + + session.commit() + + for tx_ac in warning_transcripts: + logger.warning(f'Warning: {tx_ac} not processed') with open(output_file, 'wt') as output_fp: - output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts)) + output_fp.writelines("\t".join(row) for row in sorted(result_transl_excepts)) def create_schema(session, opts, cf): diff --git a/src/uta/models.py b/src/uta/models.py index 745cdc1..43be6d4 100644 --- a/src/uta/models.py +++ b/src/uta/models.py @@ -18,7 +18,7 @@ # schema name support # also see etc/uta.conf -schema_version = "1.2" +schema_version = "1.3" use_schema = strtobool(os.environ.get('UTA_USE_SCHEMA', 'true')) if use_schema: schema_name = "uta" @@ -162,6 +162,7 @@ class TranslationException(Base): start_position = sa.Column(sa.Integer, nullable=False) end_position = sa.Column(sa.Integer, nullable=False) amino_acid = sa.Column(sa.Text, nullable=False) + source = sa.Column(sa.Text, nullable=False, server_default="NCBI") # relationships: transcript = sao.relationship("Transcript", backref="translation_exceptions")