Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion sbin/uta-diff
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ cmp_cols.update({
"gene": "gene_id".split(),
"seq_anno": "seq_anno_id seq_id origin_id ac added".split(),
"transcript": "ac".split(),
"translation_exception": "tx_ac start_position end_position amino_acid".split(),
})


Expand Down Expand Up @@ -47,7 +48,8 @@ if __name__ == "__main__":

url = "postgresql://uta_admin@localhost/uta"
tables = ["associated_accessions", "exon", "exon_aln", "exon_set",
"gene", "meta", "origin", "seq", "seq_anno", "transcript",]
"gene", "meta", "origin", "seq", "seq_anno", "transcript",
"translation_exception",]

s1, s2 = sys.argv[1:3]
con = psycopg2.connect(url)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""add source to translation exception

Revision ID: 8f20be3a9031
Revises: 77076df4224c
Create Date: 2024-12-31 18:19:11.827603

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '8f20be3a9031'
down_revision: Union[str, None] = '77076df4224c'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('translation_exception', sa.Column('source', sa.Text(), server_default='NCBI', nullable=False), schema='uta')
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('translation_exception', 'source', schema='uta')
# ### end Alembic commands ###
56 changes: 50 additions & 6 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from biocommons.seqrepo import SeqRepo
from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
from bioutils.digests import seq_md5
from bioutils.sequences import reverse_complement, translate_cds
from bioutils.sequences import reverse_complement, translate_cds, aa_to_aa3
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from sqlalchemy.orm.exc import NoResultFound
Expand Down Expand Up @@ -209,7 +209,7 @@ def check_transcripts(session: Session, opts: Dict, cf: ConfigParser):
def check_transl_except(session, opts, cf):
"""
Find transcripts in the given transcript file which are in the given UTA database version
and do not have transl_except entries when they should.
and do not have transl_except entries when they should, and add them.
"""
# required opts
transcript_file = opts['TRANSCRIPT_FILE']
Expand Down Expand Up @@ -247,12 +247,12 @@ def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None

yield transcript.ac, transcript.cds_start_i, transcript.cds_end_i, protein_ac

result_transcripts = set()
result_transl_excepts = set()
warning_transcripts = set()
sf = _get_seqfetcher(cf)
for i, (transcript_ac, transcript_cds_start_i, transcript_cds_end_i, protein_ac) in enumerate(transcript_iterator()):
if i % 1000 == 0:
print(f'Progress: {i}')
logger.info(f'Progress: {i}')

if protein_ac is None:
warning_transcripts.add(transcript_ac)
Expand Down Expand Up @@ -281,10 +281,54 @@ def transcript_iterator() -> Generator[Tuple[str, int, int, Optional[str]], None
continue

if protein_seq != translated_protein_seq:
result_transcripts.add(transcript_ac)
if len(protein_seq) != len(translated_protein_seq):
logger.warning(f'Protein sequence length mismatch: {protein_ac} {len(protein_seq)} != '
f'translated {transcript_ac} {len(translated_protein_seq)}')
continue

for i in range(len(protein_seq)):
if protein_seq[i] != translated_protein_seq[i]:
codon_start_i = (i * 3)
codon_end_i = codon_start_i + 3
tx_start_i = codon_start_i + transcript_cds_start_i
tx_end_i = codon_end_i + transcript_cds_start_i
transl_except_aa = aa_to_aa3(protein_seq[i])
if transl_except_aa == "Xaa":
transl_except_aa = "OTHER"

# add transl_except entry if it does not already exist.
te, created = _get_or_insert(
session=session,
table=usam.TranslationException,
row={
'tx_ac': transcript_ac,
'start_position': tx_start_i,
'end_position': tx_end_i,
'amino_acid': transl_except_aa,
'source': 'Internal',
},
row_identifier=('tx_ac', 'start_position', 'end_position', 'amino_acid'),
)
logger.info(
f'Translation exception {"created:" if created else "already exists"}: {transcript_ac} '
f'{tx_start_i}..{tx_end_i} {transcript_seq[codon_start_i:codon_end_i]} -> {transl_except_aa}'
)

result_transl_excepts.add((
transcript_ac,
f'pos:{codon_start_i}..{codon_end_i}',
f'{transcript_seq[codon_start_i:codon_end_i]}',
f'aa:{transl_except_aa}',
created
))

session.commit()

for tx_ac in warning_transcripts:
logger.warning(f'Warning: {tx_ac} not processed')

with open(output_file, 'wt') as output_fp:
output_fp.writelines(f'{t}\n' for t in sorted(result_transcripts))
output_fp.writelines("\t".join(row) for row in sorted(result_transl_excepts))


def create_schema(session, opts, cf):
Expand Down
3 changes: 2 additions & 1 deletion src/uta/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# schema name support
# also see etc/uta.conf

schema_version = "1.2"
schema_version = "1.3"
use_schema = strtobool(os.environ.get('UTA_USE_SCHEMA', 'true'))
if use_schema:
schema_name = "uta"
Expand Down Expand Up @@ -162,6 +162,7 @@ class TranslationException(Base):
start_position = sa.Column(sa.Integer, nullable=False)
end_position = sa.Column(sa.Integer, nullable=False)
amino_acid = sa.Column(sa.Text, nullable=False)
source = sa.Column(sa.Text, nullable=False, server_default="NCBI")

# relationships:
transcript = sao.relationship("Transcript", backref="translation_exceptions")
Expand Down
Loading