Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ env/*
.cache
.pytest_cache
.idea
appenv
appenv
*.out
*.err
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ isort==4.3.18
Jinja2==2.10.1
lazy-object-proxy==1.3.1
lockfile==0.12.2
luigi==2.8.3
luigi==3.0.0
MarkupSafe==1.1.1
mccabe==0.6.1
more-itertools==7.0.0
mysql-connector-python==8.0.16
numpy==1.16.3
numpy==1.24.4
packaging==19.0
pluggy==0.9.0
protobuf==3.7.1
Expand All @@ -44,7 +44,7 @@ sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.2
sphinxcontrib-serializinghtml==1.1.3
tornado==4.5.3
typed-ast==1.3.5
typed-ast==1.5.4
urllib3==1.24.3
wget==3.2
wrapt==1.11.1
22 changes: 11 additions & 11 deletions tark-refseq-loader/conf/refseq_source.ini
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# update config file with latest refseq annotation info
[DEFAULT]
source=2
shortname=109_20190905
release_date=xxxx-xx-xx
shortname=GCF_000001405_20250806
release_date=2025-08-06
assembly_name=GRCh38
assembly_id=1001
description=Refseq Homo sapiens Annotation Release 109.20190905
ftp_root=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/109.20190905/GCF_000001405.39_GRCh38.p13/
gff_file=GCF_000001405.39_GRCh38.p13_genomic.gff.gz
fasta_file=GCF_000001405.39_GRCh38.p13_rna.fna.gz
protein_file=GCF_000001405.39_GRCh38.p13_protein.faa.gz
assembly_id=1
description=Refseq Homo sapiens Annotation Release GCF_000001405.20250806
ftp_root=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_000001405.40-RS_2025_08/
gff_file=GCF_000001405.40_GRCh38.p14_genomic.gff.gz
fasta_file=GCF_000001405.40_GRCh38.p14_rna.fna.gz
protein_file=GCF_000001405.40_GRCh38.p14_protein.faa.gz


[DATABASE]
host = 0.0.0.0
port = 3306
host = xxx
port = xxx
user = xxx
pass = xxx
database = tark_luigi
database = ensembl_tark_e75_to_e115
76 changes: 69 additions & 7 deletions tark-refseq-loader/handlers/refseq/databasehandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
See the License for the specific language governing permissions and
limitations under the License.
"""

from __future__ import print_function
import re
from typing import Any, Dict
from datetime import datetime
from handlers.refseq.confighandler import ConfigHandler
from handlers.refseq.checksumhandler import ChecksumHandler
Expand All @@ -28,6 +29,52 @@
# Get an instance of a logger
logger = logging.getLogger(__name__)

def _to_int_pos(v: Any) -> int:
"""
Convert Biopython Position objects and their string reprs to int.
Accepts:
- int
- objects with `.position` (ExactPosition, BeforePosition, AfterPosition, BetweenPosition)
- strings like "107", "ExactPosition(107)"
"""
if isinstance(v, int):
return v
if hasattr(v, "position"):
return int(v.position)
if isinstance(v, str):
m = re.search(r"\d+", v)
if m:
return int(m.group(0))
# Let None pass through; caller can decide
if v is None:
return v
raise ValueError(f"Unrecognised coordinate type for integer field: {type(v)} -> {v!r}")

_INTEGERISH_KEYS = {
# common coord/ordering/strand fields across tables
"loc_start", "loc_end", "loc_strand",
"seq_region_start", "seq_region_end",
"exon_start", "exon_end", "exon_order",
"cds_start", "cds_end", "cds_order",
# "strand", "phase", "frame", "rank",
"start", "end",
# "gene_id", "transcript_id", "translation_id", "assembly_id", "session_id",
}

def _normalise_insert_payload(insert_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Shallow-copy and coerce integer-like fields to int. Leaves other fields untouched.
Uses key-name heuristics to avoid mangling text columns.
"""
out = dict(insert_data)
for k, v in list(out.items()):
if v is None:
continue
# Only attempt ints on known / likely integer fields
# if (k in _INTEGERISH_KEYS) or k.endswith("_id") or k.endswith("_start") or k.endswith("_end"):
if (k in _INTEGERISH_KEYS) or k.endswith("_start") or k.endswith("_end"):
out[k] = _to_int_pos(v)
return out

class DatabaseHandler(object):

Expand Down Expand Up @@ -415,9 +462,12 @@ def insert_data(self, insert_sql, insert_data, FOREIGN_KEY_CHECKS=1):
insert_sql = insert_sql.replace(sql_str, sql_str_to_replace, 1)

row_id = None
try:
connection_pool = self.dbc
cursor = connection_pool.cursor()
connection_pool = self.dbc
cursor = connection_pool.cursor()
try:
# 🔒 Normalise payload before execution
insert_data = _normalise_insert_payload(insert_data)

if FOREIGN_KEY_CHECKS == 0:
cursor.execute("SET FOREIGN_KEY_CHECKS=0")

Expand All @@ -427,12 +477,24 @@ def insert_data(self, insert_sql, insert_data, FOREIGN_KEY_CHECKS=1):
row_id = cursor.lastrowid

cursor = connection_pool.cursor()
if FOREIGN_KEY_CHECKS == 0:
cursor.execute("SET FOREIGN_KEY_CHECKS=1")
# if FOREIGN_KEY_CHECKS == 0:
# cursor.execute("SET FOREIGN_KEY_CHECKS=1")

except Exception as e:
# Proper error handling: rollback & re-raise (do NOT exit)
try:
connection_pool.rollback()
except Exception:
pass
print('Failed to insert: ' + str(e))
print(insert_sql)
exit(0)
print("DATA:", insert_data)
raise
finally:
try:
if FOREIGN_KEY_CHECKS == 0:
cursor.execute("SET FOREIGN_KEY_CHECKS=1")
finally:
cursor.close()

return row_id
67 changes: 63 additions & 4 deletions tark-refseq-loader/handlers/refseq/utils/exon_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,74 @@
"""


import re
from typing import Any, Dict, List


class ExonUtils(object):

@staticmethod
def _to_int_pos(v: Any) -> int:
"""
Normalize a coordinate value to a plain int.

Accepts:
- int
- Biopython Position objects (ExactPosition, BeforePosition, AfterPosition, BetweenPosition),
which expose `.position`
- Biopython FeatureLocation ends/starts (they also behave like positions)
- strings like "10742191" or "ExactPosition(10742191)"
"""
# Already an int
if isinstance(v, int):
return v

# Biopython Position-like (ExactPosition, etc.) or FeatureLocation boundary
# Most expose `.position`; FeatureLocation has `.start`/`.end` which in turn have `.position`
if hasattr(v, "position"):
try:
return int(v.position)
except Exception:
pass

# Some callers might pass FeatureLocation directly; take its numeric bounds if present
if hasattr(v, "start") and hasattr(v.start, "position") and hasattr(v, "end") and hasattr(v.end, "position"):
# If a full location slipped in, choose one edge (caller should pass a single edge, but be forgiving)
# Prefer the concrete integer if v is intended to represent one bound, else fall back to int(start.position)
try:
return int(v.position) # in case it mimics a Position
except Exception:
return int(v.start.position)

# Plain string or stringified Biopython repr
if isinstance(v, str):
m = re.search(r"\d+", v)
if m:
return int(m.group(0))

raise ValueError(f"Unrecognised coordinate type for exon position: {type(v)} -> {v!r}")

@classmethod
def compute_exon_coordinates(cls, exons):
updated_exon_list = []
def compute_exon_coordinates(cls, exons: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Compute relative exon coordinates, preserving the original logic:
- normalise exon_start/exon_end to ints first
- exon 1 starts at 1
- each subsequent exon starts at previous_exon_end + 1
- exon length = (end - start)
"""
updated_exon_list: List[Dict[str, Any]] = []
exon_end = 0

for original_exon in exons:
exon = original_exon.copy()
start = int(exon['exon_start'])
end = int(exon['exon_end'])

start = cls._to_int_pos(exon.get('exon_start'))
end = cls._to_int_pos(exon.get('exon_end'))

# Defensive checks
if end < start:
raise ValueError(f"Exon end < start after normalisation: start={start}, end={end}, exon={original_exon}")

exon_start = int(exon_end) + 1
exon_end = int(exon_start) + (end - start)
Expand All @@ -34,3 +92,4 @@ def compute_exon_coordinates(cls, exons):
updated_exon_list.append(exon)

return updated_exon_list