diff --git a/.gitignore b/.gitignore index 54b63ae..ff2c3b7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,6 @@ env/* .cache .pytest_cache .idea -appenv \ No newline at end of file +appenv +*.out +*.err \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 84286c0..8c423f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,12 +15,12 @@ isort==4.3.18 Jinja2==2.10.1 lazy-object-proxy==1.3.1 lockfile==0.12.2 -luigi==2.8.3 +luigi==3.0.0 MarkupSafe==1.1.1 mccabe==0.6.1 more-itertools==7.0.0 mysql-connector-python==8.0.16 -numpy==1.16.3 +numpy==1.24.4 packaging==19.0 pluggy==0.9.0 protobuf==3.7.1 @@ -44,7 +44,7 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.2 sphinxcontrib-serializinghtml==1.1.3 tornado==4.5.3 -typed-ast==1.3.5 +typed-ast==1.5.4 urllib3==1.24.3 wget==3.2 wrapt==1.11.1 diff --git a/tark-refseq-loader/conf/refseq_source.ini b/tark-refseq-loader/conf/refseq_source.ini index df23185..2a6d6b5 100644 --- a/tark-refseq-loader/conf/refseq_source.ini +++ b/tark-refseq-loader/conf/refseq_source.ini @@ -1,20 +1,20 @@ # update config file with latest refseq annotation info [DEFAULT] source=2 -shortname=109_20190905 -release_date=xxxx-xx-xx +shortname=GCF_000001405_20250806 +release_date=2025-08-06 assembly_name=GRCh38 -assembly_id=1001 -description=Refseq Homo sapiens Annotation Release 109.20190905 -ftp_root=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/109.20190905/GCF_000001405.39_GRCh38.p13/ -gff_file=GCF_000001405.39_GRCh38.p13_genomic.gff.gz -fasta_file=GCF_000001405.39_GRCh38.p13_rna.fna.gz -protein_file=GCF_000001405.39_GRCh38.p13_protein.faa.gz +assembly_id=1 +description=Refseq Homo sapiens Annotation Release GCF_000001405.20250806 +ftp_root=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_000001405.40-RS_2025_08/ +gff_file=GCF_000001405.40_GRCh38.p14_genomic.gff.gz +fasta_file=GCF_000001405.40_GRCh38.p14_rna.fna.gz +protein_file=GCF_000001405.40_GRCh38.p14_protein.faa.gz [DATABASE] -host = 0.0.0.0 -port = 3306 +host = xxx +port = xxx user = xxx pass = xxx -database = tark_luigi +database = ensembl_tark_e75_to_e115 diff --git a/tark-refseq-loader/handlers/refseq/databasehandler.py b/tark-refseq-loader/handlers/refseq/databasehandler.py index 6be6a0f..3189df6 100644 --- a/tark-refseq-loader/handlers/refseq/databasehandler.py +++ b/tark-refseq-loader/handlers/refseq/databasehandler.py @@ -14,8 +14,9 @@ See the License for the specific language governing permissions and limitations under the License. """ - from __future__ import print_function +import re +from typing import Any, Dict from datetime import datetime from handlers.refseq.confighandler import ConfigHandler from handlers.refseq.checksumhandler import ChecksumHandler @@ -28,6 +29,52 @@ # Get an instance of a logger logger = logging.getLogger(__name__) +def _to_int_pos(v: Any) -> int: + """ + Convert Biopython Position objects and their string reprs to int. + Accepts: + - int + - objects with `.position` (ExactPosition, BeforePosition, AfterPosition, BetweenPosition) + - strings like "107", "ExactPosition(107)" + """ + if isinstance(v, int): + return v + if hasattr(v, "position"): + return int(v.position) + if isinstance(v, str): + m = re.search(r"\d+", v) + if m: + return int(m.group(0)) + # Let None pass through; caller can decide + if v is None: + return v + raise ValueError(f"Unrecognised coordinate type for integer field: {type(v)} -> {v!r}") + +_INTEGERISH_KEYS = { + # common coord/ordering/strand fields across tables + "loc_start", "loc_end", "loc_strand", + "seq_region_start", "seq_region_end", + "exon_start", "exon_end", "exon_order", + "cds_start", "cds_end", "cds_order", + # "strand", "phase", "frame", "rank", + "start", "end", + # "gene_id", "transcript_id", "translation_id", "assembly_id", "session_id", +} + +def _normalise_insert_payload(insert_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Shallow-copy and coerce integer-like fields to int. Leaves other fields untouched. + Uses key-name heuristics to avoid mangling text columns. + """ + out = dict(insert_data) + for k, v in list(out.items()): + if v is None: + continue + # Only attempt ints on known / likely integer fields + # if (k in _INTEGERISH_KEYS) or k.endswith("_id") or k.endswith("_start") or k.endswith("_end"): + if (k in _INTEGERISH_KEYS) or k.endswith("_start") or k.endswith("_end"): + out[k] = _to_int_pos(v) + return out class DatabaseHandler(object): @@ -415,9 +462,12 @@ def insert_data(self, insert_sql, insert_data, FOREIGN_KEY_CHECKS=1): insert_sql = insert_sql.replace(sql_str, sql_str_to_replace, 1) row_id = None - try: - connection_pool = self.dbc - cursor = connection_pool.cursor() + connection_pool = self.dbc + cursor = connection_pool.cursor() + try: + # 🔒 Normalise payload before execution + insert_data = _normalise_insert_payload(insert_data) + if FOREIGN_KEY_CHECKS == 0: cursor.execute("SET FOREIGN_KEY_CHECKS=0") @@ -427,12 +477,24 @@ def insert_data(self, insert_sql, insert_data, FOREIGN_KEY_CHECKS=1): row_id = cursor.lastrowid cursor = connection_pool.cursor() - if FOREIGN_KEY_CHECKS == 0: - cursor.execute("SET FOREIGN_KEY_CHECKS=1") + # if FOREIGN_KEY_CHECKS == 0: + # cursor.execute("SET FOREIGN_KEY_CHECKS=1") except Exception as e: + # Proper error handling: rollback & re-raise (do NOT exit) + try: + connection_pool.rollback() + except Exception: + pass print('Failed to insert: ' + str(e)) print(insert_sql) - exit(0) + print("DATA:", insert_data) + raise + finally: + try: + if FOREIGN_KEY_CHECKS == 0: + cursor.execute("SET FOREIGN_KEY_CHECKS=1") + finally: + cursor.close() return row_id diff --git a/tark-refseq-loader/handlers/refseq/utils/exon_utils.py b/tark-refseq-loader/handlers/refseq/utils/exon_utils.py index cb2f70c..9bb7346 100644 --- a/tark-refseq-loader/handlers/refseq/utils/exon_utils.py +++ b/tark-refseq-loader/handlers/refseq/utils/exon_utils.py @@ -16,16 +16,74 @@ """ +import re +from typing import Any, Dict, List + + class ExonUtils(object): + @staticmethod + def _to_int_pos(v: Any) -> int: + """ + Normalize a coordinate value to a plain int. + + Accepts: + - int + - Biopython Position objects (ExactPosition, BeforePosition, AfterPosition, BetweenPosition), + which expose `.position` + - Biopython FeatureLocation ends/starts (they also behave like positions) + - strings like "10742191" or "ExactPosition(10742191)" + """ + # Already an int + if isinstance(v, int): + return v + + # Biopython Position-like (ExactPosition, etc.) or FeatureLocation boundary + # Most expose `.position`; FeatureLocation has `.start`/`.end` which in turn have `.position` + if hasattr(v, "position"): + try: + return int(v.position) + except Exception: + pass + + # Some callers might pass FeatureLocation directly; take its numeric bounds if present + if hasattr(v, "start") and hasattr(v.start, "position") and hasattr(v, "end") and hasattr(v.end, "position"): + # If a full location slipped in, choose one edge (caller should pass a single edge, but be forgiving) + # Prefer the concrete integer if v is intended to represent one bound, else fall back to int(start.position) + try: + return int(v.position) # in case it mimics a Position + except Exception: + return int(v.start.position) + + # Plain string or stringified Biopython repr + if isinstance(v, str): + m = re.search(r"\d+", v) + if m: + return int(m.group(0)) + + raise ValueError(f"Unrecognised coordinate type for exon position: {type(v)} -> {v!r}") + @classmethod - def compute_exon_coordinates(cls, exons): - updated_exon_list = [] + def compute_exon_coordinates(cls, exons: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Compute relative exon coordinates, preserving the original logic: + - normalise exon_start/exon_end to ints first + - exon 1 starts at 1 + - each subsequent exon starts at previous_exon_end + 1 + - exon length = (end - start) + """ + updated_exon_list: List[Dict[str, Any]] = [] exon_end = 0 + for original_exon in exons: exon = original_exon.copy() - start = int(exon['exon_start']) - end = int(exon['exon_end']) + + start = cls._to_int_pos(exon.get('exon_start')) + end = cls._to_int_pos(exon.get('exon_end')) + + # Defensive checks + if end < start: + raise ValueError(f"Exon end < start after normalisation: start={start}, end={end}, exon={original_exon}") exon_start = int(exon_end) + 1 exon_end = int(exon_start) + (end - start) @@ -34,3 +92,4 @@ def compute_exon_coordinates(cls, exons): updated_exon_list.append(exon) return updated_exon_list +