Ensembl · darefalola · Oct 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,6 @@ env/*
 .cache
 .pytest_cache
 .idea
-appenv
+appenv
+*.out
+*.err
diff --git a/requirements.txt b/requirements.txt
@@ -15,12 +15,12 @@ isort==4.3.18
 Jinja2==2.10.1
 lazy-object-proxy==1.3.1
 lockfile==0.12.2
-luigi==2.8.3
+luigi==3.0.0
 MarkupSafe==1.1.1
 mccabe==0.6.1
 more-itertools==7.0.0
 mysql-connector-python==8.0.16
-numpy==1.16.3
+numpy==1.24.4
 packaging==19.0
 pluggy==0.9.0
 protobuf==3.7.1
@@ -44,7 +44,7 @@ sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 tornado==4.5.3
-typed-ast==1.3.5
+typed-ast==1.5.4
 urllib3==1.24.3
 wget==3.2
 wrapt==1.11.1
diff --git a/tark-refseq-loader/conf/refseq_source.ini b/tark-refseq-loader/conf/refseq_source.ini
@@ -1,20 +1,20 @@
 # update config file with latest refseq annotation info
 [DEFAULT]
 source=2
-shortname=109_20190905
-release_date=xxxx-xx-xx
+shortname=GCF_000001405_20250806
+release_date=2025-08-06
 assembly_name=GRCh38
-assembly_id=1001
-description=Refseq Homo sapiens Annotation Release 109.20190905
-ftp_root=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/109.20190905/GCF_000001405.39_GRCh38.p13/
-gff_file=GCF_000001405.39_GRCh38.p13_genomic.gff.gz
-fasta_file=GCF_000001405.39_GRCh38.p13_rna.fna.gz
-protein_file=GCF_000001405.39_GRCh38.p13_protein.faa.gz
+assembly_id=1
+description=Refseq Homo sapiens Annotation Release GCF_000001405.20250806
+ftp_root=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_000001405.40-RS_2025_08/
+gff_file=GCF_000001405.40_GRCh38.p14_genomic.gff.gz
+fasta_file=GCF_000001405.40_GRCh38.p14_rna.fna.gz
+protein_file=GCF_000001405.40_GRCh38.p14_protein.faa.gz
 
 
 [DATABASE]
-host = 0.0.0.0
-port = 3306
+host = xxx
+port = xxx
 user = xxx
 pass = xxx
-database = tark_luigi
+database = ensembl_tark_e75_to_e115
diff --git a/tark-refseq-loader/handlers/refseq/databasehandler.py b/tark-refseq-loader/handlers/refseq/databasehandler.py
@@ -14,8 +14,9 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 """
-
 from __future__ import print_function
+import re
+from typing import Any, Dict
 from datetime import datetime
 from handlers.refseq.confighandler import ConfigHandler
 from handlers.refseq.checksumhandler import ChecksumHandler
@@ -28,6 +29,52 @@
 # Get an instance of a logger
 logger = logging.getLogger(__name__)
 
+def _to_int_pos(v: Any) -> int:
+    """
+    Convert Biopython Position objects and their string reprs to int.
+    Accepts:
+      - int
+      - objects with `.position` (ExactPosition, BeforePosition, AfterPosition, BetweenPosition)
+      - strings like "107", "ExactPosition(107)"
+    """
+    if isinstance(v, int):
+        return v
+    if hasattr(v, "position"):
+        return int(v.position)
+    if isinstance(v, str):
+        m = re.search(r"\d+", v)
+        if m:
+            return int(m.group(0))
+    # Let None pass through; caller can decide
+    if v is None:
+        return v
+    raise ValueError(f"Unrecognised coordinate type for integer field: {type(v)} -> {v!r}")
+
+_INTEGERISH_KEYS = {
+    # common coord/ordering/strand fields across tables
+    "loc_start", "loc_end", "loc_strand",
+    "seq_region_start", "seq_region_end",
+    "exon_start", "exon_end", "exon_order",
+    "cds_start", "cds_end", "cds_order",
+    # "strand", "phase", "frame", "rank",
+    "start", "end",
+    # "gene_id", "transcript_id", "translation_id", "assembly_id", "session_id",
+}
+
+def _normalise_insert_payload(insert_data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Shallow-copy and coerce integer-like fields to int. Leaves other fields untouched.
+    Uses key-name heuristics to avoid mangling text columns.
+    """
+    out = dict(insert_data)
+    for k, v in list(out.items()):
+        if v is None:
+            continue
+        # Only attempt ints on known / likely integer fields
+        # if (k in _INTEGERISH_KEYS) or k.endswith("_id") or k.endswith("_start") or k.endswith("_end"):
+        if (k in _INTEGERISH_KEYS) or k.endswith("_start") or k.endswith("_end"):
+            out[k] = _to_int_pos(v)
+    return out
 
 class DatabaseHandler(object):
 
@@ -415,9 +462,12 @@ def insert_data(self, insert_sql, insert_data, FOREIGN_KEY_CHECKS=1):
                 insert_sql = insert_sql.replace(sql_str, sql_str_to_replace, 1)
 
         row_id = None
-        try:
-            connection_pool = self.dbc
-            cursor = connection_pool.cursor()
+        connection_pool = self.dbc
+        cursor = connection_pool.cursor()
+        try:            
+            # 🔒 Normalise payload before execution
+            insert_data = _normalise_insert_payload(insert_data)
+
             if FOREIGN_KEY_CHECKS == 0:
                 cursor.execute("SET FOREIGN_KEY_CHECKS=0")
 
@@ -427,12 +477,24 @@ def insert_data(self, insert_sql, insert_data, FOREIGN_KEY_CHECKS=1):
             row_id = cursor.lastrowid
 
             cursor = connection_pool.cursor()
-            if FOREIGN_KEY_CHECKS == 0:
-                cursor.execute("SET FOREIGN_KEY_CHECKS=1")
+            # if FOREIGN_KEY_CHECKS == 0:
+            #     cursor.execute("SET FOREIGN_KEY_CHECKS=1")
 
         except Exception as e:
+            # Proper error handling: rollback & re-raise (do NOT exit)
+            try:
+                connection_pool.rollback()
+            except Exception:
+                pass
             print('Failed to insert: ' + str(e))
             print(insert_sql)
-            exit(0)
+            print("DATA:", insert_data)
+            raise
+        finally:
+            try:
+                if FOREIGN_KEY_CHECKS == 0:
+                    cursor.execute("SET FOREIGN_KEY_CHECKS=1")
+            finally:
+                cursor.close()
 
         return row_id
diff --git a/tark-refseq-loader/handlers/refseq/utils/exon_utils.py b/tark-refseq-loader/handlers/refseq/utils/exon_utils.py
@@ -16,16 +16,74 @@
 """
 
 
+import re
+from typing import Any, Dict, List
+
+
 class ExonUtils(object):
 
+    @staticmethod
+    def _to_int_pos(v: Any) -> int:
+        """
+        Normalize a coordinate value to a plain int.
+
+        Accepts:
+          - int
+          - Biopython Position objects (ExactPosition, BeforePosition, AfterPosition, BetweenPosition),
+            which expose `.position`
+          - Biopython FeatureLocation ends/starts (they also behave like positions)
+          - strings like "10742191" or "ExactPosition(10742191)"
+        """
+        # Already an int
+        if isinstance(v, int):
+            return v
+
+        # Biopython Position-like (ExactPosition, etc.) or FeatureLocation boundary
+        # Most expose `.position`; FeatureLocation has `.start`/`.end` which in turn have `.position`
+        if hasattr(v, "position"):
+            try:
+                return int(v.position)
+            except Exception:
+                pass
+
+        # Some callers might pass FeatureLocation directly; take its numeric bounds if present
+        if hasattr(v, "start") and hasattr(v.start, "position") and hasattr(v, "end") and hasattr(v.end, "position"):
+            # If a full location slipped in, choose one edge (caller should pass a single edge, but be forgiving)
+            # Prefer the concrete integer if v is intended to represent one bound, else fall back to int(start.position)
+            try:
+                return int(v.position)  # in case it mimics a Position
+            except Exception:
+                return int(v.start.position)
+
+        # Plain string or stringified Biopython repr
+        if isinstance(v, str):
+            m = re.search(r"\d+", v)
+            if m:
+                return int(m.group(0))
+
+        raise ValueError(f"Unrecognised coordinate type for exon position: {type(v)} -> {v!r}")
+
     @classmethod
-    def compute_exon_coordinates(cls, exons):
-        updated_exon_list = []
+    def compute_exon_coordinates(cls, exons: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Compute relative exon coordinates, preserving the original logic:
+        - normalise exon_start/exon_end to ints first
+        - exon 1 starts at 1
+        - each subsequent exon starts at previous_exon_end + 1
+        - exon length = (end - start)
+        """
+        updated_exon_list: List[Dict[str, Any]] = []
         exon_end = 0
+
         for original_exon in exons:
             exon = original_exon.copy()
-            start = int(exon['exon_start'])
-            end = int(exon['exon_end'])
+
+            start = cls._to_int_pos(exon.get('exon_start'))
+            end = cls._to_int_pos(exon.get('exon_end'))
+
+            # Defensive checks
+            if end < start:
+                raise ValueError(f"Exon end < start after normalisation: start={start}, end={end}, exon={original_exon}")
 
             exon_start = int(exon_end) + 1
             exon_end = int(exon_start) + (end - start)
@@ -34,3 +92,4 @@ def compute_exon_coordinates(cls, exons):
             updated_exon_list.append(exon)
 
         return updated_exon_list
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,4 +16,6 @@ env/* @@
     .cache
     .pytest_cache
     .idea
-    appenv
+    appenv
+    *.out
+    *.err