giosilvi · giosilvi · Jan 19, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/Makefile.gk b/Makefile.gk
@@ -143,8 +143,13 @@ bootstrap: ner cand bootstrap_seed
 # Build a simple JSONL manifest from corpus texts for the events extractor
 out/manifest.jsonl: $(CORPUS_DIR)/*.txt
 	@mkdir -p $(OUT)
-	@python -c "import glob,json,os,io; paths=glob.glob('data/corpus/*.txt'); w=io.open('out/manifest.jsonl','w',encoding='utf-8');\
- [w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':io.open(p,'r',encoding='utf-8').read()}, ensure_ascii=False)+'\\n') for p in paths]; w.close(); print('Wrote out/manifest.jsonl with %d docs' % len(paths))"
+	@python -c "import glob,json,os,io; \
+def read_file(p): \
+	with io.open(p,'r',encoding='utf-8') as f: return f.read(); \
+paths=glob.glob('data/corpus/*.txt'); \
+with io.open('out/manifest.jsonl','w',encoding='utf-8') as w: \
+	[w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':read_file(p)}, ensure_ascii=False)+'\\n') for p in paths]; \
+	print('Wrote out/manifest.jsonl with %d docs' % len(paths))"
 
 .PHONY: events event_edges ttl_events merge_edges ttl_merged
 

diff --git a/groundkg/ner_tag.py b/groundkg/ner_tag.py
@@ -48,7 +48,8 @@ def main():
             nlp.enable_pipe("ner")
         except Exception:
             pass
-    text = open(in_path, "r", encoding="utf-8").read()
+    with open(in_path, "r", encoding="utf-8") as f:
+        text = f.read()
     doc = nlp(text) # main call to the pipeline
 
     for sent in doc.sents:

diff --git a/groundkg/re_infer.py b/groundkg/re_infer.py
@@ -93,8 +93,10 @@ def main():
     if not os.path.exists(onnx_path):
         sys.exit(0)
 
-    thresholds = json.load(open(thresh_path, "r", encoding="utf-8"))
-    classes = json.load(open("models/classes.json", "r", encoding="utf-8"))
+    with open(thresh_path, "r", encoding="utf-8") as f:
+        thresholds = json.load(f)
+    with open("models/classes.json", "r", encoding="utf-8") as f:
+        classes = json.load(f)
     sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
     inp_name = sess.get_inputs()[0].name
 

diff --git a/groundkg/re_score.py b/groundkg/re_score.py
@@ -2,6 +2,7 @@
 import sys
 import json
 import os
+import threading
 import onnxruntime as ort
 import numpy as np
 from sentence_transformers import SentenceTransformer
@@ -12,13 +13,17 @@
 
 # Global cache for sentence transformer model
 _embedder_cache = None
+_embedder_lock = threading.Lock()
 
 
 def get_embedder():
-    """Get or create sentence transformer model (cached)."""
+    """Get or create sentence transformer model (cached, thread-safe)."""
     global _embedder_cache
     if _embedder_cache is None:
-        _embedder_cache = SentenceTransformer(MODEL_NAME)
+        with _embedder_lock:
+            # Double-check inside lock to prevent race condition
+            if _embedder_cache is None:
+                _embedder_cache = SentenceTransformer(MODEL_NAME)
     return _embedder_cache
 
 
@@ -50,7 +55,8 @@ def main():
     embedder = get_embedder()
 
     # Load ONNX model and classes
-    classes = json.load(open(classes_path, "r", encoding="utf-8"))
+    with open(classes_path, "r", encoding="utf-8") as f:
+        classes = json.load(f)
     sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
     inp_name = sess.get_inputs()[0].name
 

diff --git a/tools/adjust_thresholds.py b/tools/adjust_thresholds.py
@@ -44,7 +44,8 @@ def adjust_thresholds(thresholds_path, scored_path, min_edges=10, min_threshold=
     """
     # Load current thresholds
     if os.path.exists(thresholds_path):
-        thresholds = json.load(open(thresholds_path, 'r', encoding='utf-8'))
+        with open(thresholds_path, 'r', encoding='utf-8') as f:
+            thresholds = json.load(f)
     else:
         thresholds = {}
 

diff --git a/tools/crawl.py b/tools/crawl.py
@@ -79,12 +79,14 @@ def fetch_and_snapshot(doc_id: str, url: str):
 def html_to_text(data: bytes, url: str):
     html = data.decode("utf-8", errors="ignore")
     title = ""
+    soup = None
     try:
         soup = BeautifulSoup(html, "html.parser")
         if soup.title and soup.title.string: title = soup.title.string.strip()
-    except Exception: pass
+    except Exception:
+        pass
     extracted = trafilatura.extract(html, include_tables=True, url=url) or ""
-    text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if 'soup' in locals() else "")
+    text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if soup is not None else "")
     return title or url, text
 
 def pdf_to_text(data: bytes)->str:

diff --git a/tools/promote_from_scored.py b/tools/promote_from_scored.py
@@ -2,7 +2,8 @@
 
 def main():
     scored_path, thr_path = sys.argv[1:3]
-    thresholds = json.load(open(thr_path, 'r', encoding='utf-8'))
+    with open(thr_path, 'r', encoding='utf-8') as f:
+        thresholds = json.load(f)
     with open(scored_path, 'r', encoding='utf-8') as f:
         for line in f:
             r = json.loads(line)

diff --git a/tools/quality_report.py b/tools/quality_report.py
@@ -28,7 +28,8 @@ def main():
     thresholds = {}
     if os.path.exists(thr_path):
         try:
-            thresholds = json.load(open(thr_path,'r',encoding='utf-8'))
+            with open(thr_path, 'r', encoding='utf-8') as f:
+                thresholds = json.load(f)
         except Exception:
             thresholds = {}
 

diff --git a/training/train_re_sklearn.py b/training/train_re_sklearn.py
@@ -11,7 +11,9 @@
 from skl2onnx import convert_sklearn
 from skl2onnx.common.data_types import StringTensorType
 
-def load_jsonl(p): return [json.loads(l) for l in open(p, "r", encoding="utf-8") if l.strip()]
+def load_jsonl(p):
+    with open(p, "r", encoding="utf-8") as f:
+        return [json.loads(l) for l in f if l.strip()]
 
 def load_data(train_p, dev_p):
     tr = load_jsonl(train_p); dv = load_jsonl(dev_p)

diff --git a/web/app.py b/web/app.py
@@ -63,11 +63,15 @@ def get_file_info(path):
     if not full_path.exists():
         return None
     stat = full_path.stat()
+    lines = 0
+    if full_path.is_file():
+        with open(full_path, 'rb') as f:
+            lines = sum(1 for _ in f)
     return {
         'exists': True,
         'size': stat.st_size,
         'modified': stat.st_mtime,
-        'lines': sum(1 for _ in open(full_path, 'rb')) if full_path.is_file() else 0
+        'lines': lines
     }
 
 
@@ -117,22 +121,33 @@ def run_command(target):
 @app.route('/api/status/<target>')
 def command_status(target):
     """Get status of a running command"""
-    if target in running_commands:
-        process = running_commands[target]
-        if process.poll() is None:
-            # Still running
-            return jsonify({'status': 'running', 'pid': process.pid})
-        else:
-            # Finished
-            stdout, stderr = process.communicate()
-            del running_commands[target]
-            return jsonify({
-                'status': 'completed' if process.returncode == 0 else 'failed',
-                'returncode': process.returncode,
-                'stdout': stdout,
-                'stderr': stderr
-            })
-    return jsonify({'status': 'not_found'})
+    if target not in running_commands:
+        return jsonify({'status': 'not_found'})
+
+    # Store process reference to avoid race conditions
+    process = running_commands[target]
+    poll_result = process.poll()
+
+    if poll_result is None:
+        # Still running
+        return jsonify({'status': 'running', 'pid': process.pid})
+
+    # Finished - clean up and return results
+    try:
+        # Remove from running_commands first to prevent duplicate processing
+        running_commands.pop(target, None)
+        stdout, stderr = process.communicate(timeout=1)
+        return jsonify({
+            'status': 'completed' if process.returncode == 0 else 'failed',
+            'returncode': process.returncode,
+            'stdout': stdout,
+            'stderr': stderr
+        })
+    except Exception as e:
+        return jsonify({
+            'status': 'error',
+            'error': f'Failed to get process output: {str(e)}'
+        }), 500
 
 
 @app.route('/api/files')
@@ -188,10 +203,12 @@ def get_file(file_key):
     try:
         with open(path, 'r', encoding='utf-8') as f:
             content = ''.join(f.readlines()[:lines])
+        with open(path, 'rb') as f:
+            total_lines = sum(1 for _ in f)
         return jsonify({
             'path': files[file_key],
             'content': content,
-            'total_lines': sum(1 for _ in open(path, 'rb'))
+            'total_lines': total_lines
         })
     except Exception as e:
         return jsonify({'error': str(e)}), 500