From b95d72ec5eae1f6556ad009ec3bb84225e1713b2 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 13 Jan 2026 14:22:59 +0000 Subject: [PATCH 1/4] Fix resource leak in ner_tag.py Replace open() without context manager with proper with statement to ensure file handle is closed after reading. This prevents resource leaks that could accumulate over multiple invocations. Fixed: groundkg/ner_tag.py:51 --- groundkg/ner_tag.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/groundkg/ner_tag.py b/groundkg/ner_tag.py index 57fef71..d9ae563 100644 --- a/groundkg/ner_tag.py +++ b/groundkg/ner_tag.py @@ -48,7 +48,8 @@ def main(): nlp.enable_pipe("ner") except Exception: pass - text = open(in_path, "r", encoding="utf-8").read() + with open(in_path, "r", encoding="utf-8") as f: + text = f.read() doc = nlp(text) # main call to the pipeline for sent in doc.sents: From de19a50b2cc7d8fb110ad5fb31320efffbb3d237 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 13 Jan 2026 14:25:36 +0000 Subject: [PATCH 2/4] Fix resource leaks across multiple modules Replace all open() calls without context managers with proper with statements to ensure file handles are closed after use. This prevents resource leaks that could accumulate over multiple invocations. Fixed resource leaks in: - groundkg/re_score.py:53 - json.load(open(...)) - groundkg/re_infer.py:96-97 - two json.load(open(...)) - tools/promote_from_scored.py:5 - json.load(open(...)) - training/train_re_sklearn.py:14 - load_jsonl function - tools/adjust_thresholds.py:47 - json.load(open(...)) - tools/quality_report.py:31 - json.load(open(...)) --- groundkg/re_infer.py | 6 ++++-- groundkg/re_score.py | 3 ++- tools/adjust_thresholds.py | 3 ++- tools/promote_from_scored.py | 3 ++- tools/quality_report.py | 3 ++- training/train_re_sklearn.py | 4 +++- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/groundkg/re_infer.py b/groundkg/re_infer.py index 0703150..df5009e 100644 --- a/groundkg/re_infer.py +++ b/groundkg/re_infer.py @@ -93,8 +93,10 @@ def main(): if not os.path.exists(onnx_path): sys.exit(0) - thresholds = json.load(open(thresh_path, "r", encoding="utf-8")) - classes = json.load(open("models/classes.json", "r", encoding="utf-8")) + with open(thresh_path, "r", encoding="utf-8") as f: + thresholds = json.load(f) + with open("models/classes.json", "r", encoding="utf-8") as f: + classes = json.load(f) sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) inp_name = sess.get_inputs()[0].name diff --git a/groundkg/re_score.py b/groundkg/re_score.py index 64cfb2d..8f218a5 100644 --- a/groundkg/re_score.py +++ b/groundkg/re_score.py @@ -50,7 +50,8 @@ def main(): embedder = get_embedder() # Load ONNX model and classes - classes = json.load(open(classes_path, "r", encoding="utf-8")) + with open(classes_path, "r", encoding="utf-8") as f: + classes = json.load(f) sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) inp_name = sess.get_inputs()[0].name diff --git a/tools/adjust_thresholds.py b/tools/adjust_thresholds.py index 87721f2..e9cddb7 100644 --- a/tools/adjust_thresholds.py +++ b/tools/adjust_thresholds.py @@ -44,7 +44,8 @@ def adjust_thresholds(thresholds_path, scored_path, min_edges=10, min_threshold= """ # Load current thresholds if os.path.exists(thresholds_path): - thresholds = json.load(open(thresholds_path, 'r', encoding='utf-8')) + with open(thresholds_path, 'r', encoding='utf-8') as f: + thresholds = json.load(f) else: thresholds = {} diff --git a/tools/promote_from_scored.py b/tools/promote_from_scored.py index 7e3bd38..59d3c0e 100644 --- a/tools/promote_from_scored.py +++ b/tools/promote_from_scored.py @@ -2,7 +2,8 @@ def main(): scored_path, thr_path = sys.argv[1:3] - thresholds = json.load(open(thr_path, 'r', encoding='utf-8')) + with open(thr_path, 'r', encoding='utf-8') as f: + thresholds = json.load(f) with open(scored_path, 'r', encoding='utf-8') as f: for line in f: r = json.loads(line) diff --git a/tools/quality_report.py b/tools/quality_report.py index 59a1953..1d64ac9 100644 --- a/tools/quality_report.py +++ b/tools/quality_report.py @@ -28,7 +28,8 @@ def main(): thresholds = {} if os.path.exists(thr_path): try: - thresholds = json.load(open(thr_path,'r',encoding='utf-8')) + with open(thr_path, 'r', encoding='utf-8') as f: + thresholds = json.load(f) except Exception: thresholds = {} diff --git a/training/train_re_sklearn.py b/training/train_re_sklearn.py index 42c8174..cb30cd8 100644 --- a/training/train_re_sklearn.py +++ b/training/train_re_sklearn.py @@ -11,7 +11,9 @@ from skl2onnx import convert_sklearn from skl2onnx.common.data_types import StringTensorType -def load_jsonl(p): return [json.loads(l) for l in open(p, "r", encoding="utf-8") if l.strip()] +def load_jsonl(p): + with open(p, "r", encoding="utf-8") as f: + return [json.loads(l) for l in f if l.strip()] def load_data(train_p, dev_p): tr = load_jsonl(train_p); dv = load_jsonl(dev_p) From 3dafd975cf5e65b21e392426d4104924c29d8728 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 13 Jan 2026 14:44:22 +0000 Subject: [PATCH 3/4] Fix resource leaks in web app and Makefile web/app.py: - Fix file handle leak in get_file_info() when counting lines (line 70) - Fix file handle leak in get_file() when counting total lines (line 194) Both now properly use context managers to ensure files are closed Makefile.gk: - Fix critical resource leak in manifest.jsonl generation (line 147) - Replace inline file opens with proper read_file() helper function - Use context manager for output file writing - Prevents resource exhaustion when processing many corpus files --- Makefile.gk | 9 +++++++-- web/app.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/Makefile.gk b/Makefile.gk index 93b98d0..2938382 100644 --- a/Makefile.gk +++ b/Makefile.gk @@ -143,8 +143,13 @@ bootstrap: ner cand bootstrap_seed # Build a simple JSONL manifest from corpus texts for the events extractor out/manifest.jsonl: $(CORPUS_DIR)/*.txt @mkdir -p $(OUT) - @python -c "import glob,json,os,io; paths=glob.glob('data/corpus/*.txt'); w=io.open('out/manifest.jsonl','w',encoding='utf-8');\ - [w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':io.open(p,'r',encoding='utf-8').read()}, ensure_ascii=False)+'\\n') for p in paths]; w.close(); print('Wrote out/manifest.jsonl with %d docs' % len(paths))" + @python -c "import glob,json,os,io; \ +def read_file(p): \ + with io.open(p,'r',encoding='utf-8') as f: return f.read(); \ +paths=glob.glob('data/corpus/*.txt'); \ +with io.open('out/manifest.jsonl','w',encoding='utf-8') as w: \ + [w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':read_file(p)}, ensure_ascii=False)+'\\n') for p in paths]; \ + print('Wrote out/manifest.jsonl with %d docs' % len(paths))" .PHONY: events event_edges ttl_events merge_edges ttl_merged diff --git a/web/app.py b/web/app.py index ac6dad9..7113ec9 100755 --- a/web/app.py +++ b/web/app.py @@ -63,11 +63,15 @@ def get_file_info(path): if not full_path.exists(): return None stat = full_path.stat() + lines = 0 + if full_path.is_file(): + with open(full_path, 'rb') as f: + lines = sum(1 for _ in f) return { 'exists': True, 'size': stat.st_size, 'modified': stat.st_mtime, - 'lines': sum(1 for _ in open(full_path, 'rb')) if full_path.is_file() else 0 + 'lines': lines } @@ -188,10 +192,12 @@ def get_file(file_key): try: with open(path, 'r', encoding='utf-8') as f: content = ''.join(f.readlines()[:lines]) + with open(path, 'rb') as f: + total_lines = sum(1 for _ in f) return jsonify({ 'path': files[file_key], 'content': content, - 'total_lines': sum(1 for _ in open(path, 'rb')) + 'total_lines': total_lines }) except Exception as e: return jsonify({'error': str(e)}), 500 From 2dafb2858c7425fcd2ff72a41e9d006212d50d4a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 13 Jan 2026 15:10:59 +0000 Subject: [PATCH 4/4] Fix code quality issues: fragile patterns, thread safety, and race conditions tools/crawl.py: - Fix fragile locals() check for soup object (line 87) - Replace 'soup' in locals() with explicit None check - Initialize soup = None before try block for proper tracking groundkg/re_score.py: - Add thread safety to global embedder cache (line 16-27) - Implement double-checked locking pattern with threading.Lock - Prevents race condition when multiple threads initialize model - Ensures only one model instance is created even under concurrent access web/app.py: - Fix potential race condition in command_status() (line 122-150) - Store process reference locally to prevent stale references - Use .pop() instead of del to handle concurrent deletions safely - Add timeout to communicate() to prevent indefinite blocking - Add exception handling for robust error reporting --- groundkg/re_score.py | 9 +++++++-- tools/crawl.py | 6 ++++-- web/app.py | 43 +++++++++++++++++++++++++++---------------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/groundkg/re_score.py b/groundkg/re_score.py index 8f218a5..25e91f6 100644 --- a/groundkg/re_score.py +++ b/groundkg/re_score.py @@ -2,6 +2,7 @@ import sys import json import os +import threading import onnxruntime as ort import numpy as np from sentence_transformers import SentenceTransformer @@ -12,13 +13,17 @@ # Global cache for sentence transformer model _embedder_cache = None +_embedder_lock = threading.Lock() def get_embedder(): - """Get or create sentence transformer model (cached).""" + """Get or create sentence transformer model (cached, thread-safe).""" global _embedder_cache if _embedder_cache is None: - _embedder_cache = SentenceTransformer(MODEL_NAME) + with _embedder_lock: + # Double-check inside lock to prevent race condition + if _embedder_cache is None: + _embedder_cache = SentenceTransformer(MODEL_NAME) return _embedder_cache diff --git a/tools/crawl.py b/tools/crawl.py index 4bf68e8..fd9d5fc 100644 --- a/tools/crawl.py +++ b/tools/crawl.py @@ -79,12 +79,14 @@ def fetch_and_snapshot(doc_id: str, url: str): def html_to_text(data: bytes, url: str): html = data.decode("utf-8", errors="ignore") title = "" + soup = None try: soup = BeautifulSoup(html, "html.parser") if soup.title and soup.title.string: title = soup.title.string.strip() - except Exception: pass + except Exception: + pass extracted = trafilatura.extract(html, include_tables=True, url=url) or "" - text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if 'soup' in locals() else "") + text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if soup is not None else "") return title or url, text def pdf_to_text(data: bytes)->str: diff --git a/web/app.py b/web/app.py index 7113ec9..753198f 100755 --- a/web/app.py +++ b/web/app.py @@ -121,22 +121,33 @@ def run_command(target): @app.route('/api/status/') def command_status(target): """Get status of a running command""" - if target in running_commands: - process = running_commands[target] - if process.poll() is None: - # Still running - return jsonify({'status': 'running', 'pid': process.pid}) - else: - # Finished - stdout, stderr = process.communicate() - del running_commands[target] - return jsonify({ - 'status': 'completed' if process.returncode == 0 else 'failed', - 'returncode': process.returncode, - 'stdout': stdout, - 'stderr': stderr - }) - return jsonify({'status': 'not_found'}) + if target not in running_commands: + return jsonify({'status': 'not_found'}) + + # Store process reference to avoid race conditions + process = running_commands[target] + poll_result = process.poll() + + if poll_result is None: + # Still running + return jsonify({'status': 'running', 'pid': process.pid}) + + # Finished - clean up and return results + try: + # Remove from running_commands first to prevent duplicate processing + running_commands.pop(target, None) + stdout, stderr = process.communicate(timeout=1) + return jsonify({ + 'status': 'completed' if process.returncode == 0 else 'failed', + 'returncode': process.returncode, + 'stdout': stdout, + 'stderr': stderr + }) + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': f'Failed to get process output: {str(e)}' + }), 500 @app.route('/api/files')