diff --git a/Makefile.gk b/Makefile.gk index 93b98d0..2938382 100644 --- a/Makefile.gk +++ b/Makefile.gk @@ -143,8 +143,13 @@ bootstrap: ner cand bootstrap_seed # Build a simple JSONL manifest from corpus texts for the events extractor out/manifest.jsonl: $(CORPUS_DIR)/*.txt @mkdir -p $(OUT) - @python -c "import glob,json,os,io; paths=glob.glob('data/corpus/*.txt'); w=io.open('out/manifest.jsonl','w',encoding='utf-8');\ - [w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':io.open(p,'r',encoding='utf-8').read()}, ensure_ascii=False)+'\\n') for p in paths]; w.close(); print('Wrote out/manifest.jsonl with %d docs' % len(paths))" + @python -c "import glob,json,os,io; \ +def read_file(p): \ + with io.open(p,'r',encoding='utf-8') as f: return f.read(); \ +paths=glob.glob('data/corpus/*.txt'); \ +with io.open('out/manifest.jsonl','w',encoding='utf-8') as w: \ + [w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':read_file(p)}, ensure_ascii=False)+'\\n') for p in paths]; \ + print('Wrote out/manifest.jsonl with %d docs' % len(paths))" .PHONY: events event_edges ttl_events merge_edges ttl_merged diff --git a/groundkg/ner_tag.py b/groundkg/ner_tag.py index 57fef71..d9ae563 100644 --- a/groundkg/ner_tag.py +++ b/groundkg/ner_tag.py @@ -48,7 +48,8 @@ def main(): nlp.enable_pipe("ner") except Exception: pass - text = open(in_path, "r", encoding="utf-8").read() + with open(in_path, "r", encoding="utf-8") as f: + text = f.read() doc = nlp(text) # main call to the pipeline for sent in doc.sents: diff --git a/groundkg/re_infer.py b/groundkg/re_infer.py index 0703150..df5009e 100644 --- a/groundkg/re_infer.py +++ b/groundkg/re_infer.py @@ -93,8 +93,10 @@ def main(): if not os.path.exists(onnx_path): sys.exit(0) - thresholds = json.load(open(thresh_path, "r", encoding="utf-8")) - classes = json.load(open("models/classes.json", "r", encoding="utf-8")) + with open(thresh_path, "r", encoding="utf-8") as f: + thresholds = json.load(f) + with open("models/classes.json", "r", encoding="utf-8") as f: + classes = json.load(f) sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) inp_name = sess.get_inputs()[0].name diff --git a/groundkg/re_score.py b/groundkg/re_score.py index 64cfb2d..25e91f6 100644 --- a/groundkg/re_score.py +++ b/groundkg/re_score.py @@ -2,6 +2,7 @@ import sys import json import os +import threading import onnxruntime as ort import numpy as np from sentence_transformers import SentenceTransformer @@ -12,13 +13,17 @@ # Global cache for sentence transformer model _embedder_cache = None +_embedder_lock = threading.Lock() def get_embedder(): - """Get or create sentence transformer model (cached).""" + """Get or create sentence transformer model (cached, thread-safe).""" global _embedder_cache if _embedder_cache is None: - _embedder_cache = SentenceTransformer(MODEL_NAME) + with _embedder_lock: + # Double-check inside lock to prevent race condition + if _embedder_cache is None: + _embedder_cache = SentenceTransformer(MODEL_NAME) return _embedder_cache @@ -50,7 +55,8 @@ def main(): embedder = get_embedder() # Load ONNX model and classes - classes = json.load(open(classes_path, "r", encoding="utf-8")) + with open(classes_path, "r", encoding="utf-8") as f: + classes = json.load(f) sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) inp_name = sess.get_inputs()[0].name diff --git a/tools/adjust_thresholds.py b/tools/adjust_thresholds.py index 87721f2..e9cddb7 100644 --- a/tools/adjust_thresholds.py +++ b/tools/adjust_thresholds.py @@ -44,7 +44,8 @@ def adjust_thresholds(thresholds_path, scored_path, min_edges=10, min_threshold= """ # Load current thresholds if os.path.exists(thresholds_path): - thresholds = json.load(open(thresholds_path, 'r', encoding='utf-8')) + with open(thresholds_path, 'r', encoding='utf-8') as f: + thresholds = json.load(f) else: thresholds = {} diff --git a/tools/crawl.py b/tools/crawl.py index 4bf68e8..fd9d5fc 100644 --- a/tools/crawl.py +++ b/tools/crawl.py @@ -79,12 +79,14 @@ def fetch_and_snapshot(doc_id: str, url: str): def html_to_text(data: bytes, url: str): html = data.decode("utf-8", errors="ignore") title = "" + soup = None try: soup = BeautifulSoup(html, "html.parser") if soup.title and soup.title.string: title = soup.title.string.strip() - except Exception: pass + except Exception: + pass extracted = trafilatura.extract(html, include_tables=True, url=url) or "" - text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if 'soup' in locals() else "") + text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if soup is not None else "") return title or url, text def pdf_to_text(data: bytes)->str: diff --git a/tools/promote_from_scored.py b/tools/promote_from_scored.py index 7e3bd38..59d3c0e 100644 --- a/tools/promote_from_scored.py +++ b/tools/promote_from_scored.py @@ -2,7 +2,8 @@ def main(): scored_path, thr_path = sys.argv[1:3] - thresholds = json.load(open(thr_path, 'r', encoding='utf-8')) + with open(thr_path, 'r', encoding='utf-8') as f: + thresholds = json.load(f) with open(scored_path, 'r', encoding='utf-8') as f: for line in f: r = json.loads(line) diff --git a/tools/quality_report.py b/tools/quality_report.py index 59a1953..1d64ac9 100644 --- a/tools/quality_report.py +++ b/tools/quality_report.py @@ -28,7 +28,8 @@ def main(): thresholds = {} if os.path.exists(thr_path): try: - thresholds = json.load(open(thr_path,'r',encoding='utf-8')) + with open(thr_path, 'r', encoding='utf-8') as f: + thresholds = json.load(f) except Exception: thresholds = {} diff --git a/training/train_re_sklearn.py b/training/train_re_sklearn.py index 42c8174..cb30cd8 100644 --- a/training/train_re_sklearn.py +++ b/training/train_re_sklearn.py @@ -11,7 +11,9 @@ from skl2onnx import convert_sklearn from skl2onnx.common.data_types import StringTensorType -def load_jsonl(p): return [json.loads(l) for l in open(p, "r", encoding="utf-8") if l.strip()] +def load_jsonl(p): + with open(p, "r", encoding="utf-8") as f: + return [json.loads(l) for l in f if l.strip()] def load_data(train_p, dev_p): tr = load_jsonl(train_p); dv = load_jsonl(dev_p) diff --git a/web/app.py b/web/app.py index ac6dad9..753198f 100755 --- a/web/app.py +++ b/web/app.py @@ -63,11 +63,15 @@ def get_file_info(path): if not full_path.exists(): return None stat = full_path.stat() + lines = 0 + if full_path.is_file(): + with open(full_path, 'rb') as f: + lines = sum(1 for _ in f) return { 'exists': True, 'size': stat.st_size, 'modified': stat.st_mtime, - 'lines': sum(1 for _ in open(full_path, 'rb')) if full_path.is_file() else 0 + 'lines': lines } @@ -117,22 +121,33 @@ def run_command(target): @app.route('/api/status/') def command_status(target): """Get status of a running command""" - if target in running_commands: - process = running_commands[target] - if process.poll() is None: - # Still running - return jsonify({'status': 'running', 'pid': process.pid}) - else: - # Finished - stdout, stderr = process.communicate() - del running_commands[target] - return jsonify({ - 'status': 'completed' if process.returncode == 0 else 'failed', - 'returncode': process.returncode, - 'stdout': stdout, - 'stderr': stderr - }) - return jsonify({'status': 'not_found'}) + if target not in running_commands: + return jsonify({'status': 'not_found'}) + + # Store process reference to avoid race conditions + process = running_commands[target] + poll_result = process.poll() + + if poll_result is None: + # Still running + return jsonify({'status': 'running', 'pid': process.pid}) + + # Finished - clean up and return results + try: + # Remove from running_commands first to prevent duplicate processing + running_commands.pop(target, None) + stdout, stderr = process.communicate(timeout=1) + return jsonify({ + 'status': 'completed' if process.returncode == 0 else 'failed', + 'returncode': process.returncode, + 'stdout': stdout, + 'stderr': stderr + }) + except Exception as e: + return jsonify({ + 'status': 'error', + 'error': f'Failed to get process output: {str(e)}' + }), 500 @app.route('/api/files') @@ -188,10 +203,12 @@ def get_file(file_key): try: with open(path, 'r', encoding='utf-8') as f: content = ''.join(f.readlines()[:lines]) + with open(path, 'rb') as f: + total_lines = sum(1 for _ in f) return jsonify({ 'path': files[file_key], 'content': content, - 'total_lines': sum(1 for _ in open(path, 'rb')) + 'total_lines': total_lines }) except Exception as e: return jsonify({'error': str(e)}), 500