Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions Makefile.gk
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,13 @@ bootstrap: ner cand bootstrap_seed
# Build a simple JSONL manifest from corpus texts for the events extractor
out/manifest.jsonl: $(CORPUS_DIR)/*.txt
@mkdir -p $(OUT)
@python -c "import glob,json,os,io; paths=glob.glob('data/corpus/*.txt'); w=io.open('out/manifest.jsonl','w',encoding='utf-8');\
[w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':io.open(p,'r',encoding='utf-8').read()}, ensure_ascii=False)+'\\n') for p in paths]; w.close(); print('Wrote out/manifest.jsonl with %d docs' % len(paths))"
@python -c "import glob,json,os,io; \
def read_file(p): \
with io.open(p,'r',encoding='utf-8') as f: return f.read(); \
paths=glob.glob('data/corpus/*.txt'); \
with io.open('out/manifest.jsonl','w',encoding='utf-8') as w: \
[w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':read_file(p)}, ensure_ascii=False)+'\\n') for p in paths]; \
print('Wrote out/manifest.jsonl with %d docs' % len(paths))"

.PHONY: events event_edges ttl_events merge_edges ttl_merged

Expand Down
3 changes: 2 additions & 1 deletion groundkg/ner_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def main():
nlp.enable_pipe("ner")
except Exception:
pass
text = open(in_path, "r", encoding="utf-8").read()
with open(in_path, "r", encoding="utf-8") as f:
text = f.read()
doc = nlp(text) # main call to the pipeline

for sent in doc.sents:
Expand Down
6 changes: 4 additions & 2 deletions groundkg/re_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,10 @@ def main():
if not os.path.exists(onnx_path):
sys.exit(0)

thresholds = json.load(open(thresh_path, "r", encoding="utf-8"))
classes = json.load(open("models/classes.json", "r", encoding="utf-8"))
with open(thresh_path, "r", encoding="utf-8") as f:
thresholds = json.load(f)
with open("models/classes.json", "r", encoding="utf-8") as f:
classes = json.load(f)
sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
inp_name = sess.get_inputs()[0].name

Expand Down
12 changes: 9 additions & 3 deletions groundkg/re_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys
import json
import os
import threading
import onnxruntime as ort
import numpy as np
from sentence_transformers import SentenceTransformer
Expand All @@ -12,13 +13,17 @@

# Global cache for sentence transformer model
_embedder_cache = None
_embedder_lock = threading.Lock()


def get_embedder():
"""Get or create sentence transformer model (cached)."""
"""Get or create sentence transformer model (cached, thread-safe)."""
global _embedder_cache
if _embedder_cache is None:
_embedder_cache = SentenceTransformer(MODEL_NAME)
with _embedder_lock:
# Double-check inside lock to prevent race condition
if _embedder_cache is None:
_embedder_cache = SentenceTransformer(MODEL_NAME)
return _embedder_cache


Expand Down Expand Up @@ -50,7 +55,8 @@ def main():
embedder = get_embedder()

# Load ONNX model and classes
classes = json.load(open(classes_path, "r", encoding="utf-8"))
with open(classes_path, "r", encoding="utf-8") as f:
classes = json.load(f)
sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
inp_name = sess.get_inputs()[0].name

Expand Down
3 changes: 2 additions & 1 deletion tools/adjust_thresholds.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def adjust_thresholds(thresholds_path, scored_path, min_edges=10, min_threshold=
"""
# Load current thresholds
if os.path.exists(thresholds_path):
thresholds = json.load(open(thresholds_path, 'r', encoding='utf-8'))
with open(thresholds_path, 'r', encoding='utf-8') as f:
thresholds = json.load(f)
else:
thresholds = {}

Expand Down
6 changes: 4 additions & 2 deletions tools/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ def fetch_and_snapshot(doc_id: str, url: str):
def html_to_text(data: bytes, url: str):
html = data.decode("utf-8", errors="ignore")
title = ""
soup = None
try:
soup = BeautifulSoup(html, "html.parser")
if soup.title and soup.title.string: title = soup.title.string.strip()
except Exception: pass
except Exception:
pass
extracted = trafilatura.extract(html, include_tables=True, url=url) or ""
text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if 'soup' in locals() else "")
text = extracted.strip() if extracted else (soup.get_text(" ", strip=True) if soup is not None else "")
return title or url, text

def pdf_to_text(data: bytes)->str:
Expand Down
3 changes: 2 additions & 1 deletion tools/promote_from_scored.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

def main():
scored_path, thr_path = sys.argv[1:3]
thresholds = json.load(open(thr_path, 'r', encoding='utf-8'))
with open(thr_path, 'r', encoding='utf-8') as f:
thresholds = json.load(f)
with open(scored_path, 'r', encoding='utf-8') as f:
for line in f:
r = json.loads(line)
Expand Down
3 changes: 2 additions & 1 deletion tools/quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def main():
thresholds = {}
if os.path.exists(thr_path):
try:
thresholds = json.load(open(thr_path,'r',encoding='utf-8'))
with open(thr_path, 'r', encoding='utf-8') as f:
thresholds = json.load(f)
except Exception:
thresholds = {}

Expand Down
4 changes: 3 additions & 1 deletion training/train_re_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

def load_jsonl(p): return [json.loads(l) for l in open(p, "r", encoding="utf-8") if l.strip()]
def load_jsonl(p):
with open(p, "r", encoding="utf-8") as f:
return [json.loads(l) for l in f if l.strip()]

def load_data(train_p, dev_p):
tr = load_jsonl(train_p); dv = load_jsonl(dev_p)
Expand Down
53 changes: 35 additions & 18 deletions web/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,15 @@ def get_file_info(path):
if not full_path.exists():
return None
stat = full_path.stat()
lines = 0
if full_path.is_file():
with open(full_path, 'rb') as f:
lines = sum(1 for _ in f)
return {
'exists': True,
'size': stat.st_size,
'modified': stat.st_mtime,
'lines': sum(1 for _ in open(full_path, 'rb')) if full_path.is_file() else 0
'lines': lines
}


Expand Down Expand Up @@ -117,22 +121,33 @@ def run_command(target):
@app.route('/api/status/<target>')
def command_status(target):
"""Get status of a running command"""
if target in running_commands:
process = running_commands[target]
if process.poll() is None:
# Still running
return jsonify({'status': 'running', 'pid': process.pid})
else:
# Finished
stdout, stderr = process.communicate()
del running_commands[target]
return jsonify({
'status': 'completed' if process.returncode == 0 else 'failed',
'returncode': process.returncode,
'stdout': stdout,
'stderr': stderr
})
return jsonify({'status': 'not_found'})
if target not in running_commands:
return jsonify({'status': 'not_found'})

# Store process reference to avoid race conditions
process = running_commands[target]
poll_result = process.poll()

if poll_result is None:
# Still running
return jsonify({'status': 'running', 'pid': process.pid})

# Finished - clean up and return results
try:
# Remove from running_commands first to prevent duplicate processing
running_commands.pop(target, None)
stdout, stderr = process.communicate(timeout=1)
return jsonify({
'status': 'completed' if process.returncode == 0 else 'failed',
'returncode': process.returncode,
'stdout': stdout,
'stderr': stderr
})
except Exception as e:
return jsonify({
'status': 'error',
'error': f'Failed to get process output: {str(e)}'
}), 500


@app.route('/api/files')
Expand Down Expand Up @@ -188,10 +203,12 @@ def get_file(file_key):
try:
with open(path, 'r', encoding='utf-8') as f:
content = ''.join(f.readlines()[:lines])
with open(path, 'rb') as f:
total_lines = sum(1 for _ in f)
return jsonify({
'path': files[file_key],
'content': content,
'total_lines': sum(1 for _ in open(path, 'rb'))
'total_lines': total_lines
})
except Exception as e:
return jsonify({'error': str(e)}), 500
Expand Down