-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile.gk
More file actions
180 lines (138 loc) · 6.62 KB
/
Makefile.gk
File metadata and controls
180 lines (138 loc) · 6.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
PY=python
OUT=out
MODELS=models
TRAIN=training
DATA=data
SEED_CSV=$(DATA)/seed.csv
META=$(DATA)/meta.jsonl
CORPUS_DIR=$(DATA)/corpus
NER=$(OUT)/pack.ner.jsonl
CAND=$(OUT)/pack.candidates.jsonl
SCORED=$(OUT)/pack.scored.jsonl
EDGES=$(OUT)/edges.jsonl
DEDUPED=$(OUT)/edges.dedup.jsonl
TTL=$(OUT)/graph.ttl
PATTERNS=$(OUT)/patterns.jsonl
ONNX=$(MODELS)/promoter_v1.onnx
CLASSES=$(MODELS)/classes.json
THR=$(MODELS)/thresholds.json
TRAIN_TR=$(TRAIN)/re_train.jsonl
TRAIN_DV=$(TRAIN)/re_dev.jsonl
SEED_JSON=$(TRAIN)/seed.jsonl
.PHONY: all pipeline coldstart crawl manifest ner cand score patterns autoselect train rescore infer edges ttl report clean
all: crawl manifest ner cand score infer edges ttl report
pipeline: ## Recommended workflow: bootstrap → train → score → autoselect → rescore → edges → report
@# Check if model exists, if not bootstrap and train
@if [ ! -f $(ONNX) ]; then \
echo "No model found, bootstrapping seeds and training initial model..."; \
$(MAKE) -f Makefile.gk bootstrap coldstart; \
fi
@# Score with current model
$(MAKE) -f Makefile.gk score
@# Autoselect (retrain with better data)
$(MAKE) -f Makefile.gk autoselect
@# Rescore with new model (force by removing old scored file)
@rm -f $(SCORED)
$(MAKE) -f Makefile.gk score
@# Infer, edges, ttl, report
$(MAKE) -f Makefile.gk infer edges ttl report
@echo ""
@echo "Pipeline complete! Check out/edges.dedup.jsonl and out/graph.ttl"
coldstart: ## one-time: use seed to create train/dev and train the first model
@[ -f $(SEED_JSON) ] || (echo "Missing $(SEED_JSON) – add a few seed examples (see plan)." && exit 2)
@mkdir -p $(OUT) $(MODELS)
# bootstrap train/dev from seed: 80/20 split
$(PY) -c "import json,random,os,sys; random.seed(0); seed='$(SEED_JSON)'; out_tr='$(TRAIN_TR)'; out_dv='$(TRAIN_DV)'; os.makedirs('$(TRAIN)', exist_ok=True); rows=[json.loads(l) for l in open(seed,'r',encoding='utf-8') if l.strip()]; random.shuffle(rows); n=max(1,int(0.8*len(rows))); open(out_tr,'w',encoding='utf-8').write('\\n'.join(json.dumps(r,ensure_ascii=False) for r in rows[:n])+'\\n'); open(out_dv,'w',encoding='utf-8').write('\\n'.join(json.dumps(r,ensure_ascii=False) for r in rows[n:])+'\\n'); print(f'Bootstrapped {n} train / {len(rows)-n} dev from seed')"
$(PY) training/train_re_transformers.py
train_tfidf: ## deprecated: train using TF-IDF (use coldstart for sentence transformers)
@echo "WARNING: train_tfidf is deprecated. Use 'coldstart' for sentence transformer training."
$(PY) training/train_re_sklearn.py
crawl:
@[ -f $(SEED_CSV) ] || (echo "Provide $(SEED_CSV) with columns: doc_id,url,license,lang"; exit 2)
$(PY) tools/crawl.py
manifest:
$(PY) tools/build_manifest.py
ner:
@mkdir -p $(OUT)
# concatenate NER over all corpus files
@rm -f $(NER)
@for f in $(CORPUS_DIR)/*.txt; do \
echo "Processing: $$f"; \
$(PY) -m groundkg.ner_tag $$f --doc-id $$(basename $$f .txt) >> $(NER); \
done
@echo "Wrote $(NER)"
cand:
@echo "Reading: $(NER)"
$(PY) groundkg/candidates.py $(NER) > $(CAND)
@echo "Wrote: $(CAND)"
score: ## requires $(ONNX)
@[ -f $(ONNX) ] || (echo "Missing $(ONNX). Run `make -f Makefile.gk coldstart` first."; exit 2)
$(PY) groundkg/re_score.py $(CAND) $(ONNX) $(CLASSES) > $(SCORED)
patterns:
$(PY) tools/mine_patterns.py --scored $(SCORED) --min-count 3 --min-prob 0.9 --json > $(PATTERNS) || true
autoselect: ## build richer train/dev from scored + patterns + candidates
$(PY) tools/select_training_from_scored.py
$(PY) training/train_re_transformers.py
rescore: ## after retraining model, rescore with the new ONNX
@rm -f $(SCORED) # Force rescore after retraining
$(PY) groundkg/re_score.py $(CAND) $(ONNX) $(CLASSES) > $(SCORED)
infer: ## promote to edges using thresholds
@# Check if we need to adjust thresholds (if no edges would be emitted)
@MIN_THR=$${GK_MIN_THRESHOLD:-0.60}; \
$(PY) tools/adjust_thresholds.py $(SCORED) $(THR) 10 $$MIN_THR || true
$(PY) tools/promote_from_scored.py $(SCORED) $(THR) > $(EDGES)
@# Verify edges were emitted, if not, adjust thresholds and retry
@if [ ! -s $(EDGES) ]; then \
echo "No edges emitted, adjusting thresholds (min=$${GK_MIN_THRESHOLD:-0.60})..."; \
MIN_THR=$${GK_MIN_THRESHOLD:-0.60}; \
$(PY) tools/adjust_thresholds.py $(SCORED) $(THR) 10 $$MIN_THR; \
$(PY) tools/promote_from_scored.py $(SCORED) $(THR) > $(EDGES); \
fi
edges:
$(PY) groundkg/dedupe_edges.py $(EDGES) > $(DEDUPED)
ttl:
$(PY) groundkg/export_ttl.py $(DEDUPED) > $(TTL)
report:
$(PY) tools/quality_report.py $(SCORED) $(DEDUPED) $(TRAIN_TR) $(THR)
clean:
rm -rf $(OUT) $(MODELS) $(TRAIN)/re_*.jsonl
# --- Automatic seed bootstrap (deterministic, high-precision) ---
.PHONY: bootstrap_seed bootstrap
bootstrap_seed:
@$(PY) tools/bootstrap_seed_from_candidates.py out/pack.candidates.jsonl training/seed.jsonl
# One-shot helper to build seeds from scratch (no model needed)
bootstrap: ner cand bootstrap_seed
@echo "Bootstrapped seeds to training/seed.jsonl"
# Optional future steps (disabled by default):
# @$(PY) tools/bootstrap_seed_dep.py --enable || true
# @$(PY) tools/bootstrap_seed_kb.py --kb config/kb.json || true
# @$(PY) tools/bootstrap_seed_llm.py --enable || true
# --- Events pipeline (independent of the core graph) ---
.PHONY: manifest_jsonl
# Build a simple JSONL manifest from corpus texts for the events extractor
out/manifest.jsonl: $(CORPUS_DIR)/*.txt
@mkdir -p $(OUT)
@python -c "import glob,json,os,io; \
def read_file(p): \
with io.open(p,'r',encoding='utf-8') as f: return f.read(); \
paths=glob.glob('data/corpus/*.txt'); \
with io.open('out/manifest.jsonl','w',encoding='utf-8') as w: \
[w.write(json.dumps({'doc_id':os.path.splitext(os.path.basename(p))[0],'text':read_file(p)}, ensure_ascii=False)+'\\n') for p in paths]; \
print('Wrote out/manifest.jsonl with %d docs' % len(paths))"
.PHONY: events event_edges ttl_events merge_edges ttl_merged
events: out/events.jsonl
out/events.jsonl: out/manifest.jsonl
python -m groundkg.event_extract --manifest out/manifest.jsonl --out out/events.jsonl
event_edges: out/edges.events.jsonl
out/edges.events.jsonl: out/events.jsonl
python -m groundkg.events_to_edges --events out/events.jsonl --out out/edges.events.jsonl
ttl_events: graph.events.ttl
graph.events.ttl: out/edges.events.jsonl
python -m groundkg.export_ttl out/edges.events.jsonl > graph.events.ttl
# Optional: merge core + events into one edge file then export once
merge_edges: out/edges.merged.jsonl
out/edges.merged.jsonl: out/edges.jsonl out/edges.events.jsonl
cat out/edges.jsonl out/edges.events.jsonl > out/edges.merged.jsonl
ttl_merged: graph.merged.ttl
graph.merged.ttl: out/edges.merged.jsonl
python -m groundkg.export_ttl out/edges.merged.jsonl > graph.merged.ttl