knowledge-graph-01/Makefile at main · giosilvi/knowledge-graph-01 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
PY=python
# Self-training knobs (overridable):
POS_THR ?= 0.95
NEG_THR ?= 0.95
MAX_PER_CLASS ?= 500
NER_MODEL ?= en_core_web_trf
OUT=out

.PHONY: setup clean model_promote collect pipeline auto_train pack_corpus pack_stats crawl manifest quality lint

setup:
	$(PY) -m spacy download en_core_web_sm

demo:
	@echo "Deprecated: use 'make pipeline'" ; exit 1

demo_legacy:
	@echo "Deprecated: model-only pipeline in use" ; exit 1

model_promote:
	@echo "Deprecated: models-only mode is enforced. Use 'make demo'." ; exit 1

collect:
	$(PY) training/mk_training_data.py

clean:
	rm -rf $(OUT)

hash:
	@shasum out/edges.jsonl out/attributes.jsonl out/graph.ttl || true

verify:
	@$(PY) -c "import json; [json.loads(l) for l in open('out/edges.jsonl')]; print('edges.jsonl OK')"

crawl:
	$(PY) tools/crawl.py

manifest:
	$(PY) tools/build_manifest.py

clean_pack:
	rm -f $(OUT)/pack.ner.jsonl $(OUT)/pack.candidates.jsonl $(OUT)/pack.scored.jsonl

pack_corpus: clean_pack
	@set -e; \
	mkdir -p $(OUT); \
	for f in $$(ls -1 data/corpus/*.txt 2>/dev/null | sort); do \
	  bn=$$(basename $$f .txt); \
	  echo "→ NER $$bn"; \
	  $(PY) -m groundkg.ner_tag $$f --doc-id $$bn --model $(NER_MODEL) >> $(OUT)/pack.ner.jsonl; \
	done; \
	echo "→ Candidates"; \
	$(PY) -m groundkg.candidates $(OUT)/pack.ner.jsonl > $(OUT)/pack.candidates.jsonl; \
	if [ ! -f models/promoter_v1.onnx ]; then echo "ERROR: train model first"; exit 2; fi; \
	echo "→ RE scoring"; \
	$(PY) -m groundkg.re_score $(OUT)/pack.candidates.jsonl models/promoter_v1.onnx models/classes.json > $(OUT)/pack.scored.jsonl; \
	echo "Done pack_corpus."

pack_stats:
	@echo "Pred counts in pack:"; \
	jq -r '.pred' $(OUT)/pack.scored.jsonl | sort | uniq -c | sort -nr | sed 's/^/  /'

auto_train:
	@echo "Selecting training data from out/pack.scored.jsonl (POS_THR=$(POS_THR), NEG_THR=$(NEG_THR), MAX_PER_CLASS=$(MAX_PER_CLASS))";
	@POS_THR=$(POS_THR) NEG_THR=$(NEG_THR) GK_POS_THR=$(POS_THR) GK_NEG_THR=$(NEG_THR) GK_MAX_PER_CLASS=$(MAX_PER_CLASS) \
		$(PY) tools/select_training_from_scored.py out/pack.scored.jsonl;
	@if [ -s training/re_train.jsonl ]; then \
	  echo "Training examples found → retraining model..."; \
	  $(PY) training/train_re_sklearn.py; \
	  echo "Retrained model from corpus pack."; \
	else \
	  echo "No selections (training/re_train.jsonl empty). Skipping retrain."; \
	fi

# End-to-end: crawl → manifest → pack → auto-train → repack → stats → edges
pipeline: clean crawl manifest pack_corpus auto_train pack_corpus pack_stats edges_from_pack verify hash quality
	@echo "Pipeline complete."

quality:
	@echo "Quality indicators:"; \
	$(PY) tools/quality_report.py out/pack.scored.jsonl out/edges.jsonl training/re_train.jsonl models/thresholds.json

edges_from_pack:
	@echo "Promoting pack.scored.jsonl to edges.jsonl using thresholds..."; \
	$(PY) tools/promote_from_scored.py out/pack.scored.jsonl models/thresholds.json | $(PY) -m groundkg.dedupe_edges /dev/stdin > out/edges.jsonl; \
	$(PY) -m groundkg.export_ttl out/edges.jsonl > out/graph.ttl

lint:
	@echo "Running Ruff (unused imports/vars)..."; \
	ruff check groundkg tools training --select F401,F841 --fix || true; \
	echo "\nRunning Vulture (unused defs)..."; \
	vulture groundkg tools training vulture_whitelist.py --min-confidence 80 --exclude out,data,models,.venv,__pycache__ || true

mine_patterns:
	@echo "Top surface patterns from scored (high-conf):"; \
	$(PY) tools/mine_patterns.py --scored $(OUT)/pack.scored.jsonl --min-count 3 --min-prob 0.9 | head -50; \
	echo "\nTop surface patterns from raw candidates:"; \
	$(PY) tools/mine_patterns.py --candidates $(OUT)/pack.candidates.jsonl --min-count 10 | head -50