giosilvi · giosilvi · Oct 27, 2025 · Oct 24, 2025 · Oct 27, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,25 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirement.txt
+          pip install pytest
+      - name: Run pytest
+        run: pytest
+      - name: Ensure coverage threshold
+        run: python tools/run_trace_coverage.py --min 80
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,26 @@
+import sys
+import types
+from pathlib import Path
+
+# Ensure the repository root is importable as a package during tests
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+# Provide a lightweight onnxruntime stub so modules can be imported in tests
+if "onnxruntime" not in sys.modules:
+    class _StubSession:
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError(
+                "onnxruntime is stubbed in tests. Monkeypatch InferenceSession in individual tests."
+            )
+
+    sys.modules["onnxruntime"] = types.SimpleNamespace(InferenceSession=_StubSession)
+
+if "spacy" not in sys.modules:
+    def _stub_load(*_args, **_kwargs):
+        raise RuntimeError(
+            "spacy is stubbed in tests. Monkeypatch spacy.load within individual tests."
+        )
+
+    sys.modules["spacy"] = types.SimpleNamespace(load=_stub_load)
diff --git a/tests/test_candidates.py b/tests/test_candidates.py
@@ -0,0 +1,134 @@
+import io
+import json
+
+from groundkg import candidates
+
+
+def test_non_overlapping_chunks_filters_overlaps():
+    sent = "Alice and Bob met Charlie"
+    ents = [
+        {"text": "Alice", "start": 0, "end": 5, "label": "PERSON"},
+        {"text": "Charlie", "start": 18, "end": 25, "label": "PERSON"},
+    ]
+
+    chunks = candidates.non_overlapping_chunks(sent, ents)
+
+    # "Alice" and "Charlie" should be excluded because they overlap entities
+    chunk_texts = {c["text"] for c in chunks}
+    assert chunk_texts == {"Bob"}
+    # ensure chunks carry the expected metadata
+    for chunk in chunks:
+        assert chunk["label"] == "NOUNPHRASE"
+        assert chunk["end"] - chunk["start"] >= 3
+
+
+def test_main_emits_subject_object_pairs(tmp_path, monkeypatch):
+    record = {
+        "doc_id": "d1",
+        "sent_idx": 0,
+        "sent_start": 0,
+        "text": "Alice visited Paris with Charlie",
+        "entities": [
+            {"text": "Alice", "start": 0, "end": 5, "label": "PERSON"},
+            {"text": "Paris", "start": 13, "end": 18, "label": "GPE"},
+        ],
+    }
+    ner_path = tmp_path / "ner.jsonl"
+    ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")
+
+    monkeypatch.setenv("PYTHONHASHSEED", "0")  # ensure deterministic iteration if needed
+    monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.stdout", buf)
+
+    candidates.main()
+
+    lines = [json.loads(line) for line in buf.getvalue().splitlines() if line]
+    assert any(
+        rec["subject"]["text"] == "Alice" and rec["object"]["text"] == "Paris"
+        for rec in lines
+    )
+    for rec in lines:
+        assert rec["doc_id"] == "d1"
+
+
+def test_main_respects_char_distance_limit(tmp_path, monkeypatch):
+    long_text = "Alice" + " " * 151 + "Paris"
+    record = {
+        "doc_id": "d1",
+        "sent_idx": 0,
+        "sent_start": 0,
+        "text": long_text,
+        "entities": [
+            {"text": "Alice", "start": 0, "end": 5, "label": "PERSON"},
+            {"text": "Paris", "start": 156, "end": 161, "label": "GPE"},
+        ],
+    }
+    ner_path = tmp_path / "ner.jsonl"
+    ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.stdout", buf)
+    monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])
+
+    candidates.main()
+
+    assert buf.getvalue().strip() == ""
+
+
+def test_main_caps_pairs_at_limit(tmp_path, monkeypatch):
+    tokens = ["Alice"] + [f"Obj{i}" for i in range(12)]
+    text = " ".join(tokens)
+
+    ents = []
+    cursor = 0
+    for token in tokens:
+        start = cursor
+        end = start + len(token)
+        label = "PERSON" if token == "Alice" else "PRODUCT"
+        ents.append({"text": token, "start": start, "end": end, "label": label})
+        cursor = end + 1  # account for spaces
+
+    record = {
+        "doc_id": "d1",
+        "sent_idx": 0,
+        "sent_start": 0,
+        "text": text,
+        "entities": ents,
+    }
+    ner_path = tmp_path / "ner.jsonl"
+    ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.stdout", buf)
+    monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])
+
+    candidates.main()
+
+    lines = [line for line in buf.getvalue().splitlines() if line]
+    assert len(lines) == candidates.MAX_PAIRS_PER_SENT
+
+
+def test_main_falls_back_to_chunks_without_entities(tmp_path, monkeypatch):
+    record = {
+        "doc_id": "d2",
+        "sent_idx": 0,
+        "sent_start": 0,
+        "text": "Solar Panel helps Bright Homes",
+        "entities": [],
+    }
+    ner_path = tmp_path / "ner.jsonl"
+    ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.stdout", buf)
+    monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])
+
+    candidates.main()
+
+    lines = [json.loads(line) for line in buf.getvalue().splitlines() if line]
+    assert lines, "expected chunk-derived candidates"
+    for rec in lines:
+        assert rec["subject"]["label"] == "NOUNPHRASE"
+        assert rec["object"]["label"] == "NOUNPHRASE"
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,9 @@
+import io
+import runpy
+
+
+def test_cli_entrypoint_prints_message(monkeypatch):
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.stdout", buf)
+    runpy.run_module("groundkg.cli", run_name="__main__")
+    assert "Use `python -m groundkg.extract_open" in buf.getvalue()
diff --git a/tests/test_dedupe_edges.py b/tests/test_dedupe_edges.py
@@ -0,0 +1,69 @@
+import io
+import json
+
+from groundkg import dedupe_edges
+
+
+def test_key_normalizes_fields():
+    edge = {
+        "subject": " Alice ",
+        "predicate": "uses",
+        "object": " Gadget ",
+        "evidence": {"quote": "Alice uses the gadget."},
+    }
+    assert dedupe_edges.key(edge) == ("alice", "uses", "gadget", "Alice uses the gadget.")
+
+
+def test_main_filters_duplicates(tmp_path, monkeypatch):
+    edge = {
+        "subject": "Alice",
+        "predicate": "uses",
+        "object": "Gadget",
+        "evidence": {"quote": "Alice uses the gadget."},
+    }
+    dup_path = tmp_path / "edges.jsonl"
+    dup_path.write_text("\n".join(json.dumps(e) for e in (edge, edge)) + "\n", encoding="utf-8")
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.argv", ["dedupe_edges.py", str(dup_path)])
+    monkeypatch.setattr("sys.stdout", buf)
+
+    dedupe_edges.main()
+
+    lines = buf.getvalue().splitlines()
+    assert len(lines) == 1
+    assert json.loads(lines[0]) == edge
+
+
+def test_key_handles_missing_evidence_quote():
+    edge = {"subject": "Alice", "predicate": "uses", "object": "Gadget"}
+    assert dedupe_edges.key(edge) == ("alice", "uses", "gadget", "")
+
+
+def test_main_dedupes_whitespace_only_quotes(tmp_path, monkeypatch):
+    edges = [
+        {
+            "subject": "Alice",
+            "predicate": "uses",
+            "object": "Gadget",
+            "evidence": {"quote": "  Alice uses the gadget.  "},
+        },
+        {
+            "subject": "alice ",
+            "predicate": "uses",
+            "object": "gadget",
+            "evidence": {"quote": "Alice uses the gadget."},
+        },
+    ]
+    dup_path = tmp_path / "edges.jsonl"
+    dup_path.write_text("\n".join(json.dumps(e) for e in edges) + "\n", encoding="utf-8")
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.argv", ["dedupe_edges.py", str(dup_path)])
+    monkeypatch.setattr("sys.stdout", buf)
+
+    dedupe_edges.main()
+
+    lines = [json.loads(line) for line in buf.getvalue().splitlines() if line]
+    assert len(lines) == 1
+    assert lines[0]["subject"].strip().lower() == "alice"
diff --git a/tests/test_export_ttl.py b/tests/test_export_ttl.py
@@ -0,0 +1,70 @@
+import io
+import json
+
+from groundkg import export_ttl
+
+
+def test_iri_sanitizes_text():
+    assert export_ttl.iri("node", "Acme, Inc./R&D") == "ex:node/Acme_Inc._R&D"
+
+
+def test_emit_edge_triple_builds_expected_turtle():
+    triple, subj = export_ttl.emit_edge_triple({"subject": "Alice", "predicate": "uses", "object": "Gadget"})
+    assert triple == "ex:node/Alice ex:uses ex:node/Gadget .\n"
+    assert subj == "ex:node/Alice"
+
+
+def test_emit_attr_triples_formats_values():
+    attr = {
+        "name": "Battery Life",
+        "valueNumber": 12,
+        "unit": "hours",
+        "valueBoolean": True,
+        "valueString": "High capacity",
+        "time": "2023-05-01",
+        "evidence": {"char_start": 42},
+    }
+    rendered = export_ttl.emit_attr_triples(attr, "ex:node/Alice")
+    assert "ex:hasAttribute" in rendered
+    assert "ex:valueNumber 12" in rendered
+    assert "ex:unit \"hours\"" in rendered
+    assert "ex:valueBoolean true" in rendered
+    assert "ex:valueString \"High capacity\"" in rendered
+    assert rendered.endswith(" .\n")
+
+
+def test_main_reads_edges_and_attributes(tmp_path, monkeypatch):
+    edges_path = tmp_path / "edges.jsonl"
+    attrs_path = tmp_path / "attributes.jsonl"
+    edge = {"subject": "Alice", "predicate": "uses", "object": "Gadget"}
+    edges_path.write_text(json.dumps(edge) + "\n", encoding="utf-8")
+    attrs_path.write_text(json.dumps({"name": "Battery", "valueNumber": 3}) + "\n", encoding="utf-8")
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.argv", ["export_ttl.py", str(edges_path)])
+    monkeypatch.setattr("sys.stdout", buf)
+
+    export_ttl.main()
+
+    output = buf.getvalue()
+    assert output.startswith(export_ttl.PREFIX)
+    assert "ex:node/Alice ex:uses ex:node/Gadget" in output
+    assert "ex:hasAttribute" in output
+
+
+def test_main_ignores_malformed_attribute_lines(tmp_path, monkeypatch):
+    edges_path = tmp_path / "edges.jsonl"
+    attrs_path = tmp_path / "attributes.jsonl"
+    edge = {"subject": "Alice", "predicate": "uses", "object": "Gadget"}
+    edges_path.write_text(json.dumps(edge) + "\n", encoding="utf-8")
+    attrs_path.write_text("{" + "\n", encoding="utf-8")  # malformed JSON
+
+    buf = io.StringIO()
+    monkeypatch.setattr("sys.argv", ["export_ttl.py", str(edges_path)])
+    monkeypatch.setattr("sys.stdout", buf)
+
+    export_ttl.main()
+
+    output = buf.getvalue()
+    assert "ex:node/Alice ex:uses ex:node/Gadget" in output
+    assert "ex:hasAttribute" not in output