Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: CI

on:
push:
branches: [ main ]
pull_request:

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirement.txt
pip install pytest
- name: Run pytest
run: pytest
- name: Ensure coverage threshold
run: python tools/run_trace_coverage.py --min 80
26 changes: 26 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys
import types
from pathlib import Path

# Ensure the repository root is importable as a package during tests
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))

# Provide a lightweight onnxruntime stub so modules can be imported in tests
if "onnxruntime" not in sys.modules:
class _StubSession:
def __init__(self, *args, **kwargs):
raise RuntimeError(
"onnxruntime is stubbed in tests. Monkeypatch InferenceSession in individual tests."
)

sys.modules["onnxruntime"] = types.SimpleNamespace(InferenceSession=_StubSession)

if "spacy" not in sys.modules:
def _stub_load(*_args, **_kwargs):
raise RuntimeError(
"spacy is stubbed in tests. Monkeypatch spacy.load within individual tests."
)

sys.modules["spacy"] = types.SimpleNamespace(load=_stub_load)
134 changes: 134 additions & 0 deletions tests/test_candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import io
import json

from groundkg import candidates


def test_non_overlapping_chunks_filters_overlaps():
sent = "Alice and Bob met Charlie"
ents = [
{"text": "Alice", "start": 0, "end": 5, "label": "PERSON"},
{"text": "Charlie", "start": 18, "end": 25, "label": "PERSON"},
]

chunks = candidates.non_overlapping_chunks(sent, ents)

# "Alice" and "Charlie" should be excluded because they overlap entities
chunk_texts = {c["text"] for c in chunks}
assert chunk_texts == {"Bob"}
# ensure chunks carry the expected metadata
for chunk in chunks:
assert chunk["label"] == "NOUNPHRASE"
assert chunk["end"] - chunk["start"] >= 3


def test_main_emits_subject_object_pairs(tmp_path, monkeypatch):
record = {
"doc_id": "d1",
"sent_idx": 0,
"sent_start": 0,
"text": "Alice visited Paris with Charlie",
"entities": [
{"text": "Alice", "start": 0, "end": 5, "label": "PERSON"},
{"text": "Paris", "start": 13, "end": 18, "label": "GPE"},
],
}
ner_path = tmp_path / "ner.jsonl"
ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")

monkeypatch.setenv("PYTHONHASHSEED", "0") # ensure deterministic iteration if needed
monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])

buf = io.StringIO()
monkeypatch.setattr("sys.stdout", buf)

candidates.main()

lines = [json.loads(line) for line in buf.getvalue().splitlines() if line]
assert any(
rec["subject"]["text"] == "Alice" and rec["object"]["text"] == "Paris"
for rec in lines
)
for rec in lines:
assert rec["doc_id"] == "d1"


def test_main_respects_char_distance_limit(tmp_path, monkeypatch):
long_text = "Alice" + " " * 151 + "Paris"
record = {
"doc_id": "d1",
"sent_idx": 0,
"sent_start": 0,
"text": long_text,
"entities": [
{"text": "Alice", "start": 0, "end": 5, "label": "PERSON"},
{"text": "Paris", "start": 156, "end": 161, "label": "GPE"},
],
}
ner_path = tmp_path / "ner.jsonl"
ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")

buf = io.StringIO()
monkeypatch.setattr("sys.stdout", buf)
monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])

candidates.main()

assert buf.getvalue().strip() == ""


def test_main_caps_pairs_at_limit(tmp_path, monkeypatch):
tokens = ["Alice"] + [f"Obj{i}" for i in range(12)]
text = " ".join(tokens)

ents = []
cursor = 0
for token in tokens:
start = cursor
end = start + len(token)
label = "PERSON" if token == "Alice" else "PRODUCT"
ents.append({"text": token, "start": start, "end": end, "label": label})
cursor = end + 1 # account for spaces

record = {
"doc_id": "d1",
"sent_idx": 0,
"sent_start": 0,
"text": text,
"entities": ents,
}
ner_path = tmp_path / "ner.jsonl"
ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")

buf = io.StringIO()
monkeypatch.setattr("sys.stdout", buf)
monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])

candidates.main()

lines = [line for line in buf.getvalue().splitlines() if line]
assert len(lines) == candidates.MAX_PAIRS_PER_SENT


def test_main_falls_back_to_chunks_without_entities(tmp_path, monkeypatch):
record = {
"doc_id": "d2",
"sent_idx": 0,
"sent_start": 0,
"text": "Solar Panel helps Bright Homes",
"entities": [],
}
ner_path = tmp_path / "ner.jsonl"
ner_path.write_text(json.dumps(record) + "\n", encoding="utf-8")

buf = io.StringIO()
monkeypatch.setattr("sys.stdout", buf)
monkeypatch.setattr("sys.argv", ["candidates.py", str(ner_path)])

candidates.main()

lines = [json.loads(line) for line in buf.getvalue().splitlines() if line]
assert lines, "expected chunk-derived candidates"
for rec in lines:
assert rec["subject"]["label"] == "NOUNPHRASE"
assert rec["object"]["label"] == "NOUNPHRASE"
9 changes: 9 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import io
import runpy


def test_cli_entrypoint_prints_message(monkeypatch):
buf = io.StringIO()
monkeypatch.setattr("sys.stdout", buf)
runpy.run_module("groundkg.cli", run_name="__main__")
assert "Use `python -m groundkg.extract_open" in buf.getvalue()
69 changes: 69 additions & 0 deletions tests/test_dedupe_edges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import io
import json

from groundkg import dedupe_edges


def test_key_normalizes_fields():
edge = {
"subject": " Alice ",
"predicate": "uses",
"object": " Gadget ",
"evidence": {"quote": "Alice uses the gadget."},
}
assert dedupe_edges.key(edge) == ("alice", "uses", "gadget", "Alice uses the gadget.")


def test_main_filters_duplicates(tmp_path, monkeypatch):
edge = {
"subject": "Alice",
"predicate": "uses",
"object": "Gadget",
"evidence": {"quote": "Alice uses the gadget."},
}
dup_path = tmp_path / "edges.jsonl"
dup_path.write_text("\n".join(json.dumps(e) for e in (edge, edge)) + "\n", encoding="utf-8")

buf = io.StringIO()
monkeypatch.setattr("sys.argv", ["dedupe_edges.py", str(dup_path)])
monkeypatch.setattr("sys.stdout", buf)

dedupe_edges.main()

lines = buf.getvalue().splitlines()
assert len(lines) == 1
assert json.loads(lines[0]) == edge


def test_key_handles_missing_evidence_quote():
edge = {"subject": "Alice", "predicate": "uses", "object": "Gadget"}
assert dedupe_edges.key(edge) == ("alice", "uses", "gadget", "")


def test_main_dedupes_whitespace_only_quotes(tmp_path, monkeypatch):
edges = [
{
"subject": "Alice",
"predicate": "uses",
"object": "Gadget",
"evidence": {"quote": " Alice uses the gadget. "},
},
{
"subject": "alice ",
"predicate": "uses",
"object": "gadget",
"evidence": {"quote": "Alice uses the gadget."},
},
]
dup_path = tmp_path / "edges.jsonl"
dup_path.write_text("\n".join(json.dumps(e) for e in edges) + "\n", encoding="utf-8")

buf = io.StringIO()
monkeypatch.setattr("sys.argv", ["dedupe_edges.py", str(dup_path)])
monkeypatch.setattr("sys.stdout", buf)

dedupe_edges.main()

lines = [json.loads(line) for line in buf.getvalue().splitlines() if line]
assert len(lines) == 1
assert lines[0]["subject"].strip().lower() == "alice"
70 changes: 70 additions & 0 deletions tests/test_export_ttl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import io
import json

from groundkg import export_ttl


def test_iri_sanitizes_text():
assert export_ttl.iri("node", "Acme, Inc./R&D") == "ex:node/Acme_Inc._R&D"


def test_emit_edge_triple_builds_expected_turtle():
triple, subj = export_ttl.emit_edge_triple({"subject": "Alice", "predicate": "uses", "object": "Gadget"})
assert triple == "ex:node/Alice ex:uses ex:node/Gadget .\n"
assert subj == "ex:node/Alice"


def test_emit_attr_triples_formats_values():
attr = {
"name": "Battery Life",
"valueNumber": 12,
"unit": "hours",
"valueBoolean": True,
"valueString": "High capacity",
"time": "2023-05-01",
"evidence": {"char_start": 42},
}
rendered = export_ttl.emit_attr_triples(attr, "ex:node/Alice")
assert "ex:hasAttribute" in rendered
assert "ex:valueNumber 12" in rendered
assert "ex:unit \"hours\"" in rendered
assert "ex:valueBoolean true" in rendered
assert "ex:valueString \"High capacity\"" in rendered
assert rendered.endswith(" .\n")


def test_main_reads_edges_and_attributes(tmp_path, monkeypatch):
edges_path = tmp_path / "edges.jsonl"
attrs_path = tmp_path / "attributes.jsonl"
edge = {"subject": "Alice", "predicate": "uses", "object": "Gadget"}
edges_path.write_text(json.dumps(edge) + "\n", encoding="utf-8")
attrs_path.write_text(json.dumps({"name": "Battery", "valueNumber": 3}) + "\n", encoding="utf-8")

buf = io.StringIO()
monkeypatch.setattr("sys.argv", ["export_ttl.py", str(edges_path)])
monkeypatch.setattr("sys.stdout", buf)

export_ttl.main()

output = buf.getvalue()
assert output.startswith(export_ttl.PREFIX)
assert "ex:node/Alice ex:uses ex:node/Gadget" in output
assert "ex:hasAttribute" in output


def test_main_ignores_malformed_attribute_lines(tmp_path, monkeypatch):
edges_path = tmp_path / "edges.jsonl"
attrs_path = tmp_path / "attributes.jsonl"
edge = {"subject": "Alice", "predicate": "uses", "object": "Gadget"}
edges_path.write_text(json.dumps(edge) + "\n", encoding="utf-8")
attrs_path.write_text("{" + "\n", encoding="utf-8") # malformed JSON

buf = io.StringIO()
monkeypatch.setattr("sys.argv", ["export_ttl.py", str(edges_path)])
monkeypatch.setattr("sys.stdout", buf)

export_ttl.main()

output = buf.getvalue()
assert "ex:node/Alice ex:uses ex:node/Gadget" in output
assert "ex:hasAttribute" not in output
Loading