From 69df807ec80c5c1f71466fdc8e1818225a76c603 Mon Sep 17 00:00:00 2001
From: Giorgio Silvi <41969868+giosilvi@users.noreply.github.com>
Date: Fri, 14 Nov 2025 14:26:59 +0100
Subject: [PATCH 1/2] Add tests for event pipeline coverage

---
 tests/test_event_pipeline.py | 314 +++++++++++++++++++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 tests/test_event_pipeline.py

diff --git a/tests/test_event_pipeline.py b/tests/test_event_pipeline.py
new file mode 100644
index 0000000..67f58cd
--- /dev/null
+++ b/tests/test_event_pipeline.py
@@ -0,0 +1,314 @@
+import json
+import sys
+import types
+
+try:  # pragma: no cover - exercised when numpy is available
+    import numpy as np
+except ModuleNotFoundError:  # pragma: no cover - provides lightweight stub for CI
+    fake_np = types.ModuleType("numpy")
+    import builtins
+
+    class FakeArray:
+        def __init__(self, data, dtype=None, shape=None):
+            self.data = data
+            self.dtype = dtype
+            if shape is not None:
+                self.shape = shape
+            elif isinstance(data, list):
+                if data and isinstance(data[0], list):
+                    self.shape = (len(data), len(data[0]))
+                else:
+                    self.shape = (len(data),)
+            else:
+                self.shape = ()
+
+        def reshape(self, *shape):
+            if len(shape) == 2:
+                rows, cols = shape
+                flat = self.flatten().data
+                if rows == 1 and cols == -1:
+                    return FakeArray([flat], self.dtype, (1, len(flat)))
+                if cols == -1 and rows > 0:
+                    cols = len(flat) // rows if rows else len(flat)
+                new_data = [flat[i * cols : (i + 1) * cols] for i in range(rows)]
+                return FakeArray(new_data, self.dtype, (rows, cols))
+            if len(shape) == 1 and shape[0] == -1:
+                flat = self.flatten().data
+                return FakeArray(flat, self.dtype, (len(flat),))
+            return FakeArray(self.data, self.dtype, self.shape)
+
+        def astype(self, dtype):
+            return FakeArray(self.data, dtype, self.shape)
+
+        def flatten(self):
+            if self.shape and len(self.shape) == 2:
+                flat = [item for row in self.data for item in row]
+            elif isinstance(self.data, list):
+                flat = list(self.data)
+            else:
+                flat = [self.data]
+            return FakeArray(flat, self.dtype, (len(flat),))
+
+        def __getitem__(self, idx):
+            val = self.data[idx]
+            if isinstance(val, list):
+                return FakeArray(val, self.dtype)
+            return val
+
+        def __len__(self):
+            return len(self.data)
+
+        def __setitem__(self, idx, value):
+            if isinstance(idx, tuple) and len(idx) == 2:
+                row, col = idx
+                self.data[row][col] = value
+            else:
+                self.data[idx] = value
+
+    def _array(data, dtype=None):
+        return FakeArray(data, dtype)
+
+    def _zeros(shape, dtype=float):
+        if isinstance(shape, tuple):
+            if len(shape) == 2:
+                rows, cols = shape
+                data = [[dtype(0) for _ in range(cols)] for _ in range(rows)]
+                return FakeArray(data, dtype, shape)
+            if len(shape) == 1:
+                data = [dtype(0)] * shape[0]
+                return FakeArray(data, dtype, (shape[0],))
+        data = [dtype(0)] * shape
+        return FakeArray(data, dtype, (shape,))
+
+    def _arange(n, dtype=float):
+        data = [dtype(i) for i in range(n)]
+        return FakeArray(data, dtype, (n,))
+
+    def _asarray(data, dtype=None):
+        if isinstance(data, FakeArray):
+            return data
+        if isinstance(data, list) and data and isinstance(data[0], list):
+            return FakeArray(data, dtype, (len(data), len(data[0])))
+        if isinstance(data, list):
+            return FakeArray(data, dtype, (len(data),))
+        return FakeArray([data], dtype, (1,))
+
+    def _argmax(array_like):
+        if isinstance(array_like, FakeArray):
+            data = array_like.data
+            if array_like.shape and len(array_like.shape) > 1:
+                data = data[0]
+        else:
+            data = array_like
+        max_idx = 0
+        max_val = float("-inf")
+        for idx, val in enumerate(data):
+            try:
+                numeric = float(val)
+            except (TypeError, ValueError):
+                numeric = 0.0
+            if numeric > max_val:
+                max_val = numeric
+                max_idx = idx
+        return max_idx
+
+    fake_np.array = _array
+    fake_np.zeros = _zeros
+    fake_np.arange = _arange
+    fake_np.asarray = _asarray
+    fake_np.argmax = _argmax
+    fake_np.float32 = float
+    fake_np.float64 = float
+
+    builtins.fake_np = fake_np
+
+    sys.modules["numpy"] = fake_np
+    import numpy as np  # type: ignore
+
+from groundkg import event_extract, events_to_edges, re_score
+
+
+def test_event_extract_main_generates_events(tmp_path, monkeypatch):
+    manifest = tmp_path / "manifest.jsonl"
+    events_out = tmp_path / "events.jsonl"
+
+    records = [
+        {
+            "doc_id": "doc1",
+            "text": "MegaCorp acquired StartUp for $5 million on Jan 2, 2022.",
+        },
+        {
+            "doc_id": "doc2",
+            "text": "Bright Future secured $3M from Big VC on Feb 5, 2021.",
+        },
+        {
+            "doc_id": "doc3",
+            "text": "Tech Corp launched HyperWidget on 2023.",
+        },
+        {
+            "doc_id": "doc4",
+            "source_org": "ACME",
+            "text": "ACME appointed Jane Doe as CTO on Mar 3, 2020.",
+        },
+        {
+            "doc_id": "doc5",
+            "text": "John Smith founded Future Labs in 2019.",
+        },
+    ]
+    manifest.write_text("\n".join(json.dumps(r) for r in records) + "\n", encoding="utf-8")
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "event_extract",
+            "--manifest",
+            str(manifest),
+            "--out",
+            str(events_out),
+        ],
+    )
+
+    event_extract.main()
+
+    lines = [json.loads(line) for line in events_out.read_text(encoding="utf-8").splitlines()]
+
+    types = {line["type"] for line in lines}
+    expected_types = {"Acquisition", "Funding", "Launch", "Appointment", "Founding"}
+    assert expected_types.issubset(types)
+
+    acq = next(line for line in lines if line["type"] == "Acquisition")
+    assert acq["roles"].get("acquirer") == "MegaCorp"
+    assert acq["roles"].get("target", "").startswith("StartUp")
+    assert acq["amount_text"] == "$5 million"
+    assert acq["date_text"] == "Jan 2, 2022"
+
+    funding = next(line for line in lines if line["type"] == "Funding")
+    assert funding["roles"]["recipient"] == "Bright Future"
+
+    assert any(ev["roles"].get("actor") == "ACME" for ev in lines if ev["type"] == "Appointment")
+
+    founding = next(line for line in lines if line["type"] == "Founding")
+    assert "founder_or_actor" in founding["roles"]
+
+
+def test_events_to_edges_main(tmp_path, monkeypatch):
+    events_file = tmp_path / "events.jsonl"
+    edges_out = tmp_path / "edges.jsonl"
+
+    event_record = {
+        "event_id": "E1",
+        "type": "Acquisition",
+        "trigger": "acquired",
+        "date_text": "Jan 2, 2022",
+        "amount_text": "$5 million",
+        "roles": {"acquirer": "MegaCorp", "target": "StartUp", "empty": ""},
+        "confidence": 0.75,
+        "source": "doc1#s",
+    }
+    events_file.write_text(json.dumps(event_record) + "\n", encoding="utf-8")
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["events_to_edges", "--events", str(events_file), "--out", str(edges_out)],
+    )
+
+    events_to_edges.main()
+
+    edges = [json.loads(line) for line in edges_out.read_text(encoding="utf-8").splitlines()]
+    subjects = {edge["subject"] for edge in edges}
+    assert subjects == {"event:E1"}
+    predicates = {edge["predicate"] for edge in edges}
+    assert predicates == {"type", "trigger", "date", "amount", "acquirer", "target"}
+
+
+def test_re_score_mark_orders_entities():
+    text = "Object met Subject"
+    subject = {"start": 11, "end": 18}
+    obj = {"start": 0, "end": 6}
+    marked = re_score.mark(text, subject, obj)
+    assert marked.startswith("[E1]Object[/E1] met [E2]Subject[/E2]")
+
+
+def test_re_score_main_batches(tmp_path, monkeypatch, capsys):
+    cand_path = tmp_path / "candidates.jsonl"
+    onnx_path = tmp_path / "model.onnx"
+    classes_path = tmp_path / "classes.json"
+
+    num_candidates = 33
+    candidates = []
+    for i in range(num_candidates):
+        candidates.append(
+            {
+                "doc_id": f"doc{i}",
+                "sent_start": i,
+                "text": f"Sentence {i}",
+                "subject": {"start": 0, "end": 7},
+                "object": {"start": 9, "end": 12},
+            }
+        )
+    cand_path.write_text("\n".join(json.dumps(c) for c in candidates) + "\n", encoding="utf-8")
+    onnx_path.write_text("placeholder", encoding="utf-8")
+    classes_path.write_text(json.dumps(["NEG", "POS"]), encoding="utf-8")
+
+    class DummyEmbedder:
+        def __init__(self):
+            self.calls = []
+
+        def encode(self, texts, show_progress_bar=False, convert_to_numpy=True):
+            self.calls.append(list(texts))
+            batch = np.arange(len(texts) * re_score.EMBEDDING_DIM, dtype=np.float32)
+            return batch.reshape(len(texts), re_score.EMBEDDING_DIM)
+
+    class DummyInput:
+        def __init__(self):
+            self.name = "input"
+            self.shape = [None, re_score.EMBEDDING_DIM]
+
+    class DummyOutputInfo:
+        def __init__(self, name):
+            self.name = name
+            self.shape = [None, 2]
+            self.type = "tensor(float)"
+
+    class DummySession:
+        def __init__(self, path, providers):
+            self.path = path
+            self.providers = providers
+            self.calls = 0
+
+        def get_inputs(self):
+            return [DummyInput()]
+
+        def get_outputs(self):
+            return [DummyOutputInfo("label"), DummyOutputInfo("prob")]
+
+        def run(self, _, feeds):
+            self.calls += 1
+            probs = np.zeros((1, 2), dtype=np.float32)
+            probs[0, self.calls % 2] = 0.8
+            return [np.array(["label"], dtype=object), probs]
+
+    dummy_embedder = DummyEmbedder()
+    monkeypatch.setattr(re_score, "get_embedder", lambda: dummy_embedder)
+    monkeypatch.setattr(re_score.ort, "InferenceSession", DummySession)
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "re_score",
+            str(cand_path),
+            str(onnx_path),
+            str(classes_path),
+        ],
+    )
+
+    re_score.main()
+
+    captured = capsys.readouterr()
+    lines = [json.loads(line) for line in captured.out.splitlines()]
+    assert len(lines) == num_candidates
+    assert {rec["pred"] for rec in lines} <= {"NEG", "POS"}
+    assert dummy_embedder.calls  # ensure embeddings were requested

From 135dff6c29f40cbf5cf8c413498e3cc49d5ef354 Mon Sep 17 00:00:00 2001
From: Giorgio Silvi <41969868+giosilvi@users.noreply.github.com>
Date: Fri, 14 Nov 2025 15:07:32 +0100
Subject: [PATCH 2/2] Remove numpy stubs from relation pipeline tests

---
 tests/test_event_pipeline.py | 134 ++---------------------------------
 tests/test_re_modules.py     | 118 ++----------------------------
 2 files changed, 13 insertions(+), 239 deletions(-)

diff --git a/tests/test_event_pipeline.py b/tests/test_event_pipeline.py
index 67f58cd..7fe25cd 100644
--- a/tests/test_event_pipeline.py
+++ b/tests/test_event_pipeline.py
@@ -1,129 +1,9 @@
 import json
 import sys
-import types
-
-try:  # pragma: no cover - exercised when numpy is available
-    import numpy as np
-except ModuleNotFoundError:  # pragma: no cover - provides lightweight stub for CI
-    fake_np = types.ModuleType("numpy")
-    import builtins
-
-    class FakeArray:
-        def __init__(self, data, dtype=None, shape=None):
-            self.data = data
-            self.dtype = dtype
-            if shape is not None:
-                self.shape = shape
-            elif isinstance(data, list):
-                if data and isinstance(data[0], list):
-                    self.shape = (len(data), len(data[0]))
-                else:
-                    self.shape = (len(data),)
-            else:
-                self.shape = ()
-
-        def reshape(self, *shape):
-            if len(shape) == 2:
-                rows, cols = shape
-                flat = self.flatten().data
-                if rows == 1 and cols == -1:
-                    return FakeArray([flat], self.dtype, (1, len(flat)))
-                if cols == -1 and rows > 0:
-                    cols = len(flat) // rows if rows else len(flat)
-                new_data = [flat[i * cols : (i + 1) * cols] for i in range(rows)]
-                return FakeArray(new_data, self.dtype, (rows, cols))
-            if len(shape) == 1 and shape[0] == -1:
-                flat = self.flatten().data
-                return FakeArray(flat, self.dtype, (len(flat),))
-            return FakeArray(self.data, self.dtype, self.shape)
-
-        def astype(self, dtype):
-            return FakeArray(self.data, dtype, self.shape)
-
-        def flatten(self):
-            if self.shape and len(self.shape) == 2:
-                flat = [item for row in self.data for item in row]
-            elif isinstance(self.data, list):
-                flat = list(self.data)
-            else:
-                flat = [self.data]
-            return FakeArray(flat, self.dtype, (len(flat),))
-
-        def __getitem__(self, idx):
-            val = self.data[idx]
-            if isinstance(val, list):
-                return FakeArray(val, self.dtype)
-            return val
-
-        def __len__(self):
-            return len(self.data)
-
-        def __setitem__(self, idx, value):
-            if isinstance(idx, tuple) and len(idx) == 2:
-                row, col = idx
-                self.data[row][col] = value
-            else:
-                self.data[idx] = value
-
-    def _array(data, dtype=None):
-        return FakeArray(data, dtype)
-
-    def _zeros(shape, dtype=float):
-        if isinstance(shape, tuple):
-            if len(shape) == 2:
-                rows, cols = shape
-                data = [[dtype(0) for _ in range(cols)] for _ in range(rows)]
-                return FakeArray(data, dtype, shape)
-            if len(shape) == 1:
-                data = [dtype(0)] * shape[0]
-                return FakeArray(data, dtype, (shape[0],))
-        data = [dtype(0)] * shape
-        return FakeArray(data, dtype, (shape,))
-
-    def _arange(n, dtype=float):
-        data = [dtype(i) for i in range(n)]
-        return FakeArray(data, dtype, (n,))
-
-    def _asarray(data, dtype=None):
-        if isinstance(data, FakeArray):
-            return data
-        if isinstance(data, list) and data and isinstance(data[0], list):
-            return FakeArray(data, dtype, (len(data), len(data[0])))
-        if isinstance(data, list):
-            return FakeArray(data, dtype, (len(data),))
-        return FakeArray([data], dtype, (1,))
-
-    def _argmax(array_like):
-        if isinstance(array_like, FakeArray):
-            data = array_like.data
-            if array_like.shape and len(array_like.shape) > 1:
-                data = data[0]
-        else:
-            data = array_like
-        max_idx = 0
-        max_val = float("-inf")
-        for idx, val in enumerate(data):
-            try:
-                numeric = float(val)
-            except (TypeError, ValueError):
-                numeric = 0.0
-            if numeric > max_val:
-                max_val = numeric
-                max_idx = idx
-        return max_idx
-
-    fake_np.array = _array
-    fake_np.zeros = _zeros
-    fake_np.arange = _arange
-    fake_np.asarray = _asarray
-    fake_np.argmax = _argmax
-    fake_np.float32 = float
-    fake_np.float64 = float
-
-    builtins.fake_np = fake_np
-
-    sys.modules["numpy"] = fake_np
-    import numpy as np  # type: ignore
+
+import pytest
+
+np = pytest.importorskip("numpy")
 
 from groundkg import event_extract, events_to_edges, re_score
 
@@ -173,9 +53,9 @@ def test_event_extract_main_generates_events(tmp_path, monkeypatch):
 
     lines = [json.loads(line) for line in events_out.read_text(encoding="utf-8").splitlines()]
 
-    types = {line["type"] for line in lines}
+    event_types = {line["type"] for line in lines}
     expected_types = {"Acquisition", "Funding", "Launch", "Appointment", "Founding"}
-    assert expected_types.issubset(types)
+    assert expected_types.issubset(event_types)
 
     acq = next(line for line in lines if line["type"] == "Acquisition")
     assert acq["roles"].get("acquirer") == "MegaCorp"
@@ -288,7 +168,7 @@ def run(self, _, feeds):
             self.calls += 1
             probs = np.zeros((1, 2), dtype=np.float32)
             probs[0, self.calls % 2] = 0.8
-            return [np.array(["label"], dtype=object), probs]
+            return [np.array(["label"]), probs]
 
     dummy_embedder = DummyEmbedder()
     monkeypatch.setattr(re_score, "get_embedder", lambda: dummy_embedder)
diff --git a/tests/test_re_modules.py b/tests/test_re_modules.py
index 7a42503..befaac4 100644
--- a/tests/test_re_modules.py
+++ b/tests/test_re_modules.py
@@ -5,113 +5,7 @@
 
 import pytest
 
-if "numpy" not in sys.modules:
-    fake_np = types.ModuleType("numpy")
-
-    class FakeArray:
-        def __init__(self, data, dtype=None):
-            self.data = data
-            self.dtype = dtype
-            if isinstance(data, list) and len(data) > 0 and isinstance(data[0], list):
-                self.shape = (len(data), len(data[0]))
-            elif isinstance(data, list):
-                self.shape = (len(data),)
-            else:
-                self.shape = ()
-
-        def reshape(self, *shape):
-            # Simple reshape - just return a new FakeArray with new shape
-            flat = self._flatten()
-            if len(shape) == 1:
-                if isinstance(shape[0], tuple):
-                    new_shape = shape[0]
-                else:
-                    # Handle reshape(1, -1) case
-                    if shape[0] == 1:
-                        return FakeArray([flat], self.dtype)
-                    new_shape = shape[0]
-            elif len(shape) == 2:
-                # Handle reshape(1, -1) case
-                if shape[0] == 1:
-                    return FakeArray([flat], self.dtype)
-                new_shape = shape
-            else:
-                new_shape = shape
-            return FakeArray(flat, self.dtype)
-
-        def _flatten(self):
-            result = []
-            for item in self.data:
-                if isinstance(item, list):
-                    result.extend(item)
-                else:
-                    result.append(item)
-            return result
-
-        def astype(self, dtype):
-            return FakeArray(self.data, dtype)
-
-        def flatten(self):
-            return FakeArray(self._flatten(), self.dtype)
-
-        def __getitem__(self, idx):
-            item = self.data[idx]
-            # If item is a list, wrap it in FakeArray for proper method access
-            if isinstance(item, list):
-                return FakeArray(item, self.dtype)
-            return item
-
-        def __setitem__(self, idx, value):
-            if isinstance(idx, tuple):
-                # Handle 2D indexing like probs[0][1] = 0.9
-                self.data[idx[0]][idx[1]] = value
-            else:
-                self.data[idx] = value
-
-        def __len__(self):
-            return len(self.data)
-
-        def __iter__(self):
-            for item in self.data:
-                if isinstance(item, list):
-                    yield FakeArray(item, self.dtype)
-                else:
-                    yield item
-
-    def array(data, dtype=None):
-        return FakeArray(data, dtype)
-
-    def argmax(seq):
-        if hasattr(seq, '__len__') and len(seq) > 0:
-            if hasattr(seq[0], '__len__'):
-                # 2D array, get argmax of first row
-                return max(range(len(seq[0])), key=lambda i: seq[0][i])
-            return max(range(len(seq)), key=lambda i: seq[i])
-        return 0
-
-    def zeros(shape, dtype=float):
-        if isinstance(shape, tuple) and len(shape) == 2:
-            rows, cols = shape
-            return FakeArray([[dtype(0) for _ in range(cols)] for _ in range(rows)], dtype)
-        elif isinstance(shape, tuple) and len(shape) == 1:
-            return FakeArray([dtype(0)] * shape[0], dtype)
-        else:
-            rows, cols = shape
-            return FakeArray([[dtype(0) for _ in range(cols)] for _ in range(rows)], dtype)
-
-    def asarray(data, dtype=None):
-        if isinstance(data, FakeArray):
-            return data
-        return FakeArray(data, dtype)
-
-    fake_np.array = array
-    fake_np.argmax = argmax
-    fake_np.zeros = zeros
-    fake_np.asarray = asarray
-    fake_np.isscalar = lambda value: isinstance(value, (int, float))
-    fake_np.float32 = float
-    fake_np.float64 = float
-    sys.modules["numpy"] = fake_np
+np = pytest.importorskip("numpy")
 
 from groundkg import re_infer, re_score
 
@@ -160,10 +54,10 @@ def get_outputs(self):
 
         def run(self, _outputs, feeds):
             assert isinstance(feeds, dict)
-            probs = fake_np.zeros((1, len(classes)), dtype=float)
+            probs = np.zeros((1, len(classes)), dtype=float)
             probs[0][1] = 0.9
-            # Return FakeArray objects to match ONNX output format
-            return [fake_np.array(["uses"]), probs]
+            # Return numpy arrays to match ONNX output format
+            return [np.array(["uses"]), probs]
 
     # Mock embedder to return fake embeddings (384 dims for all-MiniLM-L6-v2)
     class FakeEmbedder:
@@ -171,7 +65,7 @@ def encode(self, texts, show_progress_bar=False, convert_to_numpy=True):
             # Return fake embeddings: one per text, each 384 dimensions
             if isinstance(texts, str):
                 texts = [texts]
-            return fake_np.array([[0.1] * 384 for _ in texts])
+            return np.array([[0.1] * 384 for _ in texts], dtype=float)
 
     monkeypatch.setattr(re_score, "get_embedder", lambda: FakeEmbedder())
     monkeypatch.setattr(re_score.ort, "InferenceSession", lambda *a, **k: FakeSession())
@@ -273,7 +167,7 @@ def get_inputs(self):
             return [FakeInput()]
 
         def run(self, *_args, **_kwargs):
-            probs = fake_np.zeros((1, len(classes)), dtype=float)
+            probs = np.zeros((1, len(classes)), dtype=float)
             probs[0][uses_idx] = 0.92
             labels = ["uses"]
             return [labels, probs]