From 5ce08527523f7b411846e49147826034fe5af708 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 23 Feb 2026 18:13:51 -0800
Subject: [PATCH 01/35] Add dvc and uv projects

---
 training/object-detection/.dvc/.gitignore | 3 +++
 training/object-detection/.dvc/config     | 0
 training/object-detection/.dvcignore      | 3 +++
 training/object-detection/.python-version | 1 +
 training/object-detection/README.md       | 0
 training/object-detection/main.py         | 6 ++++++
 training/object-detection/pyproject.toml  | 7 +++++++
 training/object-detection/uv.lock         | 8 ++++++++
 8 files changed, 28 insertions(+)
 create mode 100644 training/object-detection/.dvc/.gitignore
 create mode 100644 training/object-detection/.dvc/config
 create mode 100644 training/object-detection/.dvcignore
 create mode 100644 training/object-detection/.python-version
 create mode 100644 training/object-detection/README.md
 create mode 100644 training/object-detection/main.py
 create mode 100644 training/object-detection/pyproject.toml
 create mode 100644 training/object-detection/uv.lock

diff --git a/training/object-detection/.dvc/.gitignore b/training/object-detection/.dvc/.gitignore
new file mode 100644
index 0000000..528f30c
--- /dev/null
+++ b/training/object-detection/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/training/object-detection/.dvc/config b/training/object-detection/.dvc/config
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/.dvcignore b/training/object-detection/.dvcignore
new file mode 100644
index 0000000..5197305
--- /dev/null
+++ b/training/object-detection/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/training/object-detection/.python-version b/training/object-detection/.python-version
new file mode 100644
index 0000000..cc1923a
--- /dev/null
+++ b/training/object-detection/.python-version
@@ -0,0 +1 @@
+3.8
diff --git a/training/object-detection/README.md b/training/object-detection/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/main.py b/training/object-detection/main.py
new file mode 100644
index 0000000..f376d2d
--- /dev/null
+++ b/training/object-detection/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from object-detection!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/object-detection/pyproject.toml b/training/object-detection/pyproject.toml
new file mode 100644
index 0000000..94a0e56
--- /dev/null
+++ b/training/object-detection/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "object-detection"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = []
diff --git a/training/object-detection/uv.lock b/training/object-detection/uv.lock
new file mode 100644
index 0000000..7ff0862
--- /dev/null
+++ b/training/object-detection/uv.lock
@@ -0,0 +1,8 @@
+version = 1
+revision = 3
+requires-python = ">=3.8"
+
+[[package]]
+name = "object-detection"
+version = "0.1.0"
+source = { virtual = "." }

From 6e885483523cf4b92e8c6b3269954ae0b225a691 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 25 Feb 2026 22:59:59 -0800
Subject: [PATCH 02/35] Add raw JSON data from API

---
 training/object-detection/.gitignore | 1 +
 training/object-detection/01_raw.dvc | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 training/object-detection/.gitignore
 create mode 100644 training/object-detection/01_raw.dvc

diff --git a/training/object-detection/.gitignore b/training/object-detection/.gitignore
new file mode 100644
index 0000000..391d5f2
--- /dev/null
+++ b/training/object-detection/.gitignore
@@ -0,0 +1 @@
+/01_raw
diff --git a/training/object-detection/01_raw.dvc b/training/object-detection/01_raw.dvc
new file mode 100644
index 0000000..bfe7af9
--- /dev/null
+++ b/training/object-detection/01_raw.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 843268e6b8108d3bf563defd9acacbfd.dir
+  size: 382874010
+  nfiles: 51
+  hash: md5
+  path: 01_raw

From 2ece96f4e5050cf8adc2e392fcb8c4fd1750dffa Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 25 Feb 2026 23:09:23 -0800
Subject: [PATCH 03/35] Fix filepaths of raw data

---
 training/object-detection/01_raw.dvc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/object-detection/01_raw.dvc b/training/object-detection/01_raw.dvc
index bfe7af9..a08b222 100644
--- a/training/object-detection/01_raw.dvc
+++ b/training/object-detection/01_raw.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 843268e6b8108d3bf563defd9acacbfd.dir
+- md5: 3c7812ff2347c3cf7140ceb4e459bec2.dir
   size: 382874010
   nfiles: 51
   hash: md5

From c40ab5c28065533e0747507d16fe1883cd34ad70 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 25 Feb 2026 23:11:45 -0800
Subject: [PATCH 04/35] Update raw data from S3

---
 training/object-detection/01_raw.dvc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/training/object-detection/01_raw.dvc b/training/object-detection/01_raw.dvc
index a08b222..231a350 100644
--- a/training/object-detection/01_raw.dvc
+++ b/training/object-detection/01_raw.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 3c7812ff2347c3cf7140ceb4e459bec2.dir
-  size: 382874010
-  nfiles: 51
+- md5: 3c9068cdaacbf2d8934c76a2e98c945d.dir
+  size: 372865559
+  nfiles: 50
   hash: md5
   path: 01_raw

From b3ef3cee9c5d2912b55dbba35cd3b4bc32b6c87f Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 25 Feb 2026 23:34:46 -0800
Subject: [PATCH 05/35] Add update JSON pipeline step

---
 training/object-detection/01_raw.dvc |  6 ------
 training/object-detection/dvc.lock   | 11 +++++++++++
 training/object-detection/dvc.yaml   | 12 ++++++++++++
 3 files changed, 23 insertions(+), 6 deletions(-)
 delete mode 100644 training/object-detection/01_raw.dvc
 create mode 100644 training/object-detection/dvc.lock
 create mode 100644 training/object-detection/dvc.yaml

diff --git a/training/object-detection/01_raw.dvc b/training/object-detection/01_raw.dvc
deleted file mode 100644
index 231a350..0000000
--- a/training/object-detection/01_raw.dvc
+++ /dev/null
@@ -1,6 +0,0 @@
-outs:
-- md5: 3c9068cdaacbf2d8934c76a2e98c945d.dir
-  size: 372865559
-  nfiles: 50
-  hash: md5
-  path: 01_raw
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
new file mode 100644
index 0000000..38b1ae5
--- /dev/null
+++ b/training/object-detection/dvc.lock
@@ -0,0 +1,11 @@
+schema: '2.0'
+stages:
+  update_raw:
+    cmd: rclone sync  -P --filter "- /salm_dataset*/**" --filter "+ *.json" 
+      --filter "- *.zip" aws:salmonvision-ml-datasets/rgb/model_input/ 01_raw
+    outs:
+    - path: 01_raw
+      hash: md5
+      md5: 3c9068cdaacbf2d8934c76a2e98c945d.dir
+      size: 372865559
+      nfiles: 50
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
new file mode 100644
index 0000000..32edd1d
--- /dev/null
+++ b/training/object-detection/dvc.yaml
@@ -0,0 +1,12 @@
+stages:
+  update_raw:
+    cmd: >-
+      rclone sync 
+      -P
+      --filter "- /salm_dataset*/**"
+      --filter "+ *.json"
+      --filter "- *.zip"
+      aws:salmonvision-ml-datasets/rgb/model_input/
+      01_raw
+    outs:
+    - 01_raw

From c98908e072e5fb609cfff48c4fbc7af985c2bd4d Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 26 Feb 2026 08:29:56 -0800
Subject: [PATCH 06/35] Organize data more

---
 training/object-detection/.dvc/config            | 4 ++++
 training/object-detection/data/01_raw/.gitignore | 1 +
 training/object-detection/dvc.lock               | 5 +++--
 training/object-detection/dvc.yaml               | 4 ++--
 4 files changed, 10 insertions(+), 4 deletions(-)
 create mode 100644 training/object-detection/data/01_raw/.gitignore

diff --git a/training/object-detection/.dvc/config b/training/object-detection/.dvc/config
index e69de29..8b006d6 100644
--- a/training/object-detection/.dvc/config
+++ b/training/object-detection/.dvc/config
@@ -0,0 +1,4 @@
+[core]
+    remote = storage
+['remote "storage"']
+    url = s3://salmonvision-dvc/rgb_object_detection
diff --git a/training/object-detection/data/01_raw/.gitignore b/training/object-detection/data/01_raw/.gitignore
new file mode 100644
index 0000000..9cd5243
--- /dev/null
+++ b/training/object-detection/data/01_raw/.gitignore
@@ -0,0 +1 @@
+/labelstudio_annos
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 38b1ae5..dae0118 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -2,9 +2,10 @@ schema: '2.0'
 stages:
   update_raw:
     cmd: rclone sync  -P --filter "- /salm_dataset*/**" --filter "+ *.json" 
-      --filter "- *.zip" aws:salmonvision-ml-datasets/rgb/model_input/ 01_raw
+      --filter "- *.zip" aws:salmonvision-ml-datasets/rgb/model_input/ 
+      data/01_raw/labelstudio_annos
     outs:
-    - path: 01_raw
+    - path: data/01_raw/labelstudio_annos
       hash: md5
       md5: 3c9068cdaacbf2d8934c76a2e98c945d.dir
       size: 372865559
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 32edd1d..11ca0ba 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -7,6 +7,6 @@ stages:
       --filter "+ *.json"
       --filter "- *.zip"
       aws:salmonvision-ml-datasets/rgb/model_input/
-      01_raw
+      data/01_raw/labelstudio_annos
     outs:
-    - 01_raw
+      - data/01_raw/labelstudio_annos

From 078bda19219fa220200eee7d1792b8f25e2bc74e Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 26 Feb 2026 08:58:32 -0800
Subject: [PATCH 07/35] Add conditions and counts data

---
 training/object-detection/data/01_raw/.gitignore |  3 +++
 .../data/01_raw/salmon_vid_counts.csv.dvc        |  5 +++++
 .../01_raw/salmon_vid_counts_summary.csv.dvc     |  5 +++++
 .../data/01_raw/sv_water_conditions.dvc          |  6 ++++++
 training/object-detection/dvc.lock               | 16 ++++++++++++++++
 training/object-detection/dvc.yaml               |  7 +++++++
 6 files changed, 42 insertions(+)
 create mode 100644 training/object-detection/data/01_raw/salmon_vid_counts.csv.dvc
 create mode 100644 training/object-detection/data/01_raw/salmon_vid_counts_summary.csv.dvc
 create mode 100644 training/object-detection/data/01_raw/sv_water_conditions.dvc

diff --git a/training/object-detection/data/01_raw/.gitignore b/training/object-detection/data/01_raw/.gitignore
index 9cd5243..767a4b6 100644
--- a/training/object-detection/data/01_raw/.gitignore
+++ b/training/object-detection/data/01_raw/.gitignore
@@ -1 +1,4 @@
 /labelstudio_annos
+/salmon_vid_counts.csv
+/salmon_vid_counts_summary.csv
+/sv_water_conditions
diff --git a/training/object-detection/data/01_raw/salmon_vid_counts.csv.dvc b/training/object-detection/data/01_raw/salmon_vid_counts.csv.dvc
new file mode 100644
index 0000000..1023678
--- /dev/null
+++ b/training/object-detection/data/01_raw/salmon_vid_counts.csv.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: b6136eee30a358d7473a160197bc91be
+  size: 6465051
+  hash: md5
+  path: salmon_vid_counts.csv
diff --git a/training/object-detection/data/01_raw/salmon_vid_counts_summary.csv.dvc b/training/object-detection/data/01_raw/salmon_vid_counts_summary.csv.dvc
new file mode 100644
index 0000000..4f4cbb3
--- /dev/null
+++ b/training/object-detection/data/01_raw/salmon_vid_counts_summary.csv.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: c65c2ba8214d26905ccb9608e9071b93
+  size: 1031
+  hash: md5
+  path: salmon_vid_counts_summary.csv
diff --git a/training/object-detection/data/01_raw/sv_water_conditions.dvc b/training/object-detection/data/01_raw/sv_water_conditions.dvc
new file mode 100644
index 0000000..fc1ba77
--- /dev/null
+++ b/training/object-detection/data/01_raw/sv_water_conditions.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 44403010d10d741a8e00e52c93be6c39.dir
+  size: 195622
+  nfiles: 10
+  hash: md5
+  path: sv_water_conditions
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index dae0118..f401bfa 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -10,3 +10,19 @@ stages:
       md5: 3c9068cdaacbf2d8934c76a2e98c945d.dir
       size: 372865559
       nfiles: 50
+  split_data:
+    cmd: sleep 1
+    deps:
+    - path: data/01_raw/salmon_vid_counts.csv
+      hash: md5
+      md5: b6136eee30a358d7473a160197bc91be
+      size: 6465051
+    - path: data/01_raw/salmon_vid_counts_summary.csv
+      hash: md5
+      md5: c65c2ba8214d26905ccb9608e9071b93
+      size: 1031
+    - path: data/01_raw/sv_water_conditions
+      hash: md5
+      md5: 44403010d10d741a8e00e52c93be6c39.dir
+      size: 195622
+      nfiles: 10
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 11ca0ba..a56bbaf 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -10,3 +10,10 @@ stages:
       data/01_raw/labelstudio_annos
     outs:
       - data/01_raw/labelstudio_annos
+  split_data:
+    cmd: >-
+      sleep 1
+    deps:
+      - data/01_raw/salmon_vid_counts.csv
+      - data/01_raw/salmon_vid_counts_summary.csv
+      - data/01_raw/sv_water_conditions

From 8db310bc469e023dc6ed97077cba5b31baf9084e Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 26 Feb 2026 12:42:28 -0800
Subject: [PATCH 08/35] Start adding yolo converter

---
 training/object-detection/dvc.yaml            |   6 +
 .../scripts/yolo_converter_ls_video.py        | 459 ++++++++++++++++++
 2 files changed, 465 insertions(+)
 create mode 100644 training/object-detection/scripts/yolo_converter_ls_video.py

diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index a56bbaf..069744a 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -10,6 +10,12 @@ stages:
       data/01_raw/labelstudio_annos
     outs:
       - data/01_raw/labelstudio_annos
+  build_model_input:
+    cmd: >-
+      uv run python scripts/yolo_converter_ls_video.py
+    deps:
+      - scripts/yolo_converter_ls_video.py
+      - data/01_raw/labelstudio_annos
   split_data:
     cmd: >-
       sleep 1
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
new file mode 100644
index 0000000..641a0e6
--- /dev/null
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+
+import yaml  # requires PyYAML: pip install pyyaml
+import json
+import traceback
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple, Any
+from collections import defaultdict
+from datetime import datetime
+
+def load_class_map_from_yolo_yaml(yaml_path: Path) -> Dict[str, int]:
+    """
+    Load a YOLO-style data.yaml and return a mapping: class_name -> class_id
+
+    Expects something like:
+
+      names:
+        0: Coho
+        1: Bull
+        2: Rainbow
+        ...
+
+    or:
+
+      names: [Coho, Bull, Rainbow, ...]
+    """
+    data: Any = yaml.safe_load(Path(yaml_path).read_text())
+    names = data.get("names")
+    if names is None:
+        raise ValueError(f"'names' not found in {yaml_path}")
+
+    class_map: Dict[str, int] = {}
+
+    if isinstance(names, dict):
+        # {0: 'Coho', 1: 'Bull', ...} (keys can be int or str)
+        for k, v in names.items():
+            try:
+                idx = int(k)
+            except Exception:
+                raise ValueError(f"Invalid class index {k!r} in names of {yaml_path}")
+            label = str(v)
+            class_map[label] = idx
+    elif isinstance(names, (list, tuple)):
+        # ['Coho', 'Bull', 'Rainbow', ...]
+        for idx, label in enumerate(names):
+            class_map[str(label)] = idx
+    else:
+        raise ValueError(f"Unsupported 'names' structure in {yaml_path}: {type(names)}")
+
+    return class_map
+
+def _interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
+    """
+    Given a Label Studio 'sequence' (list of keyframes) like:
+
+      {
+        "frame": 47, "x": 0, "y": 62.8, "width": 15.9, "height": 15.1, "enabled": true
+      },
+      ...
+
+    Produce: frame_index -> list of (x, y, w, h) in the SAME units as input.
+
+    Semantics:
+    - Every keyframe (enabled or not) produces a box at its own frame.
+    - If a keyframe has enabled=True, we linearly interpolate boxes for the frames
+      *between it and the next keyframe* (f0+1 .. f1-1).
+    - If a keyframe has enabled=False, we do NOT interpolate forward from it,
+      but we still keep its own box at that frame.
+    - A disabled keyframe can still be the *end* of an interpolation that started
+      from a previous enabled keyframe (since that interpolation uses the previous
+      keyframe's enabled flag).
+    """
+    # Sort keyframes by frame
+    kfs = sorted(seq, key=lambda k: int(_safe_float(k.get("frame"), 0)))
+    frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
+
+    if not kfs:
+        return frames_boxes
+
+    # 1) Add all keyframes as boxes at their exact frames
+    for k in kfs:
+        f = int(_safe_float(k.get("frame"), -1))
+        if f < 0:
+            continue
+
+        x = _safe_float(k.get("x"))
+        y = _safe_float(k.get("y"))
+        w = _safe_float(k.get("width"))
+        h = _safe_float(k.get("height"))
+        frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+    # 2) Interpolate between consecutive keyframes when the *start* keyframe is enabled
+    for i in range(len(kfs) - 1):
+        k0 = kfs[i]
+        k1 = kfs[i + 1]
+
+        f0 = int(_safe_float(k0.get("frame"), -1))
+        f1 = int(_safe_float(k1.get("frame"), -1))
+        if f0 < 0 or f1 <= f0:
+            continue
+
+        enabled0 = bool(k0.get("enabled", True))
+        if not enabled0:
+            # Do not interpolate forward from a disabled keyframe
+            continue
+
+        x0 = _safe_float(k0.get("x"))
+        y0 = _safe_float(k0.get("y"))
+        w0 = _safe_float(k0.get("width"))
+        h0 = _safe_float(k0.get("height"))
+
+        x1 = _safe_float(k1.get("x"))
+        y1 = _safe_float(k1.get("y"))
+        w1 = _safe_float(k1.get("width"))
+        h1 = _safe_float(k1.get("height"))
+
+        # Fill in strictly between endpoints; endpoints themselves are already added
+        for f in range(f0 + 1, f1):
+            t = (f - f0) / float(f1 - f0)
+            x = x0 + (x1 - x0) * t
+            y = y0 + (y1 - y0) * t
+            w = w0 + (w1 - w0) * t
+            h = h0 + (h1 - h0) * t
+            frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+    return frames_boxes
+
+@dataclass
+class ConvertStats:
+    videos_with_boxes: int = 0
+    videos_without_boxes: int = 0
+    label_files_written: int = 0
+    errors: int = 0
+
+def _safe_float(v: Any, default: float = 0.0) -> float:
+    try:
+        return float(v)
+    except Exception:
+        return default
+
+def _coord_mode(x: float, y: float, w: float, h: float) -> str:
+    """
+    Infer coordinate mode for Label Studio:
+    - 'percent'   : typical LS UI export (0..100)
+    - 'normalized': already 0..1
+    - 'pixel'     : values > 100 (needs video width/height)
+    """
+    mx = max(x, y, w, h)
+    if mx <= 1.0000001:       # already normalized
+        return "normalized"
+    if mx <= 100.0000001:     # percent
+        return "percent"
+    return "pixel"
+
+
+def _to_yolo(
+    x: float,
+    y: float,
+    w: float,
+    h: float,
+    vid_w: int,
+    vid_h: int,
+    forced_mode: Optional[str] = None,
+) -> Tuple[float, float, float, float]:
+    """
+    Convert LS-style box to YOLO (xc, yc, w, h) in [0,1].
+
+    :param forced_mode: One of {"percent", "normalized", "pixel", None/"auto"}.
+                        If None or "auto", infer from values.
+    """
+    mode = forced_mode or "auto"
+    if mode == "auto":
+        mode = _coord_mode(x, y, w, h)
+
+    if mode == "normalized":
+        xc = x + w / 2.0
+        yc = y + h / 2.0
+        wn = w
+        hn = h
+    elif mode == "percent":
+        xc = (x + w / 2.0) / 100.0
+        yc = (y + h / 2.0) / 100.0
+        wn = w / 100.0
+        hn = h / 100.0
+    elif mode == "pixel":
+        xc = (x + w / 2.0) / float(vid_w) if vid_w else 0.0
+        yc = (y + h / 2.0) / float(vid_h) if vid_h else 0.0
+        wn = w / float(vid_w) if vid_w else 0.0
+        hn = h / float(vid_h) if vid_h else 0.0
+    else:
+        raise ValueError(f"Unknown coord_mode: {forced_mode!r}")
+
+    # clamp to [0,1]
+    xc = min(max(xc, 0.0), 1.0)
+    yc = min(max(yc, 0.0), 1.0)
+    wn = min(max(wn, 0.0), 1.0)
+    hn = min(max(hn, 0.0), 1.0)
+    return xc, yc, wn, hn
+
+
+class YoloConverterLSVideo:
+    """
+    Convert the provided Label Studio 'video' export (annotations + data) to YOLO frame txts.
+
+    Input structure:
+    [
+      {
+        "data": {
+          "metadata_video_width": 1280,
+          "metadata_video_height": 720,
+          "video": "s3://.../GOLD-kitkiata-jetson-1_20240720_002007_M.mp4",
+          "metadata_file_filename": "GOLD-kitkiata-jetson-1_20240720_002007_M.mp4",
+          ...
+        },
+        "annotations": [
+          {
+            "result": [
+              {
+                "type": "videorectangle",
+                "from_name": "box",
+                "to_name": "video",
+                "value": {
+                  "labels": ["Rainbow"],
+                  "sequence": [
+                    {"frame": 47, "x": 0, "y": 62.83, "width": 15.96, "height": 15.16, "enabled": true, ...},
+                    ...
+                  ]
+                }
+              },
+              ...
+            ]
+          }
+        ]
+      },
+      {
+        ...
+      },
+      ...
+    ]
+
+    Writes:
+      <out_dir>/<video_stem>/frame_000047.txt  # one line per box in that frame
+    """
+
+    def __init__(
+        self,
+        class_map: Dict[str, int],
+        output_dir: Path,
+        empty_list_path: Optional[Path] = None,
+        overwrite_video_dir: bool = False,
+        result_type: str = "videorectangle",
+        from_name: Optional[str] = None,  # e.g., "box"; if None accept any
+        to_name: Optional[str] = None,    # e.g., "video"; if None accept any
+        coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
+        error_log_path: Optional[Path] = None,
+    ):
+        """
+        :param coord_mode:
+            "auto"       -> infer (default)
+            "percent"    -> x/y/width/height are 0..100
+            "normalized" -> x/y/width/height are 0..1
+            "pixel"      -> x/y/width/height are in pixels
+        :param error_log_path: where to append error tracebacks.
+                           If None, defaults to <output_dir>/ls_to_yolo_errors.log
+        """
+        self.class_map = class_map
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.empty_list_path = Path(empty_list_path) if empty_list_path else None
+        self.overwrite_video_dir = overwrite_video_dir
+        self.result_type = result_type
+        self.from_name = from_name
+        self.to_name = to_name
+        self.coord_mode = coord_mode
+        self.error_log_path = (
+            Path(error_log_path) if error_log_path else (self.output_dir / "ls_to_yolo_errors.log")
+        )
+
+    # ---- public API ----
+
+    def convert_folder(self, json_dir: Path, pattern: str = "*.json") -> ConvertStats:
+        stats = ConvertStats()
+        for p in sorted(Path(json_dir).glob(pattern)):
+            try:
+                s = self.convert_file(p)
+                stats.videos_with_boxes += s.videos_with_boxes
+                stats.videos_without_boxes += s.videos_without_boxes
+                stats.label_files_written += s.label_files_written
+            except Exception as e:
+                stats.errors += 1
+                self._log_error(f"convert_file({p})", e)
+        return stats
+
+    def convert_file(self, json_path: Path) -> ConvertStats:
+        stats = ConvertStats()
+        json_path = Path(json_path)
+
+        try:
+            items = json.loads(json_path.read_text())
+        except Exception as e:
+            stats.errors += 1
+            self._log_error(f"read_json({json_path})", e)
+            return stats
+
+        if not isinstance(items, list):
+            err = ValueError(f"{json_path} must contain a top-level list")
+            stats.errors += 1
+            self._log_error(f"validate_json({json_path})", err)
+            return stats
+
+        for item in items:
+            try:
+                s = self._convert_item(item)
+                stats.videos_with_boxes += s.videos_with_boxes
+                stats.videos_without_boxes += s.videos_without_boxes
+                stats.label_files_written += s.label_files_written
+            except Exception as e:
+                stats.errors += 1
+                item_id = item.get("id", "unknown")
+                self._log_error(f"_convert_item(id={item_id}, src={json_path})", e)
+        return stats
+
+    # ---- internals ----
+
+    def _log_error(self, context: str, exc: Exception):
+        try:
+            self.error_log_path.parent.mkdir(parents=True, exist_ok=True)
+            with self.error_log_path.open("a") as f:
+                f.write(f"\n=== ERROR in {context} ===\n")
+                traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
+        except Exception:
+            # last-resort: don't crash because logging failed
+            pass
+
+    @staticmethod
+    def _parse_ts(s):
+        return datetime.fromisoformat(s.replace("Z", "+00:00"))
+
+    def _convert_item(self, item: dict) -> ConvertStats:
+        stats = ConvertStats()
+
+        data = item.get("data") or {}
+        video_uri = data.get("metadata_file_filename") or data.get("video") or "unknown.mp4"
+        video_stem = Path(video_uri).stem
+        vid_w = int(_safe_float(data.get("metadata_video_width"), 0))
+        vid_h = int(_safe_float(data.get("metadata_video_height"), 0))
+
+        annos = item.get("annotations") or []
+        results = []
+        if len(annos) > 0:
+            latest_ann = max(
+                annos,
+                key=lambda a: YoloConverterLSVideo._parse_ts(a["updated_at"])
+            )
+
+            for r in (latest_ann.get("result") or []):
+                if r.get("type") != self.result_type:
+                    continue
+                if self.from_name is not None and r.get("from_name") != self.from_name:
+                    continue
+                if self.to_name is not None and r.get("to_name") != self.to_name:
+                    continue
+                results.append(r)
+
+        vid_dir = self.output_dir / video_stem
+        if not results:
+            stats.videos_without_boxes += 1
+            if self.empty_list_path:
+                self.empty_list_path.parent.mkdir(parents=True, exist_ok=True)
+                with self.empty_list_path.open("a") as f:
+                    f.write(f"{video_uri}\n")
+            return stats
+
+        if vid_dir.exists() and not self.overwrite_video_dir:
+            # skip existing video dir to avoid mixing runs
+            return stats
+        vid_dir.mkdir(parents=True, exist_ok=True)
+
+        wrote_any = False
+
+        for r in results:
+            value = r.get("value") or {}
+            labels: List[str] = value.get("labels") or []
+            if not labels:
+                continue
+            cls_name = labels[0]
+            if cls_name not in self.class_map:
+                # unknown class; skip this track
+                continue
+            cls_id = self.class_map[cls_name]
+
+            seq: Iterable[dict] = value.get("sequence") or []
+            frame_boxes = _interpolate_sequence(seq)  # frame -> [(x,y,w,h), ...]
+
+            for frame_idx, boxes in frame_boxes.items():
+                label_path = vid_dir / f"frame_{frame_idx:06d}.txt"
+                with label_path.open("a") as f:
+                    for (x, y, w, h) in boxes:
+                        xc, yc, wn, hn = _to_yolo(
+                            x, y, w, h,
+                            vid_w=vid_w,
+                            vid_h=vid_h,
+                            forced_mode=self.coord_mode,
+                        )
+                        line = f"{cls_id} {xc:.6f} {yc:.6f} {wn:.6f} {hn:.6f}"
+                        f.write(line + "\n")
+                        stats.label_files_written += 1
+                        wrote_any = True
+
+        if wrote_any:
+            stats.videos_with_boxes += 1
+        else:
+            stats.videos_without_boxes += 1
+            if self.empty_list_path:
+                with self.empty_list_path.open("a") as f:
+                    f.write(f"{video_uri}\n")
+
+        return stats
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Convert Label Studio video JSON to YOLO frame labels")
+    parser.add_argument("input", help="JSON file or directory containing Label Studio JSON")
+    parser.add_argument("--data-yaml", required=True, help="Path to YOLO data.yaml (with 'names:' mapping)")
+    parser.add_argument("--out", required=True, help="Output directory")
+    parser.add_argument("--empty-list", default=None, help="Path to write videos with no boxes")
+    parser.add_argument("--pattern", default="*.json", help="Glob when input is a directory")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing per-video folders")
+    parser.add_argument("--from-name", default=None, help="Filter by result.from_name (e.g., 'box')")
+    parser.add_argument("--to-name", default=None, help="Filter by result.to_name (e.g., 'video')")
+    parser.add_argument("--coord-mode", default="percent", help='Set the coordinates mode: "auto", "percent", "normalized", "pixel"')
+    args = parser.parse_args()
+
+    data_yaml_path = Path(args.data_yaml)
+    class_map = load_class_map_from_yolo_yaml(data_yaml_path)
+
+    conv = YoloConverterLSVideo(
+        class_map=class_map,
+        output_dir=Path(args.out),
+        empty_list_path=Path(args.empty_list) if args.empty_list else None,
+        overwrite_video_dir=args.overwrite,
+        from_name=args.from_name,
+        to_name=args.to_name,
+        coord_mode=args.coord_mode,
+    )
+
+    inp = Path(args.input)
+    if inp.is_dir():
+        s = conv.convert_folder(inp, pattern=args.pattern)
+    else:
+        s = conv.convert_file(inp)
+
+    print(
+        f"Done. with_boxes={s.videos_with_boxes} without_boxes={s.videos_without_boxes} "
+        f"labels_written={s.label_files_written} errors={s.errors}"
+    )

From 4dd1d37544ec8fc894bc70b0ba1b23295801083b Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 26 Feb 2026 13:43:20 -0800
Subject: [PATCH 09/35] Track raw data changes before downloading

---
 training/object-detection/dvc.lock                   | 12 +++++++++---
 training/object-detection/dvc.yaml                   | 11 ++++++++++-
 .../scripts/yolo_converter_ls_video.py               | 10 ++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index f401bfa..9288feb 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -2,13 +2,19 @@ schema: '2.0'
 stages:
   update_raw:
     cmd: rclone sync  -P --filter "- /salm_dataset*/**" --filter "+ *.json" 
-      --filter "- *.zip" aws:salmonvision-ml-datasets/rgb/model_input/ 
+      --filter "- *.zip" aws:salmonvision-ml-datasets/rgb/raw/ 
       data/01_raw/labelstudio_annos
+    deps:
+    - path: s3://salmonvision-ml-datasets/rgb/raw
+      hash: md5
+      md5: 9aab20758f57f39971a38c9a676dad27.dir
+      size: 720238505
+      nfiles: 65
     outs:
     - path: data/01_raw/labelstudio_annos
       hash: md5
-      md5: 3c9068cdaacbf2d8934c76a2e98c945d.dir
-      size: 372865559
+      md5: 188b71c31be326fe36d209cc52c23337.dir
+      size: 379352848
       nfiles: 50
   split_data:
     cmd: sleep 1
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 069744a..f44f02b 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -6,16 +6,25 @@ stages:
       --filter "- /salm_dataset*/**"
       --filter "+ *.json"
       --filter "- *.zip"
-      aws:salmonvision-ml-datasets/rgb/model_input/
+      aws:salmonvision-ml-datasets/rgb/raw/
       data/01_raw/labelstudio_annos
+    deps:
+      - s3://salmonvision-ml-datasets/rgb/raw
     outs:
       - data/01_raw/labelstudio_annos
   build_model_input:
     cmd: >-
       uv run python scripts/yolo_converter_ls_video.py
+      ../salmon-computer-vision/training/annotation_webapp_alt/
+      --data-yaml batch_upload/salmon_yolo.yaml
+      --out data/02_interim/yolo_annos
+      --pattern '**/*.json'
+      --include-sites tankeeah kitwanga bear
     deps:
       - scripts/yolo_converter_ls_video.py
       - data/01_raw/labelstudio_annos
+    outs:
+      - data/02_interim/yolo_annos
   split_data:
     cmd: >-
       sleep 1
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index 641a0e6..2a4f321 100644
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -254,6 +254,7 @@ def __init__(
         to_name: Optional[str] = None,    # e.g., "video"; if None accept any
         coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
         error_log_path: Optional[Path] = None,
+        include_sites: Optional[list[str]] = None,
     ):
         """
         :param coord_mode:
@@ -276,6 +277,7 @@ def __init__(
         self.error_log_path = (
             Path(error_log_path) if error_log_path else (self.output_dir / "ls_to_yolo_errors.log")
         )
+        self.include_sites = include_sites
 
     # ---- public API ----
 
@@ -341,6 +343,12 @@ def _convert_item(self, item: dict) -> ConvertStats:
         stats = ConvertStats()
 
         data = item.get("data") or {}
+        site = data.get("metadata_file_site_reference_string") or ""
+        if self.include_sites is not None:
+            if site not in self.include_sites:
+                # Not in included sites
+                return stats
+
         video_uri = data.get("metadata_file_filename") or data.get("video") or "unknown.mp4"
         video_stem = Path(video_uri).stem
         vid_w = int(_safe_float(data.get("metadata_video_width"), 0))
@@ -432,6 +440,7 @@ def _convert_item(self, item: dict) -> ConvertStats:
     parser.add_argument("--from-name", default=None, help="Filter by result.from_name (e.g., 'box')")
     parser.add_argument("--to-name", default=None, help="Filter by result.to_name (e.g., 'video')")
     parser.add_argument("--coord-mode", default="percent", help='Set the coordinates mode: "auto", "percent", "normalized", "pixel"')
+    parser.add_argument("--include-sites", nargs="*", default=None, help='Only include videos of these sites')
     args = parser.parse_args()
 
     data_yaml_path = Path(args.data_yaml)
@@ -445,6 +454,7 @@ def _convert_item(self, item: dict) -> ConvertStats:
         from_name=args.from_name,
         to_name=args.to_name,
         coord_mode=args.coord_mode,
+        include_sites=args.include_sites,
     )
 
     inp = Path(args.input)

From eef10ff717ecfc70f9399feae5e1975b2ecca068 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 2 Mar 2026 10:43:50 -0800
Subject: [PATCH 10/35] Add yolo conversion to pipeline

---
 training/object-detection/README.md           |  6 ++++
 .../object-detection/config/salmon_yolo.yaml  | 28 +++++++++++++++++++
 .../data/02_interim/.gitignore                |  1 +
 training/object-detection/dvc.lock            | 24 ++++++++++++++++
 training/object-detection/dvc.yaml            |  7 +++--
 training/object-detection/pyproject.toml      |  2 +-
 .../scripts/yolo_converter_ls_video.py        | 22 +++++++++++----
 7 files changed, 80 insertions(+), 10 deletions(-)
 create mode 100644 training/object-detection/config/salmon_yolo.yaml
 create mode 100644 training/object-detection/data/02_interim/.gitignore
 mode change 100644 => 100755 training/object-detection/scripts/yolo_converter_ls_video.py

diff --git a/training/object-detection/README.md b/training/object-detection/README.md
index e69de29..993a2af 100644
--- a/training/object-detection/README.md
+++ b/training/object-detection/README.md
@@ -0,0 +1,6 @@
+# Object Detection
+
+Install uv
+```
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
diff --git a/training/object-detection/config/salmon_yolo.yaml b/training/object-detection/config/salmon_yolo.yaml
new file mode 100644
index 0000000..d1601dc
--- /dev/null
+++ b/training/object-detection/config/salmon_yolo.yaml
@@ -0,0 +1,28 @@
+# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
+# Classes updated on 2026-02-12
+path: /training/export_combined_bear_kitwanga_yolo  # dataset root dir
+train: train.txt
+val: val.txt
+test: test.txt
+
+# Classes
+names:
+  0: Coho
+  1: Bull
+  2: Rainbow
+  3: Sockeye
+  4: Pink
+  5: Whitefish
+  6: Chinook
+  7: Shiner
+  8: Pikeminnow
+  9: Chum
+  10: Steelhead
+  11: Lamprey
+  12: Cutthroat
+  13: Stickleback
+  14: Sculpin
+  15: Jack_Coho
+  16: Jack_Chinook
+  17: Otter
+  18: Sucker
diff --git a/training/object-detection/data/02_interim/.gitignore b/training/object-detection/data/02_interim/.gitignore
new file mode 100644
index 0000000..2019ca8
--- /dev/null
+++ b/training/object-detection/data/02_interim/.gitignore
@@ -0,0 +1 @@
+/yolo_annos
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 9288feb..306d76f 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -32,3 +32,27 @@ stages:
       md5: 44403010d10d741a8e00e52c93be6c39.dir
       size: 195622
       nfiles: 10
+  build_model_input:
+    cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
+      --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos 
+      --pattern '**/*.json' --include-sites tankeeah kitwanga bear
+    deps:
+    - path: config/salmon_yolo.yaml
+      hash: md5
+      md5: f453f5dc54f1743eaedcc3ab117d269e
+      size: 547
+    - path: data/01_raw/labelstudio_annos
+      hash: md5
+      md5: 188b71c31be326fe36d209cc52c23337.dir
+      size: 379352848
+      nfiles: 50
+    - path: scripts/yolo_converter_ls_video.py
+      hash: md5
+      md5: cca7398b0bf1053184009452ae7cdaaf
+      size: 16599
+    outs:
+    - path: data/02_interim/yolo_annos
+      hash: md5
+      md5: 58944a7c76dff71dbb220f78e59ed9d9.dir
+      size: 40115388
+      nfiles: 782981
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index f44f02b..68aecc9 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -14,15 +14,16 @@ stages:
       - data/01_raw/labelstudio_annos
   build_model_input:
     cmd: >-
-      uv run python scripts/yolo_converter_ls_video.py
-      ../salmon-computer-vision/training/annotation_webapp_alt/
-      --data-yaml batch_upload/salmon_yolo.yaml
+      scripts/yolo_converter_ls_video.py
+      data/01_raw/labelstudio_annos
+      --data-yaml config/salmon_yolo.yaml
       --out data/02_interim/yolo_annos
       --pattern '**/*.json'
       --include-sites tankeeah kitwanga bear
     deps:
       - scripts/yolo_converter_ls_video.py
       - data/01_raw/labelstudio_annos
+      - config/salmon_yolo.yaml
     outs:
       - data/02_interim/yolo_annos
   split_data:
diff --git a/training/object-detection/pyproject.toml b/training/object-detection/pyproject.toml
index 94a0e56..2e3f06f 100644
--- a/training/object-detection/pyproject.toml
+++ b/training/object-detection/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "object-detection"
 version = "0.1.0"
-description = "Add your description here"
+description = "SalmonVision object detector training"
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = []
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
old mode 100644
new mode 100755
index 2a4f321..7e22435
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -1,6 +1,14 @@
-#!/usr/bin/env python3
-
-import yaml  # requires PyYAML: pip install pyyaml
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#     "pyyaml>=6.0.3",
+# ]
+# [tool.uv]
+# exclude-newer = "2026-03-02T18:41:13Z"
+# ///
+
+import yaml
 import json
 import traceback
 from dataclasses import dataclass
@@ -254,7 +262,7 @@ def __init__(
         to_name: Optional[str] = None,    # e.g., "video"; if None accept any
         coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
         error_log_path: Optional[Path] = None,
-        include_sites: Optional[list[str]] = None,
+        include_sites: List[str] = [],
     ):
         """
         :param coord_mode:
@@ -344,7 +352,7 @@ def _convert_item(self, item: dict) -> ConvertStats:
 
         data = item.get("data") or {}
         site = data.get("metadata_file_site_reference_string") or ""
-        if self.include_sites is not None:
+        if len(self.include_sites) > 0:
             if site not in self.include_sites:
                 # Not in included sites
                 return stats
@@ -440,7 +448,7 @@ def _convert_item(self, item: dict) -> ConvertStats:
     parser.add_argument("--from-name", default=None, help="Filter by result.from_name (e.g., 'box')")
     parser.add_argument("--to-name", default=None, help="Filter by result.to_name (e.g., 'video')")
     parser.add_argument("--coord-mode", default="percent", help='Set the coordinates mode: "auto", "percent", "normalized", "pixel"')
-    parser.add_argument("--include-sites", nargs="*", default=None, help='Only include videos of these sites')
+    parser.add_argument("--include-sites", nargs="*", default=[], help='Only include videos of these sites')
     args = parser.parse_args()
 
     data_yaml_path = Path(args.data_yaml)
@@ -459,6 +467,8 @@ def _convert_item(self, item: dict) -> ConvertStats:
 
     inp = Path(args.input)
     if inp.is_dir():
+        print(f"Converting labels in folder {inp}")
+
         s = conv.convert_folder(inp, pattern=args.pattern)
     else:
         s = conv.convert_file(inp)

From 22e1504cb0971521f3eba59b2e38198b0b4fb7ae Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:36:00 -0800
Subject: [PATCH 11/35] Shard dataset for better uploading

---
 training/object-detection/README.md           |  14 ++-
 training/object-detection/dvc.lock            |  13 +-
 training/object-detection/dvc.yaml            |   2 +
 .../scripts/yolo_converter_ls_video.py        | 113 +++++++++++++-----
 4 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/training/object-detection/README.md b/training/object-detection/README.md
index 993a2af..df2bacf 100644
--- a/training/object-detection/README.md
+++ b/training/object-detection/README.md
@@ -1,6 +1,18 @@
 # Object Detection
 
 Install uv
-```
+```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
+
+Check dvc.yaml for the full pipeline.
+
+Run the following to run specific stages of the pipeline:
+```bash
+dvc repro stage_name
+```
+
+For example, building the model input annotations:
+```bash
+dvc repro build_model_input
+```
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 306d76f..80c86f7 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -35,7 +35,8 @@ stages:
   build_model_input:
     cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
       --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos 
-      --pattern '**/*.json' --include-sites tankeeah kitwanga bear
+      --out-shards data/02_interim/yolo_annos --shard-size 100000 --pattern 
+      '**/*.json' --include-sites tankeeah kitwanga bear
     deps:
     - path: config/salmon_yolo.yaml
       hash: md5
@@ -48,11 +49,11 @@ stages:
       nfiles: 50
     - path: scripts/yolo_converter_ls_video.py
       hash: md5
-      md5: cca7398b0bf1053184009452ae7cdaaf
-      size: 16599
+      md5: 36a7224d1eea0b6e300d421a7907c411
+      size: 18877
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
-      md5: 58944a7c76dff71dbb220f78e59ed9d9.dir
-      size: 40115388
-      nfiles: 782981
+      md5: 978efd3dca7ac11abcb153db010c39ae.dir
+      size: 880998400
+      nfiles: 9
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 68aecc9..abf5983 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -18,6 +18,8 @@ stages:
       data/01_raw/labelstudio_annos
       --data-yaml config/salmon_yolo.yaml
       --out data/02_interim/yolo_annos
+      --out-shards data/02_interim/yolo_annos
+      --shard-size 100000
       --pattern '**/*.json'
       --include-sites tankeeah kitwanga bear
     deps:
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index 7e22435..4f021ca 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -16,6 +16,47 @@
 from typing import Dict, Iterable, List, Optional, Tuple, Any
 from collections import defaultdict
 from datetime import datetime
+import io
+import tarfile
+
+class TarShardWriter:
+    def __init__(self, out_dir: Path, shard_size: int = 10000, prefix: str = "yolo_annos"):
+        self.out_dir = Path(out_dir)
+        self.out_dir.mkdir(parents=True, exist_ok=True)
+        self.shard_size = int(shard_size)
+        self.prefix = prefix
+
+        self._shard_idx = 0
+        self._n_in_shard = 0
+        self._tar = None  # tarfile.TarFile
+
+        self._open_new()
+
+    def _open_new(self):
+        if self._tar is not None:
+            self._tar.close()
+        shard_name = f"{self.prefix}-{self._shard_idx:06d}.tar"
+        self._tar_path = self.out_dir / shard_name
+        self._tar = tarfile.open(self._tar_path, mode="w")  # uncompressed tar
+        self._n_in_shard = 0
+        self._shard_idx += 1
+
+    def write_text(self, rel_path: str, text: str):
+        # rotate shard if needed
+        if self._n_in_shard >= self.shard_size:
+            self._open_new()
+
+        data = text.encode("utf-8")
+        ti = tarfile.TarInfo(name=rel_path)
+        ti.size = len(data)
+        self._tar.addfile(ti, io.BytesIO(data))
+
+        self._n_in_shard += 1
+
+    def close(self):
+        if self._tar is not None:
+            self._tar.close()
+            self._tar = None
 
 def load_class_map_from_yolo_yaml(yaml_path: Path) -> Dict[str, int]:
     """
@@ -263,6 +304,8 @@ def __init__(
         coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
         error_log_path: Optional[Path] = None,
         include_sites: List[str] = [],
+        shard_dir: Optional[Path] = None,
+        shard_size: int = 10000,
     ):
         """
         :param coord_mode:
@@ -286,6 +329,9 @@ def __init__(
             Path(error_log_path) if error_log_path else (self.output_dir / "ls_to_yolo_errors.log")
         )
         self.include_sites = include_sites
+        self.shard_dir = Path(shard_dir) if shard_dir else None
+        self.shard_size = int(shard_size)
+        self._sharder = TarShardWriter(self.shard_dir, shard_size=self.shard_size) if self.shard_dir else None
 
     # ---- public API ----
 
@@ -379,22 +425,10 @@ def _convert_item(self, item: dict) -> ConvertStats:
                     continue
                 results.append(r)
 
-        vid_dir = self.output_dir / video_stem
-        if not results:
-            stats.videos_without_boxes += 1
-            if self.empty_list_path:
-                self.empty_list_path.parent.mkdir(parents=True, exist_ok=True)
-                with self.empty_list_path.open("a") as f:
-                    f.write(f"{video_uri}\n")
-            return stats
-
-        if vid_dir.exists() and not self.overwrite_video_dir:
-            # skip existing video dir to avoid mixing runs
-            return stats
-        vid_dir.mkdir(parents=True, exist_ok=True)
-
         wrote_any = False
 
+        # Collect lines per frame
+        frame_lines: Dict[int, List[str]] = defaultdict(list)
         for r in results:
             value = r.get("value") or {}
             labels: List[str] = value.get("labels") or []
@@ -410,19 +444,16 @@ def _convert_item(self, item: dict) -> ConvertStats:
             frame_boxes = _interpolate_sequence(seq)  # frame -> [(x,y,w,h), ...]
 
             for frame_idx, boxes in frame_boxes.items():
-                label_path = vid_dir / f"frame_{frame_idx:06d}.txt"
-                with label_path.open("a") as f:
-                    for (x, y, w, h) in boxes:
-                        xc, yc, wn, hn = _to_yolo(
-                            x, y, w, h,
-                            vid_w=vid_w,
-                            vid_h=vid_h,
-                            forced_mode=self.coord_mode,
-                        )
-                        line = f"{cls_id} {xc:.6f} {yc:.6f} {wn:.6f} {hn:.6f}"
-                        f.write(line + "\n")
-                        stats.label_files_written += 1
-                        wrote_any = True
+                for (x, y, w, h) in boxes:
+                    xc, yc, wn, hn = _to_yolo(
+                        x, y, w, h,
+                        vid_w=vid_w,
+                        vid_h=vid_h,
+                        forced_mode=self.coord_mode,
+                    )
+                    frame_lines[frame_idx].append(f"{cls_id} {xc:.6f} {yc:.6f} {wn:.6f} {hn:.6f}")
+                    stats.label_files_written += 1
+                    wrote_any = True
 
         if wrote_any:
             stats.videos_with_boxes += 1
@@ -432,6 +463,25 @@ def _convert_item(self, item: dict) -> ConvertStats:
                 with self.empty_list_path.open("a") as f:
                     f.write(f"{video_uri}\n")
 
+        if self._sharder:
+            # write into shards: <video_stem>/frame_000123.txt
+            for frame_idx, lines in frame_lines.items():
+                rel_path = f"{video_stem}/frame_{frame_idx:06d}.txt"
+                self._sharder.write_text(rel_path, "\n".join(lines) + "\n")
+        else:
+            # current behavior: write to filesystem
+            vid_dir = self.output_dir / video_stem
+
+            if vid_dir.exists() and not self.overwrite_video_dir:
+                # skip existing video dir to avoid mixing runs
+                return stats
+            vid_dir.mkdir(parents=True, exist_ok=True)
+
+            for frame_idx, lines in frame_lines.items():
+                label_path = vid_dir / f"frame_{frame_idx:06d}.txt"
+                label_path.parent.mkdir(parents=True, exist_ok=True)
+                label_path.write_text("\n".join(lines) + "\n")
+
         return stats
 
 
@@ -449,6 +499,8 @@ def _convert_item(self, item: dict) -> ConvertStats:
     parser.add_argument("--to-name", default=None, help="Filter by result.to_name (e.g., 'video')")
     parser.add_argument("--coord-mode", default="percent", help='Set the coordinates mode: "auto", "percent", "normalized", "pixel"')
     parser.add_argument("--include-sites", nargs="*", default=[], help='Only include videos of these sites')
+    parser.add_argument("--out-shards", default=None, help="Directory to write TAR shards (instead of many files)")
+    parser.add_argument("--shard-size", type=int, default=10000, help="Number of frame label files per shard")
     args = parser.parse_args()
 
     data_yaml_path = Path(args.data_yaml)
@@ -463,16 +515,21 @@ def _convert_item(self, item: dict) -> ConvertStats:
         to_name=args.to_name,
         coord_mode=args.coord_mode,
         include_sites=args.include_sites,
+        shard_dir=Path(args.out_shards) if args.out_shards else None,
+        shard_size=args.shard_size,
     )
 
     inp = Path(args.input)
     if inp.is_dir():
-        print(f"Converting labels in folder {inp}")
+        print(f"Converting labels from {inp}")
 
         s = conv.convert_folder(inp, pattern=args.pattern)
     else:
         s = conv.convert_file(inp)
 
+    if getattr(conv, "_sharder", None):
+        conv._sharder.close()
+
     print(
         f"Done. with_boxes={s.videos_with_boxes} without_boxes={s.videos_without_boxes} "
         f"labels_written={s.label_files_written} errors={s.errors}"

From 57caa377a55ef651b49c871cb0f3c99e838c07d9 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 2 Mar 2026 13:20:46 -0800
Subject: [PATCH 12/35] Skip every 3rd frame for each video

---
 training/object-detection/README.md           |   9 +-
 training/object-detection/dvc.lock            |   7 +-
 training/object-detection/dvc.yaml            |   2 +
 .../scripts/yolo_converter_ls_video.py        | 182 ++++++++++--------
 4 files changed, 119 insertions(+), 81 deletions(-)

diff --git a/training/object-detection/README.md b/training/object-detection/README.md
index df2bacf..f5d6d7a 100644
--- a/training/object-detection/README.md
+++ b/training/object-detection/README.md
@@ -1,10 +1,17 @@
 # Object Detection
 
-Install uv
+Training pipeline to train the SalmonVision object detection model.
+
+Install uv:
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
 
+Install DVC:
+```bash
+uv tool install dvc
+```
+
 Check dvc.yaml for the full pipeline.
 
 Run the following to run specific stages of the pipeline:
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 80c86f7..eb8328e 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -36,7 +36,8 @@ stages:
     cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
       --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos 
       --out-shards data/02_interim/yolo_annos --shard-size 100000 --pattern 
-      '**/*.json' --include-sites tankeeah kitwanga bear
+      '**/*.json' --include-sites tankeeah kitwanga bear --frame-stride 3 
+      --frame-offset-mode video_hash
     deps:
     - path: config/salmon_yolo.yaml
       hash: md5
@@ -49,8 +50,8 @@ stages:
       nfiles: 50
     - path: scripts/yolo_converter_ls_video.py
       hash: md5
-      md5: 36a7224d1eea0b6e300d421a7907c411
-      size: 18877
+      md5: c076aef67ddd95232ad60c90a0acb3d4
+      size: 20426
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index abf5983..a6981e1 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -22,6 +22,8 @@ stages:
       --shard-size 100000
       --pattern '**/*.json'
       --include-sites tankeeah kitwanga bear
+      --frame-stride 3
+      --frame-offset-mode video_hash
     deps:
       - scripts/yolo_converter_ls_video.py
       - data/01_raw/labelstudio_annos
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index 4f021ca..bacff7a 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -18,6 +18,7 @@
 from datetime import datetime
 import io
 import tarfile
+import zlib
 
 class TarShardWriter:
     def __init__(self, out_dir: Path, shard_size: int = 10000, prefix: str = "yolo_annos"):
@@ -99,82 +100,6 @@ def load_class_map_from_yolo_yaml(yaml_path: Path) -> Dict[str, int]:
 
     return class_map
 
-def _interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
-    """
-    Given a Label Studio 'sequence' (list of keyframes) like:
-
-      {
-        "frame": 47, "x": 0, "y": 62.8, "width": 15.9, "height": 15.1, "enabled": true
-      },
-      ...
-
-    Produce: frame_index -> list of (x, y, w, h) in the SAME units as input.
-
-    Semantics:
-    - Every keyframe (enabled or not) produces a box at its own frame.
-    - If a keyframe has enabled=True, we linearly interpolate boxes for the frames
-      *between it and the next keyframe* (f0+1 .. f1-1).
-    - If a keyframe has enabled=False, we do NOT interpolate forward from it,
-      but we still keep its own box at that frame.
-    - A disabled keyframe can still be the *end* of an interpolation that started
-      from a previous enabled keyframe (since that interpolation uses the previous
-      keyframe's enabled flag).
-    """
-    # Sort keyframes by frame
-    kfs = sorted(seq, key=lambda k: int(_safe_float(k.get("frame"), 0)))
-    frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
-
-    if not kfs:
-        return frames_boxes
-
-    # 1) Add all keyframes as boxes at their exact frames
-    for k in kfs:
-        f = int(_safe_float(k.get("frame"), -1))
-        if f < 0:
-            continue
-
-        x = _safe_float(k.get("x"))
-        y = _safe_float(k.get("y"))
-        w = _safe_float(k.get("width"))
-        h = _safe_float(k.get("height"))
-        frames_boxes.setdefault(f, []).append((x, y, w, h))
-
-    # 2) Interpolate between consecutive keyframes when the *start* keyframe is enabled
-    for i in range(len(kfs) - 1):
-        k0 = kfs[i]
-        k1 = kfs[i + 1]
-
-        f0 = int(_safe_float(k0.get("frame"), -1))
-        f1 = int(_safe_float(k1.get("frame"), -1))
-        if f0 < 0 or f1 <= f0:
-            continue
-
-        enabled0 = bool(k0.get("enabled", True))
-        if not enabled0:
-            # Do not interpolate forward from a disabled keyframe
-            continue
-
-        x0 = _safe_float(k0.get("x"))
-        y0 = _safe_float(k0.get("y"))
-        w0 = _safe_float(k0.get("width"))
-        h0 = _safe_float(k0.get("height"))
-
-        x1 = _safe_float(k1.get("x"))
-        y1 = _safe_float(k1.get("y"))
-        w1 = _safe_float(k1.get("width"))
-        h1 = _safe_float(k1.get("height"))
-
-        # Fill in strictly between endpoints; endpoints themselves are already added
-        for f in range(f0 + 1, f1):
-            t = (f - f0) / float(f1 - f0)
-            x = x0 + (x1 - x0) * t
-            y = y0 + (y1 - y0) * t
-            w = w0 + (w1 - w0) * t
-            h = h0 + (h1 - h0) * t
-            frames_boxes.setdefault(f, []).append((x, y, w, h))
-
-    return frames_boxes
-
 @dataclass
 class ConvertStats:
     videos_with_boxes: int = 0
@@ -306,6 +231,9 @@ def __init__(
         include_sites: List[str] = [],
         shard_dir: Optional[Path] = None,
         shard_size: int = 10000,
+        frame_stride: int = 1,
+        frame_offset_mode: str = "fixed",
+        frame_offset: int = 0,
     ):
         """
         :param coord_mode:
@@ -332,6 +260,9 @@ def __init__(
         self.shard_dir = Path(shard_dir) if shard_dir else None
         self.shard_size = int(shard_size)
         self._sharder = TarShardWriter(self.shard_dir, shard_size=self.shard_size) if self.shard_dir else None
+        self.frame_stride = max(1, int(frame_stride))
+        self.frame_offset_mode = frame_offset_mode
+        self.frame_offset = int(frame_offset)
 
     # ---- public API ----
 
@@ -379,6 +310,94 @@ def convert_file(self, json_path: Path) -> ConvertStats:
 
     # ---- internals ----
 
+    def _stride_offset(self, video_stem: str) -> int:
+        if self.frame_stride <= 1:
+            return 0
+        if self.frame_offset_mode == "fixed":
+            return int(self.frame_offset) % self.frame_stride
+        if self.frame_offset_mode == "video_hash":
+            # deterministic across runs + platforms
+            return zlib.crc32(video_stem.encode("utf-8")) % self.frame_stride
+
+        raise ValueError("Invalid frame offset mode")
+
+    @staticmethod
+    def _interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
+        """
+        Given a Label Studio 'sequence' (list of keyframes) like:
+
+          {
+            "frame": 47, "x": 0, "y": 62.8, "width": 15.9, "height": 15.1, "enabled": true
+          },
+          ...
+
+        Produce: frame_index -> list of (x, y, w, h) in the SAME units as input.
+
+        Semantics:
+        - Every keyframe (enabled or not) produces a box at its own frame.
+        - If a keyframe has enabled=True, we linearly interpolate boxes for the frames
+          *between it and the next keyframe* (f0+1 .. f1-1).
+        - If a keyframe has enabled=False, we do NOT interpolate forward from it,
+          but we still keep its own box at that frame.
+        - A disabled keyframe can still be the *end* of an interpolation that started
+          from a previous enabled keyframe (since that interpolation uses the previous
+          keyframe's enabled flag).
+        """
+        # Sort keyframes by frame
+        kfs = sorted(seq, key=lambda k: int(_safe_float(k.get("frame"), 0)))
+        frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
+
+        if not kfs:
+            return frames_boxes
+
+        # 1) Add all keyframes as boxes at their exact frames
+        for k in kfs:
+            f = int(_safe_float(k.get("frame"), -1))
+            if f < 0:
+                continue
+
+            x = _safe_float(k.get("x"))
+            y = _safe_float(k.get("y"))
+            w = _safe_float(k.get("width"))
+            h = _safe_float(k.get("height"))
+            frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+        # 2) Interpolate between consecutive keyframes when the *start* keyframe is enabled
+        for i in range(len(kfs) - 1):
+            k0 = kfs[i]
+            k1 = kfs[i + 1]
+
+            f0 = int(_safe_float(k0.get("frame"), -1))
+            f1 = int(_safe_float(k1.get("frame"), -1))
+            if f0 < 0 or f1 <= f0:
+                continue
+
+            enabled0 = bool(k0.get("enabled", True))
+            if not enabled0:
+                # Do not interpolate forward from a disabled keyframe
+                continue
+
+            x0 = _safe_float(k0.get("x"))
+            y0 = _safe_float(k0.get("y"))
+            w0 = _safe_float(k0.get("width"))
+            h0 = _safe_float(k0.get("height"))
+
+            x1 = _safe_float(k1.get("x"))
+            y1 = _safe_float(k1.get("y"))
+            w1 = _safe_float(k1.get("width"))
+            h1 = _safe_float(k1.get("height"))
+
+            # Fill in strictly between endpoints; endpoints themselves are already added
+            for f in range(f0 + 1, f1):
+                t = (f - f0) / float(f1 - f0)
+                x = x0 + (x1 - x0) * t
+                y = y0 + (y1 - y0) * t
+                w = w0 + (w1 - w0) * t
+                h = h0 + (h1 - h0) * t
+                frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+        return frames_boxes
+
     def _log_error(self, context: str, exc: Exception):
         try:
             self.error_log_path.parent.mkdir(parents=True, exist_ok=True)
@@ -441,7 +460,7 @@ def _convert_item(self, item: dict) -> ConvertStats:
             cls_id = self.class_map[cls_name]
 
             seq: Iterable[dict] = value.get("sequence") or []
-            frame_boxes = _interpolate_sequence(seq)  # frame -> [(x,y,w,h), ...]
+            frame_boxes = self._interpolate_sequence(seq)  # frame -> [(x,y,w,h), ...]
 
             for frame_idx, boxes in frame_boxes.items():
                 for (x, y, w, h) in boxes:
@@ -501,6 +520,12 @@ def _convert_item(self, item: dict) -> ConvertStats:
     parser.add_argument("--include-sites", nargs="*", default=[], help='Only include videos of these sites')
     parser.add_argument("--out-shards", default=None, help="Directory to write TAR shards (instead of many files)")
     parser.add_argument("--shard-size", type=int, default=10000, help="Number of frame label files per shard")
+    parser.add_argument("--frame-stride", type=int, default=1,
+                    help="Keep every Nth frame (1 keeps all)")
+    parser.add_argument("--frame-offset-mode", choices=["fixed", "video_hash"], default="video_hash",
+                        help="How to choose offset within stride")
+    parser.add_argument("--frame-offset", type=int, default=0,
+                        help="Offset for fixed mode (0..stride-1)")
     args = parser.parse_args()
 
     data_yaml_path = Path(args.data_yaml)
@@ -517,6 +542,9 @@ def _convert_item(self, item: dict) -> ConvertStats:
         include_sites=args.include_sites,
         shard_dir=Path(args.out_shards) if args.out_shards else None,
         shard_size=args.shard_size,
+        frame_stride=args.frame_stride,
+        frame_offset_mode=args.frame_offset_mode,
+        frame_offset=args.frame_offset,
     )
 
     inp = Path(args.input)

From 4cc372adcb4f3441b2272b79081ac9c9ec62cf6c Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 2 Mar 2026 15:30:04 -0800
Subject: [PATCH 13/35] Add unpacking step

---
 training/object-detection/.gitignore          |  1 -
 .../data/02_interim/.gitignore                |  1 +
 training/object-detection/dvc.lock            |  9 +++++
 training/object-detection/dvc.yaml            |  9 +++++
 .../object-detection/scripts/unpack_annos.sh  | 33 +++++++++++++++++++
 5 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100755 training/object-detection/scripts/unpack_annos.sh

diff --git a/training/object-detection/.gitignore b/training/object-detection/.gitignore
index 391d5f2..e69de29 100644
--- a/training/object-detection/.gitignore
+++ b/training/object-detection/.gitignore
@@ -1 +0,0 @@
-/01_raw
diff --git a/training/object-detection/data/02_interim/.gitignore b/training/object-detection/data/02_interim/.gitignore
index 2019ca8..8d21cd8 100644
--- a/training/object-detection/data/02_interim/.gitignore
+++ b/training/object-detection/data/02_interim/.gitignore
@@ -1 +1,2 @@
 /yolo_annos
+/yolo_annos_unpacked
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index eb8328e..517e0bf 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -58,3 +58,12 @@ stages:
       md5: 978efd3dca7ac11abcb153db010c39ae.dir
       size: 880998400
       nfiles: 9
+  unpack_annos:
+    cmd: scripts/unpack_annos.sh data/02_interim/yolo_annos 
+      data/02_interim/yolo_annos_unpacked
+    deps:
+    - path: data/02_interim/yolo_annos
+      hash: md5
+      md5: 978efd3dca7ac11abcb153db010c39ae.dir
+      size: 880998400
+      nfiles: 9
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index a6981e1..8dfd154 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -12,6 +12,7 @@ stages:
       - s3://salmonvision-ml-datasets/rgb/raw
     outs:
       - data/01_raw/labelstudio_annos
+    frozen: true
   build_model_input:
     cmd: >-
       scripts/yolo_converter_ls_video.py
@@ -30,6 +31,13 @@ stages:
       - config/salmon_yolo.yaml
     outs:
       - data/02_interim/yolo_annos
+  unpack_annos:
+    cmd: >-
+      scripts/unpack_annos.sh
+      data/02_interim/yolo_annos
+      data/02_interim/yolo_annos_unpacked
+    deps:
+      - data/02_interim/yolo_annos
   split_data:
     cmd: >-
       sleep 1
@@ -37,3 +45,4 @@ stages:
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv
       - data/01_raw/sv_water_conditions
+      - data/02_interim/yolo_annos_unpacked
diff --git a/training/object-detection/scripts/unpack_annos.sh b/training/object-detection/scripts/unpack_annos.sh
new file mode 100755
index 0000000..c78c455
--- /dev/null
+++ b/training/object-detection/scripts/unpack_annos.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -e
+
+help_msg() {
+    echo "$0 [-h] in_folder out_folder"
+}
+
+# Get the options
+while getopts ":h" option; do
+    case $option in
+        h) # display Help
+            help_msg
+            exit;;
+        \?) # Invalid option
+            echo "Error: Invalid option"
+            help_msg
+            exit;;
+    esac
+done
+
+# Check if exactly two arguments are given
+if [ $# -ne 2 ]; then
+    help_msg
+    exit 1
+fi
+
+in_path="$1"
+out_path="$2"
+
+mkdir -p "$out_path"
+for f in "$in_path"/*.tar; do
+    tar -xf "$f" -C "$out_path"
+done

From 3e57d9b5aa7bfc387c764e0429fc9cbfbe2ef32b Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Fri, 6 Mar 2026 19:40:35 -0800
Subject: [PATCH 14/35] Fix frame sampling

---
 training/object-detection/dvc.lock            |  30 +-
 training/object-detection/dvc.yaml            |   1 +
 .../object-detection/scripts/make_splits.py   | 656 ++++++++++++++++++
 .../scripts/yolo_converter_ls_video.py        |   8 +-
 4 files changed, 679 insertions(+), 16 deletions(-)
 create mode 100644 training/object-detection/scripts/make_splits.py

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 517e0bf..a8cdda9 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -33,11 +33,11 @@ stages:
       size: 195622
       nfiles: 10
   build_model_input:
-    cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
-      --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos 
-      --out-shards data/02_interim/yolo_annos --shard-size 100000 --pattern 
-      '**/*.json' --include-sites tankeeah kitwanga bear --frame-stride 3 
-      --frame-offset-mode video_hash
+    cmd: rm -r data/02_interim/yolo_annos || scripts/yolo_converter_ls_video.py 
+      data/01_raw/labelstudio_annos --data-yaml config/salmon_yolo.yaml --out 
+      data/02_interim/yolo_annos --out-shards data/02_interim/yolo_annos 
+      --shard-size 100000 --pattern '**/*.json' --include-sites tankeeah 
+      kitwanga bear --frame-stride 3 --frame-offset-mode video_hash
     deps:
     - path: config/salmon_yolo.yaml
       hash: md5
@@ -50,20 +50,20 @@ stages:
       nfiles: 50
     - path: scripts/yolo_converter_ls_video.py
       hash: md5
-      md5: c076aef67ddd95232ad60c90a0acb3d4
-      size: 20426
+      md5: ae7fb35f80b387294b652e733c3ba500
+      size: 20699
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
-      md5: 978efd3dca7ac11abcb153db010c39ae.dir
-      size: 880998400
-      nfiles: 9
+      md5: 473a8eedf5fd4cc5d5b8b6d1f80681e7.dir
+      size: 293713920
+      nfiles: 3
   unpack_annos:
-    cmd: scripts/unpack_annos.sh data/02_interim/yolo_annos 
-      data/02_interim/yolo_annos_unpacked
+    cmd: rm -r data/02_interim/yolo_annos_unpacked || scripts/unpack_annos.sh 
+      data/02_interim/yolo_annos data/02_interim/yolo_annos_unpacked
     deps:
     - path: data/02_interim/yolo_annos
       hash: md5
-      md5: 978efd3dca7ac11abcb153db010c39ae.dir
-      size: 880998400
-      nfiles: 9
+      md5: 473a8eedf5fd4cc5d5b8b6d1f80681e7.dir
+      size: 293713920
+      nfiles: 3
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 8dfd154..02b6a10 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -33,6 +33,7 @@ stages:
       - data/02_interim/yolo_annos
   unpack_annos:
     cmd: >-
+      rm -r data/02_interim/yolo_annos_unpacked ||
       scripts/unpack_annos.sh
       data/02_interim/yolo_annos
       data/02_interim/yolo_annos_unpacked
diff --git a/training/object-detection/scripts/make_splits.py b/training/object-detection/scripts/make_splits.py
new file mode 100644
index 0000000..4293706
--- /dev/null
+++ b/training/object-detection/scripts/make_splits.py
@@ -0,0 +1,656 @@
+#!/usr/bin/env python3
+"""
+make_splits.py
+
+Group-wise stratified-ish split for unpacked YOLO label files.
+
+- Input: unpacked labels directory that looks like:
+    <root>/<video_stem>/frame_000123.txt
+
+  where video_stem looks like:
+    ORG-site-device-id_YYYYMMDD_HHMMSS_M
+
+- Output:
+    out_dir/train.txt
+    out_dir/val.txt
+    out_dir/test.txt
+    out_dir/group_assignments.csv
+    out_dir/split_report.json
+
+Split unit (to prevent leakage): group_id = site + device + date(YYYYMMDD)
+Balancing objectives (soft): class counts, time-of-day, density bins, box area bins.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import math
+import os
+import random
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Iterable, Any
+from collections import defaultdict, Counter
+
+# -----------------------------
+# Parsing helpers
+# -----------------------------
+
+_STEM_RE = re.compile(
+    r"""
+    ^
+    (?P<prefix>.+?)                 # ORG-site-device-id (up to first underscore)
+    _
+    (?P<date>\d{8})                 # YYYYMMDD
+    _
+    (?P<time>\d{6})                 # HHMMSS
+    _
+    (?P<suffix>.+)                  # M / etc
+    $
+    """,
+    re.VERBOSE,
+)
+
+def parse_video_stem(video_stem: str) -> Optional[Dict[str, str]]:
+    """
+    Parse:
+      HIRMD-tankeeah-jetson-0_20250714_012827_M
+    into:
+      org="HIRMD", site="tankeeah", device="jetson-0", date="20250714", time="012827"
+    """
+    m = _STEM_RE.match(video_stem)
+    if not m:
+        return None
+
+    prefix = m.group("prefix")
+    date = m.group("date")
+    time = m.group("time")
+
+    parts = prefix.split("-")
+    if len(parts) < 3:
+        return None
+
+    org = parts[0]
+    site = parts[1]
+    device = "-".join(parts[2:])  # includes jetson-0, jetsonorin-1, etc.
+
+    return {"org": org, "site": site, "device": device, "date": date, "time": time}
+
+
+def time_bucket(hhmmss: str) -> str:
+    """Coarse time-of-day buckets based on HH."""
+    try:
+        hh = int(hhmmss[0:2])
+    except Exception:
+        return "unknown"
+    if 0 <= hh <= 5:
+        return "night"
+    if 6 <= hh <= 11:
+        return "morning"
+    if 12 <= hh <= 17:
+        return "afternoon"
+    if 18 <= hh <= 23:
+        return "evening"
+    return "unknown"
+
+
+def density_bin(n_boxes: int) -> str:
+    """Bins for boxes per frame."""
+    if n_boxes <= 0:
+        return "0"
+    if n_boxes == 1:
+        return "1"
+    if n_boxes == 2:
+        return "2"
+    if 3 <= n_boxes <= 4:
+        return "3-4"
+    if 5 <= n_boxes <= 9:
+        return "5-9"
+    return "10+"
+
+def ar_bin(w: float, h: float) -> str:
+    """
+    Aspect ratio bins based on w/h.
+    w,h are YOLO normalized widths/heights in [0,1].
+    """
+    if w <= 0 or h <= 0:
+        return "invalid"
+    r = w / h
+
+    # You can tune these thresholds, but this is a good start:
+    if r < 0.67:
+        return "tall"        # height-dominant
+    if r <= 1.5:
+        return "square"      # roughly square-ish
+    return "wide"            # width-dominant
+
+def area_bin(area: float) -> str:
+    """
+    Bin YOLO normalized bbox area (w*h) in [0,1].
+    Tune thresholds if needed.
+    """
+    if area <= 0:
+        return "0"
+    if area < 0.0025:
+        return "<0.0025"
+    if area < 0.01:
+        return "0.0025-0.01"
+    if area < 0.04:
+        return "0.01-0.04"
+    if area < 0.16:
+        return "0.04-0.16"
+    return ">=0.16"
+
+
+def safe_float(x: str, default: float = 0.0) -> float:
+    try:
+        return float(x)
+    except Exception:
+        return default
+
+
+# -----------------------------
+# Data structures
+# -----------------------------
+
+@dataclass
+class FrameRecord:
+    rel_path: str            # relative path to label file within labels_root
+    video_stem: str
+    frame_idx: int
+    org: str
+    site: str
+    device: str
+    date: str                # YYYYMMDD
+    tod: str                 # time-of-day bucket
+
+    n_boxes: int
+    class_counts: Counter    # class_id -> count
+    density_bin: str
+    area_bins: Counter       # area_bin -> count (counts per box)
+
+
+@dataclass
+class GroupStats:
+    group_id: str
+    site: str
+    device: str
+    date: str
+
+    n_frames: int
+    n_boxes: int
+
+    class_counts: Counter
+    tod_counts: Counter
+    density_counts: Counter
+    area_counts: Counter
+
+    # list of frame rel_paths (for manifest writing)
+    frame_paths: List[str]
+
+
+# -----------------------------
+# Scanning labels
+# -----------------------------
+
+def iter_label_files(labels_root: Path) -> Iterable[Path]:
+    # Expect structure: labels_root/<video_stem>/frame_000123.txt
+    for p in labels_root.rglob("frame_*.txt"):
+        if p.is_file():
+            yield p
+
+
+def parse_frame_idx(filename: str) -> Optional[int]:
+    # frame_000123.txt
+    m = re.match(r"^frame_(\d+)\.txt$", filename)
+    if not m:
+        return None
+    try:
+        return int(m.group(1))
+    except Exception:
+        return None
+
+
+def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter]:
+    """
+    Returns:
+      n_boxes,
+      class_counts (class_id -> count),
+      area_bins (area_bin -> count)
+    """
+    n_boxes = 0
+    class_counts: Counter = Counter()
+    area_counts: Counter = Counter()
+
+    try:
+        txt = path.read_text().strip()
+    except Exception:
+        return 0, Counter(), Counter()
+
+    if not txt:
+        return 0, Counter(), Counter()
+
+    for line in txt.splitlines():
+        parts = line.strip().split()
+        if len(parts) < 5:
+            continue
+        cls = parts[0]
+        w = safe_float(parts[3], 0.0)
+        h = safe_float(parts[4], 0.0)
+        try:
+            cls_id = int(cls)
+        except Exception:
+            continue
+        n_boxes += 1
+        class_counts[cls_id] += 1
+        area_counts[area_bin(w * h)] += 1
+
+    return n_boxes, class_counts, area_counts
+
+
+def build_groups(
+    labels_root: Path,
+    sites_keep: set[str],
+    seed: int,
+    limit: Optional[int] = None,
+) -> Dict[str, GroupStats]:
+    """
+    Build per-group aggregates. Group id = site|device|date.
+    """
+    rnd = random.Random(seed)
+
+    files = list(iter_label_files(labels_root))
+    files.sort()
+    if limit is not None:
+        rnd.shuffle(files)
+        files = files[:limit]
+
+    groups: Dict[str, GroupStats] = {}
+
+    skipped = 0
+    for f in files:
+        video_stem = f.parent.name
+        meta = parse_video_stem(video_stem)
+        if meta is None:
+            skipped += 1
+            continue
+
+        site = meta["site"]
+        if sites_keep and site not in sites_keep:
+            continue
+
+        frame_idx = parse_frame_idx(f.name)
+        if frame_idx is None:
+            skipped += 1
+            continue
+
+        n_boxes, class_counts, area_counts = read_yolo_label(f)
+        # If you ever include negatives, you may want to keep empty files too.
+        # For now, assume label files exist only when boxes exist.
+        dens_bin = density_bin(n_boxes)
+        tod = time_bucket(meta["time"])
+
+        rel_path = str(f.relative_to(labels_root))
+
+        group_id = f"{site}|{meta['device']}|{meta['date']}"
+        if group_id not in groups:
+            groups[group_id] = GroupStats(
+                group_id=group_id,
+                site=site,
+                device=meta["device"],
+                date=meta["date"],
+                n_frames=0,
+                n_boxes=0,
+                class_counts=Counter(),
+                tod_counts=Counter(),
+                density_counts=Counter(),
+                area_counts=Counter(),
+                frame_paths=[],
+            )
+
+        g = groups[group_id]
+        g.n_frames += 1
+        g.n_boxes += n_boxes
+        g.class_counts.update(class_counts)
+        g.tod_counts[tod] += 1
+        g.density_counts[dens_bin] += 1
+        g.area_counts.update(area_counts)
+        g.frame_paths.append(rel_path)
+
+    if skipped:
+        print(f"[make_splits] skipped {skipped} files due to parse issues")
+    print(f"[make_splits] groups={len(groups)} from label files={len(files)}")
+    return groups
+
+
+# -----------------------------
+# Split objective
+# -----------------------------
+
+def normalize_counter(c: Counter) -> Dict[Any, float]:
+    s = float(sum(c.values()))
+    if s <= 0:
+        return {}
+    return {k: v / s for k, v in c.items()}
+
+
+def l1_dist(p: Dict[Any, float], q: Dict[Any, float], keys: Iterable[Any]) -> float:
+    d = 0.0
+    for k in keys:
+        d += abs(p.get(k, 0.0) - q.get(k, 0.0))
+    return d
+
+
+@dataclass
+class SplitState:
+    name: str
+    target_frac: float
+    n_frames: int = 0
+
+    class_counts: Counter = None
+    tod_counts: Counter = None
+    density_counts: Counter = None
+    area_counts: Counter = None
+
+    group_ids: List[str] = None
+    frame_paths: List[str] = None
+
+    def __post_init__(self):
+        self.class_counts = Counter()
+        self.tod_counts = Counter()
+        self.density_counts = Counter()
+        self.area_counts = Counter()
+        self.group_ids = []
+        self.frame_paths = []
+
+    def add_group(self, g: GroupStats):
+        self.n_frames += g.n_frames
+        self.class_counts.update(g.class_counts)
+        self.tod_counts.update(g.tod_counts)
+        self.density_counts.update(g.density_counts)
+        self.area_counts.update(g.area_counts)
+        self.group_ids.append(g.group_id)
+        self.frame_paths.extend(g.frame_paths)
+
+
+def compute_global_targets(groups: List[GroupStats]) -> Dict[str, Any]:
+    total_frames = sum(g.n_frames for g in groups)
+
+    global_class = Counter()
+    global_tod = Counter()
+    global_density = Counter()
+    global_area = Counter()
+
+    for g in groups:
+        global_class.update(g.class_counts)
+        global_tod.update(g.tod_counts)
+        global_density.update(g.density_counts)
+        global_area.update(g.area_counts)
+
+    targets = {
+        "total_frames": total_frames,
+        "class_keys": sorted(global_class.keys()),
+        "tod_keys": sorted(global_tod.keys()),
+        "density_keys": sorted(global_density.keys()),
+        "area_keys": sorted(global_area.keys()),
+        "class_dist": normalize_counter(global_class),
+        "tod_dist": normalize_counter(global_tod),
+        "density_dist": normalize_counter(global_density),
+        "area_dist": normalize_counter(global_area),
+    }
+    return targets
+
+
+def rarity_score(g: GroupStats, global_class_dist: Dict[int, float]) -> float:
+    """
+    Higher score => assign earlier.
+    Use inverse frequency weighting on classes present in the group.
+    """
+    s = 0.0
+    for cls_id, cnt in g.class_counts.items():
+        p = global_class_dist.get(cls_id, 1e-12)
+        # weight by amount of that class in the group
+        s += cnt * (1.0 / max(p, 1e-6))
+    # also emphasize very dense groups a bit
+    s += 0.25 * g.n_boxes
+    return s
+
+
+def split_groups_greedy(
+    groups: Dict[str, GroupStats],
+    seed: int,
+    train_frac: float,
+    val_frac: float,
+    test_frac: float,
+    weights: Dict[str, float],
+) -> Tuple[SplitState, SplitState, SplitState, Dict[str, Any]]:
+    """
+    Greedy group assignment minimizing distance to global distributions + size penalty.
+
+    weights keys: class, tod, density, area, size
+    """
+    rnd = random.Random(seed)
+    group_list = list(groups.values())
+
+    targets = compute_global_targets(group_list)
+    total_frames = targets["total_frames"]
+
+    # Sort groups by rarity (desc), stable tie-break with seed
+    rnd.shuffle(group_list)
+    group_list.sort(key=lambda g: rarity_score(g, targets["class_dist"]), reverse=True)
+
+    train = SplitState("train", train_frac)
+    val = SplitState("val", val_frac)
+    test = SplitState("test", test_frac)
+    splits = [train, val, test]
+
+    # precompute target frame counts
+    target_frames = {
+        "train": train_frac * total_frames,
+        "val": val_frac * total_frames,
+        "test": test_frac * total_frames,
+    }
+
+    def score_split(after: SplitState) -> float:
+        # distribution distances (L1)
+        class_d = l1_dist(normalize_counter(after.class_counts), targets["class_dist"], targets["class_keys"])
+        tod_d = l1_dist(normalize_counter(after.tod_counts), targets["tod_dist"], targets["tod_keys"])
+        dens_d = l1_dist(normalize_counter(after.density_counts), targets["density_dist"], targets["density_keys"])
+        area_d = l1_dist(normalize_counter(after.area_counts), targets["area_dist"], targets["area_keys"])
+
+        # size penalty: keep n_frames close to target
+        tf = target_frames[after.name]
+        size_d = abs(after.n_frames - tf) / max(tf, 1.0)
+
+        return (
+            weights["class"] * class_d +
+            weights["tod"] * tod_d +
+            weights["density"] * dens_d +
+            weights["area"] * area_d +
+            weights["size"] * size_d
+        )
+
+    # Greedy: for each group, try each split, pick minimal total score across all splits
+    for g in group_list:
+        best = None
+        best_score = float("inf")
+
+        for s in splits:
+            # clone minimal stats (cheap-ish since Counters)
+            tmp = SplitState(s.name, s.target_frac)
+            tmp.n_frames = s.n_frames
+            tmp.class_counts = s.class_counts.copy()
+            tmp.tod_counts = s.tod_counts.copy()
+            tmp.density_counts = s.density_counts.copy()
+            tmp.area_counts = s.area_counts.copy()
+
+            tmp.add_group(g)
+
+            # compute global score as sum of each split score
+            # (this keeps all splits moving toward their targets)
+            total = 0.0
+            for other in splits:
+                if other.name == s.name:
+                    total += score_split(tmp)
+                else:
+                    total += score_split(other)
+
+            if total < best_score:
+                best_score = total
+                best = s
+
+        assert best is not None
+        best.add_group(g)
+
+    report = {
+        "total_frames": total_frames,
+        "target_frames": target_frames,
+        "actual_frames": {s.name: s.n_frames for s in splits},
+    }
+    return train, val, test, {**targets, **report}
+
+
+# -----------------------------
+# Reporting + writing
+# -----------------------------
+
+def summarize_split(s: SplitState) -> Dict[str, Any]:
+    return {
+        "n_frames": s.n_frames,
+        "n_boxes": int(sum(s.class_counts.values())),
+        "n_groups": len(s.group_ids),
+        "class_counts": dict(s.class_counts),
+        "tod_counts": dict(s.tod_counts),
+        "density_counts": dict(s.density_counts),
+        "area_counts": dict(s.area_counts),
+    }
+
+
+def write_manifest(out_path: Path, rel_paths: List[str]):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    rel_paths = list(rel_paths)
+    rel_paths.sort()
+    out_path.write_text("\n".join(rel_paths) + ("\n" if rel_paths else ""))
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--labels-root", required=True, type=Path,
+                    help="Root of exploded YOLO labels, e.g. data/99_work/yolo_annos_exploded")
+    ap.add_argument("--out-dir", required=True, type=Path,
+                    help="Output directory for split manifests")
+    ap.add_argument("--sites", nargs="*", default=["tankeeah", "kitwanga", "bear"],
+                    help="Sites to include (baseline)")
+    ap.add_argument("--seed", type=int, default=42)
+    ap.add_argument("--train-frac", type=float, default=0.80)
+    ap.add_argument("--val-frac", type=float, default=0.10)
+    ap.add_argument("--test-frac", type=float, default=0.10)
+    ap.add_argument("--limit-files", type=int, default=None,
+                    help="Debug: limit to N random label files")
+
+    # Objective weights
+    ap.add_argument("--w-class", type=float, default=4.0)
+    ap.add_argument("--w-tod", type=float, default=1.0)
+    ap.add_argument("--w-density", type=float, default=1.0)
+    ap.add_argument("--w-area", type=float, default=1.0)
+    ap.add_argument("--w-size", type=float, default=2.0)
+
+    args = ap.parse_args()
+
+    if not args.labels_root.exists():
+        raise SystemExit(f"labels-root not found: {args.labels_root}")
+
+    ssum = args.train_frac + args.val_frac + args.test_frac
+    if abs(ssum - 1.0) > 1e-6:
+        raise SystemExit(f"train/val/test fractions must sum to 1.0; got {ssum}")
+
+    sites_keep = set(args.sites) if args.sites else set()
+
+    groups = build_groups(
+        labels_root=args.labels_root,
+        sites_keep=sites_keep,
+        seed=args.seed,
+        limit=args.limit_files,
+    )
+
+    # Split
+    weights = {
+        "class": args.w_class,
+        "tod": args.w_tod,
+        "density": args.w_density,
+        "area": args.w_area,
+        "size": args.w_size,
+    }
+
+    train, val, test, report = split_groups_greedy(
+        groups=groups,
+        seed=args.seed,
+        train_frac=args.train_frac,
+        val_frac=args.val_frac,
+        test_frac=args.test_frac,
+        weights=weights,
+    )
+
+    out_dir = args.out_dir
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Write manifests (relative paths to label files)
+    write_manifest(out_dir / "train.txt", train.frame_paths)
+    write_manifest(out_dir / "val.txt", val.frame_paths)
+    write_manifest(out_dir / "test.txt", test.frame_paths)
+
+    # Group assignment CSV
+    with (out_dir / "group_assignments.csv").open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["group_id", "split", "site", "device", "date", "n_frames", "n_boxes"])
+        for s in [train, val, test]:
+            for gid in s.group_ids:
+                g = groups[gid]
+                w.writerow([gid, s.name, g.site, g.device, g.date, g.n_frames, g.n_boxes])
+
+    # JSON report
+    full_report = {
+        "params": {
+            "labels_root": str(args.labels_root),
+            "sites": sorted(list(sites_keep)),
+            "seed": args.seed,
+            "fractions": {"train": args.train_frac, "val": args.val_frac, "test": args.test_frac},
+            "weights": weights,
+            "grouping": "group_id = site|device|YYYYMMDD",
+            "notes": [
+                "Split is group-wise to reduce leakage from temporally adjacent frames.",
+                "Time-of-day bucket derives from video clip HHMMSS in stem; frames inherit clip bucket.",
+                "Balancing is soft; rare classes are prioritized earlier in greedy assignment.",
+            ],
+        },
+        "targets": {
+            "total_frames": report["total_frames"],
+            "target_frames": report["target_frames"],
+            "actual_frames": report["actual_frames"],
+            "global_class_dist": report["class_dist"],
+            "global_tod_dist": report["tod_dist"],
+            "global_density_dist": report["density_dist"],
+            "global_area_dist": report["area_dist"],
+        },
+        "splits": {
+            "train": summarize_split(train),
+            "val": summarize_split(val),
+            "test": summarize_split(test),
+        },
+    }
+
+    (out_dir / "split_report.json").write_text(json.dumps(full_report, indent=2, sort_keys=True) + "\n")
+
+    print("[make_splits] wrote:")
+    print(f"  {out_dir / 'train.txt'} ({len(train.frame_paths)} frames)")
+    print(f"  {out_dir / 'val.txt'}   ({len(val.frame_paths)} frames)")
+    print(f"  {out_dir / 'test.txt'}  ({len(test.frame_paths)} frames)")
+    print(f"  {out_dir / 'group_assignments.csv'}")
+    print(f"  {out_dir / 'split_report.json'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index bacff7a..baa9ba2 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -471,7 +471,6 @@ def _convert_item(self, item: dict) -> ConvertStats:
                         forced_mode=self.coord_mode,
                     )
                     frame_lines[frame_idx].append(f"{cls_id} {xc:.6f} {yc:.6f} {wn:.6f} {hn:.6f}")
-                    stats.label_files_written += 1
                     wrote_any = True
 
         if wrote_any:
@@ -482,6 +481,13 @@ def _convert_item(self, item: dict) -> ConvertStats:
                 with self.empty_list_path.open("a") as f:
                     f.write(f"{video_uri}\n")
 
+        # Apply frame sampling
+        if self.frame_stride > 1 and frame_lines:
+            off = self._stride_offset(video_stem)
+            frame_lines = {f: lines for f, lines in frame_lines.items()
+                           if (f % self.frame_stride) == off}
+            stats.label_files_written += len(frame_lines)
+
         if self._sharder:
             # write into shards: <video_stem>/frame_000123.txt
             for frame_idx, lines in frame_lines.items():

From c6017e6e6de5d8e5ae9e39b80680bcceff88c4e8 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Fri, 6 Mar 2026 19:41:19 -0800
Subject: [PATCH 15/35] Move written labels counter generally

---
 training/object-detection/scripts/yolo_converter_ls_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index baa9ba2..c27d8c7 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -486,8 +486,8 @@ def _convert_item(self, item: dict) -> ConvertStats:
             off = self._stride_offset(video_stem)
             frame_lines = {f: lines for f, lines in frame_lines.items()
                            if (f % self.frame_stride) == off}
-            stats.label_files_written += len(frame_lines)
 
+        stats.label_files_written += len(frame_lines)
         if self._sharder:
             # write into shards: <video_stem>/frame_000123.txt
             for frame_idx, lines in frame_lines.items():

From 0593afc72f4132f4eb7ae2f69ef287ce8ab6ba1b Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 12 Mar 2026 12:04:19 -0700
Subject: [PATCH 16/35] Split data based on day, class, area, density, and time
 of day

---
 .../object-detection/data/03_processed/.gitignore     |  1 +
 training/object-detection/dvc.lock                    | 11 ++++++++++-
 training/object-detection/dvc.yaml                    | 10 ++++++++--
 training/object-detection/scripts/make_splits.py      | 11 +++++++----
 4 files changed, 26 insertions(+), 7 deletions(-)
 create mode 100644 training/object-detection/data/03_processed/.gitignore
 mode change 100644 => 100755 training/object-detection/scripts/make_splits.py

diff --git a/training/object-detection/data/03_processed/.gitignore b/training/object-detection/data/03_processed/.gitignore
new file mode 100644
index 0000000..15e8ec8
--- /dev/null
+++ b/training/object-detection/data/03_processed/.gitignore
@@ -0,0 +1 @@
+/splits_baseline
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index a8cdda9..931309f 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -17,7 +17,10 @@ stages:
       size: 379352848
       nfiles: 50
   split_data:
-    cmd: sleep 1
+    cmd: scripts/make_splits.py --labels-root 
+      data/02_interim/yolo_annos_unpacked --out-dir 
+      data/03_processed/splits_baseline --sites tankeeah kitwanga bear --seed 42
+      --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
     - path: data/01_raw/salmon_vid_counts.csv
       hash: md5
@@ -32,6 +35,12 @@ stages:
       md5: 44403010d10d741a8e00e52c93be6c39.dir
       size: 195622
       nfiles: 10
+    outs:
+    - path: data/03_processed/splits_baseline
+      hash: md5
+      md5: 1d0a2f30f86add043a7980159a4e8669.dir
+      size: 15264480
+      nfiles: 5
   build_model_input:
     cmd: rm -r data/02_interim/yolo_annos || scripts/yolo_converter_ls_video.py 
       data/01_raw/labelstudio_annos --data-yaml config/salmon_yolo.yaml --out 
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 02b6a10..f6b19d9 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -41,9 +41,15 @@ stages:
       - data/02_interim/yolo_annos
   split_data:
     cmd: >-
-      sleep 1
+      scripts/make_splits.py
+      --labels-root data/02_interim/yolo_annos_unpacked
+      --out-dir data/03_processed/splits_baseline
+      --sites tankeeah kitwanga bear
+      --seed 42
+      --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv
       - data/01_raw/sv_water_conditions
-      - data/02_interim/yolo_annos_unpacked
+    outs:
+      - data/03_processed/splits_baseline
diff --git a/training/object-detection/scripts/make_splits.py b/training/object-detection/scripts/make_splits.py
old mode 100644
new mode 100755
index 4293706..4c9f15c
--- a/training/object-detection/scripts/make_splits.py
+++ b/training/object-detection/scripts/make_splits.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S uv run --script
+
 """
 make_splits.py
 
@@ -17,7 +18,7 @@
     out_dir/group_assignments.csv
     out_dir/split_report.json
 
-Split unit (to prevent leakage): group_id = site + device + date(YYYYMMDD)
+Split unit: group_id = site + device + date(YYYYMMDD)
 Balancing objectives (soft): class counts, time-of-day, density bins, box area bins.
 """
 
@@ -214,7 +215,7 @@ def parse_frame_idx(filename: str) -> Optional[int]:
         return None
 
 
-def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter]:
+def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter, Counter]:
     """
     Returns:
       n_boxes,
@@ -224,6 +225,7 @@ def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter]:
     n_boxes = 0
     class_counts: Counter = Counter()
     area_counts: Counter = Counter()
+    ar_counts: Counter = Counter()
 
     try:
         txt = path.read_text().strip()
@@ -247,6 +249,7 @@ def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter]:
         n_boxes += 1
         class_counts[cls_id] += 1
         area_counts[area_bin(w * h)] += 1
+        ar_counts[ar_bin(w, h)] += 1
 
     return n_boxes, class_counts, area_counts
 
@@ -445,7 +448,7 @@ def split_groups_greedy(
     train = SplitState("train", train_frac)
     val = SplitState("val", val_frac)
     test = SplitState("test", test_frac)
-    splits = [train, val, test]
+    splits = [test, val, train]
 
     # precompute target frame counts
     target_frames = {

From 00cfe135a0842baa2aa1bcf95803d270d46ca60d Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 12:58:20 -0700
Subject: [PATCH 17/35] Add negatives from removed annotation vids

---
 training/object-detection/dvc.lock            |  40 +--
 training/object-detection/dvc.yaml            |  13 +-
 training/object-detection/pyproject.toml      |   5 +
 .../scripts/yolo_converter_ls_video.py        | 239 ++++++++++++++++-
 training/object-detection/uv.lock             | 248 ++++++++++++++++++
 5 files changed, 509 insertions(+), 36 deletions(-)

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 931309f..efc1287 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -17,10 +17,11 @@ stages:
       size: 379352848
       nfiles: 50
   split_data:
-    cmd: scripts/make_splits.py --labels-root 
-      data/02_interim/yolo_annos_unpacked --out-dir 
-      data/03_processed/splits_baseline --sites tankeeah kitwanga bear --seed 42
-      --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
+    cmd: rm -r data/02_interim/yolo_annos_unpacked || scripts/unpack_annos.sh 
+      data/02_interim/yolo_annos data/02_interim/yolo_annos_unpacked && 
+      scripts/make_splits.py --labels-root data/02_interim/yolo_annos_unpacked 
+      --out-dir data/03_processed/splits_baseline --sites tankeeah kitwanga bear
+      --seed 42 --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
     - path: data/01_raw/salmon_vid_counts.csv
       hash: md5
@@ -35,18 +36,25 @@ stages:
       md5: 44403010d10d741a8e00e52c93be6c39.dir
       size: 195622
       nfiles: 10
+    - path: data/02_interim/yolo_annos
+      hash: md5
+      md5: a65beddacd3a0ceefa39225176c78f6c.dir
+      size: 307039754
+      nfiles: 5
     outs:
     - path: data/03_processed/splits_baseline
       hash: md5
-      md5: 1d0a2f30f86add043a7980159a4e8669.dir
-      size: 15264480
+      md5: b0af0cfdde2f5164decf4a9b15baeb9c.dir
+      size: 16780574
       nfiles: 5
   build_model_input:
-    cmd: rm -r data/02_interim/yolo_annos || scripts/yolo_converter_ls_video.py 
-      data/01_raw/labelstudio_annos --data-yaml config/salmon_yolo.yaml --out 
-      data/02_interim/yolo_annos --out-shards data/02_interim/yolo_annos 
-      --shard-size 100000 --pattern '**/*.json' --include-sites tankeeah 
-      kitwanga bear --frame-stride 3 --frame-offset-mode video_hash
+    cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
+      --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos 
+      --out-shards data/02_interim/yolo_annos --empty-list 
+      data/02_interim/yolo_annos/empty_vids.txt --shard-size 100000 --pattern 
+      '**/*.json' --include-sites tankeeah kitwanga bear --frame-stride 3 
+      --frame-offset-mode video_hash --include-negatives --negative-ratio 0.10 
+      --negatives-per-video 10
     deps:
     - path: config/salmon_yolo.yaml
       hash: md5
@@ -59,14 +67,14 @@ stages:
       nfiles: 50
     - path: scripts/yolo_converter_ls_video.py
       hash: md5
-      md5: ae7fb35f80b387294b652e733c3ba500
-      size: 20699
+      md5: ca3d64988d96ee8f3c4125d2d01600d0
+      size: 28147
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
-      md5: 473a8eedf5fd4cc5d5b8b6d1f80681e7.dir
-      size: 293713920
-      nfiles: 3
+      md5: a65beddacd3a0ceefa39225176c78f6c.dir
+      size: 307039754
+      nfiles: 5
   unpack_annos:
     cmd: rm -r data/02_interim/yolo_annos_unpacked || scripts/unpack_annos.sh 
       data/02_interim/yolo_annos data/02_interim/yolo_annos_unpacked
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index f6b19d9..f6b6b9b 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -20,27 +20,27 @@ stages:
       --data-yaml config/salmon_yolo.yaml
       --out data/02_interim/yolo_annos
       --out-shards data/02_interim/yolo_annos
+      --empty-list data/02_interim/yolo_annos/empty_vids.txt
       --shard-size 100000
       --pattern '**/*.json'
       --include-sites tankeeah kitwanga bear
       --frame-stride 3
       --frame-offset-mode video_hash
+      --include-negatives
+      --negative-ratio 0.10
+      --negatives-per-video 10
     deps:
       - scripts/yolo_converter_ls_video.py
       - data/01_raw/labelstudio_annos
       - config/salmon_yolo.yaml
     outs:
       - data/02_interim/yolo_annos
-  unpack_annos:
+  split_data:
     cmd: >-
       rm -r data/02_interim/yolo_annos_unpacked ||
       scripts/unpack_annos.sh
       data/02_interim/yolo_annos
-      data/02_interim/yolo_annos_unpacked
-    deps:
-      - data/02_interim/yolo_annos
-  split_data:
-    cmd: >-
+      data/02_interim/yolo_annos_unpacked &&
       scripts/make_splits.py
       --labels-root data/02_interim/yolo_annos_unpacked
       --out-dir data/03_processed/splits_baseline
@@ -48,6 +48,7 @@ stages:
       --seed 42
       --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
+      - data/02_interim/yolo_annos
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv
       - data/01_raw/sv_water_conditions
diff --git a/training/object-detection/pyproject.toml b/training/object-detection/pyproject.toml
index 2e3f06f..414bc56 100644
--- a/training/object-detection/pyproject.toml
+++ b/training/object-detection/pyproject.toml
@@ -5,3 +5,8 @@ description = "SalmonVision object detector training"
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = []
+
+[dependency-groups]
+dev = [
+    "pytest>=8.3.5",
+]
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index c27d8c7..56a4997 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -11,13 +11,14 @@
 import yaml
 import json
 import traceback
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Tuple, Any
 from collections import defaultdict
 from datetime import datetime
 import io
 import tarfile
+import random
 import zlib
 
 class TarShardWriter:
@@ -104,9 +105,18 @@ def load_class_map_from_yolo_yaml(yaml_path: Path) -> Dict[str, int]:
 class ConvertStats:
     videos_with_boxes: int = 0
     videos_without_boxes: int = 0
-    label_files_written: int = 0
+    label_lines_written: int = 0      # box lines
+    label_files_written: int = 0      # positive frame txt files
+    negative_files_written: int = 0   # negative frame txt files
+    total_candidate_negative_frames: int = 0   # total candidate negative frames
     errors: int = 0
 
+@dataclass
+class NegativeVideoCandidate:
+    video_stem: str
+    video_uri: str
+    total_frames: int
+
 def _safe_float(v: Any, default: float = 0.0) -> float:
     try:
         return float(v)
@@ -234,6 +244,10 @@ def __init__(
         frame_stride: int = 1,
         frame_offset_mode: str = "fixed",
         frame_offset: int = 0,
+        include_negatives: bool = False,
+        negative_ratio: float = 0.10,
+        negatives_per_video: int = 6,
+        negative_seed: int = 42,
     ):
         """
         :param coord_mode:
@@ -264,6 +278,15 @@ def __init__(
         self.frame_offset_mode = frame_offset_mode
         self.frame_offset = int(frame_offset)
 
+        self.include_negatives = include_negatives
+        self.negative_ratio = float(negative_ratio)
+        self.negatives_per_video = int(negatives_per_video)
+        self.negative_seed = int(negative_seed)
+
+        self._positive_frame_files_written = 0
+        self._negative_frame_files_written = 0
+        self._negative_candidates: List[NegativeVideoCandidate] = []
+
     # ---- public API ----
 
     def convert_folder(self, json_dir: Path, pattern: str = "*.json") -> ConvertStats:
@@ -308,6 +331,72 @@ def convert_file(self, json_path: Path) -> ConvertStats:
                 self._log_error(f"_convert_item(id={item_id}, src={json_path})", e)
         return stats
 
+    def materialize_negatives(self) -> (int, int):
+        """
+        Sample negatives globally so negatives are at most self.negative_ratio
+        of the final dataset. Returns number of negative files written.
+        """
+        if not self.include_negatives:
+            return 0
+
+        pos = self._positive_frame_files_written
+        if pos <= 0:
+            return 0
+
+        r = self.negative_ratio
+        if r <= 0:
+            return 0
+        if r >= 1:
+            max_neg = sum(
+                min(self.negatives_per_video, len(self._eligible_frames_for_video(c.video_stem, c.total_frames)))
+                for c in self._negative_candidates
+            )
+        else:
+            max_neg = int((r / (1.0 - r)) * pos)
+
+        if max_neg <= 0:
+            return 0
+
+        # Build all candidates at the video level first
+        per_video_samples: Dict[str, List[int]] = {}
+        total_candidate_frames = 0
+
+        for c in self._negative_candidates:
+            eligible = self._eligible_frames_for_video(c.video_stem, c.total_frames)
+            if not eligible:
+                continue
+
+            k = min(self.negatives_per_video, len(eligible))
+            sampled = self._sample_negative_frames_for_video(c.video_stem, c.total_frames, k)
+            if sampled:
+                per_video_samples[c.video_stem] = sampled
+                total_candidate_frames += len(sampled)
+
+        if total_candidate_frames == 0:
+            return 0
+
+        # Flatten candidates, then globally subsample if needed
+        flat: List[Tuple[str, int]] = []
+        for video_stem, frames in per_video_samples.items():
+            for frame_idx in frames:
+                flat.append((video_stem, frame_idx))
+
+        # Deterministic global shuffle
+        flat.sort()
+        rng = random.Random(self.negative_seed)
+        rng.shuffle(flat)
+
+        flat = flat[:max_neg]
+
+        # Write empty labels
+        wrote = 0
+        for video_stem, frame_idx in sorted(flat):
+            self._write_label(video_stem, frame_idx, "")
+            wrote += 1
+
+        self._negative_frame_files_written += wrote
+        return wrote, total_candidate_frames
+
     # ---- internals ----
 
     def _stride_offset(self, video_stem: str) -> int:
@@ -321,6 +410,103 @@ def _stride_offset(self, video_stem: str) -> int:
 
         raise ValueError("Invalid frame offset mode")
 
+    @staticmethod
+    def _parse_ffmpeg_rate(rate: Any) -> float:
+        """
+        Parse strings like '10/1' or '30000/1001' into float.
+        """
+        if rate is None:
+            return 0.0
+        if isinstance(rate, (int, float)):
+            return float(rate)
+
+        s = str(rate).strip()
+        if "/" in s:
+            a, b = s.split("/", 1)
+            try:
+                num = float(a)
+                den = float(b)
+                return num / den if den else 0.0
+            except Exception:
+                return 0.0
+        try:
+            return float(s)
+        except Exception:
+            return 0.0
+
+
+    def _infer_total_frames(self, item: dict, results: Optional[List[dict]] = None) -> int:
+        """
+        Prefer metadata_video_nb_frames. Fall back to framesCount in result.value,
+        then duration * fps.
+        """
+        data = item.get("data") or {}
+
+        # 1) Best source
+        n = int(_safe_float(data.get("metadata_video_nb_frames"), 0))
+        if n > 0:
+            return n
+
+        # 2) From result.value.framesCount
+        if results:
+            for r in results:
+                value = r.get("value") or {}
+                n = int(_safe_float(value.get("framesCount"), 0))
+                if n > 0:
+                    return n
+
+        # 3) duration * fps
+        duration = _safe_float(
+            data.get("metadata_video_duration",
+                     data.get("duration", 0.0)),
+            0.0
+        )
+
+        fps = _safe_float(data.get("frames_per_second"), 0.0)
+        if fps <= 0:
+            fps = self._parse_ffmpeg_rate(data.get("metadata_video_r_frame_rate"))
+        if fps <= 0:
+            fps = self._parse_ffmpeg_rate(data.get("metadata_video_avg_frame_rate"))
+
+        if duration > 0 and fps > 0:
+            return int(round(duration * fps))
+
+        return 0
+
+
+    def _eligible_frames_for_video(self, video_stem: str, total_frames: int) -> List[int]:
+        off = self._stride_offset(video_stem)
+        return [f for f in range(total_frames) if (f % self.frame_stride) == off]
+
+
+    def _write_empty_label(self, path: Path):
+        path.parent.mkdir(parents=True, exist_ok=True)
+        if not path.exists():
+            path.write_text("")
+
+    def _write_label(self, video_stem: str, frame_idx: int, text: str):
+        rel_path = f"{video_stem}/frame_{frame_idx:06d}.txt"
+
+        if self._sharder is not None:
+            self._sharder.write_text(rel_path, text)
+        else:
+            label_path = self.output_dir / rel_path
+            label_path.parent.mkdir(parents=True, exist_ok=True)
+            label_path.write_text(text)
+
+    def _sample_negative_frames_for_video(self, video_stem: str, total_frames: int, k: int) -> List[int]:
+        eligible = self._eligible_frames_for_video(video_stem, total_frames)
+        if not eligible or k <= 0:
+            return []
+
+        seed = zlib.crc32(f"{video_stem}|{self.negative_seed}".encode("utf-8"))
+        rng = random.Random(seed)
+
+        if k >= len(eligible):
+            return sorted(eligible)
+
+        return sorted(rng.sample(eligible, k))
+
     @staticmethod
     def _interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
         """
@@ -478,9 +664,21 @@ def _convert_item(self, item: dict) -> ConvertStats:
         else:
             stats.videos_without_boxes += 1
             if self.empty_list_path:
+                self.empty_list_path.parent.mkdir(parents=True, exist_ok=True)
                 with self.empty_list_path.open("a") as f:
                     f.write(f"{video_uri}\n")
 
+            if self.include_negatives:
+                total_frames = self._infer_total_frames(item, results=None)
+                if total_frames > 0:
+                    self._negative_candidates.append(
+                        NegativeVideoCandidate(
+                            video_stem=video_stem,
+                            video_uri=video_uri,
+                            total_frames=total_frames,
+                        )
+                    )
+
         # Apply frame sampling
         if self.frame_stride > 1 and frame_lines:
             off = self._stride_offset(video_stem)
@@ -488,13 +686,9 @@ def _convert_item(self, item: dict) -> ConvertStats:
                            if (f % self.frame_stride) == off}
 
         stats.label_files_written += len(frame_lines)
-        if self._sharder:
-            # write into shards: <video_stem>/frame_000123.txt
-            for frame_idx, lines in frame_lines.items():
-                rel_path = f"{video_stem}/frame_{frame_idx:06d}.txt"
-                self._sharder.write_text(rel_path, "\n".join(lines) + "\n")
-        else:
-            # current behavior: write to filesystem
+        self._positive_frame_files_written += len(frame_lines)
+        if self._sharder is None:
+            # write to filesystem
             vid_dir = self.output_dir / video_stem
 
             if vid_dir.exists() and not self.overwrite_video_dir:
@@ -502,10 +696,8 @@ def _convert_item(self, item: dict) -> ConvertStats:
                 return stats
             vid_dir.mkdir(parents=True, exist_ok=True)
 
-            for frame_idx, lines in frame_lines.items():
-                label_path = vid_dir / f"frame_{frame_idx:06d}.txt"
-                label_path.parent.mkdir(parents=True, exist_ok=True)
-                label_path.write_text("\n".join(lines) + "\n")
+        for frame_idx, lines in frame_lines.items():
+            self._write_label(video_stem, frame_idx, "\n".join(lines) + "\n")
 
         return stats
 
@@ -532,6 +724,14 @@ def _convert_item(self, item: dict) -> ConvertStats:
                         help="How to choose offset within stride")
     parser.add_argument("--frame-offset", type=int, default=0,
                         help="Offset for fixed mode (0..stride-1)")
+    parser.add_argument("--include-negatives", action="store_true",
+                    help="Add negative frames from videos with no annotations")
+    parser.add_argument("--negative-ratio", type=float, default=0.10,
+                        help="Max negatives as fraction of final dataset")
+    parser.add_argument("--negatives-per-video", type=int, default=6,
+                        help="Max sampled negative frames per empty video")
+    parser.add_argument("--negative-seed", type=int, default=42,
+                        help="Seed for deterministic negative sampling")
     args = parser.parse_args()
 
     data_yaml_path = Path(args.data_yaml)
@@ -551,6 +751,10 @@ def _convert_item(self, item: dict) -> ConvertStats:
         frame_stride=args.frame_stride,
         frame_offset_mode=args.frame_offset_mode,
         frame_offset=args.frame_offset,
+        include_negatives=args.include_negatives,
+        negative_ratio=args.negative_ratio,
+        negatives_per_video=args.negatives_per_video,
+        negative_seed=args.negative_seed,
     )
 
     inp = Path(args.input)
@@ -561,10 +765,17 @@ def _convert_item(self, item: dict) -> ConvertStats:
     else:
         s = conv.convert_file(inp)
 
+    neg_written, total_candidate_frames = conv.materialize_negatives()
+    s.negative_files_written += neg_written
+    s.total_candidate_negative_frames += total_candidate_frames
+
     if getattr(conv, "_sharder", None):
         conv._sharder.close()
 
     print(
         f"Done. with_boxes={s.videos_with_boxes} without_boxes={s.videos_without_boxes} "
-        f"labels_written={s.label_files_written} errors={s.errors}"
+        f"positive_label_files={s.label_files_written} "
+        f"negative_label_files={s.negative_files_written} "
+        f"total_candidate_negative_frames={s.total_candidate_negative_frames} "
+        f"errors={s.errors}"
     )
diff --git a/training/object-detection/uv.lock b/training/object-detection/uv.lock
index 7ff0862..6063ebf 100644
--- a/training/object-detection/uv.lock
+++ b/training/object-detection/uv.lock
@@ -1,8 +1,256 @@
 version = 1
 revision = 3
 requires-python = ">=3.8"
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+    "python_full_version < '3.9'",
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' and python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.9.*'",
+    "python_full_version < '3.9'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
 
 [[package]]
 name = "object-detection"
 version = "0.1.0"
 source = { virtual = "." }
+
+[package.dev-dependencies]
+dev = [
+    { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
+    { name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+]
+
+[package.metadata]
+
+[package.metadata.requires-dev]
+dev = [{ name = "pytest", specifier = ">=8.3.5" }]
+
+[[package]]
+name = "packaging"
+version = "26.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955, upload-time = "2024-04-20T21:34:42.531Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload-time = "2024-04-20T21:34:40.434Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "8.3.5"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+dependencies = [
+    { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.9'" },
+    { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "packaging", marker = "python_full_version < '3.9'" },
+    { name = "pluggy", version = "1.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "tomli", marker = "python_full_version < '3.9'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload-time = "2025-03-02T12:54:52.069Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "8.4.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.9.*'",
+]
+dependencies = [
+    { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version == '3.9.*'" },
+    { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
+    { name = "packaging", marker = "python_full_version == '3.9.*'" },
+    { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
+    { name = "pygments", marker = "python_full_version == '3.9.*'" },
+    { name = "tomli", marker = "python_full_version == '3.9.*'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+]
+dependencies = [
+    { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version == '3.10.*'" },
+    { name = "iniconfig", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "packaging", marker = "python_full_version >= '3.10'" },
+    { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "pygments", marker = "python_full_version >= '3.10'" },
+    { name = "tomli", marker = "python_full_version == '3.10.*'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/30/31573e9457673ab10aa432461bee537ce6cef177667deca369efb79df071/tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c", size = 17477, upload-time = "2026-01-11T11:22:38.165Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3c/d9/3dc2289e1f3b32eb19b9785b6a006b28ee99acb37d1d47f78d4c10e28bf8/tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867", size = 153663, upload-time = "2026-01-11T11:21:45.27Z" },
+    { url = "https://files.pythonhosted.org/packages/51/32/ef9f6845e6b9ca392cd3f64f9ec185cc6f09f0a2df3db08cbe8809d1d435/tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9", size = 148469, upload-time = "2026-01-11T11:21:46.873Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/c2/506e44cce89a8b1b1e047d64bd495c22c9f71f21e05f380f1a950dd9c217/tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95", size = 236039, upload-time = "2026-01-11T11:21:48.503Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/40/e1b65986dbc861b7e986e8ec394598187fa8aee85b1650b01dd925ca0be8/tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76", size = 243007, upload-time = "2026-01-11T11:21:49.456Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/6f/6e39ce66b58a5b7ae572a0f4352ff40c71e8573633deda43f6a379d56b3e/tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d", size = 240875, upload-time = "2026-01-11T11:21:50.755Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/ad/cb089cb190487caa80204d503c7fd0f4d443f90b95cf4ef5cf5aa0f439b0/tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576", size = 246271, upload-time = "2026-01-11T11:21:51.81Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/63/69125220e47fd7a3a27fd0de0c6398c89432fec41bc739823bcc66506af6/tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a", size = 96770, upload-time = "2026-01-11T11:21:52.647Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/0d/a22bb6c83f83386b0008425a6cd1fa1c14b5f3dd4bad05e98cf3dbbf4a64/tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa", size = 107626, upload-time = "2026-01-11T11:21:53.459Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/6d/77be674a3485e75cacbf2ddba2b146911477bd887dda9d8c9dfb2f15e871/tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614", size = 94842, upload-time = "2026-01-11T11:21:54.831Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/43/7389a1869f2f26dba52404e1ef13b4784b6b37dac93bac53457e3ff24ca3/tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1", size = 154894, upload-time = "2026-01-11T11:21:56.07Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/05/2f9bf110b5294132b2edf13fe6ca6ae456204f3d749f623307cbb7a946f2/tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8", size = 149053, upload-time = "2026-01-11T11:21:57.467Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/41/1eda3ca1abc6f6154a8db4d714a4d35c4ad90adc0bcf700657291593fbf3/tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a", size = 243481, upload-time = "2026-01-11T11:21:58.661Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/6d/02ff5ab6c8868b41e7d4b987ce2b5f6a51d3335a70aa144edd999e055a01/tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1", size = 251720, upload-time = "2026-01-11T11:22:00.178Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/57/0405c59a909c45d5b6f146107c6d997825aa87568b042042f7a9c0afed34/tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b", size = 247014, upload-time = "2026-01-11T11:22:01.238Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/0e/2e37568edd944b4165735687cbaf2fe3648129e440c26d02223672ee0630/tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51", size = 251820, upload-time = "2026-01-11T11:22:02.727Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/1c/ee3b707fdac82aeeb92d1a113f803cf6d0f37bdca0849cb489553e1f417a/tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729", size = 97712, upload-time = "2026-01-11T11:22:03.777Z" },
+    { url = "https://files.pythonhosted.org/packages/69/13/c07a9177d0b3bab7913299b9278845fc6eaaca14a02667c6be0b0a2270c8/tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da", size = 108296, upload-time = "2026-01-11T11:22:04.86Z" },
+    { url = "https://files.pythonhosted.org/packages/18/27/e267a60bbeeee343bcc279bb9e8fbed0cbe224bc7b2a3dc2975f22809a09/tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3", size = 94553, upload-time = "2026-01-11T11:22:05.854Z" },
+    { url = "https://files.pythonhosted.org/packages/34/91/7f65f9809f2936e1f4ce6268ae1903074563603b2a2bd969ebbda802744f/tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0", size = 154915, upload-time = "2026-01-11T11:22:06.703Z" },
+    { url = "https://files.pythonhosted.org/packages/20/aa/64dd73a5a849c2e8f216b755599c511badde80e91e9bc2271baa7b2cdbb1/tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e", size = 149038, upload-time = "2026-01-11T11:22:07.56Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/8a/6d38870bd3d52c8d1505ce054469a73f73a0fe62c0eaf5dddf61447e32fa/tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4", size = 242245, upload-time = "2026-01-11T11:22:08.344Z" },
+    { url = "https://files.pythonhosted.org/packages/59/bb/8002fadefb64ab2669e5b977df3f5e444febea60e717e755b38bb7c41029/tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e", size = 250335, upload-time = "2026-01-11T11:22:09.951Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/3d/4cdb6f791682b2ea916af2de96121b3cb1284d7c203d97d92d6003e91c8d/tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c", size = 245962, upload-time = "2026-01-11T11:22:11.27Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/4a/5f25789f9a460bd858ba9756ff52d0830d825b458e13f754952dd15fb7bb/tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f", size = 250396, upload-time = "2026-01-11T11:22:12.325Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/2f/b73a36fea58dfa08e8b3a268750e6853a6aac2a349241a905ebd86f3047a/tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86", size = 97530, upload-time = "2026-01-11T11:22:13.865Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/af/ca18c134b5d75de7e8dc551c5234eaba2e8e951f6b30139599b53de9c187/tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87", size = 108227, upload-time = "2026-01-11T11:22:15.224Z" },
+    { url = "https://files.pythonhosted.org/packages/22/c3/b386b832f209fee8073c8138ec50f27b4460db2fdae9ffe022df89a57f9b/tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132", size = 94748, upload-time = "2026-01-11T11:22:16.009Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/c4/84047a97eb1004418bc10bdbcfebda209fca6338002eba2dc27cc6d13563/tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6", size = 154725, upload-time = "2026-01-11T11:22:17.269Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/5d/d39038e646060b9d76274078cddf146ced86dc2b9e8bbf737ad5983609a0/tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc", size = 148901, upload-time = "2026-01-11T11:22:18.287Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e5/383be1724cb30f4ce44983d249645684a48c435e1cd4f8b5cded8a816d3c/tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66", size = 243375, upload-time = "2026-01-11T11:22:19.154Z" },
+    { url = "https://files.pythonhosted.org/packages/31/f0/bea80c17971c8d16d3cc109dc3585b0f2ce1036b5f4a8a183789023574f2/tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d", size = 250639, upload-time = "2026-01-11T11:22:20.168Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/8f/2853c36abbb7608e3f945d8a74e32ed3a74ee3a1f468f1ffc7d1cb3abba6/tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702", size = 246897, upload-time = "2026-01-11T11:22:21.544Z" },
+    { url = "https://files.pythonhosted.org/packages/49/f0/6c05e3196ed5337b9fe7ea003e95fd3819a840b7a0f2bf5a408ef1dad8ed/tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8", size = 254697, upload-time = "2026-01-11T11:22:23.058Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/f5/2922ef29c9f2951883525def7429967fc4d8208494e5ab524234f06b688b/tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776", size = 98567, upload-time = "2026-01-11T11:22:24.033Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/31/22b52e2e06dd2a5fdbc3ee73226d763b184ff21fc24e20316a44ccc4d96b/tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475", size = 108556, upload-time = "2026-01-11T11:22:25.378Z" },
+    { url = "https://files.pythonhosted.org/packages/48/3d/5058dff3255a3d01b705413f64f4306a141a8fd7a251e5a495e3f192a998/tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2", size = 96014, upload-time = "2026-01-11T11:22:26.138Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/4e/75dab8586e268424202d3a1997ef6014919c941b50642a1682df43204c22/tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9", size = 163339, upload-time = "2026-01-11T11:22:27.143Z" },
+    { url = "https://files.pythonhosted.org/packages/06/e3/b904d9ab1016829a776d97f163f183a48be6a4deb87304d1e0116a349519/tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0", size = 159490, upload-time = "2026-01-11T11:22:28.399Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/5a/fc3622c8b1ad823e8ea98a35e3c632ee316d48f66f80f9708ceb4f2a0322/tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df", size = 269398, upload-time = "2026-01-11T11:22:29.345Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/33/62bd6152c8bdd4c305ad9faca48f51d3acb2df1f8791b1477d46ff86e7f8/tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d", size = 276515, upload-time = "2026-01-11T11:22:30.327Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/ff/ae53619499f5235ee4211e62a8d7982ba9e439a0fb4f2f351a93d67c1dd2/tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f", size = 273806, upload-time = "2026-01-11T11:22:32.56Z" },
+    { url = "https://files.pythonhosted.org/packages/47/71/cbca7787fa68d4d0a9f7072821980b39fbb1b6faeb5f5cf02f4a5559fa28/tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b", size = 281340, upload-time = "2026-01-11T11:22:33.505Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/00/d595c120963ad42474cf6ee7771ad0d0e8a49d0f01e29576ee9195d9ecdf/tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087", size = 108106, upload-time = "2026-01-11T11:22:34.451Z" },
+    { url = "https://files.pythonhosted.org/packages/de/69/9aa0c6a505c2f80e519b43764f8b4ba93b5a0bbd2d9a9de6e2b24271b9a5/tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd", size = 120504, upload-time = "2026-01-11T11:22:35.764Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/9f/f1668c281c58cfae01482f7114a4b88d345e4c140386241a1a24dcc9e7bc/tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4", size = 99561, upload-time = "2026-01-11T11:22:36.624Z" },
+    { url = "https://files.pythonhosted.org/packages/23/d1/136eb2cb77520a31e1f64cbae9d33ec6df0d78bdf4160398e86eec8a8754/tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a", size = 14477, upload-time = "2026-01-11T11:22:37.446Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.13.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload-time = "2025-04-10T14:19:05.416Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload-time = "2025-04-10T14:19:03.967Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]

From db3469477707bc08ee543b8c55aca4fead3b5499 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 14:21:28 -0700
Subject: [PATCH 18/35] Remove mutable default

---
 training/object-detection/scripts/yolo_converter_ls_video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index 56a4997..efe979c 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -238,7 +238,7 @@ def __init__(
         to_name: Optional[str] = None,    # e.g., "video"; if None accept any
         coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
         error_log_path: Optional[Path] = None,
-        include_sites: List[str] = [],
+        include_sites: Optional[List[str]] = None,
         shard_dir: Optional[Path] = None,
         shard_size: int = 10000,
         frame_stride: int = 1,
@@ -270,7 +270,7 @@ def __init__(
         self.error_log_path = (
             Path(error_log_path) if error_log_path else (self.output_dir / "ls_to_yolo_errors.log")
         )
-        self.include_sites = include_sites
+        self.include_sites = include_sites or []
         self.shard_dir = Path(shard_dir) if shard_dir else None
         self.shard_size = int(shard_size)
         self._sharder = TarShardWriter(self.shard_dir, shard_size=self.shard_size) if self.shard_dir else None

From 33d41891641f8b36953788dd4b358e29bc6e93cf Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 15:29:15 -0700
Subject: [PATCH 19/35] Refactor yolo converter into a module for testing

---
 training/object-detection/dvc.lock            |   4 +-
 training/object-detection/pyproject.toml      |  15 +-
 .../scripts/yolo_converter_ls_video.py        | 781 +-----------------
 .../src/object_detection/__init__.py          |   0
 .../src/object_detection/yolo_ls/__init__.py  |   0
 .../src/object_detection/yolo_ls/cli.py       |  87 ++
 .../src/object_detection/yolo_ls/converter.py | 547 ++++++++++++
 .../src/object_detection/yolo_ls/parsing.py   | 110 +++
 .../src/object_detection/yolo_ls/shards.py    |  43 +
 training/object-detection/uv.lock             |  86 +-
 10 files changed, 891 insertions(+), 782 deletions(-)
 create mode 100644 training/object-detection/src/object_detection/__init__.py
 create mode 100644 training/object-detection/src/object_detection/yolo_ls/__init__.py
 create mode 100644 training/object-detection/src/object_detection/yolo_ls/cli.py
 create mode 100644 training/object-detection/src/object_detection/yolo_ls/converter.py
 create mode 100644 training/object-detection/src/object_detection/yolo_ls/parsing.py
 create mode 100644 training/object-detection/src/object_detection/yolo_ls/shards.py

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index efc1287..b14f23f 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -67,8 +67,8 @@ stages:
       nfiles: 50
     - path: scripts/yolo_converter_ls_video.py
       hash: md5
-      md5: ca3d64988d96ee8f3c4125d2d01600d0
-      size: 28147
+      md5: 95696514ec77c20c6491c5af2ccb46a5
+      size: 118
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
diff --git a/training/object-detection/pyproject.toml b/training/object-detection/pyproject.toml
index 414bc56..dc1a24e 100644
--- a/training/object-detection/pyproject.toml
+++ b/training/object-detection/pyproject.toml
@@ -4,9 +4,22 @@ version = "0.1.0"
 description = "SalmonVision object detector training"
 readme = "README.md"
 requires-python = ">=3.8"
-dependencies = []
+dependencies = [
+    "pyyaml>=6.0.3",
+]
 
 [dependency-groups]
 dev = [
     "pytest>=8.3.5",
 ]
+
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
diff --git a/training/object-detection/scripts/yolo_converter_ls_video.py b/training/object-detection/scripts/yolo_converter_ls_video.py
index efe979c..9c816a7 100755
--- a/training/object-detection/scripts/yolo_converter_ls_video.py
+++ b/training/object-detection/scripts/yolo_converter_ls_video.py
@@ -1,781 +1,6 @@
-#!/usr/bin/env -S uv run --script
-# /// script
-# requires-python = ">=3.8"
-# dependencies = [
-#     "pyyaml>=6.0.3",
-# ]
-# [tool.uv]
-# exclude-newer = "2026-03-02T18:41:13Z"
-# ///
-
-import yaml
-import json
-import traceback
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple, Any
-from collections import defaultdict
-from datetime import datetime
-import io
-import tarfile
-import random
-import zlib
-
-class TarShardWriter:
-    def __init__(self, out_dir: Path, shard_size: int = 10000, prefix: str = "yolo_annos"):
-        self.out_dir = Path(out_dir)
-        self.out_dir.mkdir(parents=True, exist_ok=True)
-        self.shard_size = int(shard_size)
-        self.prefix = prefix
-
-        self._shard_idx = 0
-        self._n_in_shard = 0
-        self._tar = None  # tarfile.TarFile
-
-        self._open_new()
-
-    def _open_new(self):
-        if self._tar is not None:
-            self._tar.close()
-        shard_name = f"{self.prefix}-{self._shard_idx:06d}.tar"
-        self._tar_path = self.out_dir / shard_name
-        self._tar = tarfile.open(self._tar_path, mode="w")  # uncompressed tar
-        self._n_in_shard = 0
-        self._shard_idx += 1
-
-    def write_text(self, rel_path: str, text: str):
-        # rotate shard if needed
-        if self._n_in_shard >= self.shard_size:
-            self._open_new()
-
-        data = text.encode("utf-8")
-        ti = tarfile.TarInfo(name=rel_path)
-        ti.size = len(data)
-        self._tar.addfile(ti, io.BytesIO(data))
-
-        self._n_in_shard += 1
-
-    def close(self):
-        if self._tar is not None:
-            self._tar.close()
-            self._tar = None
-
-def load_class_map_from_yolo_yaml(yaml_path: Path) -> Dict[str, int]:
-    """
-    Load a YOLO-style data.yaml and return a mapping: class_name -> class_id
-
-    Expects something like:
-
-      names:
-        0: Coho
-        1: Bull
-        2: Rainbow
-        ...
-
-    or:
-
-      names: [Coho, Bull, Rainbow, ...]
-    """
-    data: Any = yaml.safe_load(Path(yaml_path).read_text())
-    names = data.get("names")
-    if names is None:
-        raise ValueError(f"'names' not found in {yaml_path}")
-
-    class_map: Dict[str, int] = {}
-
-    if isinstance(names, dict):
-        # {0: 'Coho', 1: 'Bull', ...} (keys can be int or str)
-        for k, v in names.items():
-            try:
-                idx = int(k)
-            except Exception:
-                raise ValueError(f"Invalid class index {k!r} in names of {yaml_path}")
-            label = str(v)
-            class_map[label] = idx
-    elif isinstance(names, (list, tuple)):
-        # ['Coho', 'Bull', 'Rainbow', ...]
-        for idx, label in enumerate(names):
-            class_map[str(label)] = idx
-    else:
-        raise ValueError(f"Unsupported 'names' structure in {yaml_path}: {type(names)}")
-
-    return class_map
-
-@dataclass
-class ConvertStats:
-    videos_with_boxes: int = 0
-    videos_without_boxes: int = 0
-    label_lines_written: int = 0      # box lines
-    label_files_written: int = 0      # positive frame txt files
-    negative_files_written: int = 0   # negative frame txt files
-    total_candidate_negative_frames: int = 0   # total candidate negative frames
-    errors: int = 0
-
-@dataclass
-class NegativeVideoCandidate:
-    video_stem: str
-    video_uri: str
-    total_frames: int
-
-def _safe_float(v: Any, default: float = 0.0) -> float:
-    try:
-        return float(v)
-    except Exception:
-        return default
-
-def _coord_mode(x: float, y: float, w: float, h: float) -> str:
-    """
-    Infer coordinate mode for Label Studio:
-    - 'percent'   : typical LS UI export (0..100)
-    - 'normalized': already 0..1
-    - 'pixel'     : values > 100 (needs video width/height)
-    """
-    mx = max(x, y, w, h)
-    if mx <= 1.0000001:       # already normalized
-        return "normalized"
-    if mx <= 100.0000001:     # percent
-        return "percent"
-    return "pixel"
-
-
-def _to_yolo(
-    x: float,
-    y: float,
-    w: float,
-    h: float,
-    vid_w: int,
-    vid_h: int,
-    forced_mode: Optional[str] = None,
-) -> Tuple[float, float, float, float]:
-    """
-    Convert LS-style box to YOLO (xc, yc, w, h) in [0,1].
-
-    :param forced_mode: One of {"percent", "normalized", "pixel", None/"auto"}.
-                        If None or "auto", infer from values.
-    """
-    mode = forced_mode or "auto"
-    if mode == "auto":
-        mode = _coord_mode(x, y, w, h)
-
-    if mode == "normalized":
-        xc = x + w / 2.0
-        yc = y + h / 2.0
-        wn = w
-        hn = h
-    elif mode == "percent":
-        xc = (x + w / 2.0) / 100.0
-        yc = (y + h / 2.0) / 100.0
-        wn = w / 100.0
-        hn = h / 100.0
-    elif mode == "pixel":
-        xc = (x + w / 2.0) / float(vid_w) if vid_w else 0.0
-        yc = (y + h / 2.0) / float(vid_h) if vid_h else 0.0
-        wn = w / float(vid_w) if vid_w else 0.0
-        hn = h / float(vid_h) if vid_h else 0.0
-    else:
-        raise ValueError(f"Unknown coord_mode: {forced_mode!r}")
-
-    # clamp to [0,1]
-    xc = min(max(xc, 0.0), 1.0)
-    yc = min(max(yc, 0.0), 1.0)
-    wn = min(max(wn, 0.0), 1.0)
-    hn = min(max(hn, 0.0), 1.0)
-    return xc, yc, wn, hn
-
-
-class YoloConverterLSVideo:
-    """
-    Convert the provided Label Studio 'video' export (annotations + data) to YOLO frame txts.
-
-    Input structure:
-    [
-      {
-        "data": {
-          "metadata_video_width": 1280,
-          "metadata_video_height": 720,
-          "video": "s3://.../GOLD-kitkiata-jetson-1_20240720_002007_M.mp4",
-          "metadata_file_filename": "GOLD-kitkiata-jetson-1_20240720_002007_M.mp4",
-          ...
-        },
-        "annotations": [
-          {
-            "result": [
-              {
-                "type": "videorectangle",
-                "from_name": "box",
-                "to_name": "video",
-                "value": {
-                  "labels": ["Rainbow"],
-                  "sequence": [
-                    {"frame": 47, "x": 0, "y": 62.83, "width": 15.96, "height": 15.16, "enabled": true, ...},
-                    ...
-                  ]
-                }
-              },
-              ...
-            ]
-          }
-        ]
-      },
-      {
-        ...
-      },
-      ...
-    ]
-
-    Writes:
-      <out_dir>/<video_stem>/frame_000047.txt  # one line per box in that frame
-    """
-
-    def __init__(
-        self,
-        class_map: Dict[str, int],
-        output_dir: Path,
-        empty_list_path: Optional[Path] = None,
-        overwrite_video_dir: bool = False,
-        result_type: str = "videorectangle",
-        from_name: Optional[str] = None,  # e.g., "box"; if None accept any
-        to_name: Optional[str] = None,    # e.g., "video"; if None accept any
-        coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
-        error_log_path: Optional[Path] = None,
-        include_sites: Optional[List[str]] = None,
-        shard_dir: Optional[Path] = None,
-        shard_size: int = 10000,
-        frame_stride: int = 1,
-        frame_offset_mode: str = "fixed",
-        frame_offset: int = 0,
-        include_negatives: bool = False,
-        negative_ratio: float = 0.10,
-        negatives_per_video: int = 6,
-        negative_seed: int = 42,
-    ):
-        """
-        :param coord_mode:
-            "auto"       -> infer (default)
-            "percent"    -> x/y/width/height are 0..100
-            "normalized" -> x/y/width/height are 0..1
-            "pixel"      -> x/y/width/height are in pixels
-        :param error_log_path: where to append error tracebacks.
-                           If None, defaults to <output_dir>/ls_to_yolo_errors.log
-        """
-        self.class_map = class_map
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        self.empty_list_path = Path(empty_list_path) if empty_list_path else None
-        self.overwrite_video_dir = overwrite_video_dir
-        self.result_type = result_type
-        self.from_name = from_name
-        self.to_name = to_name
-        self.coord_mode = coord_mode
-        self.error_log_path = (
-            Path(error_log_path) if error_log_path else (self.output_dir / "ls_to_yolo_errors.log")
-        )
-        self.include_sites = include_sites or []
-        self.shard_dir = Path(shard_dir) if shard_dir else None
-        self.shard_size = int(shard_size)
-        self._sharder = TarShardWriter(self.shard_dir, shard_size=self.shard_size) if self.shard_dir else None
-        self.frame_stride = max(1, int(frame_stride))
-        self.frame_offset_mode = frame_offset_mode
-        self.frame_offset = int(frame_offset)
-
-        self.include_negatives = include_negatives
-        self.negative_ratio = float(negative_ratio)
-        self.negatives_per_video = int(negatives_per_video)
-        self.negative_seed = int(negative_seed)
-
-        self._positive_frame_files_written = 0
-        self._negative_frame_files_written = 0
-        self._negative_candidates: List[NegativeVideoCandidate] = []
-
-    # ---- public API ----
-
-    def convert_folder(self, json_dir: Path, pattern: str = "*.json") -> ConvertStats:
-        stats = ConvertStats()
-        for p in sorted(Path(json_dir).glob(pattern)):
-            try:
-                s = self.convert_file(p)
-                stats.videos_with_boxes += s.videos_with_boxes
-                stats.videos_without_boxes += s.videos_without_boxes
-                stats.label_files_written += s.label_files_written
-            except Exception as e:
-                stats.errors += 1
-                self._log_error(f"convert_file({p})", e)
-        return stats
-
-    def convert_file(self, json_path: Path) -> ConvertStats:
-        stats = ConvertStats()
-        json_path = Path(json_path)
-
-        try:
-            items = json.loads(json_path.read_text())
-        except Exception as e:
-            stats.errors += 1
-            self._log_error(f"read_json({json_path})", e)
-            return stats
-
-        if not isinstance(items, list):
-            err = ValueError(f"{json_path} must contain a top-level list")
-            stats.errors += 1
-            self._log_error(f"validate_json({json_path})", err)
-            return stats
-
-        for item in items:
-            try:
-                s = self._convert_item(item)
-                stats.videos_with_boxes += s.videos_with_boxes
-                stats.videos_without_boxes += s.videos_without_boxes
-                stats.label_files_written += s.label_files_written
-            except Exception as e:
-                stats.errors += 1
-                item_id = item.get("id", "unknown")
-                self._log_error(f"_convert_item(id={item_id}, src={json_path})", e)
-        return stats
-
-    def materialize_negatives(self) -> (int, int):
-        """
-        Sample negatives globally so negatives are at most self.negative_ratio
-        of the final dataset. Returns number of negative files written.
-        """
-        if not self.include_negatives:
-            return 0
-
-        pos = self._positive_frame_files_written
-        if pos <= 0:
-            return 0
-
-        r = self.negative_ratio
-        if r <= 0:
-            return 0
-        if r >= 1:
-            max_neg = sum(
-                min(self.negatives_per_video, len(self._eligible_frames_for_video(c.video_stem, c.total_frames)))
-                for c in self._negative_candidates
-            )
-        else:
-            max_neg = int((r / (1.0 - r)) * pos)
-
-        if max_neg <= 0:
-            return 0
-
-        # Build all candidates at the video level first
-        per_video_samples: Dict[str, List[int]] = {}
-        total_candidate_frames = 0
-
-        for c in self._negative_candidates:
-            eligible = self._eligible_frames_for_video(c.video_stem, c.total_frames)
-            if not eligible:
-                continue
-
-            k = min(self.negatives_per_video, len(eligible))
-            sampled = self._sample_negative_frames_for_video(c.video_stem, c.total_frames, k)
-            if sampled:
-                per_video_samples[c.video_stem] = sampled
-                total_candidate_frames += len(sampled)
-
-        if total_candidate_frames == 0:
-            return 0
-
-        # Flatten candidates, then globally subsample if needed
-        flat: List[Tuple[str, int]] = []
-        for video_stem, frames in per_video_samples.items():
-            for frame_idx in frames:
-                flat.append((video_stem, frame_idx))
-
-        # Deterministic global shuffle
-        flat.sort()
-        rng = random.Random(self.negative_seed)
-        rng.shuffle(flat)
-
-        flat = flat[:max_neg]
-
-        # Write empty labels
-        wrote = 0
-        for video_stem, frame_idx in sorted(flat):
-            self._write_label(video_stem, frame_idx, "")
-            wrote += 1
-
-        self._negative_frame_files_written += wrote
-        return wrote, total_candidate_frames
-
-    # ---- internals ----
-
-    def _stride_offset(self, video_stem: str) -> int:
-        if self.frame_stride <= 1:
-            return 0
-        if self.frame_offset_mode == "fixed":
-            return int(self.frame_offset) % self.frame_stride
-        if self.frame_offset_mode == "video_hash":
-            # deterministic across runs + platforms
-            return zlib.crc32(video_stem.encode("utf-8")) % self.frame_stride
-
-        raise ValueError("Invalid frame offset mode")
-
-    @staticmethod
-    def _parse_ffmpeg_rate(rate: Any) -> float:
-        """
-        Parse strings like '10/1' or '30000/1001' into float.
-        """
-        if rate is None:
-            return 0.0
-        if isinstance(rate, (int, float)):
-            return float(rate)
-
-        s = str(rate).strip()
-        if "/" in s:
-            a, b = s.split("/", 1)
-            try:
-                num = float(a)
-                den = float(b)
-                return num / den if den else 0.0
-            except Exception:
-                return 0.0
-        try:
-            return float(s)
-        except Exception:
-            return 0.0
-
-
-    def _infer_total_frames(self, item: dict, results: Optional[List[dict]] = None) -> int:
-        """
-        Prefer metadata_video_nb_frames. Fall back to framesCount in result.value,
-        then duration * fps.
-        """
-        data = item.get("data") or {}
-
-        # 1) Best source
-        n = int(_safe_float(data.get("metadata_video_nb_frames"), 0))
-        if n > 0:
-            return n
-
-        # 2) From result.value.framesCount
-        if results:
-            for r in results:
-                value = r.get("value") or {}
-                n = int(_safe_float(value.get("framesCount"), 0))
-                if n > 0:
-                    return n
-
-        # 3) duration * fps
-        duration = _safe_float(
-            data.get("metadata_video_duration",
-                     data.get("duration", 0.0)),
-            0.0
-        )
-
-        fps = _safe_float(data.get("frames_per_second"), 0.0)
-        if fps <= 0:
-            fps = self._parse_ffmpeg_rate(data.get("metadata_video_r_frame_rate"))
-        if fps <= 0:
-            fps = self._parse_ffmpeg_rate(data.get("metadata_video_avg_frame_rate"))
-
-        if duration > 0 and fps > 0:
-            return int(round(duration * fps))
-
-        return 0
-
-
-    def _eligible_frames_for_video(self, video_stem: str, total_frames: int) -> List[int]:
-        off = self._stride_offset(video_stem)
-        return [f for f in range(total_frames) if (f % self.frame_stride) == off]
-
-
-    def _write_empty_label(self, path: Path):
-        path.parent.mkdir(parents=True, exist_ok=True)
-        if not path.exists():
-            path.write_text("")
-
-    def _write_label(self, video_stem: str, frame_idx: int, text: str):
-        rel_path = f"{video_stem}/frame_{frame_idx:06d}.txt"
-
-        if self._sharder is not None:
-            self._sharder.write_text(rel_path, text)
-        else:
-            label_path = self.output_dir / rel_path
-            label_path.parent.mkdir(parents=True, exist_ok=True)
-            label_path.write_text(text)
-
-    def _sample_negative_frames_for_video(self, video_stem: str, total_frames: int, k: int) -> List[int]:
-        eligible = self._eligible_frames_for_video(video_stem, total_frames)
-        if not eligible or k <= 0:
-            return []
-
-        seed = zlib.crc32(f"{video_stem}|{self.negative_seed}".encode("utf-8"))
-        rng = random.Random(seed)
-
-        if k >= len(eligible):
-            return sorted(eligible)
-
-        return sorted(rng.sample(eligible, k))
-
-    @staticmethod
-    def _interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
-        """
-        Given a Label Studio 'sequence' (list of keyframes) like:
-
-          {
-            "frame": 47, "x": 0, "y": 62.8, "width": 15.9, "height": 15.1, "enabled": true
-          },
-          ...
-
-        Produce: frame_index -> list of (x, y, w, h) in the SAME units as input.
-
-        Semantics:
-        - Every keyframe (enabled or not) produces a box at its own frame.
-        - If a keyframe has enabled=True, we linearly interpolate boxes for the frames
-          *between it and the next keyframe* (f0+1 .. f1-1).
-        - If a keyframe has enabled=False, we do NOT interpolate forward from it,
-          but we still keep its own box at that frame.
-        - A disabled keyframe can still be the *end* of an interpolation that started
-          from a previous enabled keyframe (since that interpolation uses the previous
-          keyframe's enabled flag).
-        """
-        # Sort keyframes by frame
-        kfs = sorted(seq, key=lambda k: int(_safe_float(k.get("frame"), 0)))
-        frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
-
-        if not kfs:
-            return frames_boxes
-
-        # 1) Add all keyframes as boxes at their exact frames
-        for k in kfs:
-            f = int(_safe_float(k.get("frame"), -1))
-            if f < 0:
-                continue
-
-            x = _safe_float(k.get("x"))
-            y = _safe_float(k.get("y"))
-            w = _safe_float(k.get("width"))
-            h = _safe_float(k.get("height"))
-            frames_boxes.setdefault(f, []).append((x, y, w, h))
-
-        # 2) Interpolate between consecutive keyframes when the *start* keyframe is enabled
-        for i in range(len(kfs) - 1):
-            k0 = kfs[i]
-            k1 = kfs[i + 1]
-
-            f0 = int(_safe_float(k0.get("frame"), -1))
-            f1 = int(_safe_float(k1.get("frame"), -1))
-            if f0 < 0 or f1 <= f0:
-                continue
-
-            enabled0 = bool(k0.get("enabled", True))
-            if not enabled0:
-                # Do not interpolate forward from a disabled keyframe
-                continue
-
-            x0 = _safe_float(k0.get("x"))
-            y0 = _safe_float(k0.get("y"))
-            w0 = _safe_float(k0.get("width"))
-            h0 = _safe_float(k0.get("height"))
-
-            x1 = _safe_float(k1.get("x"))
-            y1 = _safe_float(k1.get("y"))
-            w1 = _safe_float(k1.get("width"))
-            h1 = _safe_float(k1.get("height"))
-
-            # Fill in strictly between endpoints; endpoints themselves are already added
-            for f in range(f0 + 1, f1):
-                t = (f - f0) / float(f1 - f0)
-                x = x0 + (x1 - x0) * t
-                y = y0 + (y1 - y0) * t
-                w = w0 + (w1 - w0) * t
-                h = h0 + (h1 - h0) * t
-                frames_boxes.setdefault(f, []).append((x, y, w, h))
-
-        return frames_boxes
-
-    def _log_error(self, context: str, exc: Exception):
-        try:
-            self.error_log_path.parent.mkdir(parents=True, exist_ok=True)
-            with self.error_log_path.open("a") as f:
-                f.write(f"\n=== ERROR in {context} ===\n")
-                traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
-        except Exception:
-            # last-resort: don't crash because logging failed
-            pass
-
-    @staticmethod
-    def _parse_ts(s):
-        return datetime.fromisoformat(s.replace("Z", "+00:00"))
-
-    def _convert_item(self, item: dict) -> ConvertStats:
-        stats = ConvertStats()
-
-        data = item.get("data") or {}
-        site = data.get("metadata_file_site_reference_string") or ""
-        if len(self.include_sites) > 0:
-            if site not in self.include_sites:
-                # Not in included sites
-                return stats
-
-        video_uri = data.get("metadata_file_filename") or data.get("video") or "unknown.mp4"
-        video_stem = Path(video_uri).stem
-        vid_w = int(_safe_float(data.get("metadata_video_width"), 0))
-        vid_h = int(_safe_float(data.get("metadata_video_height"), 0))
-
-        annos = item.get("annotations") or []
-        results = []
-        if len(annos) > 0:
-            latest_ann = max(
-                annos,
-                key=lambda a: YoloConverterLSVideo._parse_ts(a["updated_at"])
-            )
-
-            for r in (latest_ann.get("result") or []):
-                if r.get("type") != self.result_type:
-                    continue
-                if self.from_name is not None and r.get("from_name") != self.from_name:
-                    continue
-                if self.to_name is not None and r.get("to_name") != self.to_name:
-                    continue
-                results.append(r)
-
-        wrote_any = False
-
-        # Collect lines per frame
-        frame_lines: Dict[int, List[str]] = defaultdict(list)
-        for r in results:
-            value = r.get("value") or {}
-            labels: List[str] = value.get("labels") or []
-            if not labels:
-                continue
-            cls_name = labels[0]
-            if cls_name not in self.class_map:
-                # unknown class; skip this track
-                continue
-            cls_id = self.class_map[cls_name]
-
-            seq: Iterable[dict] = value.get("sequence") or []
-            frame_boxes = self._interpolate_sequence(seq)  # frame -> [(x,y,w,h), ...]
-
-            for frame_idx, boxes in frame_boxes.items():
-                for (x, y, w, h) in boxes:
-                    xc, yc, wn, hn = _to_yolo(
-                        x, y, w, h,
-                        vid_w=vid_w,
-                        vid_h=vid_h,
-                        forced_mode=self.coord_mode,
-                    )
-                    frame_lines[frame_idx].append(f"{cls_id} {xc:.6f} {yc:.6f} {wn:.6f} {hn:.6f}")
-                    wrote_any = True
-
-        if wrote_any:
-            stats.videos_with_boxes += 1
-        else:
-            stats.videos_without_boxes += 1
-            if self.empty_list_path:
-                self.empty_list_path.parent.mkdir(parents=True, exist_ok=True)
-                with self.empty_list_path.open("a") as f:
-                    f.write(f"{video_uri}\n")
-
-            if self.include_negatives:
-                total_frames = self._infer_total_frames(item, results=None)
-                if total_frames > 0:
-                    self._negative_candidates.append(
-                        NegativeVideoCandidate(
-                            video_stem=video_stem,
-                            video_uri=video_uri,
-                            total_frames=total_frames,
-                        )
-                    )
-
-        # Apply frame sampling
-        if self.frame_stride > 1 and frame_lines:
-            off = self._stride_offset(video_stem)
-            frame_lines = {f: lines for f, lines in frame_lines.items()
-                           if (f % self.frame_stride) == off}
-
-        stats.label_files_written += len(frame_lines)
-        self._positive_frame_files_written += len(frame_lines)
-        if self._sharder is None:
-            # write to filesystem
-            vid_dir = self.output_dir / video_stem
-
-            if vid_dir.exists() and not self.overwrite_video_dir:
-                # skip existing video dir to avoid mixing runs
-                return stats
-            vid_dir.mkdir(parents=True, exist_ok=True)
-
-        for frame_idx, lines in frame_lines.items():
-            self._write_label(video_stem, frame_idx, "\n".join(lines) + "\n")
-
-        return stats
+#!/usr/bin/env -S uv run python
 
+from object_detection.yolo_ls.cli import main
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Convert Label Studio video JSON to YOLO frame labels")
-    parser.add_argument("input", help="JSON file or directory containing Label Studio JSON")
-    parser.add_argument("--data-yaml", required=True, help="Path to YOLO data.yaml (with 'names:' mapping)")
-    parser.add_argument("--out", required=True, help="Output directory")
-    parser.add_argument("--empty-list", default=None, help="Path to write videos with no boxes")
-    parser.add_argument("--pattern", default="*.json", help="Glob when input is a directory")
-    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing per-video folders")
-    parser.add_argument("--from-name", default=None, help="Filter by result.from_name (e.g., 'box')")
-    parser.add_argument("--to-name", default=None, help="Filter by result.to_name (e.g., 'video')")
-    parser.add_argument("--coord-mode", default="percent", help='Set the coordinates mode: "auto", "percent", "normalized", "pixel"')
-    parser.add_argument("--include-sites", nargs="*", default=[], help='Only include videos of these sites')
-    parser.add_argument("--out-shards", default=None, help="Directory to write TAR shards (instead of many files)")
-    parser.add_argument("--shard-size", type=int, default=10000, help="Number of frame label files per shard")
-    parser.add_argument("--frame-stride", type=int, default=1,
-                    help="Keep every Nth frame (1 keeps all)")
-    parser.add_argument("--frame-offset-mode", choices=["fixed", "video_hash"], default="video_hash",
-                        help="How to choose offset within stride")
-    parser.add_argument("--frame-offset", type=int, default=0,
-                        help="Offset for fixed mode (0..stride-1)")
-    parser.add_argument("--include-negatives", action="store_true",
-                    help="Add negative frames from videos with no annotations")
-    parser.add_argument("--negative-ratio", type=float, default=0.10,
-                        help="Max negatives as fraction of final dataset")
-    parser.add_argument("--negatives-per-video", type=int, default=6,
-                        help="Max sampled negative frames per empty video")
-    parser.add_argument("--negative-seed", type=int, default=42,
-                        help="Seed for deterministic negative sampling")
-    args = parser.parse_args()
-
-    data_yaml_path = Path(args.data_yaml)
-    class_map = load_class_map_from_yolo_yaml(data_yaml_path)
-
-    conv = YoloConverterLSVideo(
-        class_map=class_map,
-        output_dir=Path(args.out),
-        empty_list_path=Path(args.empty_list) if args.empty_list else None,
-        overwrite_video_dir=args.overwrite,
-        from_name=args.from_name,
-        to_name=args.to_name,
-        coord_mode=args.coord_mode,
-        include_sites=args.include_sites,
-        shard_dir=Path(args.out_shards) if args.out_shards else None,
-        shard_size=args.shard_size,
-        frame_stride=args.frame_stride,
-        frame_offset_mode=args.frame_offset_mode,
-        frame_offset=args.frame_offset,
-        include_negatives=args.include_negatives,
-        negative_ratio=args.negative_ratio,
-        negatives_per_video=args.negatives_per_video,
-        negative_seed=args.negative_seed,
-    )
-
-    inp = Path(args.input)
-    if inp.is_dir():
-        print(f"Converting labels from {inp}")
-
-        s = conv.convert_folder(inp, pattern=args.pattern)
-    else:
-        s = conv.convert_file(inp)
-
-    neg_written, total_candidate_frames = conv.materialize_negatives()
-    s.negative_files_written += neg_written
-    s.total_candidate_negative_frames += total_candidate_frames
-
-    if getattr(conv, "_sharder", None):
-        conv._sharder.close()
-
-    print(
-        f"Done. with_boxes={s.videos_with_boxes} without_boxes={s.videos_without_boxes} "
-        f"positive_label_files={s.label_files_written} "
-        f"negative_label_files={s.negative_files_written} "
-        f"total_candidate_negative_frames={s.total_candidate_negative_frames} "
-        f"errors={s.errors}"
-    )
+    main()
diff --git a/training/object-detection/src/object_detection/__init__.py b/training/object-detection/src/object_detection/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/src/object_detection/yolo_ls/__init__.py b/training/object-detection/src/object_detection/yolo_ls/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/src/object_detection/yolo_ls/cli.py b/training/object-detection/src/object_detection/yolo_ls/cli.py
new file mode 100644
index 0000000..34dc462
--- /dev/null
+++ b/training/object-detection/src/object_detection/yolo_ls/cli.py
@@ -0,0 +1,87 @@
+import yaml
+import argparse
+from pathlib import Path
+
+from object_detection.yolo_ls.converter import YoloConverterLSVideo
+from object_detection.yolo_ls.parsing import load_class_map_from_yolo_yaml
+
+def build_parser():
+    parser = argparse.ArgumentParser(description="Convert Label Studio video JSON to YOLO frame labels")
+    parser.add_argument("input", help="JSON file or directory containing Label Studio JSON")
+    parser.add_argument("--data-yaml", required=True, help="Path to YOLO data.yaml (with 'names:' mapping)")
+    parser.add_argument("--out", required=True, help="Output directory")
+    parser.add_argument("--empty-list", default=None, help="Path to write videos with no boxes")
+    parser.add_argument("--pattern", default="*.json", help="Glob when input is a directory")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing per-video folders")
+    parser.add_argument("--from-name", default=None, help="Filter by result.from_name (e.g., 'box')")
+    parser.add_argument("--to-name", default=None, help="Filter by result.to_name (e.g., 'video')")
+    parser.add_argument("--coord-mode", default="percent", help='Set the coordinates mode: "auto", "percent", "normalized", "pixel"')
+    parser.add_argument("--include-sites", nargs="*", default=[], help='Only include videos of these sites')
+    parser.add_argument("--out-shards", default=None, help="Directory to write TAR shards (instead of many files)")
+    parser.add_argument("--shard-size", type=int, default=10000, help="Number of frame label files per shard")
+    parser.add_argument("--frame-stride", type=int, default=1,
+                    help="Keep every Nth frame (1 keeps all)")
+    parser.add_argument("--frame-offset-mode", choices=["fixed", "video_hash"], default="video_hash",
+                        help="How to choose offset within stride")
+    parser.add_argument("--frame-offset", type=int, default=0,
+                        help="Offset for fixed mode (0..stride-1)")
+    parser.add_argument("--include-negatives", action="store_true",
+                    help="Add negative frames from videos with no annotations")
+    parser.add_argument("--negative-ratio", type=float, default=0.10,
+                        help="Max negatives as fraction of final dataset")
+    parser.add_argument("--negatives-per-video", type=int, default=6,
+                        help="Max sampled negative frames per empty video")
+    parser.add_argument("--negative-seed", type=int, default=42,
+                        help="Seed for deterministic negative sampling")
+
+    return parser
+
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+
+    data_yaml_path = Path(args.data_yaml)
+    class_map = load_class_map_from_yolo_yaml(data_yaml_path)
+
+    conv = YoloConverterLSVideo(
+        class_map=class_map,
+        output_dir=Path(args.out),
+        empty_list_path=Path(args.empty_list) if args.empty_list else None,
+        overwrite_video_dir=args.overwrite,
+        from_name=args.from_name,
+        to_name=args.to_name,
+        coord_mode=args.coord_mode,
+        include_sites=args.include_sites,
+        shard_dir=Path(args.out_shards) if args.out_shards else None,
+        shard_size=args.shard_size,
+        frame_stride=args.frame_stride,
+        frame_offset_mode=args.frame_offset_mode,
+        frame_offset=args.frame_offset,
+        include_negatives=args.include_negatives,
+        negative_ratio=args.negative_ratio,
+        negatives_per_video=args.negatives_per_video,
+        negative_seed=args.negative_seed,
+    )
+
+    inp = Path(args.input)
+    if inp.is_dir():
+        print(f"Converting labels from {inp}")
+
+        s = conv.convert_folder(inp, pattern=args.pattern)
+    else:
+        s = conv.convert_file(inp)
+
+    neg_written, total_candidate_frames = conv.materialize_negatives()
+    s.negative_files_written += neg_written
+    s.total_candidate_negative_frames += total_candidate_frames
+
+    if getattr(conv, "_sharder", None):
+        conv._sharder.close()
+
+    print(
+        f"Done. with_boxes={s.videos_with_boxes} without_boxes={s.videos_without_boxes} "
+        f"positive_label_files={s.label_files_written} "
+        f"negative_label_files={s.negative_files_written} "
+        f"total_candidate_negative_frames={s.total_candidate_negative_frames} "
+        f"errors={s.errors}"
+    )
diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
new file mode 100644
index 0000000..6945074
--- /dev/null
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -0,0 +1,547 @@
+import json
+import traceback
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple, Any
+from collections import defaultdict
+from datetime import datetime
+import random
+import zlib
+
+from object_detection.yolo_ls.shards import TarShardWriter
+from object_detection.yolo_ls.parsing import *
+
+@dataclass
+class ConvertStats:
+    videos_with_boxes: int = 0
+    videos_without_boxes: int = 0
+    label_lines_written: int = 0      # box lines
+    label_files_written: int = 0      # positive frame txt files
+    negative_files_written: int = 0   # negative frame txt files
+    total_candidate_negative_frames: int = 0   # total candidate negative frames
+    errors: int = 0
+
+@dataclass
+class NegativeVideoCandidate:
+    video_stem: str
+    video_uri: str
+    total_frames: int
+
+class YoloConverterLSVideo:
+    """
+    Convert the provided Label Studio 'video' export (annotations + data) to YOLO frame txts.
+
+    Input structure:
+    [
+      {
+        "data": {
+          "metadata_video_width": 1280,
+          "metadata_video_height": 720,
+          "video": "s3://.../GOLD-kitkiata-jetson-1_20240720_002007_M.mp4",
+          "metadata_file_filename": "GOLD-kitkiata-jetson-1_20240720_002007_M.mp4",
+          ...
+        },
+        "annotations": [
+          {
+            "result": [
+              {
+                "type": "videorectangle",
+                "from_name": "box",
+                "to_name": "video",
+                "value": {
+                  "labels": ["Rainbow"],
+                  "sequence": [
+                    {"frame": 47, "x": 0, "y": 62.83, "width": 15.96, "height": 15.16, "enabled": true, ...},
+                    ...
+                  ]
+                }
+              },
+              ...
+            ]
+          }
+        ]
+      },
+      {
+        ...
+      },
+      ...
+    ]
+
+    Writes:
+      <out_dir>/<video_stem>/frame_000047.txt  # one line per box in that frame
+    """
+
+    def __init__(
+        self,
+        class_map: Dict[str, int],
+        output_dir: Path,
+        empty_list_path: Optional[Path] = None,
+        overwrite_video_dir: bool = False,
+        result_type: str = "videorectangle",
+        from_name: Optional[str] = None,  # e.g., "box"; if None accept any
+        to_name: Optional[str] = None,    # e.g., "video"; if None accept any
+        coord_mode: str = "auto",         # "auto", "percent", "normalized", "pixel"
+        error_log_path: Optional[Path] = None,
+        include_sites: Optional[List[str]] = None,
+        shard_dir: Optional[Path] = None,
+        shard_size: int = 10000,
+        frame_stride: int = 1,
+        frame_offset_mode: str = "fixed",
+        frame_offset: int = 0,
+        include_negatives: bool = False,
+        negative_ratio: float = 0.10,
+        negatives_per_video: int = 6,
+        negative_seed: int = 42,
+    ):
+        """
+        :param coord_mode:
+            "auto"       -> infer (default)
+            "percent"    -> x/y/width/height are 0..100
+            "normalized" -> x/y/width/height are 0..1
+            "pixel"      -> x/y/width/height are in pixels
+        :param error_log_path: where to append error tracebacks.
+                           If None, defaults to <output_dir>/ls_to_yolo_errors.log
+        """
+        self.class_map = class_map
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.empty_list_path = Path(empty_list_path) if empty_list_path else None
+        self.overwrite_video_dir = overwrite_video_dir
+        self.result_type = result_type
+        self.from_name = from_name
+        self.to_name = to_name
+        self.coord_mode = coord_mode
+        self.error_log_path = (
+            Path(error_log_path) if error_log_path else (self.output_dir / "ls_to_yolo_errors.log")
+        )
+        self.include_sites = include_sites or []
+        self.shard_dir = Path(shard_dir) if shard_dir else None
+        self.shard_size = int(shard_size)
+        self._sharder = TarShardWriter(self.shard_dir, shard_size=self.shard_size) if self.shard_dir else None
+        self.frame_stride = max(1, int(frame_stride))
+        self.frame_offset_mode = frame_offset_mode
+        self.frame_offset = int(frame_offset)
+
+        self.include_negatives = include_negatives
+        self.negative_ratio = float(negative_ratio)
+        self.negatives_per_video = int(negatives_per_video)
+        self.negative_seed = int(negative_seed)
+
+        self._positive_frame_files_written = 0
+        self._negative_frame_files_written = 0
+        self._negative_candidates: List[NegativeVideoCandidate] = []
+
+    # ---- public API ----
+
+    def convert_folder(self, json_dir: Path, pattern: str = "*.json") -> ConvertStats:
+        stats = ConvertStats()
+        for p in sorted(Path(json_dir).glob(pattern)):
+            try:
+                s = self.convert_file(p)
+                stats.videos_with_boxes += s.videos_with_boxes
+                stats.videos_without_boxes += s.videos_without_boxes
+                stats.label_files_written += s.label_files_written
+            except Exception as e:
+                stats.errors += 1
+                self._log_error(f"convert_file({p})", e)
+        return stats
+
+    def convert_file(self, json_path: Path) -> ConvertStats:
+        stats = ConvertStats()
+        json_path = Path(json_path)
+
+        try:
+            items = json.loads(json_path.read_text())
+        except Exception as e:
+            stats.errors += 1
+            self._log_error(f"read_json({json_path})", e)
+            return stats
+
+        if not isinstance(items, list):
+            err = ValueError(f"{json_path} must contain a top-level list")
+            stats.errors += 1
+            self._log_error(f"validate_json({json_path})", err)
+            return stats
+
+        for item in items:
+            try:
+                s = self._convert_item(item)
+                stats.videos_with_boxes += s.videos_with_boxes
+                stats.videos_without_boxes += s.videos_without_boxes
+                stats.label_files_written += s.label_files_written
+            except Exception as e:
+                stats.errors += 1
+                item_id = item.get("id", "unknown")
+                self._log_error(f"_convert_item(id={item_id}, src={json_path})", e)
+        return stats
+
+    def materialize_negatives(self) -> (int, int):
+        """
+        Sample negatives globally so negatives are at most self.negative_ratio
+        of the final dataset. Returns number of negative files written.
+        """
+        if not self.include_negatives:
+            return 0
+
+        pos = self._positive_frame_files_written
+        if pos <= 0:
+            return 0
+
+        r = self.negative_ratio
+        if r <= 0:
+            return 0
+        if r >= 1:
+            max_neg = sum(
+                min(self.negatives_per_video, len(self._eligible_frames_for_video(c.video_stem, c.total_frames)))
+                for c in self._negative_candidates
+            )
+        else:
+            max_neg = int((r / (1.0 - r)) * pos)
+
+        if max_neg <= 0:
+            return 0
+
+        # Build all candidates at the video level first
+        per_video_samples: Dict[str, List[int]] = {}
+        total_candidate_frames = 0
+
+        for c in self._negative_candidates:
+            eligible = self._eligible_frames_for_video(c.video_stem, c.total_frames)
+            if not eligible:
+                continue
+
+            k = min(self.negatives_per_video, len(eligible))
+            sampled = self._sample_negative_frames_for_video(c.video_stem, c.total_frames, k)
+            if sampled:
+                per_video_samples[c.video_stem] = sampled
+                total_candidate_frames += len(sampled)
+
+        if total_candidate_frames == 0:
+            return 0
+
+        # Flatten candidates, then globally subsample if needed
+        flat: List[Tuple[str, int]] = []
+        for video_stem, frames in per_video_samples.items():
+            for frame_idx in frames:
+                flat.append((video_stem, frame_idx))
+
+        # Deterministic global shuffle
+        flat.sort()
+        rng = random.Random(self.negative_seed)
+        rng.shuffle(flat)
+
+        flat = flat[:max_neg]
+
+        # Write empty labels
+        wrote = 0
+        for video_stem, frame_idx in sorted(flat):
+            self._write_label(video_stem, frame_idx, "")
+            wrote += 1
+
+        self._negative_frame_files_written += wrote
+        return wrote, total_candidate_frames
+
+    # ---- internals ----
+
+    def _stride_offset(self, video_stem: str) -> int:
+        if self.frame_stride <= 1:
+            return 0
+        if self.frame_offset_mode == "fixed":
+            return int(self.frame_offset) % self.frame_stride
+        if self.frame_offset_mode == "video_hash":
+            # deterministic across runs + platforms
+            return zlib.crc32(video_stem.encode("utf-8")) % self.frame_stride
+
+        raise ValueError("Invalid frame offset mode")
+
+    @staticmethod
+    def _parse_ffmpeg_rate(rate: Any) -> float:
+        """
+        Parse strings like '10/1' or '30000/1001' into float.
+        """
+        if rate is None:
+            return 0.0
+        if isinstance(rate, (int, float)):
+            return float(rate)
+
+        s = str(rate).strip()
+        if "/" in s:
+            a, b = s.split("/", 1)
+            try:
+                num = float(a)
+                den = float(b)
+                return num / den if den else 0.0
+            except Exception:
+                return 0.0
+        try:
+            return float(s)
+        except Exception:
+            return 0.0
+
+
+    def _infer_total_frames(self, item: dict, results: Optional[List[dict]] = None) -> int:
+        """
+        Prefer metadata_video_nb_frames. Fall back to framesCount in result.value,
+        then duration * fps.
+        """
+        data = item.get("data") or {}
+
+        # 1) Best source
+        n = int(safe_float(data.get("metadata_video_nb_frames"), 0))
+        if n > 0:
+            return n
+
+        # 2) From result.value.framesCount
+        if results:
+            for r in results:
+                value = r.get("value") or {}
+                n = int(safe_float(value.get("framesCount"), 0))
+                if n > 0:
+                    return n
+
+        # 3) duration * fps
+        duration = safe_float(
+            data.get("metadata_video_duration",
+                     data.get("duration", 0.0)),
+            0.0
+        )
+
+        fps = safe_float(data.get("frames_per_second"), 0.0)
+        if fps <= 0:
+            fps = self._parse_ffmpeg_rate(data.get("metadata_video_r_frame_rate"))
+        if fps <= 0:
+            fps = self._parse_ffmpeg_rate(data.get("metadata_video_avg_frame_rate"))
+
+        if duration > 0 and fps > 0:
+            return int(round(duration * fps))
+
+        return 0
+
+
+    def _eligible_frames_for_video(self, video_stem: str, total_frames: int) -> List[int]:
+        off = self._stride_offset(video_stem)
+        return [f for f in range(total_frames) if (f % self.frame_stride) == off]
+
+
+    def _write_empty_label(self, path: Path):
+        path.parent.mkdir(parents=True, exist_ok=True)
+        if not path.exists():
+            path.write_text("")
+
+    def _write_label(self, video_stem: str, frame_idx: int, text: str):
+        rel_path = f"{video_stem}/frame_{frame_idx:06d}.txt"
+
+        if self._sharder is not None:
+            self._sharder.write_text(rel_path, text)
+        else:
+            label_path = self.output_dir / rel_path
+            label_path.parent.mkdir(parents=True, exist_ok=True)
+            label_path.write_text(text)
+
+    def _sample_negative_frames_for_video(self, video_stem: str, total_frames: int, k: int) -> List[int]:
+        eligible = self._eligible_frames_for_video(video_stem, total_frames)
+        if not eligible or k <= 0:
+            return []
+
+        seed = zlib.crc32(f"{video_stem}|{self.negative_seed}".encode("utf-8"))
+        rng = random.Random(seed)
+
+        if k >= len(eligible):
+            return sorted(eligible)
+
+        return sorted(rng.sample(eligible, k))
+
+    @staticmethod
+    def _interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
+        """
+        Given a Label Studio 'sequence' (list of keyframes) like:
+
+          {
+            "frame": 47, "x": 0, "y": 62.8, "width": 15.9, "height": 15.1, "enabled": true
+          },
+          ...
+
+        Produce: frame_index -> list of (x, y, w, h) in the SAME units as input.
+
+        Semantics:
+        - Every keyframe (enabled or not) produces a box at its own frame.
+        - If a keyframe has enabled=True, we linearly interpolate boxes for the frames
+          *between it and the next keyframe* (f0+1 .. f1-1).
+        - If a keyframe has enabled=False, we do NOT interpolate forward from it,
+          but we still keep its own box at that frame.
+        - A disabled keyframe can still be the *end* of an interpolation that started
+          from a previous enabled keyframe (since that interpolation uses the previous
+          keyframe's enabled flag).
+        """
+        # Sort keyframes by frame
+        kfs = sorted(seq, key=lambda k: int(safe_float(k.get("frame"), 0)))
+        frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
+
+        if not kfs:
+            return frames_boxes
+
+        # 1) Add all keyframes as boxes at their exact frames
+        for k in kfs:
+            f = int(safe_float(k.get("frame"), -1))
+            if f < 0:
+                continue
+
+            x = safe_float(k.get("x"))
+            y = safe_float(k.get("y"))
+            w = safe_float(k.get("width"))
+            h = safe_float(k.get("height"))
+            frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+        # 2) Interpolate between consecutive keyframes when the *start* keyframe is enabled
+        for i in range(len(kfs) - 1):
+            k0 = kfs[i]
+            k1 = kfs[i + 1]
+
+            f0 = int(safe_float(k0.get("frame"), -1))
+            f1 = int(safe_float(k1.get("frame"), -1))
+            if f0 < 0 or f1 <= f0:
+                continue
+
+            enabled0 = bool(k0.get("enabled", True))
+            if not enabled0:
+                # Do not interpolate forward from a disabled keyframe
+                continue
+
+            x0 = safe_float(k0.get("x"))
+            y0 = safe_float(k0.get("y"))
+            w0 = safe_float(k0.get("width"))
+            h0 = safe_float(k0.get("height"))
+
+            x1 = safe_float(k1.get("x"))
+            y1 = safe_float(k1.get("y"))
+            w1 = safe_float(k1.get("width"))
+            h1 = safe_float(k1.get("height"))
+
+            # Fill in strictly between endpoints; endpoints themselves are already added
+            for f in range(f0 + 1, f1):
+                t = (f - f0) / float(f1 - f0)
+                x = x0 + (x1 - x0) * t
+                y = y0 + (y1 - y0) * t
+                w = w0 + (w1 - w0) * t
+                h = h0 + (h1 - h0) * t
+                frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+        return frames_boxes
+
+    def _log_error(self, context: str, exc: Exception):
+        try:
+            self.error_log_path.parent.mkdir(parents=True, exist_ok=True)
+            with self.error_log_path.open("a") as f:
+                f.write(f"\n=== ERROR in {context} ===\n")
+                traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
+        except Exception:
+            # last-resort: don't crash because logging failed
+            pass
+
+    @staticmethod
+    def _parse_ts(s):
+        return datetime.fromisoformat(s.replace("Z", "+00:00"))
+
+    def _convert_item(self, item: dict) -> ConvertStats:
+        stats = ConvertStats()
+
+        data = item.get("data") or {}
+        site = data.get("metadata_file_site_reference_string") or ""
+        if len(self.include_sites) > 0:
+            if site not in self.include_sites:
+                # Not in included sites
+                return stats
+
+        video_uri = data.get("metadata_file_filename") or data.get("video") or "unknown.mp4"
+        video_stem = Path(video_uri).stem
+        vid_w = int(safe_float(data.get("metadata_video_width"), 0))
+        vid_h = int(safe_float(data.get("metadata_video_height"), 0))
+
+        annos = item.get("annotations") or []
+        results = []
+        if len(annos) > 0:
+            latest_ann = max(
+                annos,
+                key=lambda a: YoloConverterLSVideo._parse_ts(a["updated_at"])
+            )
+
+            for r in (latest_ann.get("result") or []):
+                if r.get("type") != self.result_type:
+                    continue
+                if self.from_name is not None and r.get("from_name") != self.from_name:
+                    continue
+                if self.to_name is not None and r.get("to_name") != self.to_name:
+                    continue
+                results.append(r)
+
+        wrote_any = False
+
+        # Collect lines per frame
+        frame_lines: Dict[int, List[str]] = defaultdict(list)
+        for r in results:
+            value = r.get("value") or {}
+            labels: List[str] = value.get("labels") or []
+            if not labels:
+                continue
+            cls_name = labels[0]
+            if cls_name not in self.class_map:
+                # unknown class; skip this track
+                continue
+            cls_id = self.class_map[cls_name]
+
+            seq: Iterable[dict] = value.get("sequence") or []
+            frame_boxes = self._interpolate_sequence(seq)  # frame -> [(x,y,w,h), ...]
+
+            for frame_idx, boxes in frame_boxes.items():
+                for (x, y, w, h) in boxes:
+                    xc, yc, wn, hn = to_yolo(
+                        x, y, w, h,
+                        vid_w=vid_w,
+                        vid_h=vid_h,
+                        forced_mode=self.coord_mode,
+                    )
+                    frame_lines[frame_idx].append(f"{cls_id} {xc:.6f} {yc:.6f} {wn:.6f} {hn:.6f}")
+                    wrote_any = True
+
+        if wrote_any:
+            stats.videos_with_boxes += 1
+        else:
+            stats.videos_without_boxes += 1
+            if self.empty_list_path:
+                self.empty_list_path.parent.mkdir(parents=True, exist_ok=True)
+                with self.empty_list_path.open("a") as f:
+                    f.write(f"{video_uri}\n")
+
+            if self.include_negatives:
+                total_frames = self._infer_total_frames(item, results=None)
+                if total_frames > 0:
+                    self._negative_candidates.append(
+                        NegativeVideoCandidate(
+                            video_stem=video_stem,
+                            video_uri=video_uri,
+                            total_frames=total_frames,
+                        )
+                    )
+
+        # Apply frame sampling
+        if self.frame_stride > 1 and frame_lines:
+            off = self._stride_offset(video_stem)
+            frame_lines = {f: lines for f, lines in frame_lines.items()
+                           if (f % self.frame_stride) == off}
+
+        stats.label_files_written += len(frame_lines)
+        self._positive_frame_files_written += len(frame_lines)
+        if self._sharder is None:
+            # write to filesystem
+            vid_dir = self.output_dir / video_stem
+
+            if vid_dir.exists() and not self.overwrite_video_dir:
+                # skip existing video dir to avoid mixing runs
+                return stats
+            vid_dir.mkdir(parents=True, exist_ok=True)
+
+        for frame_idx, lines in frame_lines.items():
+            self._write_label(video_stem, frame_idx, "\n".join(lines) + "\n")
+
+        return stats
+
diff --git a/training/object-detection/src/object_detection/yolo_ls/parsing.py b/training/object-detection/src/object_detection/yolo_ls/parsing.py
new file mode 100644
index 0000000..0b7b395
--- /dev/null
+++ b/training/object-detection/src/object_detection/yolo_ls/parsing.py
@@ -0,0 +1,110 @@
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Any
+import yaml
+
+def safe_float(v: Any, default: float = 0.0) -> float:
+    try:
+        return float(v)
+    except Exception:
+        return default
+
+def coord_mode(x: float, y: float, w: float, h: float) -> str:
+    """
+    Infer coordinate mode for Label Studio:
+    - 'percent'   : typical LS UI export (0..100)
+    - 'normalized': already 0..1
+    - 'pixel'     : values > 100 (needs video width/height)
+    """
+    mx = max(x, y, w, h)
+    if mx <= 1.0000001:       # already normalized
+        return "normalized"
+    if mx <= 100.0000001:     # percent
+        return "percent"
+    return "pixel"
+
+
+def to_yolo(
+    x: float,
+    y: float,
+    w: float,
+    h: float,
+    vid_w: int,
+    vid_h: int,
+    forced_mode: Optional[str] = None,
+) -> Tuple[float, float, float, float]:
+    """
+    Convert LS-style box to YOLO (xc, yc, w, h) in [0,1].
+
+    :param forced_mode: One of {"percent", "normalized", "pixel", None/"auto"}.
+                        If None or "auto", infer from values.
+    """
+    mode = forced_mode or "auto"
+    if mode == "auto":
+        mode = coord_mode(x, y, w, h)
+
+    if mode == "normalized":
+        xc = x + w / 2.0
+        yc = y + h / 2.0
+        wn = w
+        hn = h
+    elif mode == "percent":
+        xc = (x + w / 2.0) / 100.0
+        yc = (y + h / 2.0) / 100.0
+        wn = w / 100.0
+        hn = h / 100.0
+    elif mode == "pixel":
+        xc = (x + w / 2.0) / float(vid_w) if vid_w else 0.0
+        yc = (y + h / 2.0) / float(vid_h) if vid_h else 0.0
+        wn = w / float(vid_w) if vid_w else 0.0
+        hn = h / float(vid_h) if vid_h else 0.0
+    else:
+        raise ValueError(f"Unknown coord_mode: {forced_mode!r}")
+
+    # clamp to [0,1]
+    xc = min(max(xc, 0.0), 1.0)
+    yc = min(max(yc, 0.0), 1.0)
+    wn = min(max(wn, 0.0), 1.0)
+    hn = min(max(hn, 0.0), 1.0)
+    return xc, yc, wn, hn
+
+def load_class_map_from_yolo_yaml(yaml_path: Path) -> Dict[str, int]:
+    """
+    Load a YOLO-style data.yaml and return a mapping: class_name -> class_id
+
+    Expects something like:
+
+      names:
+        0: Coho
+        1: Bull
+        2: Rainbow
+        ...
+
+    or:
+
+      names: [Coho, Bull, Rainbow, ...]
+    """
+    data: Any = yaml.safe_load(Path(yaml_path).read_text())
+    names = data.get("names")
+    if names is None:
+        raise ValueError(f"'names' not found in {yaml_path}")
+
+    class_map: Dict[str, int] = {}
+
+    if isinstance(names, dict):
+        # {0: 'Coho', 1: 'Bull', ...} (keys can be int or str)
+        for k, v in names.items():
+            try:
+                idx = int(k)
+            except Exception:
+                raise ValueError(f"Invalid class index {k!r} in names of {yaml_path}")
+            label = str(v)
+            class_map[label] = idx
+    elif isinstance(names, (list, tuple)):
+        # ['Coho', 'Bull', 'Rainbow', ...]
+        for idx, label in enumerate(names):
+            class_map[str(label)] = idx
+    else:
+        raise ValueError(f"Unsupported 'names' structure in {yaml_path}: {type(names)}")
+
+    return class_map
+
diff --git a/training/object-detection/src/object_detection/yolo_ls/shards.py b/training/object-detection/src/object_detection/yolo_ls/shards.py
new file mode 100644
index 0000000..5eb9de3
--- /dev/null
+++ b/training/object-detection/src/object_detection/yolo_ls/shards.py
@@ -0,0 +1,43 @@
+import tarfile
+from pathlib import Path
+import io
+
+class TarShardWriter:
+    def __init__(self, out_dir: Path, shard_size: int = 10000, prefix: str = "yolo_annos"):
+        self.out_dir = Path(out_dir)
+        self.out_dir.mkdir(parents=True, exist_ok=True)
+        self.shard_size = int(shard_size)
+        self.prefix = prefix
+
+        self._shard_idx = 0
+        self._n_in_shard = 0
+        self._tar = None  # tarfile.TarFile
+
+        self._open_new()
+
+    def _open_new(self):
+        if self._tar is not None:
+            self._tar.close()
+        shard_name = f"{self.prefix}-{self._shard_idx:06d}.tar"
+        self._tar_path = self.out_dir / shard_name
+        self._tar = tarfile.open(self._tar_path, mode="w")  # uncompressed tar
+        self._n_in_shard = 0
+        self._shard_idx += 1
+
+    def write_text(self, rel_path: str, text: str):
+        # rotate shard if needed
+        if self._n_in_shard >= self.shard_size:
+            self._open_new()
+
+        data = text.encode("utf-8")
+        ti = tarfile.TarInfo(name=rel_path)
+        ti.size = len(data)
+        self._tar.addfile(ti, io.BytesIO(data))
+
+        self._n_in_shard += 1
+
+    def close(self):
+        if self._tar is not None:
+            self._tar.close()
+            self._tar = None
+
diff --git a/training/object-detection/uv.lock b/training/object-detection/uv.lock
index 6063ebf..da7eedd 100644
--- a/training/object-detection/uv.lock
+++ b/training/object-detection/uv.lock
@@ -57,7 +57,10 @@ wheels = [
 [[package]]
 name = "object-detection"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
+dependencies = [
+    { name = "pyyaml" },
+]
 
 [package.dev-dependencies]
 dev = [
@@ -67,6 +70,7 @@ dev = [
 ]
 
 [package.metadata]
+requires-dist = [{ name = "pyyaml", specifier = ">=6.0.3" }]
 
 [package.metadata.requires-dev]
 dev = [{ name = "pytest", specifier = ">=8.3.5" }]
@@ -176,6 +180,86 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/a2/09f67a3589cb4320fb5ce90d3fd4c9752636b8b6ad8f34b54d76c5a54693/PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f", size = 186824, upload-time = "2025-09-29T20:27:35.918Z" },
+    { url = "https://files.pythonhosted.org/packages/02/72/d972384252432d57f248767556ac083793292a4adf4e2d85dfe785ec2659/PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4", size = 795069, upload-time = "2025-09-29T20:27:38.15Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/3b/6c58ac0fa7c4e1b35e48024eb03d00817438310447f93ef4431673c24138/PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3", size = 862585, upload-time = "2025-09-29T20:27:39.715Z" },
+    { url = "https://files.pythonhosted.org/packages/25/a2/b725b61ac76a75583ae7104b3209f75ea44b13cfd026aa535ece22b7f22e/PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6", size = 806018, upload-time = "2025-09-29T20:27:41.444Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/b0/b2227677b2d1036d84f5ee95eb948e7af53d59fe3e4328784e4d290607e0/PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369", size = 802822, upload-time = "2025-09-29T20:27:42.885Z" },
+    { url = "https://files.pythonhosted.org/packages/99/a5/718a8ea22521e06ef19f91945766a892c5ceb1855df6adbde67d997ea7ed/PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295", size = 143744, upload-time = "2025-09-29T20:27:44.487Z" },
+    { url = "https://files.pythonhosted.org/packages/76/b2/2b69cee94c9eb215216fc05778675c393e3aa541131dc910df8e52c83776/PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b", size = 160082, upload-time = "2025-09-29T20:27:46.049Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" },
+    { url = "https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" },
+    { url = "https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+    { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+    { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+    { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" },
+    { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" },
+    { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" },
+]
+
 [[package]]
 name = "tomli"
 version = "2.4.0"

From ef756a9e64fb57e526b25c152f8e84959d64ef4d Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 16:02:24 -0700
Subject: [PATCH 20/35] Fix Tuple

---
 training/object-detection/README.md                    | 10 ++++++++++
 .../src/object_detection/yolo_ls/cli.py                |  5 ++---
 .../src/object_detection/yolo_ls/converter.py          |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/training/object-detection/README.md b/training/object-detection/README.md
index f5d6d7a..21ab44c 100644
--- a/training/object-detection/README.md
+++ b/training/object-detection/README.md
@@ -12,6 +12,11 @@ Install DVC:
 uv tool install dvc
 ```
 
+Install the module:
+```bash
+uv pip install -e .
+```
+
 Check dvc.yaml for the full pipeline.
 
 Run the following to run specific stages of the pipeline:
@@ -23,3 +28,8 @@ For example, building the model input annotations:
 ```bash
 dvc repro build_model_input
 ```
+
+Run tests with
+```
+uv run pytest
+```
diff --git a/training/object-detection/src/object_detection/yolo_ls/cli.py b/training/object-detection/src/object_detection/yolo_ls/cli.py
index 34dc462..5a2876a 100644
--- a/training/object-detection/src/object_detection/yolo_ls/cli.py
+++ b/training/object-detection/src/object_detection/yolo_ls/cli.py
@@ -1,11 +1,10 @@
-import yaml
 import argparse
 from pathlib import Path
 
 from object_detection.yolo_ls.converter import YoloConverterLSVideo
 from object_detection.yolo_ls.parsing import load_class_map_from_yolo_yaml
 
-def build_parser():
+def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Convert Label Studio video JSON to YOLO frame labels")
     parser.add_argument("input", help="JSON file or directory containing Label Studio JSON")
     parser.add_argument("--data-yaml", required=True, help="Path to YOLO data.yaml (with 'names:' mapping)")
@@ -36,7 +35,7 @@ def build_parser():
 
     return parser
 
-def main():
+def main() -> None:
     parser = build_parser()
     args = parser.parse_args()
 
diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index 6945074..5a77d27 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -175,7 +175,7 @@ def convert_file(self, json_path: Path) -> ConvertStats:
                 self._log_error(f"_convert_item(id={item_id}, src={json_path})", e)
         return stats
 
-    def materialize_negatives(self) -> (int, int):
+    def materialize_negatives(self) -> Tuple[int, int]:
         """
         Sample negatives globally so negatives are at most self.negative_ratio
         of the final dataset. Returns number of negative files written.

From 2968160903a6a4c6ef385c9e48b4edebf5b92745 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 17:16:34 -0700
Subject: [PATCH 21/35] Refactor splitter into modules and add tests

---
 training/object-detection/dvc.lock            |  10 +-
 training/object-detection/dvc.yaml            |   3 +-
 .../object-detection/scripts/make_splits.py   | 635 +-----------------
 .../src/object_detection/splits/cli.py        | 129 ++++
 .../src/object_detection/splits/parsing.py    | 167 +++++
 .../src/object_detection/splits/splitter.py   | 361 ++++++++++
 .../src/object_detection/yolo_ls/converter.py |   6 +-
 training/object-detection/tests/conftest.py   |  85 +++
 .../tests/test_splits_parsing.py              | 107 +++
 .../tests/test_splits_splitter.py             | 143 ++++
 .../tests/test_yolo_ls_converter.py           | 466 +++++++++++++
 .../tests/test_yolo_ls_parsing.py             | 104 +++
 .../tests/test_yolo_ls_shards.py              |  30 +
 13 files changed, 1608 insertions(+), 638 deletions(-)
 create mode 100644 training/object-detection/src/object_detection/splits/cli.py
 create mode 100644 training/object-detection/src/object_detection/splits/parsing.py
 create mode 100644 training/object-detection/src/object_detection/splits/splitter.py
 create mode 100644 training/object-detection/tests/conftest.py
 create mode 100644 training/object-detection/tests/test_splits_parsing.py
 create mode 100644 training/object-detection/tests/test_splits_splitter.py
 create mode 100644 training/object-detection/tests/test_yolo_ls_converter.py
 create mode 100644 training/object-detection/tests/test_yolo_ls_parsing.py
 create mode 100644 training/object-detection/tests/test_yolo_ls_shards.py

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index b14f23f..3612a2e 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -17,7 +17,7 @@ stages:
       size: 379352848
       nfiles: 50
   split_data:
-    cmd: rm -r data/02_interim/yolo_annos_unpacked || scripts/unpack_annos.sh 
+    cmd: rm -r data/02_interim/yolo_annos_unpacked; scripts/unpack_annos.sh 
       data/02_interim/yolo_annos data/02_interim/yolo_annos_unpacked && 
       scripts/make_splits.py --labels-root data/02_interim/yolo_annos_unpacked 
       --out-dir data/03_processed/splits_baseline --sites tankeeah kitwanga bear
@@ -41,11 +41,15 @@ stages:
       md5: a65beddacd3a0ceefa39225176c78f6c.dir
       size: 307039754
       nfiles: 5
+    - path: scripts/make_splits.py
+      hash: md5
+      md5: 3921729fec8a5aa6c1eb25c54196a658
+      size: 650
     outs:
     - path: data/03_processed/splits_baseline
       hash: md5
-      md5: b0af0cfdde2f5164decf4a9b15baeb9c.dir
-      size: 16780574
+      md5: c1535b54178c37a23abf827b656afa19.dir
+      size: 16780687
       nfiles: 5
   build_model_input:
     cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index f6b6b9b..a2ee1e4 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -37,7 +37,7 @@ stages:
       - data/02_interim/yolo_annos
   split_data:
     cmd: >-
-      rm -r data/02_interim/yolo_annos_unpacked ||
+      rm -r data/02_interim/yolo_annos_unpacked;
       scripts/unpack_annos.sh
       data/02_interim/yolo_annos
       data/02_interim/yolo_annos_unpacked &&
@@ -48,6 +48,7 @@ stages:
       --seed 42
       --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
+      - scripts/make_splits.py
       - data/02_interim/yolo_annos
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv
diff --git a/training/object-detection/scripts/make_splits.py b/training/object-detection/scripts/make_splits.py
index 4c9f15c..3ca26da 100755
--- a/training/object-detection/scripts/make_splits.py
+++ b/training/object-detection/scripts/make_splits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S uv run --script
+#!/usr/bin/env -S uv run python
 
 """
 make_splits.py
@@ -22,638 +22,7 @@
 Balancing objectives (soft): class counts, time-of-day, density bins, box area bins.
 """
 
-from __future__ import annotations
-
-import argparse
-import csv
-import json
-import math
-import os
-import random
-import re
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, List, Tuple, Optional, Iterable, Any
-from collections import defaultdict, Counter
-
-# -----------------------------
-# Parsing helpers
-# -----------------------------
-
-_STEM_RE = re.compile(
-    r"""
-    ^
-    (?P<prefix>.+?)                 # ORG-site-device-id (up to first underscore)
-    _
-    (?P<date>\d{8})                 # YYYYMMDD
-    _
-    (?P<time>\d{6})                 # HHMMSS
-    _
-    (?P<suffix>.+)                  # M / etc
-    $
-    """,
-    re.VERBOSE,
-)
-
-def parse_video_stem(video_stem: str) -> Optional[Dict[str, str]]:
-    """
-    Parse:
-      HIRMD-tankeeah-jetson-0_20250714_012827_M
-    into:
-      org="HIRMD", site="tankeeah", device="jetson-0", date="20250714", time="012827"
-    """
-    m = _STEM_RE.match(video_stem)
-    if not m:
-        return None
-
-    prefix = m.group("prefix")
-    date = m.group("date")
-    time = m.group("time")
-
-    parts = prefix.split("-")
-    if len(parts) < 3:
-        return None
-
-    org = parts[0]
-    site = parts[1]
-    device = "-".join(parts[2:])  # includes jetson-0, jetsonorin-1, etc.
-
-    return {"org": org, "site": site, "device": device, "date": date, "time": time}
-
-
-def time_bucket(hhmmss: str) -> str:
-    """Coarse time-of-day buckets based on HH."""
-    try:
-        hh = int(hhmmss[0:2])
-    except Exception:
-        return "unknown"
-    if 0 <= hh <= 5:
-        return "night"
-    if 6 <= hh <= 11:
-        return "morning"
-    if 12 <= hh <= 17:
-        return "afternoon"
-    if 18 <= hh <= 23:
-        return "evening"
-    return "unknown"
-
-
-def density_bin(n_boxes: int) -> str:
-    """Bins for boxes per frame."""
-    if n_boxes <= 0:
-        return "0"
-    if n_boxes == 1:
-        return "1"
-    if n_boxes == 2:
-        return "2"
-    if 3 <= n_boxes <= 4:
-        return "3-4"
-    if 5 <= n_boxes <= 9:
-        return "5-9"
-    return "10+"
-
-def ar_bin(w: float, h: float) -> str:
-    """
-    Aspect ratio bins based on w/h.
-    w,h are YOLO normalized widths/heights in [0,1].
-    """
-    if w <= 0 or h <= 0:
-        return "invalid"
-    r = w / h
-
-    # You can tune these thresholds, but this is a good start:
-    if r < 0.67:
-        return "tall"        # height-dominant
-    if r <= 1.5:
-        return "square"      # roughly square-ish
-    return "wide"            # width-dominant
-
-def area_bin(area: float) -> str:
-    """
-    Bin YOLO normalized bbox area (w*h) in [0,1].
-    Tune thresholds if needed.
-    """
-    if area <= 0:
-        return "0"
-    if area < 0.0025:
-        return "<0.0025"
-    if area < 0.01:
-        return "0.0025-0.01"
-    if area < 0.04:
-        return "0.01-0.04"
-    if area < 0.16:
-        return "0.04-0.16"
-    return ">=0.16"
-
-
-def safe_float(x: str, default: float = 0.0) -> float:
-    try:
-        return float(x)
-    except Exception:
-        return default
-
-
-# -----------------------------
-# Data structures
-# -----------------------------
-
-@dataclass
-class FrameRecord:
-    rel_path: str            # relative path to label file within labels_root
-    video_stem: str
-    frame_idx: int
-    org: str
-    site: str
-    device: str
-    date: str                # YYYYMMDD
-    tod: str                 # time-of-day bucket
-
-    n_boxes: int
-    class_counts: Counter    # class_id -> count
-    density_bin: str
-    area_bins: Counter       # area_bin -> count (counts per box)
-
-
-@dataclass
-class GroupStats:
-    group_id: str
-    site: str
-    device: str
-    date: str
-
-    n_frames: int
-    n_boxes: int
-
-    class_counts: Counter
-    tod_counts: Counter
-    density_counts: Counter
-    area_counts: Counter
-
-    # list of frame rel_paths (for manifest writing)
-    frame_paths: List[str]
-
-
-# -----------------------------
-# Scanning labels
-# -----------------------------
-
-def iter_label_files(labels_root: Path) -> Iterable[Path]:
-    # Expect structure: labels_root/<video_stem>/frame_000123.txt
-    for p in labels_root.rglob("frame_*.txt"):
-        if p.is_file():
-            yield p
-
-
-def parse_frame_idx(filename: str) -> Optional[int]:
-    # frame_000123.txt
-    m = re.match(r"^frame_(\d+)\.txt$", filename)
-    if not m:
-        return None
-    try:
-        return int(m.group(1))
-    except Exception:
-        return None
-
-
-def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter, Counter]:
-    """
-    Returns:
-      n_boxes,
-      class_counts (class_id -> count),
-      area_bins (area_bin -> count)
-    """
-    n_boxes = 0
-    class_counts: Counter = Counter()
-    area_counts: Counter = Counter()
-    ar_counts: Counter = Counter()
-
-    try:
-        txt = path.read_text().strip()
-    except Exception:
-        return 0, Counter(), Counter()
-
-    if not txt:
-        return 0, Counter(), Counter()
-
-    for line in txt.splitlines():
-        parts = line.strip().split()
-        if len(parts) < 5:
-            continue
-        cls = parts[0]
-        w = safe_float(parts[3], 0.0)
-        h = safe_float(parts[4], 0.0)
-        try:
-            cls_id = int(cls)
-        except Exception:
-            continue
-        n_boxes += 1
-        class_counts[cls_id] += 1
-        area_counts[area_bin(w * h)] += 1
-        ar_counts[ar_bin(w, h)] += 1
-
-    return n_boxes, class_counts, area_counts
-
-
-def build_groups(
-    labels_root: Path,
-    sites_keep: set[str],
-    seed: int,
-    limit: Optional[int] = None,
-) -> Dict[str, GroupStats]:
-    """
-    Build per-group aggregates. Group id = site|device|date.
-    """
-    rnd = random.Random(seed)
-
-    files = list(iter_label_files(labels_root))
-    files.sort()
-    if limit is not None:
-        rnd.shuffle(files)
-        files = files[:limit]
-
-    groups: Dict[str, GroupStats] = {}
-
-    skipped = 0
-    for f in files:
-        video_stem = f.parent.name
-        meta = parse_video_stem(video_stem)
-        if meta is None:
-            skipped += 1
-            continue
-
-        site = meta["site"]
-        if sites_keep and site not in sites_keep:
-            continue
-
-        frame_idx = parse_frame_idx(f.name)
-        if frame_idx is None:
-            skipped += 1
-            continue
-
-        n_boxes, class_counts, area_counts = read_yolo_label(f)
-        # If you ever include negatives, you may want to keep empty files too.
-        # For now, assume label files exist only when boxes exist.
-        dens_bin = density_bin(n_boxes)
-        tod = time_bucket(meta["time"])
-
-        rel_path = str(f.relative_to(labels_root))
-
-        group_id = f"{site}|{meta['device']}|{meta['date']}"
-        if group_id not in groups:
-            groups[group_id] = GroupStats(
-                group_id=group_id,
-                site=site,
-                device=meta["device"],
-                date=meta["date"],
-                n_frames=0,
-                n_boxes=0,
-                class_counts=Counter(),
-                tod_counts=Counter(),
-                density_counts=Counter(),
-                area_counts=Counter(),
-                frame_paths=[],
-            )
-
-        g = groups[group_id]
-        g.n_frames += 1
-        g.n_boxes += n_boxes
-        g.class_counts.update(class_counts)
-        g.tod_counts[tod] += 1
-        g.density_counts[dens_bin] += 1
-        g.area_counts.update(area_counts)
-        g.frame_paths.append(rel_path)
-
-    if skipped:
-        print(f"[make_splits] skipped {skipped} files due to parse issues")
-    print(f"[make_splits] groups={len(groups)} from label files={len(files)}")
-    return groups
-
-
-# -----------------------------
-# Split objective
-# -----------------------------
-
-def normalize_counter(c: Counter) -> Dict[Any, float]:
-    s = float(sum(c.values()))
-    if s <= 0:
-        return {}
-    return {k: v / s for k, v in c.items()}
-
-
-def l1_dist(p: Dict[Any, float], q: Dict[Any, float], keys: Iterable[Any]) -> float:
-    d = 0.0
-    for k in keys:
-        d += abs(p.get(k, 0.0) - q.get(k, 0.0))
-    return d
-
-
-@dataclass
-class SplitState:
-    name: str
-    target_frac: float
-    n_frames: int = 0
-
-    class_counts: Counter = None
-    tod_counts: Counter = None
-    density_counts: Counter = None
-    area_counts: Counter = None
-
-    group_ids: List[str] = None
-    frame_paths: List[str] = None
-
-    def __post_init__(self):
-        self.class_counts = Counter()
-        self.tod_counts = Counter()
-        self.density_counts = Counter()
-        self.area_counts = Counter()
-        self.group_ids = []
-        self.frame_paths = []
-
-    def add_group(self, g: GroupStats):
-        self.n_frames += g.n_frames
-        self.class_counts.update(g.class_counts)
-        self.tod_counts.update(g.tod_counts)
-        self.density_counts.update(g.density_counts)
-        self.area_counts.update(g.area_counts)
-        self.group_ids.append(g.group_id)
-        self.frame_paths.extend(g.frame_paths)
-
-
-def compute_global_targets(groups: List[GroupStats]) -> Dict[str, Any]:
-    total_frames = sum(g.n_frames for g in groups)
-
-    global_class = Counter()
-    global_tod = Counter()
-    global_density = Counter()
-    global_area = Counter()
-
-    for g in groups:
-        global_class.update(g.class_counts)
-        global_tod.update(g.tod_counts)
-        global_density.update(g.density_counts)
-        global_area.update(g.area_counts)
-
-    targets = {
-        "total_frames": total_frames,
-        "class_keys": sorted(global_class.keys()),
-        "tod_keys": sorted(global_tod.keys()),
-        "density_keys": sorted(global_density.keys()),
-        "area_keys": sorted(global_area.keys()),
-        "class_dist": normalize_counter(global_class),
-        "tod_dist": normalize_counter(global_tod),
-        "density_dist": normalize_counter(global_density),
-        "area_dist": normalize_counter(global_area),
-    }
-    return targets
-
-
-def rarity_score(g: GroupStats, global_class_dist: Dict[int, float]) -> float:
-    """
-    Higher score => assign earlier.
-    Use inverse frequency weighting on classes present in the group.
-    """
-    s = 0.0
-    for cls_id, cnt in g.class_counts.items():
-        p = global_class_dist.get(cls_id, 1e-12)
-        # weight by amount of that class in the group
-        s += cnt * (1.0 / max(p, 1e-6))
-    # also emphasize very dense groups a bit
-    s += 0.25 * g.n_boxes
-    return s
-
-
-def split_groups_greedy(
-    groups: Dict[str, GroupStats],
-    seed: int,
-    train_frac: float,
-    val_frac: float,
-    test_frac: float,
-    weights: Dict[str, float],
-) -> Tuple[SplitState, SplitState, SplitState, Dict[str, Any]]:
-    """
-    Greedy group assignment minimizing distance to global distributions + size penalty.
-
-    weights keys: class, tod, density, area, size
-    """
-    rnd = random.Random(seed)
-    group_list = list(groups.values())
-
-    targets = compute_global_targets(group_list)
-    total_frames = targets["total_frames"]
-
-    # Sort groups by rarity (desc), stable tie-break with seed
-    rnd.shuffle(group_list)
-    group_list.sort(key=lambda g: rarity_score(g, targets["class_dist"]), reverse=True)
-
-    train = SplitState("train", train_frac)
-    val = SplitState("val", val_frac)
-    test = SplitState("test", test_frac)
-    splits = [test, val, train]
-
-    # precompute target frame counts
-    target_frames = {
-        "train": train_frac * total_frames,
-        "val": val_frac * total_frames,
-        "test": test_frac * total_frames,
-    }
-
-    def score_split(after: SplitState) -> float:
-        # distribution distances (L1)
-        class_d = l1_dist(normalize_counter(after.class_counts), targets["class_dist"], targets["class_keys"])
-        tod_d = l1_dist(normalize_counter(after.tod_counts), targets["tod_dist"], targets["tod_keys"])
-        dens_d = l1_dist(normalize_counter(after.density_counts), targets["density_dist"], targets["density_keys"])
-        area_d = l1_dist(normalize_counter(after.area_counts), targets["area_dist"], targets["area_keys"])
-
-        # size penalty: keep n_frames close to target
-        tf = target_frames[after.name]
-        size_d = abs(after.n_frames - tf) / max(tf, 1.0)
-
-        return (
-            weights["class"] * class_d +
-            weights["tod"] * tod_d +
-            weights["density"] * dens_d +
-            weights["area"] * area_d +
-            weights["size"] * size_d
-        )
-
-    # Greedy: for each group, try each split, pick minimal total score across all splits
-    for g in group_list:
-        best = None
-        best_score = float("inf")
-
-        for s in splits:
-            # clone minimal stats (cheap-ish since Counters)
-            tmp = SplitState(s.name, s.target_frac)
-            tmp.n_frames = s.n_frames
-            tmp.class_counts = s.class_counts.copy()
-            tmp.tod_counts = s.tod_counts.copy()
-            tmp.density_counts = s.density_counts.copy()
-            tmp.area_counts = s.area_counts.copy()
-
-            tmp.add_group(g)
-
-            # compute global score as sum of each split score
-            # (this keeps all splits moving toward their targets)
-            total = 0.0
-            for other in splits:
-                if other.name == s.name:
-                    total += score_split(tmp)
-                else:
-                    total += score_split(other)
-
-            if total < best_score:
-                best_score = total
-                best = s
-
-        assert best is not None
-        best.add_group(g)
-
-    report = {
-        "total_frames": total_frames,
-        "target_frames": target_frames,
-        "actual_frames": {s.name: s.n_frames for s in splits},
-    }
-    return train, val, test, {**targets, **report}
-
-
-# -----------------------------
-# Reporting + writing
-# -----------------------------
-
-def summarize_split(s: SplitState) -> Dict[str, Any]:
-    return {
-        "n_frames": s.n_frames,
-        "n_boxes": int(sum(s.class_counts.values())),
-        "n_groups": len(s.group_ids),
-        "class_counts": dict(s.class_counts),
-        "tod_counts": dict(s.tod_counts),
-        "density_counts": dict(s.density_counts),
-        "area_counts": dict(s.area_counts),
-    }
-
-
-def write_manifest(out_path: Path, rel_paths: List[str]):
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    rel_paths = list(rel_paths)
-    rel_paths.sort()
-    out_path.write_text("\n".join(rel_paths) + ("\n" if rel_paths else ""))
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--labels-root", required=True, type=Path,
-                    help="Root of exploded YOLO labels, e.g. data/99_work/yolo_annos_exploded")
-    ap.add_argument("--out-dir", required=True, type=Path,
-                    help="Output directory for split manifests")
-    ap.add_argument("--sites", nargs="*", default=["tankeeah", "kitwanga", "bear"],
-                    help="Sites to include (baseline)")
-    ap.add_argument("--seed", type=int, default=42)
-    ap.add_argument("--train-frac", type=float, default=0.80)
-    ap.add_argument("--val-frac", type=float, default=0.10)
-    ap.add_argument("--test-frac", type=float, default=0.10)
-    ap.add_argument("--limit-files", type=int, default=None,
-                    help="Debug: limit to N random label files")
-
-    # Objective weights
-    ap.add_argument("--w-class", type=float, default=4.0)
-    ap.add_argument("--w-tod", type=float, default=1.0)
-    ap.add_argument("--w-density", type=float, default=1.0)
-    ap.add_argument("--w-area", type=float, default=1.0)
-    ap.add_argument("--w-size", type=float, default=2.0)
-
-    args = ap.parse_args()
-
-    if not args.labels_root.exists():
-        raise SystemExit(f"labels-root not found: {args.labels_root}")
-
-    ssum = args.train_frac + args.val_frac + args.test_frac
-    if abs(ssum - 1.0) > 1e-6:
-        raise SystemExit(f"train/val/test fractions must sum to 1.0; got {ssum}")
-
-    sites_keep = set(args.sites) if args.sites else set()
-
-    groups = build_groups(
-        labels_root=args.labels_root,
-        sites_keep=sites_keep,
-        seed=args.seed,
-        limit=args.limit_files,
-    )
-
-    # Split
-    weights = {
-        "class": args.w_class,
-        "tod": args.w_tod,
-        "density": args.w_density,
-        "area": args.w_area,
-        "size": args.w_size,
-    }
-
-    train, val, test, report = split_groups_greedy(
-        groups=groups,
-        seed=args.seed,
-        train_frac=args.train_frac,
-        val_frac=args.val_frac,
-        test_frac=args.test_frac,
-        weights=weights,
-    )
-
-    out_dir = args.out_dir
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    # Write manifests (relative paths to label files)
-    write_manifest(out_dir / "train.txt", train.frame_paths)
-    write_manifest(out_dir / "val.txt", val.frame_paths)
-    write_manifest(out_dir / "test.txt", test.frame_paths)
-
-    # Group assignment CSV
-    with (out_dir / "group_assignments.csv").open("w", newline="") as f:
-        w = csv.writer(f)
-        w.writerow(["group_id", "split", "site", "device", "date", "n_frames", "n_boxes"])
-        for s in [train, val, test]:
-            for gid in s.group_ids:
-                g = groups[gid]
-                w.writerow([gid, s.name, g.site, g.device, g.date, g.n_frames, g.n_boxes])
-
-    # JSON report
-    full_report = {
-        "params": {
-            "labels_root": str(args.labels_root),
-            "sites": sorted(list(sites_keep)),
-            "seed": args.seed,
-            "fractions": {"train": args.train_frac, "val": args.val_frac, "test": args.test_frac},
-            "weights": weights,
-            "grouping": "group_id = site|device|YYYYMMDD",
-            "notes": [
-                "Split is group-wise to reduce leakage from temporally adjacent frames.",
-                "Time-of-day bucket derives from video clip HHMMSS in stem; frames inherit clip bucket.",
-                "Balancing is soft; rare classes are prioritized earlier in greedy assignment.",
-            ],
-        },
-        "targets": {
-            "total_frames": report["total_frames"],
-            "target_frames": report["target_frames"],
-            "actual_frames": report["actual_frames"],
-            "global_class_dist": report["class_dist"],
-            "global_tod_dist": report["tod_dist"],
-            "global_density_dist": report["density_dist"],
-            "global_area_dist": report["area_dist"],
-        },
-        "splits": {
-            "train": summarize_split(train),
-            "val": summarize_split(val),
-            "test": summarize_split(test),
-        },
-    }
-
-    (out_dir / "split_report.json").write_text(json.dumps(full_report, indent=2, sort_keys=True) + "\n")
-
-    print("[make_splits] wrote:")
-    print(f"  {out_dir / 'train.txt'} ({len(train.frame_paths)} frames)")
-    print(f"  {out_dir / 'val.txt'}   ({len(val.frame_paths)} frames)")
-    print(f"  {out_dir / 'test.txt'}  ({len(test.frame_paths)} frames)")
-    print(f"  {out_dir / 'group_assignments.csv'}")
-    print(f"  {out_dir / 'split_report.json'}")
-
+from object_detection.splits.cli import main
 
 if __name__ == "__main__":
     main()
diff --git a/training/object-detection/src/object_detection/splits/cli.py b/training/object-detection/src/object_detection/splits/cli.py
new file mode 100644
index 0000000..cbd00b2
--- /dev/null
+++ b/training/object-detection/src/object_detection/splits/cli.py
@@ -0,0 +1,129 @@
+import argparse
+import csv
+import json
+from pathlib import Path
+
+from object_detection.splits.splitter import (
+        build_groups,
+        split_groups_greedy,
+        write_manifest,
+        summarize_split,
+)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--labels-root", required=True, type=Path,
+                    help="Root of exploded YOLO labels, e.g. data/99_work/yolo_annos_exploded")
+    ap.add_argument("--out-dir", required=True, type=Path,
+                    help="Output directory for split manifests")
+    ap.add_argument("--sites", nargs="*", default=["tankeeah", "kitwanga", "bear"],
+                    help="Sites to include (baseline)")
+    ap.add_argument("--seed", type=int, default=42)
+    ap.add_argument("--train-frac", type=float, default=0.80)
+    ap.add_argument("--val-frac", type=float, default=0.10)
+    ap.add_argument("--test-frac", type=float, default=0.10)
+    ap.add_argument("--limit-files", type=int, default=None,
+                    help="Debug: limit to N random label files")
+
+    # Objective weights
+    ap.add_argument("--w-class", type=float, default=4.0)
+    ap.add_argument("--w-tod", type=float, default=1.0)
+    ap.add_argument("--w-density", type=float, default=1.0)
+    ap.add_argument("--w-area", type=float, default=1.0)
+    ap.add_argument("--w-ar", type=float, default=1.0)
+    ap.add_argument("--w-size", type=float, default=2.0)
+
+    args = ap.parse_args()
+
+    if not args.labels_root.exists():
+        raise SystemExit(f"labels-root not found: {args.labels_root}")
+
+    ssum = args.train_frac + args.val_frac + args.test_frac
+    if abs(ssum - 1.0) > 1e-6:
+        raise SystemExit(f"train/val/test fractions must sum to 1.0; got {ssum}")
+
+    groups = build_groups(
+        labels_root=args.labels_root,
+        sites_keep=args.sites,
+        seed=args.seed,
+        limit=args.limit_files,
+    )
+
+    # Split
+    weights = {
+        "class": args.w_class,
+        "tod": args.w_tod,
+        "density": args.w_density,
+        "area": args.w_area,
+        "ar": args.w_ar,
+        "size": args.w_size,
+    }
+
+    train, val, test, report = split_groups_greedy(
+        groups=groups,
+        seed=args.seed,
+        train_frac=args.train_frac,
+        val_frac=args.val_frac,
+        test_frac=args.test_frac,
+        weights=weights,
+    )
+
+    out_dir = args.out_dir
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Write manifests (relative paths to label files)
+    write_manifest(out_dir / "train.txt", train.frame_paths)
+    write_manifest(out_dir / "val.txt", val.frame_paths)
+    write_manifest(out_dir / "test.txt", test.frame_paths)
+
+    # Group assignment CSV
+    with (out_dir / "group_assignments.csv").open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["group_id", "split", "site", "device", "date", "n_frames", "n_boxes"])
+        for s in [train, val, test]:
+            for gid in s.group_ids:
+                g = groups[gid]
+                w.writerow([gid, s.name, g.site, g.device, g.date, g.n_frames, g.n_boxes])
+
+    # JSON report
+    full_report = {
+        "params": {
+            "labels_root": str(args.labels_root),
+            "sites": args.sites,
+            "seed": args.seed,
+            "fractions": {"train": args.train_frac, "val": args.val_frac, "test": args.test_frac},
+            "weights": weights,
+            "grouping": "group_id = site|device|YYYYMMDD",
+            "notes": [
+                "Split is group-wise to reduce leakage from temporally adjacent frames.",
+                "Time-of-day bucket derives from video clip HHMMSS in stem; frames inherit clip bucket.",
+                "Balancing is soft; rare classes are prioritized earlier in greedy assignment.",
+            ],
+        },
+        "targets": {
+            "total_frames": report["total_frames"],
+            "target_frames": report["target_frames"],
+            "actual_frames": report["actual_frames"],
+            "global_class_dist": report["class_dist"],
+            "global_tod_dist": report["tod_dist"],
+            "global_density_dist": report["density_dist"],
+            "global_area_dist": report["area_dist"],
+            "global_ar_dist": report["ar_dist"],
+        },
+        "splits": {
+            "train": summarize_split(train),
+            "val": summarize_split(val),
+            "test": summarize_split(test),
+        },
+    }
+
+    (out_dir / "split_report.json").write_text(json.dumps(full_report, indent=2, sort_keys=True) + "\n")
+
+    print("[make_splits] wrote:")
+    print(f"  {out_dir / 'train.txt'} ({len(train.frame_paths)} frames)")
+    print(f"  {out_dir / 'val.txt'}   ({len(val.frame_paths)} frames)")
+    print(f"  {out_dir / 'test.txt'}  ({len(test.frame_paths)} frames)")
+    print(f"  {out_dir / 'group_assignments.csv'}")
+    print(f"  {out_dir / 'split_report.json'}")
+
+
diff --git a/training/object-detection/src/object_detection/splits/parsing.py b/training/object-detection/src/object_detection/splits/parsing.py
new file mode 100644
index 0000000..e737171
--- /dev/null
+++ b/training/object-detection/src/object_detection/splits/parsing.py
@@ -0,0 +1,167 @@
+import re
+from pathlib import Path
+from typing import Dict, Tuple, Optional
+from collections import Counter
+
+
+_STEM_RE = re.compile(
+    r"""
+    ^
+    (?P<prefix>.+?)                 # ORG-site-device-id (up to first underscore)
+    _
+    (?P<date>\d{8})                 # YYYYMMDD
+    _
+    (?P<time>\d{6})                 # HHMMSS
+    _
+    (?P<suffix>.+)                  # M / etc
+    $
+    """,
+    re.VERBOSE,
+)
+
+def parse_video_stem(video_stem: str) -> Optional[Dict[str, str]]:
+    """
+    Parse:
+      HIRMD-tankeeah-jetson-0_20250714_012827_M
+    into:
+      org="HIRMD", site="tankeeah", device="jetson-0", date="20250714", time="012827"
+    """
+    m = _STEM_RE.match(video_stem)
+    if not m:
+        return None
+
+    prefix = m.group("prefix")
+    date = m.group("date")
+    time = m.group("time")
+
+    parts = prefix.split("-")
+    if len(parts) < 3:
+        return None
+
+    org = parts[0]
+    site = parts[1]
+    device = "-".join(parts[2:])  # includes jetson-0, jetsonorin-1, etc.
+
+    return {"org": org, "site": site, "device": device, "date": date, "time": time}
+
+
+def time_bucket(hhmmss: str) -> str:
+    """Coarse time-of-day buckets based on HH."""
+    try:
+        hh = int(hhmmss[0:2])
+    except Exception:
+        return "unknown"
+    if 0 <= hh <= 5:
+        return "night"
+    if 6 <= hh <= 11:
+        return "morning"
+    if 12 <= hh <= 17:
+        return "afternoon"
+    if 18 <= hh <= 23:
+        return "evening"
+    return "unknown"
+
+
+def density_bin(n_boxes: int) -> str:
+    """Bins for boxes per frame."""
+    if n_boxes <= 0:
+        return "0"
+    if n_boxes == 1:
+        return "1"
+    if n_boxes == 2:
+        return "2"
+    if 3 <= n_boxes <= 4:
+        return "3-4"
+    if 5 <= n_boxes <= 9:
+        return "5-9"
+    return "10+"
+
+def ar_bin(w: float, h: float) -> str:
+    """
+    Aspect ratio bins based on w/h.
+    w,h are YOLO normalized widths/heights in [0,1].
+    """
+    if w <= 0 or h <= 0:
+        return "invalid"
+    r = w / h
+
+    # You can tune these thresholds, but this is a good start:
+    if r < 0.67:
+        return "tall"        # height-dominant
+    if r <= 1.5:
+        return "square"      # roughly square-ish
+    return "wide"            # width-dominant
+
+def area_bin(area: float) -> str:
+    """
+    Bin YOLO normalized bbox area (w*h) in [0,1].
+    Tune thresholds if needed.
+    """
+    if area <= 0:
+        return "0"
+    if area < 0.0025:
+        return "<0.0025"
+    if area < 0.01:
+        return "0.0025-0.01"
+    if area < 0.04:
+        return "0.01-0.04"
+    if area < 0.16:
+        return "0.04-0.16"
+    return ">=0.16"
+
+
+def safe_float(x: str, default: float = 0.0) -> float:
+    try:
+        return float(x)
+    except Exception:
+        return default
+
+def parse_frame_idx(filename: str) -> Optional[int]:
+    # frame_000123.txt
+    m = re.match(r"^frame_(\d+)\.txt$", filename)
+    if not m:
+        return None
+    try:
+        return int(m.group(1))
+    except Exception:
+        return None
+
+
+def read_yolo_label(path: Path) -> Tuple[int, Counter, Counter, Counter]:
+    """
+    Returns:
+      n_boxes,
+      class_counts (class_id -> count),
+      area_bins (area_bin -> count)
+    """
+    n_boxes = 0
+    class_counts: Counter = Counter()
+    area_counts: Counter = Counter()
+    ar_counts: Counter = Counter()
+
+    try:
+        txt = path.read_text().strip()
+    except Exception:
+        return 0, Counter(), Counter(), Counter()
+
+    if not txt:
+        return 0, Counter(), Counter(), Counter()
+
+    for line in txt.splitlines():
+        parts = line.strip().split()
+        if len(parts) < 5:
+            continue
+        cls = parts[0]
+        w = safe_float(parts[3], 0.0)
+        h = safe_float(parts[4], 0.0)
+        try:
+            cls_id = int(cls)
+        except Exception:
+            continue
+        n_boxes += 1
+        class_counts[cls_id] += 1
+        area_counts[area_bin(w * h)] += 1
+        ar_counts[ar_bin(w, h)] += 1
+
+    return n_boxes, class_counts, area_counts, ar_counts
+
diff --git a/training/object-detection/src/object_detection/splits/splitter.py b/training/object-detection/src/object_detection/splits/splitter.py
new file mode 100644
index 0000000..b77cba8
--- /dev/null
+++ b/training/object-detection/src/object_detection/splits/splitter.py
@@ -0,0 +1,361 @@
+import random
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Iterable, Any
+from collections import Counter
+
+from object_detection.splits.parsing import (
+        parse_video_stem,
+        time_bucket,
+        density_bin,
+        parse_frame_idx,
+        read_yolo_label,
+)
+
+# -----------------------------
+# Data structures
+# -----------------------------
+
+@dataclass
+class FrameRecord:
+    rel_path: str            # relative path to label file within labels_root
+    video_stem: str
+    frame_idx: int
+    org: str
+    site: str
+    device: str
+    date: str                # YYYYMMDD
+    tod: str                 # time-of-day bucket
+
+    n_boxes: int
+    class_counts: Counter    # class_id -> count
+    density_bin: str
+    area_bins: Counter       # area_bin -> count (counts per box)
+
+
+@dataclass
+class GroupStats:
+    group_id: str
+    site: str
+    device: str
+    date: str
+
+    n_frames: int
+    n_boxes: int
+
+    class_counts: Counter
+    tod_counts: Counter
+    density_counts: Counter
+    area_counts: Counter
+    ar_counts: Counter
+
+    # list of frame rel_paths (for manifest writing)
+    frame_paths: List[str]
+
+
+# -----------------------------
+# Scanning labels
+# -----------------------------
+
+def iter_label_files(labels_root: Path) -> Iterable[Path]:
+    # Expect structure: labels_root/<video_stem>/frame_000123.txt
+    for p in labels_root.rglob("frame_*.txt"):
+        if p.is_file():
+            yield p
+
+def build_groups(
+    labels_root: Path,
+    sites_keep: List[str],
+    seed: int,
+    limit: Optional[int] = None,
+) -> Dict[str, GroupStats]:
+    """
+    Build per-group aggregates. Group id = site|device|date.
+    """
+    rnd = random.Random(seed)
+
+    files = list(iter_label_files(labels_root))
+    files.sort()
+    if limit is not None:
+        rnd.shuffle(files)
+        files = files[:limit]
+
+    groups: Dict[str, GroupStats] = {}
+
+    skipped = 0
+    for f in files:
+        video_stem = f.parent.name
+        meta = parse_video_stem(video_stem)
+        if meta is None:
+            skipped += 1
+            continue
+
+        site = meta["site"]
+        if sites_keep and site not in sites_keep:
+            continue
+
+        frame_idx = parse_frame_idx(f.name)
+        if frame_idx is None:
+            skipped += 1
+            continue
+
+        n_boxes, class_counts, area_counts, ar_counts = read_yolo_label(f)
+        # If you ever include negatives, you may want to keep empty files too.
+        # For now, assume label files exist only when boxes exist.
+        dens_bin = density_bin(n_boxes)
+        tod = time_bucket(meta["time"])
+
+        rel_path = str(f.relative_to(labels_root))
+
+        group_id = f"{site}|{meta['device']}|{meta['date']}"
+        if group_id not in groups:
+            groups[group_id] = GroupStats(
+                group_id=group_id,
+                site=site,
+                device=meta["device"],
+                date=meta["date"],
+                n_frames=0,
+                n_boxes=0,
+                class_counts=Counter(),
+                tod_counts=Counter(),
+                density_counts=Counter(),
+                area_counts=Counter(),
+                ar_counts=Counter(),
+                frame_paths=[],
+            )
+
+        g = groups[group_id]
+        g.n_frames += 1
+        g.n_boxes += n_boxes
+        g.class_counts.update(class_counts)
+        g.tod_counts[tod] += 1
+        g.density_counts[dens_bin] += 1
+        g.area_counts.update(area_counts)
+        g.ar_counts.update(ar_counts)
+        g.frame_paths.append(rel_path)
+
+    if skipped:
+        print(f"[make_splits] skipped {skipped} files due to parse issues")
+    print(f"[make_splits] groups={len(groups)} from label files={len(files)}")
+    return groups
+
+
+# -----------------------------
+# Split objective
+# -----------------------------
+
+def normalize_counter(c: Counter) -> Dict[Any, float]:
+    s = float(sum(c.values()))
+    if s <= 0:
+        return {}
+    return {k: v / s for k, v in c.items()}
+
+
+def l1_dist(p: Dict[Any, float], q: Dict[Any, float], keys: Iterable[Any]) -> float:
+    d = 0.0
+    for k in keys:
+        d += abs(p.get(k, 0.0) - q.get(k, 0.0))
+    return d
+
+
+@dataclass
+class SplitState:
+    name: str
+    target_frac: float
+    n_frames: int = 0
+
+    class_counts: Counter = None
+    tod_counts: Counter = None
+    density_counts: Counter = None
+    area_counts: Counter = None
+    ar_counts: Counter = None
+
+    group_ids: List[str] = None
+    frame_paths: List[str] = None
+
+    def __post_init__(self):
+        self.class_counts = Counter()
+        self.tod_counts = Counter()
+        self.density_counts = Counter()
+        self.area_counts = Counter()
+        self.ar_counts = Counter()
+        self.group_ids = []
+        self.frame_paths = []
+
+    def add_group(self, g: GroupStats):
+        self.n_frames += g.n_frames
+        self.class_counts.update(g.class_counts)
+        self.tod_counts.update(g.tod_counts)
+        self.density_counts.update(g.density_counts)
+        self.area_counts.update(g.area_counts)
+        self.ar_counts.update(g.ar_counts)
+        self.group_ids.append(g.group_id)
+        self.frame_paths.extend(g.frame_paths)
+
+
+def compute_global_targets(groups: List[GroupStats]) -> Dict[str, Any]:
+    total_frames = sum(g.n_frames for g in groups)
+
+    global_class = Counter()
+    global_tod = Counter()
+    global_density = Counter()
+    global_area = Counter()
+    global_ar = Counter()
+
+    for g in groups:
+        global_class.update(g.class_counts)
+        global_tod.update(g.tod_counts)
+        global_density.update(g.density_counts)
+        global_area.update(g.area_counts)
+        global_ar.update(g.ar_counts)
+
+    targets = {
+        "total_frames": total_frames,
+        "class_keys": sorted(global_class.keys()),
+        "tod_keys": sorted(global_tod.keys()),
+        "density_keys": sorted(global_density.keys()),
+        "area_keys": sorted(global_area.keys()),
+        "ar_keys": sorted(global_ar.keys()),
+        "class_dist": normalize_counter(global_class),
+        "tod_dist": normalize_counter(global_tod),
+        "density_dist": normalize_counter(global_density),
+        "area_dist": normalize_counter(global_area),
+        "ar_dist": normalize_counter(global_ar),
+    }
+    return targets
+
+
+def rarity_score(g: GroupStats, global_class_dist: Dict[int, float]) -> float:
+    """
+    Higher score => assign earlier.
+    Use inverse frequency weighting on classes present in the group.
+    """
+    s = 0.0
+    for cls_id, cnt in g.class_counts.items():
+        p = global_class_dist.get(cls_id, 1e-12)
+        # weight by amount of that class in the group
+        s += cnt * (1.0 / max(p, 1e-6))
+    # also emphasize very dense groups a bit
+    s += 0.25 * g.n_boxes
+    return s
+
+
+def split_groups_greedy(
+    groups: Dict[str, GroupStats],
+    seed: int,
+    train_frac: float,
+    val_frac: float,
+    test_frac: float,
+    weights: Dict[str, float],
+) -> Tuple[SplitState, SplitState, SplitState, Dict[str, Any]]:
+    """
+    Greedy group assignment minimizing distance to global distributions + size penalty.
+
+    weights keys: class, tod, density, area, size
+    """
+    rnd = random.Random(seed)
+    group_list = list(groups.values())
+
+    targets = compute_global_targets(group_list)
+    total_frames = targets["total_frames"]
+
+    # Sort groups by rarity (desc), stable tie-break with seed
+    rnd.shuffle(group_list)
+    group_list.sort(key=lambda g: rarity_score(g, targets["class_dist"]), reverse=True)
+
+    train = SplitState("train", train_frac)
+    val = SplitState("val", val_frac)
+    test = SplitState("test", test_frac)
+    splits = [test, val, train]
+
+    # precompute target frame counts
+    target_frames = {
+        "train": train_frac * total_frames,
+        "val": val_frac * total_frames,
+        "test": test_frac * total_frames,
+    }
+
+    def score_split(after: SplitState) -> float:
+        # distribution distances (L1)
+        class_d = l1_dist(normalize_counter(after.class_counts), targets["class_dist"], targets["class_keys"])
+        tod_d = l1_dist(normalize_counter(after.tod_counts), targets["tod_dist"], targets["tod_keys"])
+        dens_d = l1_dist(normalize_counter(after.density_counts), targets["density_dist"], targets["density_keys"])
+        area_d = l1_dist(normalize_counter(after.area_counts), targets["area_dist"], targets["area_keys"])
+        ar_d = l1_dist(normalize_counter(after.ar_counts), targets["ar_dist"], targets["ar_keys"])
+
+        # size penalty: keep n_frames close to target
+        tf = target_frames[after.name]
+        size_d = abs(after.n_frames - tf) / max(tf, 1.0)
+
+        return (
+            weights["class"] * class_d +
+            weights["tod"] * tod_d +
+            weights["density"] * dens_d +
+            weights["area"] * area_d +
+            weights["ar"] * ar_d +
+            weights["size"] * size_d
+        )
+
+    # Greedy: for each group, try each split, pick minimal total score across all splits
+    for g in group_list:
+        best = None
+        best_score = float("inf")
+
+        for s in splits:
+            # clone minimal stats (cheap-ish since Counters)
+            tmp = SplitState(s.name, s.target_frac)
+            tmp.n_frames = s.n_frames
+            tmp.class_counts = s.class_counts.copy()
+            tmp.tod_counts = s.tod_counts.copy()
+            tmp.density_counts = s.density_counts.copy()
+            tmp.area_counts = s.area_counts.copy()
+
+            tmp.add_group(g)
+
+            # compute global score as sum of each split score
+            # (this keeps all splits moving toward their targets)
+            total = 0.0
+            for other in splits:
+                if other.name == s.name:
+                    total += score_split(tmp)
+                else:
+                    total += score_split(other)
+
+            if total < best_score:
+                best_score = total
+                best = s
+
+        assert best is not None
+        best.add_group(g)
+
+    report = {
+        "total_frames": total_frames,
+        "target_frames": target_frames,
+        "actual_frames": {s.name: s.n_frames for s in splits},
+    }
+    return train, val, test, {**targets, **report}
+
+
+# -----------------------------
+# Reporting + writing
+# -----------------------------
+
+def summarize_split(s: SplitState) -> Dict[str, Any]:
+    return {
+        "n_frames": s.n_frames,
+        "n_boxes": int(sum(s.class_counts.values())),
+        "n_groups": len(s.group_ids),
+        "class_counts": dict(s.class_counts),
+        "tod_counts": dict(s.tod_counts),
+        "density_counts": dict(s.density_counts),
+        "area_counts": dict(s.area_counts),
+        "ar_counts": dict(s.ar_counts),
+    }
+
+
+def write_manifest(out_path: Path, rel_paths: List[str]):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    rel_paths = list(rel_paths)
+    rel_paths.sort()
+    out_path.write_text("\n".join(rel_paths) + ("\n" if rel_paths else ""))
+
diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index 5a77d27..72dc81b 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -9,7 +9,11 @@
 import zlib
 
 from object_detection.yolo_ls.shards import TarShardWriter
-from object_detection.yolo_ls.parsing import *
+from object_detection.yolo_ls.parsing import (
+    coord_mode,
+    safe_float,
+    to_yolo,
+)
 
 @dataclass
 class ConvertStats:
diff --git a/training/object-detection/tests/conftest.py b/training/object-detection/tests/conftest.py
new file mode 100644
index 0000000..aea6be8
--- /dev/null
+++ b/training/object-detection/tests/conftest.py
@@ -0,0 +1,85 @@
+import json
+from pathlib import Path
+
+import pytest
+
+
+def write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+
+
+def make_ls_item(
+    *,
+    item_id=1,
+    site="tankeeah",
+    filename="HIRMD-tankeeah-jetson-0_20240704_055747_M.mp4",
+    updated_at="2025-03-01T03:49:36.058815Z",
+    labels=("Sockeye",),
+    sequence=None,
+    result_type="videorectangle",
+    from_name="box",
+    to_name="video",
+    include_result=True,
+    metadata_video_nb_frames=30,
+):
+    if sequence is None:
+        sequence = [
+            {"enabled": True, "frame": 10, "x": 10, "y": 20, "width": 30, "height": 40},
+            {"enabled": True, "frame": 12, "x": 20, "y": 30, "width": 30, "height": 40},
+        ]
+
+    result = []
+    if include_result:
+        result = [
+            {
+                "type": result_type,
+                "from_name": from_name,
+                "to_name": to_name,
+                "value": {
+                    "labels": list(labels),
+                    "sequence": sequence,
+                    "framesCount": metadata_video_nb_frames,
+                },
+            }
+        ]
+
+    return {
+        "id": item_id,
+        "data": {
+            "metadata_file_site_reference_string": site,
+            "metadata_file_filename": filename,
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": metadata_video_nb_frames,
+            "metadata_video_duration": metadata_video_nb_frames / 10.0,
+            "frames_per_second": 10.0,
+            "metadata_video_r_frame_rate": "10/1",
+            "metadata_video_avg_frame_rate": "10/1",
+            "video": f"s3://bucket/{filename}",
+        },
+        "annotations": [
+            {
+                "updated_at": updated_at,
+                "result": result,
+            }
+        ],
+    }
+
+
+def make_label_file(root: Path, video_stem: str, frame_idx: int, lines: str) -> Path:
+    path = root / video_stem / f"frame_{frame_idx:06d}.txt"
+    write_text(path, lines)
+    return path
+
+
+@pytest.fixture
+def sample_item():
+    return make_ls_item()
+
+
+@pytest.fixture
+def sample_json_file(tmp_path, sample_item):
+    p = tmp_path / "input.json"
+    p.write_text(json.dumps([sample_item]), encoding="utf-8")
+    return p
diff --git a/training/object-detection/tests/test_splits_parsing.py b/training/object-detection/tests/test_splits_parsing.py
new file mode 100644
index 0000000..ae4ad87
--- /dev/null
+++ b/training/object-detection/tests/test_splits_parsing.py
@@ -0,0 +1,107 @@
+from collections import Counter
+
+from object_detection.splits.parsing import (
+    ar_bin,
+    area_bin,
+    density_bin,
+    parse_frame_idx,
+    parse_video_stem,
+    read_yolo_label,
+    safe_float,
+    time_bucket,
+)
+
+
+def test_parse_video_stem_happy_path():
+    got = parse_video_stem("HIRMD-tankeeah-jetson-0_20250714_012827_M")
+    assert got == {
+        "org": "HIRMD",
+        "site": "tankeeah",
+        "device": "jetson-0",
+        "date": "20250714",
+        "time": "012827",
+    }
+
+
+def test_parse_video_stem_invalid():
+    assert parse_video_stem("not_a_valid_stem") is None
+
+
+def test_time_bucket():
+    assert time_bucket("010203") == "night"
+    assert time_bucket("070000") == "morning"
+    assert time_bucket("130000") == "afternoon"
+    assert time_bucket("200000") == "evening"
+    assert time_bucket("bad") == "unknown"
+
+
+def test_density_bin():
+    assert density_bin(0) == "0"
+    assert density_bin(1) == "1"
+    assert density_bin(2) == "2"
+    assert density_bin(4) == "3-4"
+    assert density_bin(7) == "5-9"
+    assert density_bin(10) == "10+"
+
+
+def test_ar_bin():
+    assert ar_bin(0.1, 0.3) == "tall"
+    assert ar_bin(0.3, 0.3) == "square"
+    assert ar_bin(0.5, 0.2) == "wide"
+    assert ar_bin(0.0, 0.2) == "invalid"
+
+
+def test_area_bin():
+    assert area_bin(0) == "0"
+    assert area_bin(0.001) == "<0.0025"
+    assert area_bin(0.005) == "0.0025-0.01"
+    assert area_bin(0.02) == "0.01-0.04"
+    assert area_bin(0.10) == "0.04-0.16"
+    assert area_bin(0.20) == ">=0.16"
+
+
+def test_safe_float():
+    assert safe_float("1.25") == 1.25
+    assert safe_float("bad", 7.0) == 7.0
+
+
+def test_parse_frame_idx():
+    assert parse_frame_idx("frame_000123.txt") == 123
+    assert parse_frame_idx("bad.txt") is None
+
+
+def test_read_yolo_label_happy_path(tmp_path):
+    path = tmp_path / "frame_000001.txt"
+    path.write_text(
+        "0 0.5 0.5 0.10 0.20\n"
+        "1 0.3 0.3 0.30 0.30\n",
+        encoding="utf-8",
+    )
+    n_boxes, class_counts, area_counts, ar_counts = read_yolo_label(path)
+
+    assert n_boxes == 2
+    assert class_counts == Counter({0: 1, 1: 1})
+    assert sum(area_counts.values()) == 2
+    assert sum(ar_counts.values()) == 2
+
+
+def test_read_yolo_label_empty_file(tmp_path):
+    path = tmp_path / "frame_000001.txt"
+    path.write_text("", encoding="utf-8")
+
+    n_boxes, class_counts, area_counts, ar_counts = read_yolo_label(path)
+    assert n_boxes == 0
+    assert class_counts == Counter()
+    assert area_counts == Counter()
+    assert ar_counts == Counter()
+
+
+def test_read_yolo_label_bad_lines(tmp_path):
+    path = tmp_path / "frame_000001.txt"
+    path.write_text("bad line\n0 0.5 0.5 nope 0.2\n", encoding="utf-8")
+
+    n_boxes, class_counts, area_counts, ar_counts = read_yolo_label(path)
+    assert n_boxes == 1
+    assert class_counts == Counter({0: 1})
+    assert sum(area_counts.values()) == 1
+    assert sum(ar_counts.values()) == 1
diff --git a/training/object-detection/tests/test_splits_splitter.py b/training/object-detection/tests/test_splits_splitter.py
new file mode 100644
index 0000000..9c5ef21
--- /dev/null
+++ b/training/object-detection/tests/test_splits_splitter.py
@@ -0,0 +1,143 @@
+import json
+from collections import Counter
+from pathlib import Path
+
+import pytest
+
+from object_detection.splits.splitter import (
+    SplitState,
+    build_groups,
+    compute_global_targets,
+    iter_label_files,
+    l1_dist,
+    normalize_counter,
+    rarity_score,
+    split_groups_greedy,
+    summarize_split,
+    write_manifest,
+)
+
+
+def test_iter_label_files(tmp_path: Path, make_label_file):
+    make_label_file(tmp_path, "HIRMD-tankeeah-jetson-0_20250714_012827_M", 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file(tmp_path, "HIRMD-tankeeah-jetson-0_20250714_012827_M", 11, "0 0.5 0.5 0.1 0.2\n")
+    files = sorted(iter_label_files(tmp_path))
+    assert len(files) == 2
+    assert files[0].name == "frame_000010.txt"
+
+
+def test_build_groups_filters_sites(tmp_path: Path, make_label_file):
+    make_label_file(
+        tmp_path,
+        "HIRMD-tankeeah-jetson-0_20250714_012827_M",
+        10,
+        "0 0.5 0.5 0.1 0.2\n",
+    )
+    make_label_file(
+        tmp_path,
+        "HIRMD-bear-jetsonnx-0_20250912_011859_M",
+        20,
+        "1 0.5 0.5 0.2 0.2\n",
+    )
+
+    groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
+
+    assert len(groups) == 1
+    gid = next(iter(groups))
+    assert gid.startswith("tankeeah|")
+
+
+def test_build_groups_aggregates_frames(tmp_path: Path, make_label_file):
+    video = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
+    make_label_file(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file(tmp_path, video, 11, "0 0.5 0.5 0.1 0.2\n1 0.5 0.5 0.2 0.2\n")
+
+    groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
+    g = next(iter(groups.values()))
+
+    assert g.n_frames == 2
+    assert g.n_boxes == 3
+    assert g.class_counts == Counter({0: 2, 1: 1})
+    assert len(g.frame_paths) == 2
+
+
+def test_normalize_counter():
+    got = normalize_counter(Counter({"a": 2, "b": 1}))
+    assert got["a"] == pytest.approx(2 / 3)
+    assert got["b"] == pytest.approx(1 / 3)
+
+
+def test_l1_dist():
+    p = {"a": 0.5, "b": 0.5}
+    q = {"a": 1.0, "b": 0.0}
+    assert l1_dist(p, q, ["a", "b"]) == pytest.approx(1.0)
+
+
+def test_compute_global_targets(tmp_path: Path, make_label_file):
+    video1 = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
+    video2 = "HIRMD-bear-jetsonnx-0_20250912_011859_M"
+    make_label_file(tmp_path, video1, 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file(tmp_path, video2, 20, "1 0.5 0.5 0.2 0.2\n")
+
+    groups = build_groups(tmp_path, sites_keep={"tankeeah", "bear"}, seed=42)
+    targets = compute_global_targets(list(groups.values()))
+
+    assert targets["total_frames"] == 2
+    assert targets["class_keys"] == [0, 1]
+    assert pytest.approx(sum(targets["class_dist"].values())) == 1.0
+
+
+def test_rarity_score_positive(tmp_path: Path, make_label_file):
+    video = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
+    make_label_file(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
+    groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
+    g = next(iter(groups.values()))
+    score = rarity_score(g, {0: 1.0})
+    assert score > 0
+
+
+def test_split_groups_greedy_assigns_all_groups(tmp_path: Path, make_label_file):
+    videos = [
+        "HIRMD-tankeeah-jetson-0_20250714_012827_M",
+        "HIRMD-tankeeah-jetson-0_20250715_012827_M",
+        "HIRMD-kitwanga-jetson-0_20250714_012827_M",
+        "SFC-bear-jetsonnx-0_20250912_011859_M",
+    ]
+    for i, video in enumerate(videos):
+        make_label_file(tmp_path, video, 10, f"{i % 2} 0.5 0.5 0.1 0.2\n")
+
+    groups = build_groups(tmp_path, sites_keep={"tankeeah", "kitwanga", "bear"}, seed=42)
+    train, val, test, report = split_groups_greedy(
+        groups=groups,
+        seed=42,
+        train_frac=0.8,
+        val_frac=0.1,
+        test_frac=0.1,
+        weights={"class": 4.0, "tod": 1.0, "density": 1.0, "area": 1.0, "size": 2.0},
+    )
+
+    assigned = set(train.group_ids) | set(val.group_ids) | set(test.group_ids)
+    assert assigned == set(groups.keys())
+    assert report["total_frames"] == 4
+    assert train.n_frames + val.n_frames + test.n_frames == 4
+
+
+def test_summarize_split(tmp_path: Path, make_label_file):
+    video = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
+    make_label_file(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
+    groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
+    g = next(iter(groups.values()))
+
+    split = SplitState("train", 0.8)
+    split.add_group(g)
+
+    summary = summarize_split(split)
+    assert summary["n_frames"] == 1
+    assert summary["n_boxes"] == 1
+    assert summary["n_groups"] == 1
+
+
+def test_write_manifest(tmp_path: Path):
+    out = tmp_path / "manifest.txt"
+    write_manifest(out, ["b/file2.txt", "a/file1.txt"])
+    assert out.read_text(encoding="utf-8") == "a/file1.txt\nb/file2.txt\n"
diff --git a/training/object-detection/tests/test_yolo_ls_converter.py b/training/object-detection/tests/test_yolo_ls_converter.py
new file mode 100644
index 0000000..8fe728d
--- /dev/null
+++ b/training/object-detection/tests/test_yolo_ls_converter.py
@@ -0,0 +1,466 @@
+import json
+import tarfile
+from pathlib import Path
+
+import pytest
+
+from object_detection.yolo_ls.converter import YoloConverterLSVideo
+
+
+def test_stride_offset_fixed(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=2,
+    )
+    assert conv._stride_offset("video") == 2
+
+
+def test_stride_offset_video_hash_deterministic(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        frame_stride=3,
+        frame_offset_mode="video_hash",
+    )
+    a = conv._stride_offset("video_name")
+    b = conv._stride_offset("video_name")
+    assert a == b
+    assert 0 <= a < 3
+
+
+def test_stride_offset_invalid_mode(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        frame_stride=3,
+        frame_offset_mode="bad_mode",
+    )
+    with pytest.raises(ValueError, match="Invalid frame offset mode"):
+        conv._stride_offset("video")
+
+
+def test_parse_ffmpeg_rate_fraction():
+    assert YoloConverterLSVideo._parse_ffmpeg_rate("30000/1001") == pytest.approx(29.97002997)
+
+
+def test_parse_ffmpeg_rate_float_string():
+    assert YoloConverterLSVideo._parse_ffmpeg_rate("10.0") == pytest.approx(10.0)
+
+
+def test_parse_ffmpeg_rate_bad():
+    assert YoloConverterLSVideo._parse_ffmpeg_rate("not_a_rate") == 0.0
+
+
+def test_infer_total_frames_prefers_metadata_video_nb_frames(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(class_map={"Sockeye": 0}, output_dir=tmp_path / "out")
+    sample_item["data"]["metadata_video_nb_frames"] = 997
+    assert conv._infer_total_frames(sample_item) == 997
+
+
+def test_infer_total_frames_from_results_framescount(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(class_map={"Sockeye": 0}, output_dir=tmp_path / "out")
+    sample_item["data"]["metadata_video_nb_frames"] = 0
+    results = sample_item["annotations"][0]["result"]
+    assert conv._infer_total_frames(sample_item, results=results) == 30
+
+
+def test_infer_total_frames_from_duration_and_fps(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(class_map={"Sockeye": 0}, output_dir=tmp_path / "out")
+    sample_item["data"]["metadata_video_nb_frames"] = 0
+    sample_item["annotations"][0]["result"][0]["value"].pop("framesCount", None)
+    sample_item["data"]["metadata_video_duration"] = 9.7
+    sample_item["data"]["frames_per_second"] = 10.0
+    assert conv._infer_total_frames(sample_item, results=sample_item["annotations"][0]["result"]) == 97
+
+
+def test_eligible_frames_for_video_fixed_stride(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=1,
+    )
+    assert conv._eligible_frames_for_video("video", 10) == [1, 4, 7]
+
+
+def test_sample_negative_frames_for_video_is_deterministic(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+        negative_seed=42,
+    )
+    a = conv._sample_negative_frames_for_video("video", 30, 4)
+    b = conv._sample_negative_frames_for_video("video", 30, 4)
+    assert a == b
+    assert len(a) == 4
+    assert all(f % 3 == 0 for f in a)
+
+
+def test_interpolate_sequence_empty():
+    assert YoloConverterLSVideo._interpolate_sequence([]) == {}
+
+
+def test_interpolate_sequence_keyframes_and_interpolation():
+    seq = [
+        {"enabled": True, "frame": 10, "x": 0, "y": 0, "width": 10, "height": 10},
+        {"enabled": True, "frame": 12, "x": 20, "y": 20, "width": 10, "height": 10},
+    ]
+    out = YoloConverterLSVideo._interpolate_sequence(seq)
+    assert sorted(out.keys()) == [10, 11, 12]
+    assert out[10] == [(0.0, 0.0, 10.0, 10.0)]
+    assert out[12] == [(20.0, 20.0, 10.0, 10.0)]
+    assert out[11][0][0] == pytest.approx(10.0)
+    assert out[11][0][1] == pytest.approx(10.0)
+
+
+def test_interpolate_sequence_disabled_start_no_forward_interpolation():
+    seq = [
+        {"enabled": False, "frame": 10, "x": 0, "y": 0, "width": 10, "height": 10},
+        {"enabled": True, "frame": 12, "x": 20, "y": 20, "width": 10, "height": 10},
+    ]
+    out = YoloConverterLSVideo._interpolate_sequence(seq)
+    assert sorted(out.keys()) == [10, 12]
+    assert 11 not in out
+
+
+def test_parse_ts():
+    dt = YoloConverterLSVideo._parse_ts("2025-03-01T03:49:36.058815Z")
+    assert dt.year == 2025
+    assert dt.month == 3
+    assert dt.tzinfo is not None
+
+
+def test_log_error_writes_file(tmp_path: Path):
+    log_path = tmp_path / "logs" / "err.log"
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        error_log_path=log_path,
+    )
+    conv._log_error("ctx", ValueError("boom"))
+    text = log_path.read_text(encoding="utf-8")
+    assert "ERROR in ctx" in text
+    assert "ValueError: boom" in text
+
+
+def test_convert_item_writes_filesystem_labels(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        coord_mode="percent",
+    )
+    stats = conv._convert_item(sample_item)
+
+    assert stats.videos_with_boxes == 1
+    assert stats.videos_without_boxes == 0
+    assert stats.label_files_written == 3
+    assert conv._positive_frame_files_written == 3
+
+    out_file = tmp_path / "out" / "HIRMD-tankeeah-jetson-0_20240704_055747_M" / "frame_000010.txt"
+    assert out_file.exists()
+    assert out_file.read_text(encoding="utf-8").startswith("0 ")
+
+
+def test_convert_item_respects_include_sites(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        include_sites=["bear"],
+    )
+    stats = conv._convert_item(sample_item)
+
+    assert stats.videos_with_boxes == 0
+    assert stats.videos_without_boxes == 0
+    assert stats.label_files_written == 0
+
+
+def test_convert_item_no_boxes_records_empty_video_and_negative_candidate(tmp_path: Path):
+    empty_list = tmp_path / "empty.txt"
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        empty_list_path=empty_list,
+        include_negatives=True,
+    )
+
+    item = {
+        "id": 1,
+        "data": {
+            "metadata_file_site_reference_string": "tankeeah",
+            "metadata_file_filename": "HIRMD-tankeeah-jetson-0_20240704_060000_M.mp4",
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": 25,
+            "metadata_video_duration": 2.5,
+            "frames_per_second": 10.0,
+            "video": "s3://bucket/HIRMD-tankeeah-jetson-0_20240704_060000_M.mp4",
+        },
+        "annotations": [
+            {
+                "updated_at": "2025-03-01T03:49:36.058815Z",
+                "result": [],
+            }
+        ],
+    }
+
+    stats = conv._convert_item(item)
+
+    assert stats.videos_with_boxes == 0
+    assert stats.videos_without_boxes == 1
+    assert stats.label_files_written == 0
+    assert empty_list.read_text(encoding="utf-8").strip().endswith(".mp4")
+    assert len(conv._negative_candidates) == 1
+    assert conv._negative_candidates[0].total_frames == 25
+
+
+def test_convert_item_respects_frame_stride(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        frame_stride=2,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+        coord_mode="percent",
+    )
+    stats = conv._convert_item(sample_item)
+
+    assert stats.label_files_written == 2
+
+    base = tmp_path / "out" / "HIRMD-tankeeah-jetson-0_20240704_055747_M"
+    assert (base / "frame_000010.txt").exists()
+    assert not (base / "frame_000011.txt").exists()
+    assert (base / "frame_000012.txt").exists()
+
+
+def test_convert_item_writes_to_shards(tmp_path: Path, sample_item):
+    shard_dir = tmp_path / "shards"
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        shard_dir=shard_dir,
+        shard_size=10,
+        coord_mode="percent",
+    )
+    stats = conv._convert_item(sample_item)
+    conv._sharder.close()
+
+    assert stats.label_files_written == 3
+    tar_path = shard_dir / "yolo_annos-000000.tar"
+    assert tar_path.exists()
+
+    with tarfile.open(tar_path, "r") as tf:
+        names = sorted(tf.getnames())
+        assert "HIRMD-tankeeah-jetson-0_20240704_055747_M/frame_000010.txt" in names
+        content = tf.extractfile(
+            "HIRMD-tankeeah-jetson-0_20240704_055747_M/frame_000010.txt"
+        ).read().decode("utf-8")
+        assert content.startswith("0 ")
+
+
+def test_convert_item_skips_unknown_class(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Coho": 0},
+        output_dir=tmp_path / "out",
+    )
+    stats = conv._convert_item(sample_item)
+
+    assert stats.videos_with_boxes == 0
+    assert stats.videos_without_boxes == 1
+    assert stats.label_files_written == 0
+
+
+def test_convert_file_happy_path(tmp_path: Path, sample_json_file):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        coord_mode="percent",
+    )
+
+    stats = conv.convert_file(sample_json_file)
+    assert stats.videos_with_boxes == 1
+    assert stats.videos_without_boxes == 0
+    assert stats.label_files_written == 3
+    assert stats.errors == 0
+
+
+def test_convert_file_invalid_json(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+    )
+    json_path = tmp_path / "bad.json"
+    json_path.write_text("{not json", encoding="utf-8")
+
+    stats = conv.convert_file(json_path)
+    assert stats.errors == 1
+
+
+def test_convert_file_top_level_not_list(tmp_path: Path):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+    )
+    json_path = tmp_path / "bad.json"
+    json_path.write_text(json.dumps({"not": "a list"}), encoding="utf-8")
+
+    stats = conv.convert_file(json_path)
+    assert stats.errors == 1
+
+
+def test_convert_folder_reads_multiple_json_files(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        coord_mode="percent",
+    )
+
+    item1 = sample_item
+    item2 = json.loads(json.dumps(sample_item))
+    item2["data"]["metadata_file_filename"] = "HIRMD-tankeeah-jetson-0_20240704_055748_M.mp4"
+    item2["data"]["video"] = "s3://bucket/HIRMD-tankeeah-jetson-0_20240704_055748_M.mp4"
+
+    d = tmp_path / "jsons"
+    d.mkdir()
+    (d / "a.json").write_text(json.dumps([item1]), encoding="utf-8")
+    (d / "b.json").write_text(json.dumps([item2]), encoding="utf-8")
+
+    stats = conv.convert_folder(d, pattern="*.json")
+    assert stats.videos_with_boxes == 2
+    assert stats.label_files_written == 6
+    assert stats.errors == 0
+
+
+def test_materialize_negatives_writes_empty_files(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        include_negatives=True,
+        negative_ratio=0.10,
+        negatives_per_video=3,
+        frame_stride=2,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+    )
+
+    conv._convert_item(sample_item)
+
+    empty_item = {
+        "id": 2,
+        "data": {
+            "metadata_file_site_reference_string": "tankeeah",
+            "metadata_file_filename": "HIRMD-tankeeah-jetson-0_20240704_060000_M.mp4",
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": 20,
+            "metadata_video_duration": 2.0,
+            "frames_per_second": 10.0,
+            "video": "s3://bucket/HIRMD-tankeeah-jetson-0_20240704_060000_M.mp4",
+        },
+        "annotations": [{"updated_at": "2025-03-01T03:49:36.058815Z", "result": []}],
+    }
+    conv._convert_item(empty_item)
+
+    wrote, total_candidate_frames = conv.materialize_negatives()
+    assert wrote == 0
+    assert total_candidate_frames > 0
+
+
+def test_materialize_negatives_with_enough_positives(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        include_negatives=True,
+        negative_ratio=0.25,
+        negatives_per_video=3,
+        frame_stride=2,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+    )
+
+    for i in range(4):
+        item = json.loads(json.dumps(sample_item))
+        item["id"] = i + 1
+        item["data"]["metadata_file_filename"] = f"HIRMD-tankeeah-jetson-0_20240704_05574{i}_M.mp4"
+        item["data"]["video"] = f"s3://bucket/HIRMD-tankeeah-jetson-0_20240704_05574{i}_M.mp4"
+        conv._convert_item(item)
+
+    for i in range(2):
+        empty_item = {
+            "id": 100 + i,
+            "data": {
+                "metadata_file_site_reference_string": "tankeeah",
+                "metadata_file_filename": f"HIRMD-tankeeah-jetson-0_20240704_06100{i}_M.mp4",
+                "metadata_video_width": 1280,
+                "metadata_video_height": 720,
+                "metadata_video_nb_frames": 20,
+                "metadata_video_duration": 2.0,
+                "frames_per_second": 10.0,
+                "video": f"s3://bucket/HIRMD-tankeeah-jetson-0_20240704_06100{i}_M.mp4",
+            },
+            "annotations": [{"updated_at": "2025-03-01T03:49:36.058815Z", "result": []}],
+        }
+        conv._convert_item(empty_item)
+
+    wrote, total_candidate_frames = conv.materialize_negatives()
+
+    assert wrote == 4
+    assert total_candidate_frames >= 4
+    assert conv._negative_frame_files_written == 4
+
+    txt_files = list((tmp_path / "out").rglob("frame_*.txt"))
+    assert len(txt_files) >= 12 + 4
+
+
+def test_materialize_negatives_writes_to_shards(tmp_path: Path, sample_item):
+    conv = YoloConverterLSVideo(
+        class_map={"Sockeye": 0},
+        output_dir=tmp_path / "out",
+        shard_dir=tmp_path / "shards",
+        include_negatives=True,
+        negative_ratio=0.50,
+        negatives_per_video=2,
+        frame_stride=1,
+    )
+
+    for i in range(2):
+        item = json.loads(json.dumps(sample_item))
+        item["id"] = i + 1
+        item["data"]["metadata_file_filename"] = f"HIRMD-tankeeah-jetson-0_20240704_05570{i}_M.mp4"
+        item["data"]["video"] = f"s3://bucket/HIRMD-tankeeah-jetson-0_20240704_05570{i}_M.mp4"
+        conv._convert_item(item)
+
+    empty_item = {
+        "id": 3,
+        "data": {
+            "metadata_file_site_reference_string": "tankeeah",
+            "metadata_file_filename": "HIRMD-tankeeah-jetson-0_20240704_070000_M.mp4",
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": 10,
+            "metadata_video_duration": 1.0,
+            "frames_per_second": 10.0,
+            "video": "s3://bucket/HIRMD-tankeeah-jetson-0_20240704_070000_M.mp4",
+        },
+        "annotations": [{"updated_at": "2025-03-01T03:49:36.058815Z", "result": []}],
+    }
+    conv._convert_item(empty_item)
+
+    wrote, _ = conv.materialize_negatives()
+    conv._sharder.close()
+
+    assert wrote == 2
+
+    tar_path = tmp_path / "shards" / "yolo_annos-000000.tar"
+    with tarfile.open(tar_path, "r") as tf:
+        negatives = [m for m in tf.getnames() if "20240704_070000" in m]
+        assert len(negatives) == 2
+        for name in negatives:
+            content = tf.extractfile(name).read().decode("utf-8")
+            assert content == ""
diff --git a/training/object-detection/tests/test_yolo_ls_parsing.py b/training/object-detection/tests/test_yolo_ls_parsing.py
new file mode 100644
index 0000000..9a460fa
--- /dev/null
+++ b/training/object-detection/tests/test_yolo_ls_parsing.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+
+import pytest
+
+from object_detection.yolo_ls.parsing import (
+    coord_mode,
+    load_class_map_from_yolo_yaml,
+    safe_float,
+    to_yolo,
+)
+
+
+def test_load_class_map_from_yolo_yaml_dict(tmp_path: Path):
+    yaml_path = tmp_path / "data.yaml"
+    yaml_path.write_text("names:\n  0: Coho\n  1: Sockeye\n", encoding="utf-8")
+
+    got = load_class_map_from_yolo_yaml(yaml_path)
+    assert got == {"Coho": 0, "Sockeye": 1}
+
+
+def test_load_class_map_from_yolo_yaml_list(tmp_path: Path):
+    yaml_path = tmp_path / "data.yaml"
+    yaml_path.write_text("names: [Coho, Sockeye]\n", encoding="utf-8")
+
+    got = load_class_map_from_yolo_yaml(yaml_path)
+    assert got == {"Coho": 0, "Sockeye": 1}
+
+
+def test_load_class_map_from_yolo_yaml_missing_names(tmp_path: Path):
+    yaml_path = tmp_path / "data.yaml"
+    yaml_path.write_text("train: images/train\n", encoding="utf-8")
+
+    with pytest.raises(ValueError, match="'names' not found"):
+        load_class_map_from_yolo_yaml(yaml_path)
+
+
+def test_load_class_map_from_yolo_yaml_bad_type(tmp_path: Path):
+    yaml_path = tmp_path / "data.yaml"
+    yaml_path.write_text("names: 123\n", encoding="utf-8")
+
+    with pytest.raises(ValueError, match="Unsupported 'names' structure"):
+        load_class_map_from_yolo_yaml(yaml_path)
+
+
+@pytest.mark.parametrize(
+    ("value", "default", "expected"),
+    [
+        ("1.5", 0.0, 1.5),
+        (2, 0.0, 2.0),
+        (None, 7.0, 7.0),
+        ("abc", -1.0, -1.0),
+    ],
+)
+def test_safe_float(value, default, expected):
+    assert safe_float(value, default) == expected
+
+
+def test_coord_mode_normalized():
+    assert coord_mode(0.1, 0.2, 0.3, 0.4) == "normalized"
+
+
+def test_coord_mode_percent():
+    assert coord_mode(10, 20, 30, 40) == "percent"
+
+
+def test_coord_mode_pixel():
+    assert coord_mode(120, 20, 30, 40) == "pixel"
+
+
+def test_to_yolo_percent():
+    xc, yc, w, h = to_yolo(10, 20, 30, 40, vid_w=1280, vid_h=720, forced_mode="percent")
+    assert xc == pytest.approx(0.25)
+    assert yc == pytest.approx(0.40)
+    assert w == pytest.approx(0.30)
+    assert h == pytest.approx(0.40)
+
+
+def test_to_yolo_pixel():
+    xc, yc, w, h = to_yolo(64, 36, 128, 72, vid_w=1280, vid_h=720, forced_mode="pixel")
+    assert xc == pytest.approx((64 + 64) / 1280)
+    assert yc == pytest.approx((36 + 36) / 720)
+    assert w == pytest.approx(128 / 1280)
+    assert h == pytest.approx(72 / 720)
+
+
+def test_to_yolo_auto():
+    xc, yc, w, h = to_yolo(10, 20, 30, 40, vid_w=1280, vid_h=720, forced_mode="auto")
+    assert xc == pytest.approx(0.25)
+    assert yc == pytest.approx(0.40)
+    assert w == pytest.approx(0.30)
+    assert h == pytest.approx(0.40)
+
+
+def test_to_yolo_clamps():
+    xc, yc, w, h = to_yolo(-10, -10, 200, 200, vid_w=100, vid_h=100, forced_mode="pixel")
+    assert 0.0 <= xc <= 1.0
+    assert 0.0 <= yc <= 1.0
+    assert 0.0 <= w <= 1.0
+    assert 0.0 <= h <= 1.0
+
+
+def test_to_yolo_invalid_mode():
+    with pytest.raises(ValueError, match="Unknown coord_mode"):
+        to_yolo(1, 2, 3, 4, vid_w=100, vid_h=100, forced_mode="bad_mode")
diff --git a/training/object-detection/tests/test_yolo_ls_shards.py b/training/object-detection/tests/test_yolo_ls_shards.py
new file mode 100644
index 0000000..bb43849
--- /dev/null
+++ b/training/object-detection/tests/test_yolo_ls_shards.py
@@ -0,0 +1,30 @@
+import tarfile
+from pathlib import Path
+
+from object_detection.yolo_ls.shards import TarShardWriter
+
+
+def test_tar_shard_writer_writes_member(tmp_path: Path):
+    out_dir = tmp_path / "shards"
+    writer = TarShardWriter(out_dir, shard_size=10, prefix="test")
+    writer.write_text("video/frame_000001.txt", "hello\n")
+    writer.close()
+
+    tar_path = out_dir / "test-000000.tar"
+    assert tar_path.exists()
+
+    with tarfile.open(tar_path, "r") as tf:
+        assert tf.getnames() == ["video/frame_000001.txt"]
+        content = tf.extractfile("video/frame_000001.txt").read().decode("utf-8")
+        assert content == "hello\n"
+
+
+def test_tar_shard_writer_rotates(tmp_path: Path):
+    out_dir = tmp_path / "shards"
+    writer = TarShardWriter(out_dir, shard_size=1, prefix="test")
+    writer.write_text("a.txt", "a")
+    writer.write_text("b.txt", "b")
+    writer.close()
+
+    assert (out_dir / "test-000000.tar").exists()
+    assert (out_dir / "test-000001.tar").exists()

From fe65c4af1feb8f1b9178ef5194740a29b29ed499 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 17:58:04 -0700
Subject: [PATCH 22/35] Fix tests

---
 .../src/object_detection/yolo_ls/converter.py | 12 +++---
 training/object-detection/tests/conftest.py   |  5 +++
 .../tests/test_splits_splitter.py             | 38 +++++++++----------
 .../tests/test_yolo_ls_converter.py           | 15 +++++---
 4 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index 72dc81b..01051f4 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -185,15 +185,15 @@ def materialize_negatives(self) -> Tuple[int, int]:
         of the final dataset. Returns number of negative files written.
         """
         if not self.include_negatives:
-            return 0
+            return 0, 0
 
         pos = self._positive_frame_files_written
         if pos <= 0:
-            return 0
+            return 0, 0
 
         r = self.negative_ratio
         if r <= 0:
-            return 0
+            return 0, 0
         if r >= 1:
             max_neg = sum(
                 min(self.negatives_per_video, len(self._eligible_frames_for_video(c.video_stem, c.total_frames)))
@@ -203,7 +203,7 @@ def materialize_negatives(self) -> Tuple[int, int]:
             max_neg = int((r / (1.0 - r)) * pos)
 
         if max_neg <= 0:
-            return 0
+            return 0, 0
 
         # Build all candidates at the video level first
         per_video_samples: Dict[str, List[int]] = {}
@@ -220,8 +220,8 @@ def materialize_negatives(self) -> Tuple[int, int]:
                 per_video_samples[c.video_stem] = sampled
                 total_candidate_frames += len(sampled)
 
-        if total_candidate_frames == 0:
-            return 0
+        if total_candidate_frames <= 0:
+            return 0, total_candidate_frames
 
         # Flatten candidates, then globally subsample if needed
         flat: List[Tuple[str, int]] = []
diff --git a/training/object-detection/tests/conftest.py b/training/object-detection/tests/conftest.py
index aea6be8..65b47dc 100644
--- a/training/object-detection/tests/conftest.py
+++ b/training/object-detection/tests/conftest.py
@@ -73,6 +73,11 @@ def make_label_file(root: Path, video_stem: str, frame_idx: int, lines: str) ->
     return path
 
 
+@pytest.fixture
+def make_label_file_fixture():
+    return make_label_file
+
+
 @pytest.fixture
 def sample_item():
     return make_ls_item()
diff --git a/training/object-detection/tests/test_splits_splitter.py b/training/object-detection/tests/test_splits_splitter.py
index 9c5ef21..6cc009e 100644
--- a/training/object-detection/tests/test_splits_splitter.py
+++ b/training/object-detection/tests/test_splits_splitter.py
@@ -18,22 +18,22 @@
 )
 
 
-def test_iter_label_files(tmp_path: Path, make_label_file):
-    make_label_file(tmp_path, "HIRMD-tankeeah-jetson-0_20250714_012827_M", 10, "0 0.5 0.5 0.1 0.2\n")
-    make_label_file(tmp_path, "HIRMD-tankeeah-jetson-0_20250714_012827_M", 11, "0 0.5 0.5 0.1 0.2\n")
+def test_iter_label_files(tmp_path: Path, make_label_file_fixture):
+    make_label_file_fixture(tmp_path, "HIRMD-tankeeah-jetson-0_20250714_012827_M", 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file_fixture(tmp_path, "HIRMD-tankeeah-jetson-0_20250714_012827_M", 11, "0 0.5 0.5 0.1 0.2\n")
     files = sorted(iter_label_files(tmp_path))
     assert len(files) == 2
     assert files[0].name == "frame_000010.txt"
 
 
-def test_build_groups_filters_sites(tmp_path: Path, make_label_file):
-    make_label_file(
+def test_build_groups_filters_sites(tmp_path: Path, make_label_file_fixture):
+    make_label_file_fixture(
         tmp_path,
         "HIRMD-tankeeah-jetson-0_20250714_012827_M",
         10,
         "0 0.5 0.5 0.1 0.2\n",
     )
-    make_label_file(
+    make_label_file_fixture(
         tmp_path,
         "HIRMD-bear-jetsonnx-0_20250912_011859_M",
         20,
@@ -47,10 +47,10 @@ def test_build_groups_filters_sites(tmp_path: Path, make_label_file):
     assert gid.startswith("tankeeah|")
 
 
-def test_build_groups_aggregates_frames(tmp_path: Path, make_label_file):
+def test_build_groups_aggregates_frames(tmp_path: Path, make_label_file_fixture):
     video = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
-    make_label_file(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
-    make_label_file(tmp_path, video, 11, "0 0.5 0.5 0.1 0.2\n1 0.5 0.5 0.2 0.2\n")
+    make_label_file_fixture(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file_fixture(tmp_path, video, 11, "0 0.5 0.5 0.1 0.2\n1 0.5 0.5 0.2 0.2\n")
 
     groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
     g = next(iter(groups.values()))
@@ -73,11 +73,11 @@ def test_l1_dist():
     assert l1_dist(p, q, ["a", "b"]) == pytest.approx(1.0)
 
 
-def test_compute_global_targets(tmp_path: Path, make_label_file):
+def test_compute_global_targets(tmp_path: Path, make_label_file_fixture):
     video1 = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
     video2 = "HIRMD-bear-jetsonnx-0_20250912_011859_M"
-    make_label_file(tmp_path, video1, 10, "0 0.5 0.5 0.1 0.2\n")
-    make_label_file(tmp_path, video2, 20, "1 0.5 0.5 0.2 0.2\n")
+    make_label_file_fixture(tmp_path, video1, 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file_fixture(tmp_path, video2, 20, "1 0.5 0.5 0.2 0.2\n")
 
     groups = build_groups(tmp_path, sites_keep={"tankeeah", "bear"}, seed=42)
     targets = compute_global_targets(list(groups.values()))
@@ -87,16 +87,16 @@ def test_compute_global_targets(tmp_path: Path, make_label_file):
     assert pytest.approx(sum(targets["class_dist"].values())) == 1.0
 
 
-def test_rarity_score_positive(tmp_path: Path, make_label_file):
+def test_rarity_score_positive(tmp_path: Path, make_label_file_fixture):
     video = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
-    make_label_file(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file_fixture(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
     groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
     g = next(iter(groups.values()))
     score = rarity_score(g, {0: 1.0})
     assert score > 0
 
 
-def test_split_groups_greedy_assigns_all_groups(tmp_path: Path, make_label_file):
+def test_split_groups_greedy_assigns_all_groups(tmp_path: Path, make_label_file_fixture):
     videos = [
         "HIRMD-tankeeah-jetson-0_20250714_012827_M",
         "HIRMD-tankeeah-jetson-0_20250715_012827_M",
@@ -104,7 +104,7 @@ def test_split_groups_greedy_assigns_all_groups(tmp_path: Path, make_label_file)
         "SFC-bear-jetsonnx-0_20250912_011859_M",
     ]
     for i, video in enumerate(videos):
-        make_label_file(tmp_path, video, 10, f"{i % 2} 0.5 0.5 0.1 0.2\n")
+        make_label_file_fixture(tmp_path, video, 10, f"{i % 2} 0.5 0.5 0.1 0.2\n")
 
     groups = build_groups(tmp_path, sites_keep={"tankeeah", "kitwanga", "bear"}, seed=42)
     train, val, test, report = split_groups_greedy(
@@ -113,7 +113,7 @@ def test_split_groups_greedy_assigns_all_groups(tmp_path: Path, make_label_file)
         train_frac=0.8,
         val_frac=0.1,
         test_frac=0.1,
-        weights={"class": 4.0, "tod": 1.0, "density": 1.0, "area": 1.0, "size": 2.0},
+        weights={"class": 4.0, "tod": 1.0, "density": 1.0, "area": 1.0, "ar": 1.0, "size": 2.0},
     )
 
     assigned = set(train.group_ids) | set(val.group_ids) | set(test.group_ids)
@@ -122,9 +122,9 @@ def test_split_groups_greedy_assigns_all_groups(tmp_path: Path, make_label_file)
     assert train.n_frames + val.n_frames + test.n_frames == 4
 
 
-def test_summarize_split(tmp_path: Path, make_label_file):
+def test_summarize_split(tmp_path: Path, make_label_file_fixture):
     video = "HIRMD-tankeeah-jetson-0_20250714_012827_M"
-    make_label_file(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
+    make_label_file_fixture(tmp_path, video, 10, "0 0.5 0.5 0.1 0.2\n")
     groups = build_groups(tmp_path, sites_keep={"tankeeah"}, seed=42)
     g = next(iter(groups.values()))
 
diff --git a/training/object-detection/tests/test_yolo_ls_converter.py b/training/object-detection/tests/test_yolo_ls_converter.py
index 8fe728d..aba84ff 100644
--- a/training/object-detection/tests/test_yolo_ls_converter.py
+++ b/training/object-detection/tests/test_yolo_ls_converter.py
@@ -351,6 +351,7 @@ def test_materialize_negatives_writes_empty_files(tmp_path: Path, sample_item):
 
     conv._convert_item(sample_item)
 
+    # No positives, so no negatives
     empty_item = {
         "id": 2,
         "data": {
@@ -369,7 +370,7 @@ def test_materialize_negatives_writes_empty_files(tmp_path: Path, sample_item):
 
     wrote, total_candidate_frames = conv.materialize_negatives()
     assert wrote == 0
-    assert total_candidate_frames > 0
+    assert total_candidate_frames == 0
 
 
 def test_materialize_negatives_with_enough_positives(tmp_path: Path, sample_item):
@@ -410,12 +411,12 @@ def test_materialize_negatives_with_enough_positives(tmp_path: Path, sample_item
 
     wrote, total_candidate_frames = conv.materialize_negatives()
 
-    assert wrote == 4
-    assert total_candidate_frames >= 4
-    assert conv._negative_frame_files_written == 4
+    assert wrote == 2
+    assert total_candidate_frames == 6
+    assert conv._negative_frame_files_written == 2
 
     txt_files = list((tmp_path / "out").rglob("frame_*.txt"))
-    assert len(txt_files) >= 12 + 4
+    assert len(txt_files) == 8 + 2
 
 
 def test_materialize_negatives_writes_to_shards(tmp_path: Path, sample_item):
@@ -434,7 +435,9 @@ def test_materialize_negatives_writes_to_shards(tmp_path: Path, sample_item):
         item["id"] = i + 1
         item["data"]["metadata_file_filename"] = f"HIRMD-tankeeah-jetson-0_20240704_05570{i}_M.mp4"
         item["data"]["video"] = f"s3://bucket/HIRMD-tankeeah-jetson-0_20240704_05570{i}_M.mp4"
-        conv._convert_item(item)
+        s = conv._convert_item(item)
+
+    assert s.label_files_written == 3
 
     empty_item = {
         "id": 3,

From eb7f4d9691411b219a2b22ca374435d534db502e Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 18:20:43 -0700
Subject: [PATCH 23/35] Move safe_float to own utils module

---
 training/object-detection/dvc.lock            | 10 ++++++++++
 training/object-detection/dvc.yaml            |  2 ++
 .../src/object_detection/splits/parsing.py    |  8 +-------
 .../src/object_detection/utils/__init__.py    |  0
 .../src/object_detection/utils/utils.py       |  5 +++++
 .../src/object_detection/yolo_ls/converter.py |  2 +-
 .../src/object_detection/yolo_ls/parsing.py   |  6 ------
 .../tests/test_splits_parsing.py              |  7 -------
 .../tests/test_utils_utils.py                 | 19 +++++++++++++++++++
 .../tests/test_yolo_ls_parsing.py             | 13 -------------
 10 files changed, 38 insertions(+), 34 deletions(-)
 create mode 100644 training/object-detection/src/object_detection/utils/__init__.py
 create mode 100644 training/object-detection/src/object_detection/utils/utils.py
 create mode 100644 training/object-detection/tests/test_utils_utils.py

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 3612a2e..9b6d3bf 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -45,6 +45,11 @@ stages:
       hash: md5
       md5: 3921729fec8a5aa6c1eb25c54196a658
       size: 650
+    - path: src/object_detection/splits
+      hash: md5
+      md5: 8ca8d1d4f2962a0e27f6fd50da90a8d2.dir
+      size: 35847
+      nfiles: 6
     outs:
     - path: data/03_processed/splits_baseline
       hash: md5
@@ -73,6 +78,11 @@ stages:
       hash: md5
       md5: 95696514ec77c20c6491c5af2ccb46a5
       size: 118
+    - path: src/object_detection/yolo_ls
+      hash: md5
+      md5: f1b6f977a5c7acd7e5d66ca2fec33d69.dir
+      size: 52048
+      nfiles: 10
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index a2ee1e4..e95d25b 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -31,6 +31,7 @@ stages:
       --negatives-per-video 10
     deps:
       - scripts/yolo_converter_ls_video.py
+      - src/object_detection/yolo_ls
       - data/01_raw/labelstudio_annos
       - config/salmon_yolo.yaml
     outs:
@@ -49,6 +50,7 @@ stages:
       --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
       - scripts/make_splits.py
+      - src/object_detection/splits
       - data/02_interim/yolo_annos
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv
diff --git a/training/object-detection/src/object_detection/splits/parsing.py b/training/object-detection/src/object_detection/splits/parsing.py
index e737171..55a9c68 100644
--- a/training/object-detection/src/object_detection/splits/parsing.py
+++ b/training/object-detection/src/object_detection/splits/parsing.py
@@ -3,6 +3,7 @@
 from typing import Dict, Tuple, Optional
 from collections import Counter
 
+from object_detection.utils.utils import safe_float
 
 _STEM_RE = re.compile(
     r"""
@@ -109,13 +110,6 @@ def area_bin(area: float) -> str:
         return "0.04-0.16"
     return ">=0.16"
 
-
-def safe_float(x: str, default: float = 0.0) -> float:
-    try:
-        return float(x)
-    except Exception:
-        return default
-
 def parse_frame_idx(filename: str) -> Optional[int]:
     # frame_000123.txt
     m = re.match(r"^frame_(\d+)\.txt$", filename)
diff --git a/training/object-detection/src/object_detection/utils/__init__.py b/training/object-detection/src/object_detection/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/src/object_detection/utils/utils.py b/training/object-detection/src/object_detection/utils/utils.py
new file mode 100644
index 0000000..6b6166c
--- /dev/null
+++ b/training/object-detection/src/object_detection/utils/utils.py
@@ -0,0 +1,5 @@
+def safe_float(x: str, default: float = 0.0) -> float:
+    try:
+        return float(x)
+    except Exception:
+        return default
diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index 01051f4..42bcac9 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -11,9 +11,9 @@
 from object_detection.yolo_ls.shards import TarShardWriter
 from object_detection.yolo_ls.parsing import (
     coord_mode,
-    safe_float,
     to_yolo,
 )
+from object_detection.utils.utils import safe_float
 
 @dataclass
 class ConvertStats:
diff --git a/training/object-detection/src/object_detection/yolo_ls/parsing.py b/training/object-detection/src/object_detection/yolo_ls/parsing.py
index 0b7b395..6be0923 100644
--- a/training/object-detection/src/object_detection/yolo_ls/parsing.py
+++ b/training/object-detection/src/object_detection/yolo_ls/parsing.py
@@ -2,12 +2,6 @@
 from typing import Dict, Optional, Tuple, Any
 import yaml
 
-def safe_float(v: Any, default: float = 0.0) -> float:
-    try:
-        return float(v)
-    except Exception:
-        return default
-
 def coord_mode(x: float, y: float, w: float, h: float) -> str:
     """
     Infer coordinate mode for Label Studio:
diff --git a/training/object-detection/tests/test_splits_parsing.py b/training/object-detection/tests/test_splits_parsing.py
index ae4ad87..5391dc5 100644
--- a/training/object-detection/tests/test_splits_parsing.py
+++ b/training/object-detection/tests/test_splits_parsing.py
@@ -7,7 +7,6 @@
     parse_frame_idx,
     parse_video_stem,
     read_yolo_label,
-    safe_float,
     time_bucket,
 )
 
@@ -59,12 +58,6 @@ def test_area_bin():
     assert area_bin(0.10) == "0.04-0.16"
     assert area_bin(0.20) == ">=0.16"
 
-
-def test_safe_float():
-    assert safe_float("1.25") == 1.25
-    assert safe_float("bad", 7.0) == 7.0
-
-
 def test_parse_frame_idx():
     assert parse_frame_idx("frame_000123.txt") == 123
     assert parse_frame_idx("bad.txt") is None
diff --git a/training/object-detection/tests/test_utils_utils.py b/training/object-detection/tests/test_utils_utils.py
new file mode 100644
index 0000000..a24e450
--- /dev/null
+++ b/training/object-detection/tests/test_utils_utils.py
@@ -0,0 +1,19 @@
+from object_detection.utils.utils import safe_float
+import pytest
+
+def test_safe_float():
+    assert safe_float("1.25") == 1.25
+    assert safe_float("bad", 7.0) == 7.0
+
+
+@pytest.mark.parametrize(
+    ("value", "default", "expected"),
+    [
+        ("1.5", 0.0, 1.5),
+        (2, 0.0, 2.0),
+        (None, 7.0, 7.0),
+        ("abc", -1.0, -1.0),
+    ],
+)
+def test_safe_float(value, default, expected):
+    assert safe_float(value, default) == expected
diff --git a/training/object-detection/tests/test_yolo_ls_parsing.py b/training/object-detection/tests/test_yolo_ls_parsing.py
index 9a460fa..e941b44 100644
--- a/training/object-detection/tests/test_yolo_ls_parsing.py
+++ b/training/object-detection/tests/test_yolo_ls_parsing.py
@@ -5,7 +5,6 @@
 from object_detection.yolo_ls.parsing import (
     coord_mode,
     load_class_map_from_yolo_yaml,
-    safe_float,
     to_yolo,
 )
 
@@ -42,18 +41,6 @@ def test_load_class_map_from_yolo_yaml_bad_type(tmp_path: Path):
         load_class_map_from_yolo_yaml(yaml_path)
 
 
-@pytest.mark.parametrize(
-    ("value", "default", "expected"),
-    [
-        ("1.5", 0.0, 1.5),
-        (2, 0.0, 2.0),
-        (None, 7.0, 7.0),
-        ("abc", -1.0, -1.0),
-    ],
-)
-def test_safe_float(value, default, expected):
-    assert safe_float(value, default) == expected
-
 
 def test_coord_mode_normalized():
     assert coord_mode(0.1, 0.2, 0.3, 0.4) == "normalized"

From a9ef14827988efce07db449e394d7c717241ccf1 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 18:22:53 -0700
Subject: [PATCH 24/35] Repro with utils change

---
 training/object-detection/dvc.lock | 10 ++++++++++
 training/object-detection/dvc.yaml |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 9b6d3bf..ea408da 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -50,6 +50,11 @@ stages:
       md5: 8ca8d1d4f2962a0e27f6fd50da90a8d2.dir
       size: 35847
       nfiles: 6
+    - path: src/object_detection/utils
+      hash: md5
+      md5: 5828afbf8a25f11a0f8e5bbc6c0065bf.dir
+      size: 762
+      nfiles: 4
     outs:
     - path: data/03_processed/splits_baseline
       hash: md5
@@ -78,6 +83,11 @@ stages:
       hash: md5
       md5: 95696514ec77c20c6491c5af2ccb46a5
       size: 118
+    - path: src/object_detection/utils
+      hash: md5
+      md5: 5828afbf8a25f11a0f8e5bbc6c0065bf.dir
+      size: 762
+      nfiles: 4
     - path: src/object_detection/yolo_ls
       hash: md5
       md5: f1b6f977a5c7acd7e5d66ca2fec33d69.dir
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index e95d25b..cfffc0b 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -32,6 +32,7 @@ stages:
     deps:
       - scripts/yolo_converter_ls_video.py
       - src/object_detection/yolo_ls
+      - src/object_detection/utils
       - data/01_raw/labelstudio_annos
       - config/salmon_yolo.yaml
     outs:
@@ -51,6 +52,7 @@ stages:
     deps:
       - scripts/make_splits.py
       - src/object_detection/splits
+      - src/object_detection/utils
       - data/02_interim/yolo_annos
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv

From 502f8dd80f4db7c44a7b40359177273eca5fbefa Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 18:30:53 -0700
Subject: [PATCH 25/35] Add smoke tests

---
 .../object-detection/tests/test_splits_cli.py | 44 +++++++++++++
 .../tests/test_yolo_ls_cli.py                 | 63 +++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 training/object-detection/tests/test_splits_cli.py
 create mode 100644 training/object-detection/tests/test_yolo_ls_cli.py

diff --git a/training/object-detection/tests/test_splits_cli.py b/training/object-detection/tests/test_splits_cli.py
new file mode 100644
index 0000000..225ce38
--- /dev/null
+++ b/training/object-detection/tests/test_splits_cli.py
@@ -0,0 +1,44 @@
+import json
+import sys
+
+from object_detection.splits.cli import main
+
+
+def test_splits_cli_smoke(tmp_path, monkeypatch):
+    labels_root = tmp_path / "labels"
+    out_dir = tmp_path / "splits"
+
+    for video, frame, line in [
+        ("HIRMD-tankeeah-jetson-0_20250714_012827_M", 10, "0 0.5 0.5 0.1 0.2\n"),
+        ("HIRMD-tankeeah-jetson-0_20250715_012827_M", 11, "1 0.5 0.5 0.2 0.2\n"),
+        ("SFC-bear-jetsonnx-0_20250912_011859_M", 20, "0 0.5 0.5 0.3 0.1\n"),
+    ]:
+        p = labels_root / video / f"frame_{frame:06d}.txt"
+        p.parent.mkdir(parents=True, exist_ok=True)
+        p.write_text(line, encoding="utf-8")
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "prog",
+            "--labels-root", str(labels_root),
+            "--out-dir", str(out_dir),
+            "--sites", "tankeeah", "bear",
+            "--seed", "42",
+            "--train-frac", "0.8",
+            "--val-frac", "0.1",
+            "--test-frac", "0.1",
+        ],
+    )
+
+    main()
+
+    assert (out_dir / "train.txt").exists()
+    assert (out_dir / "val.txt").exists()
+    assert (out_dir / "test.txt").exists()
+    assert (out_dir / "group_assignments.csv").exists()
+    assert (out_dir / "split_report.json").exists()
+
+    report = json.loads((out_dir / "split_report.json").read_text(encoding="utf-8"))
+    assert "splits" in report
diff --git a/training/object-detection/tests/test_yolo_ls_cli.py b/training/object-detection/tests/test_yolo_ls_cli.py
new file mode 100644
index 0000000..70fe35e
--- /dev/null
+++ b/training/object-detection/tests/test_yolo_ls_cli.py
@@ -0,0 +1,63 @@
+import json
+import sys
+
+from object_detection.yolo_ls.cli import main
+
+
+def test_yolo_ls_cli_smoke(tmp_path, monkeypatch):
+    yaml_path = tmp_path / "data.yaml"
+    yaml_path.write_text("names: [Sockeye]\n", encoding="utf-8")
+
+    item = {
+        "id": 1,
+        "data": {
+            "metadata_file_site_reference_string": "tankeeah",
+            "metadata_file_filename": "HIRMD-tankeeah-jetson-0_20240704_055747_M.mp4",
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": 30,
+            "video": "s3://bucket/HIRMD-tankeeah-jetson-0_20240704_055747_M.mp4",
+        },
+        "annotations": [
+            {
+                "updated_at": "2025-03-01T03:49:36.058815Z",
+                "result": [
+                    {
+                        "type": "videorectangle",
+                        "from_name": "box",
+                        "to_name": "video",
+                        "value": {
+                            "labels": ["Sockeye"],
+                            "sequence": [
+                                {"enabled": True, "frame": 10, "x": 10, "y": 20, "width": 30, "height": 40},
+                                {"enabled": True, "frame": 12, "x": 20, "y": 30, "width": 30, "height": 40},
+                            ],
+                        },
+                    }
+                ],
+            }
+        ],
+    }
+
+    json_dir = tmp_path / "jsons"
+    json_dir.mkdir()
+    (json_dir / "a.json").write_text(json.dumps([item]), encoding="utf-8")
+
+    out_dir = tmp_path / "out"
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "prog",
+            str(json_dir),
+            "--data-yaml", str(yaml_path),
+            "--out", str(out_dir),
+            "--pattern", "*.json",
+            "--coord-mode", "percent",
+        ],
+    )
+
+    main()
+
+    assert (out_dir / "HIRMD-tankeeah-jetson-0_20240704_055747_M" / "frame_000010.txt").exists()

From 2bf0c81e16f85f43d5f4f241a7e9e5bbdef01d72 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 18 Mar 2026 18:37:45 -0700
Subject: [PATCH 26/35] Aggregate all stats

---
 training/object-detection/dvc.yaml                          | 2 +-
 .../src/object_detection/yolo_ls/converter.py               | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index cfffc0b..298496a 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -18,7 +18,7 @@ stages:
       scripts/yolo_converter_ls_video.py
       data/01_raw/labelstudio_annos
       --data-yaml config/salmon_yolo.yaml
-      --out data/02_interim/yolo_annos
+      --out data/02_interim/yolo_annos_fs
       --out-shards data/02_interim/yolo_annos
       --empty-list data/02_interim/yolo_annos/empty_vids.txt
       --shard-size 100000
diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index 42bcac9..6748674 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -145,6 +145,9 @@ def convert_folder(self, json_dir: Path, pattern: str = "*.json") -> ConvertStat
                 stats.videos_with_boxes += s.videos_with_boxes
                 stats.videos_without_boxes += s.videos_without_boxes
                 stats.label_files_written += s.label_files_written
+                stats.label_lines_written += s.label_lines_written
+                stats.negative_files_written += s.negative_files_written
+                stats.total_candidate_negative_frames += s.total_candidate_negative_frames
             except Exception as e:
                 stats.errors += 1
                 self._log_error(f"convert_file({p})", e)
@@ -173,6 +176,9 @@ def convert_file(self, json_path: Path) -> ConvertStats:
                 stats.videos_with_boxes += s.videos_with_boxes
                 stats.videos_without_boxes += s.videos_without_boxes
                 stats.label_files_written += s.label_files_written
+                stats.label_lines_written += s.label_lines_written
+                stats.negative_files_written += s.negative_files_written
+                stats.total_candidate_negative_frames += s.total_candidate_negative_frames
             except Exception as e:
                 stats.errors += 1
                 item_id = item.get("id", "unknown")

From a71bbbad9468454a96f64ea232997417804c81b2 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 19 Mar 2026 16:07:44 -0700
Subject: [PATCH 27/35] Add negatives with balanced water conditions

---
 .../data/01_raw/sv_water_conditions.dvc       |   4 +-
 .../data/02_interim/.gitignore                |   1 +
 training/object-detection/dvc.lock            |  71 ++-
 training/object-detection/dvc.yaml            |  27 +-
 training/object-detection/pyproject.toml      |   1 +
 .../scripts/create_condition_negatives.py     |   5 +
 .../src/object_detection/negatives/cli.py     |  51 ++
 .../object_detection/negatives/conditions.py  | 600 ++++++++++++++++++
 .../src/object_detection/yolo_ls/cli.py       |   3 +-
 .../src/object_detection/yolo_ls/converter.py |   4 +-
 training/object-detection/uv.lock             | 180 +++++-
 11 files changed, 919 insertions(+), 28 deletions(-)
 create mode 100755 training/object-detection/scripts/create_condition_negatives.py
 create mode 100644 training/object-detection/src/object_detection/negatives/cli.py
 create mode 100644 training/object-detection/src/object_detection/negatives/conditions.py

diff --git a/training/object-detection/data/01_raw/sv_water_conditions.dvc b/training/object-detection/data/01_raw/sv_water_conditions.dvc
index fc1ba77..f4cce12 100644
--- a/training/object-detection/data/01_raw/sv_water_conditions.dvc
+++ b/training/object-detection/data/01_raw/sv_water_conditions.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 44403010d10d741a8e00e52c93be6c39.dir
-  size: 195622
+- md5: 717ade5c2fd57a763609ab55d66bf351.dir
+  size: 199437
   nfiles: 10
   hash: md5
   path: sv_water_conditions
diff --git a/training/object-detection/data/02_interim/.gitignore b/training/object-detection/data/02_interim/.gitignore
index 8d21cd8..c412ac4 100644
--- a/training/object-detection/data/02_interim/.gitignore
+++ b/training/object-detection/data/02_interim/.gitignore
@@ -1,2 +1,3 @@
 /yolo_annos
 /yolo_annos_unpacked
+/yolo_condition_negatives
diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index ea408da..93dbc88 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -17,11 +17,13 @@ stages:
       size: 379352848
       nfiles: 50
   split_data:
-    cmd: rm -r data/02_interim/yolo_annos_unpacked; scripts/unpack_annos.sh 
+    cmd: rm -rf data/02_interim/yolo_annos_unpacked && scripts/unpack_annos.sh 
       data/02_interim/yolo_annos data/02_interim/yolo_annos_unpacked && 
-      scripts/make_splits.py --labels-root data/02_interim/yolo_annos_unpacked 
-      --out-dir data/03_processed/splits_baseline --sites tankeeah kitwanga bear
-      --seed 42 --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
+      scripts/unpack_annos.sh data/02_interim/yolo_condition_negatives 
+      data/02_interim/yolo_annos_unpacked && scripts/make_splits.py 
+      --labels-root data/02_interim/yolo_annos_unpacked --out-dir 
+      data/03_processed/splits_baseline --sites tankeeah kitwanga bear --seed 42
+      --train-frac 0.8 --val-frac 0.1 --test-frac 0.1
     deps:
     - path: data/01_raw/salmon_vid_counts.csv
       hash: md5
@@ -31,16 +33,16 @@ stages:
       hash: md5
       md5: c65c2ba8214d26905ccb9608e9071b93
       size: 1031
-    - path: data/01_raw/sv_water_conditions
-      hash: md5
-      md5: 44403010d10d741a8e00e52c93be6c39.dir
-      size: 195622
-      nfiles: 10
     - path: data/02_interim/yolo_annos
       hash: md5
-      md5: a65beddacd3a0ceefa39225176c78f6c.dir
-      size: 307039754
+      md5: 88551957fa5e19f676175f29606a613a.dir
+      size: 308350474
       nfiles: 5
+    - path: data/02_interim/yolo_condition_negatives
+      hash: md5
+      md5: c3505d079516f38874bdb42d2182cb46.dir
+      size: 338928
+      nfiles: 3
     - path: scripts/make_splits.py
       hash: md5
       md5: 3921729fec8a5aa6c1eb25c54196a658
@@ -58,17 +60,17 @@ stages:
     outs:
     - path: data/03_processed/splits_baseline
       hash: md5
-      md5: c1535b54178c37a23abf827b656afa19.dir
-      size: 16780687
+      md5: e821f72b0bf42a09afafb688ce927ac4.dir
+      size: 16966114
       nfiles: 5
   build_model_input:
     cmd: scripts/yolo_converter_ls_video.py data/01_raw/labelstudio_annos 
-      --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos 
+      --data-yaml config/salmon_yolo.yaml --out data/02_interim/yolo_annos_fs 
       --out-shards data/02_interim/yolo_annos --empty-list 
       data/02_interim/yolo_annos/empty_vids.txt --shard-size 100000 --pattern 
       '**/*.json' --include-sites tankeeah kitwanga bear --frame-stride 3 
       --frame-offset-mode video_hash --include-negatives --negative-ratio 0.10 
-      --negatives-per-video 10
+      --negatives-per-video 11
     deps:
     - path: config/salmon_yolo.yaml
       hash: md5
@@ -90,14 +92,14 @@ stages:
       nfiles: 4
     - path: src/object_detection/yolo_ls
       hash: md5
-      md5: f1b6f977a5c7acd7e5d66ca2fec33d69.dir
-      size: 52048
+      md5: dc5c6c05b7e13684e00009e7f4d213a0.dir
+      size: 52733
       nfiles: 10
     outs:
     - path: data/02_interim/yolo_annos
       hash: md5
-      md5: a65beddacd3a0ceefa39225176c78f6c.dir
-      size: 307039754
+      md5: 88551957fa5e19f676175f29606a613a.dir
+      size: 308350474
       nfiles: 5
   unpack_annos:
     cmd: rm -r data/02_interim/yolo_annos_unpacked || scripts/unpack_annos.sh 
@@ -108,3 +110,34 @@ stages:
       md5: 473a8eedf5fd4cc5d5b8b6d1f80681e7.dir
       size: 293713920
       nfiles: 3
+  build_condition_negatives:
+    cmd: scripts/create_condition_negatives.py --conditions-csv 
+      data/01_raw/sv_water_conditions/SV_conditions_tracking_tankeeah_2025.csv 
+      data/01_raw/sv_water_conditions/SV_conditions_tracking_bear_2025.csv 
+      data/01_raw/sv_water_conditions/SV_conditions_tracking_kitwanga_2025.csv 
+      --out-dir data/02_interim/yolo_condition_negatives --frames-per-video 20 
+      --frame-stride 3 --frame-offset-mode video_hash --shard-size 100000
+    deps:
+    - path: data/01_raw/sv_water_conditions
+      hash: md5
+      md5: 717ade5c2fd57a763609ab55d66bf351.dir
+      size: 199437
+      nfiles: 10
+    - path: src/object_detection/negatives/cli.py
+      hash: md5
+      md5: d04fcc8b316529ebefeec2830b288197
+      size: 2142
+    - path: src/object_detection/negatives/conditions.py
+      hash: md5
+      md5: f037be2028fead4a5a9e88f95799254a
+      size: 18950
+    - path: src/object_detection/yolo_ls/shards.py
+      hash: md5
+      md5: 9a1e97420b95c978e10ec236c2645337
+      size: 1266
+    outs:
+    - path: data/02_interim/yolo_condition_negatives
+      hash: md5
+      md5: c3505d079516f38874bdb42d2182cb46.dir
+      size: 338928
+      nfiles: 3
diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 298496a..02e48b9 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -28,7 +28,7 @@ stages:
       --frame-offset-mode video_hash
       --include-negatives
       --negative-ratio 0.10
-      --negatives-per-video 10
+      --negatives-per-video 11
     deps:
       - scripts/yolo_converter_ls_video.py
       - src/object_detection/yolo_ls
@@ -37,12 +37,33 @@ stages:
       - config/salmon_yolo.yaml
     outs:
       - data/02_interim/yolo_annos
+  build_condition_negatives:
+    cmd: >-
+      scripts/create_condition_negatives.py
+      --conditions-csv data/01_raw/sv_water_conditions/SV_conditions_tracking_tankeeah_2025.csv
+      data/01_raw/sv_water_conditions/SV_conditions_tracking_bear_2025.csv
+      data/01_raw/sv_water_conditions/SV_conditions_tracking_kitwanga_2025.csv
+      --out-dir data/02_interim/yolo_condition_negatives
+      --frames-per-video 20
+      --frame-stride 3
+      --frame-offset-mode video_hash
+      --shard-size 100000
+    deps:
+      - src/object_detection/negatives/conditions.py
+      - src/object_detection/negatives/cli.py
+      - src/object_detection/yolo_ls/shards.py
+      - data/01_raw/sv_water_conditions
+    outs:
+      - data/02_interim/yolo_condition_negatives
   split_data:
     cmd: >-
-      rm -r data/02_interim/yolo_annos_unpacked;
+      rm -rf data/02_interim/yolo_annos_unpacked &&
       scripts/unpack_annos.sh
       data/02_interim/yolo_annos
       data/02_interim/yolo_annos_unpacked &&
+      scripts/unpack_annos.sh
+      data/02_interim/yolo_condition_negatives
+      data/02_interim/yolo_annos_unpacked &&
       scripts/make_splits.py
       --labels-root data/02_interim/yolo_annos_unpacked
       --out-dir data/03_processed/splits_baseline
@@ -54,8 +75,8 @@ stages:
       - src/object_detection/splits
       - src/object_detection/utils
       - data/02_interim/yolo_annos
+      - data/02_interim/yolo_condition_negatives
       - data/01_raw/salmon_vid_counts.csv
       - data/01_raw/salmon_vid_counts_summary.csv
-      - data/01_raw/sv_water_conditions
     outs:
       - data/03_processed/splits_baseline
diff --git a/training/object-detection/pyproject.toml b/training/object-detection/pyproject.toml
index dc1a24e..43fb8fb 100644
--- a/training/object-detection/pyproject.toml
+++ b/training/object-detection/pyproject.toml
@@ -5,6 +5,7 @@ description = "SalmonVision object detector training"
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
+    "boto3>=1.37.38",
     "pyyaml>=6.0.3",
 ]
 
diff --git a/training/object-detection/scripts/create_condition_negatives.py b/training/object-detection/scripts/create_condition_negatives.py
new file mode 100755
index 0000000..547ddbb
--- /dev/null
+++ b/training/object-detection/scripts/create_condition_negatives.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env -S uv run python
+from object_detection.negatives.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/training/object-detection/src/object_detection/negatives/cli.py b/training/object-detection/src/object_detection/negatives/cli.py
new file mode 100644
index 0000000..1e73362
--- /dev/null
+++ b/training/object-detection/src/object_detection/negatives/cli.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from object_detection.negatives.conditions import create_condition_negative_shards
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="Create condition-balanced negative YOLO label shards from water-conditions CSVs")
+    p.add_argument("--conditions-csv", nargs="+", required=True, help="One or more water-conditions CSV files")
+    p.add_argument("--out-dir", required=True, help="Output directory for negative shards and manifests")
+    p.add_argument("--bucket", default="prod-salmonvision-edge-assets-labelstudio-source")
+    p.add_argument("--frames-per-video", type=int, default=5)
+    p.add_argument("--frame-stride", type=int, default=3)
+    p.add_argument("--frame-offset-mode", choices=["fixed", "video_hash"], default="video_hash")
+    p.add_argument("--frame-offset", type=int, default=0)
+    p.add_argument("--shard-size", type=int, default=100000)
+    p.add_argument("--negative-seed", type=int, default=42)
+    p.add_argument("--result-type", default="videorectangle")
+    p.add_argument("--from-name", default=None)
+    p.add_argument("--to-name", default=None)
+    p.add_argument("--aws-profile", default=None)
+    return p
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+
+    summary = create_condition_negative_shards(
+        csv_paths=[Path(p) for p in args.conditions_csv],
+        out_dir=Path(args.out_dir),
+        bucket=args.bucket,
+        frames_per_video=args.frames_per_video,
+        frame_stride=args.frame_stride,
+        frame_offset_mode=args.frame_offset_mode,
+        frame_offset=args.frame_offset,
+        shard_size=args.shard_size,
+        negative_seed=args.negative_seed,
+        result_type=args.result_type,
+        from_name=args.from_name,
+        to_name=args.to_name,
+        aws_profile=args.aws_profile,
+    )
+
+    print(
+        f"Done. input_rows={summary['input_rows']} "
+        f"selected_videos={summary['written_videos']} "
+        f"written_negative_frames={summary['written_negative_frames']} "
+        f"failures={len(summary['failures'])}"
+    )
diff --git a/training/object-detection/src/object_detection/negatives/conditions.py b/training/object-detection/src/object_detection/negatives/conditions.py
new file mode 100644
index 0000000..4908205
--- /dev/null
+++ b/training/object-detection/src/object_detection/negatives/conditions.py
@@ -0,0 +1,600 @@
+from __future__ import annotations
+
+import csv
+import json
+import random
+import re
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
+
+import boto3
+
+from object_detection.utils.utils import safe_float
+from object_detection.yolo_ls.shards import TarShardWriter
+
+
+EXCLUDED_COLUMNS = {
+    "Project",
+    "Site",
+    "Camera",
+    "Filename",   # Label Studio task ID, not video filename
+    "Date",
+    "Time",
+    "Notes:",
+    "Image Link:",
+}
+
+DEFAULT_BUCKET = "prod-salmonvision-edge-assets-labelstudio-source"
+
+
+@dataclass(frozen=True)
+class ConditionRow:
+    project: str
+    site: str
+    camera: str
+    labelstudio_task_id: str
+    date: str
+    time: str
+    video_stem: str
+    s3_key: str
+    conditions: Dict[str, str]
+    source_csv: str
+
+
+@dataclass
+class VideoNegativeSample:
+    video_stem: str
+    s3_key: str
+    sampled_frames: List[int]
+    total_frames: int
+    positive_frames: int
+    eligible_negative_frames: int
+    conditions: Dict[str, str]
+    source_csv: str
+
+
+def normalize_value(v: Any) -> Optional[str]:
+    if v is None:
+        return None
+    s = str(v).strip()
+    if not s or s.upper() == "NA":
+        return None
+    return s
+
+
+def normalize_date(date_str: str) -> str:
+    dt = datetime.strptime(date_str.strip(), "%Y-%m-%d")
+    return dt.strftime("%Y%m%d")
+
+
+def normalize_time(time_str: str) -> str:
+    # Handles "3:17:05" and "03:17:05"
+    dt = datetime.strptime(time_str.strip(), "%H:%M:%S")
+    return dt.strftime("%H%M%S")
+
+
+def construct_video_stem(project: str, site: str, camera: str, date_str: str, time_str: str) -> str:
+    return f"{project}-{site}-{camera}_{normalize_date(date_str)}_{normalize_time(time_str)}_M"
+
+
+def construct_task_s3_key(project: str, site: str, camera: str, video_stem: str) -> str:
+    return f"{project}/{site}/{camera}/labelstudio_tasks/{video_stem}.json"
+
+
+def infer_condition_columns(fieldnames: Sequence[str]) -> List[str]:
+    cols: List[str] = []
+    for name in fieldnames:
+        if name is None:
+            continue
+        s = name.strip()
+        if not s:
+            continue
+        if s in EXCLUDED_COLUMNS:
+            continue
+        cols.append(s)
+    return cols
+
+
+def load_condition_rows(csv_paths: Sequence[Path]) -> Tuple[List[ConditionRow], List[str]]:
+    rows: List[ConditionRow] = []
+    all_fieldnames: List[str] = []
+
+    for csv_path in csv_paths:
+        with csv_path.open("r", newline="", encoding="utf-8-sig") as f:
+            reader = csv.DictReader(f)
+            if reader.fieldnames:
+                for fn in reader.fieldnames:
+                    if fn not in all_fieldnames:
+                        all_fieldnames.append(fn)
+
+            for raw in reader:
+                project = normalize_value(raw.get("Project"))
+                site = normalize_value(raw.get("Site"))
+                camera = normalize_value(raw.get("Camera"))
+                labelstudio_task_id = normalize_value(raw.get("Filename"))
+                date_str = normalize_value(raw.get("Date"))
+                time_str = normalize_value(raw.get("Time"))
+
+                # Skip blank rows and placeholder rows like "NA"
+                if not project or not site or not camera or not date_str or not time_str:
+                    continue
+                if not labelstudio_task_id:
+                    continue
+
+                try:
+                    video_stem = construct_video_stem(project, site, camera, date_str, time_str)
+                except ValueError:
+                    continue
+
+                condition_values: Dict[str, str] = {}
+                for col in infer_condition_columns(reader.fieldnames or []):
+                    v = normalize_value(raw.get(col))
+                    if v is not None:
+                        condition_values[col] = v
+
+                row = ConditionRow(
+                    project=project,
+                    site=site,
+                    camera=camera,
+                    labelstudio_task_id=labelstudio_task_id,
+                    date=date_str,
+                    time=time_str,
+                    video_stem=video_stem,
+                    s3_key=construct_task_s3_key(project, site, camera, video_stem),
+                    conditions=condition_values,
+                    source_csv=str(csv_path),
+                )
+                rows.append(row)
+
+    # Dedupe by real video stem
+    dedup: Dict[str, ConditionRow] = {}
+    for row in rows:
+        dedup[row.video_stem] = row
+
+    deduped = list(dedup.values())
+    condition_columns = infer_condition_columns(all_fieldnames)
+    return deduped, condition_columns
+
+
+def active_condition_columns(rows: Sequence[ConditionRow], condition_columns: Sequence[str]) -> List[str]:
+    keep: List[str] = []
+    for col in condition_columns:
+        vals = sorted({r.conditions[col] for r in rows if col in r.conditions})
+        if len(vals) >= 2:
+            keep.append(col)
+    return keep
+
+
+def compute_condition_targets(
+    rows: Sequence[ConditionRow],
+    condition_columns: Sequence[str],
+) -> Tuple[Dict[Tuple[str, str], int], Dict[str, Counter]]:
+    per_col_counts: Dict[str, Counter] = {}
+    targets: Dict[Tuple[str, str], int] = {}
+
+    for col in condition_columns:
+        c = Counter()
+        for row in rows:
+            if col in row.conditions:
+                c[row.conditions[col]] += 1
+        if not c:
+            continue
+
+        per_col_counts[col] = c
+        target = min(c.values())
+        for value in c:
+            targets[(col, value)] = target
+
+    return targets, per_col_counts
+
+
+def greedy_select_balanced_rows(
+    rows: Sequence[ConditionRow],
+    condition_columns: Sequence[str],
+) -> Tuple[List[ConditionRow], Dict[Tuple[str, str], int], Dict[str, Counter]]:
+    """
+    Greedy marginal balancing:
+    - each condition column is balanced independently to its rarest category count
+    - rows that satisfy multiple deficits are preferred
+    """
+    targets, per_col_counts = compute_condition_targets(rows, condition_columns)
+    deficits = dict(targets)
+
+    remaining = list(rows)
+    selected: List[ConditionRow] = []
+
+    def row_score(row: ConditionRow) -> int:
+        score = 0
+        for col in condition_columns:
+            val = row.conditions.get(col)
+            if val is None:
+                continue
+            score += max(deficits.get((col, val), 0), 0)
+        return score
+
+    while True:
+        best_row = None
+        best_score = 0
+
+        for row in remaining:
+            score = row_score(row)
+            if score > best_score:
+                best_score = score
+                best_row = row
+
+        if best_row is None or best_score <= 0:
+            break
+
+        selected.append(best_row)
+        remaining.remove(best_row)
+
+        for col in condition_columns:
+            val = best_row.conditions.get(col)
+            if val is None:
+                continue
+            key = (col, val)
+            if key in deficits and deficits[key] > 0:
+                deficits[key] -= 1
+
+    return selected, targets, per_col_counts
+
+
+def parse_ts(s: str) -> datetime:
+    return datetime.fromisoformat(s.replace("Z", "+00:00"))
+
+
+def parse_ffmpeg_rate(rate: Any) -> float:
+    if rate is None:
+        return 0.0
+    if isinstance(rate, (int, float)):
+        return float(rate)
+
+    s = str(rate).strip()
+    if "/" in s:
+        a, b = s.split("/", 1)
+        try:
+            num = float(a)
+            den = float(b)
+            return num / den if den else 0.0
+        except Exception:
+            return 0.0
+    try:
+        return float(s)
+    except Exception:
+        return 0.0
+
+
+def interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
+    kfs = sorted(seq, key=lambda k: int(safe_float(k.get("frame"), 0)))
+    frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
+
+    if not kfs:
+        return frames_boxes
+
+    for k in kfs:
+        f = int(safe_float(k.get("frame"), -1))
+        if f < 0:
+            continue
+        x = safe_float(k.get("x"))
+        y = safe_float(k.get("y"))
+        w = safe_float(k.get("width"))
+        h = safe_float(k.get("height"))
+        frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+    for i in range(len(kfs) - 1):
+        k0 = kfs[i]
+        k1 = kfs[i + 1]
+
+        f0 = int(safe_float(k0.get("frame"), -1))
+        f1 = int(safe_float(k1.get("frame"), -1))
+        if f0 < 0 or f1 <= f0:
+            continue
+
+        enabled0 = bool(k0.get("enabled", True))
+        if not enabled0:
+            continue
+
+        x0 = safe_float(k0.get("x"))
+        y0 = safe_float(k0.get("y"))
+        w0 = safe_float(k0.get("width"))
+        h0 = safe_float(k0.get("height"))
+
+        x1 = safe_float(k1.get("x"))
+        y1 = safe_float(k1.get("y"))
+        w1 = safe_float(k1.get("width"))
+        h1 = safe_float(k1.get("height"))
+
+        for f in range(f0 + 1, f1):
+            t = (f - f0) / float(f1 - f0)
+            x = x0 + (x1 - x0) * t
+            y = y0 + (y1 - y0) * t
+            w = w0 + (w1 - w0) * t
+            h = h0 + (h1 - h0) * t
+            frames_boxes.setdefault(f, []).append((x, y, w, h))
+
+    return frames_boxes
+
+
+def extract_task_item(task_json: Any, expected_video_stem: str) -> dict:
+    if isinstance(task_json, dict):
+        return task_json
+    if isinstance(task_json, list):
+        if len(task_json) == 1:
+            return task_json[0]
+        for item in task_json:
+            data = item.get("data") or {}
+            stem = Path(data.get("metadata_file_filename") or data.get("video") or "").stem
+            if stem == expected_video_stem:
+                return item
+        return task_json[0]
+    raise ValueError("Unsupported task JSON structure")
+
+
+def infer_total_frames(item: dict, results: Optional[List[dict]] = None) -> int:
+    data = item.get("data") or {}
+
+    n = int(safe_float(data.get("metadata_video_nb_frames"), 0))
+    if n > 0:
+        return n
+
+    if results:
+        for r in results:
+            value = r.get("value") or {}
+            n = int(safe_float(value.get("framesCount"), 0))
+            if n > 0:
+                return n
+
+    duration = safe_float(data.get("metadata_video_duration", data.get("duration", 0.0)), 0.0)
+
+    fps = safe_float(data.get("frames_per_second"), 0.0)
+    if fps <= 0:
+        fps = parse_ffmpeg_rate(data.get("metadata_video_r_frame_rate"))
+    if fps <= 0:
+        fps = parse_ffmpeg_rate(data.get("metadata_video_avg_frame_rate"))
+
+    if duration > 0 and fps > 0:
+        return int(round(duration * fps))
+
+    return 0
+
+
+def extract_latest_results(
+    item: dict,
+    result_type: str = "videorectangle",
+    from_name: Optional[str] = None,
+    to_name: Optional[str] = None,
+) -> List[dict]:
+    annos = item.get("annotations") or []
+    if not annos:
+        return []
+
+    latest_ann = max(annos, key=lambda a: parse_ts(a["updated_at"]))
+    out: List[dict] = []
+    for r in (latest_ann.get("result") or []):
+        if r.get("type") != result_type:
+            continue
+        if from_name is not None and r.get("from_name") != from_name:
+            continue
+        if to_name is not None and r.get("to_name") != to_name:
+            continue
+        out.append(r)
+    return out
+
+
+def extract_positive_frames(
+    item: dict,
+    result_type: str = "videorectangle",
+    from_name: Optional[str] = None,
+    to_name: Optional[str] = None,
+) -> Set[int]:
+    results = extract_latest_results(item, result_type=result_type, from_name=from_name, to_name=to_name)
+    positive: Set[int] = set()
+
+    for r in results:
+        value = r.get("value") or {}
+        seq = value.get("sequence") or []
+        frame_boxes = interpolate_sequence(seq)
+        positive.update(frame_boxes.keys())
+
+    return positive
+
+
+def stride_offset(video_stem: str, frame_stride: int, frame_offset_mode: str, frame_offset: int) -> int:
+    if frame_stride <= 1:
+        return 0
+    if frame_offset_mode == "fixed":
+        return int(frame_offset) % frame_stride
+    if frame_offset_mode == "video_hash":
+        import zlib
+        return zlib.crc32(video_stem.encode("utf-8")) % frame_stride
+    raise ValueError(f"Invalid frame_offset_mode: {frame_offset_mode}")
+
+
+def eligible_negative_frames(
+    video_stem: str,
+    total_frames: int,
+    positive_frames: Set[int],
+    frame_stride: int,
+    frame_offset_mode: str,
+    frame_offset: int,
+) -> List[int]:
+    off = stride_offset(video_stem, frame_stride, frame_offset_mode, frame_offset)
+    return [
+        f for f in range(total_frames)
+        if (f % frame_stride) == off and f not in positive_frames
+    ]
+
+
+def fetch_task_json(s3_client: Any, bucket: str, key: str) -> Any:
+    obj = s3_client.get_object(Bucket=bucket, Key=key)
+    return json.loads(obj["Body"].read().decode("utf-8"))
+
+
+def sample_frames(video_stem: str, eligible_frames: Sequence[int], k: int, seed: int) -> List[int]:
+    if not eligible_frames or k <= 0:
+        return []
+    if k >= len(eligible_frames):
+        return sorted(eligible_frames)
+
+    rng = random.Random(f"{seed}:{video_stem}")
+    return sorted(rng.sample(list(eligible_frames), k))
+
+
+def create_condition_negative_shards(
+    csv_paths: Sequence[Path],
+    out_dir: Path,
+    *,
+    bucket: str = DEFAULT_BUCKET,
+    frames_per_video: int = 5,
+    frame_stride: int = 3,
+    frame_offset_mode: str = "video_hash",
+    frame_offset: int = 0,
+    shard_size: int = 100000,
+    negative_seed: int = 42,
+    result_type: str = "videorectangle",
+    from_name: Optional[str] = None,
+    to_name: Optional[str] = None,
+    aws_profile: Optional[str] = None,
+) -> Dict[str, Any]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    manifest_csv = out_dir / "condition_negative_manifest.csv"
+    summary_json = out_dir / "condition_negative_summary.json"
+
+    session = boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
+    s3_client = session.client("s3")
+
+    rows, raw_condition_columns = load_condition_rows(csv_paths)
+    condition_columns = active_condition_columns(rows, raw_condition_columns)
+
+    selected_rows, targets, per_col_counts = greedy_select_balanced_rows(rows, condition_columns)
+
+    writer = TarShardWriter(out_dir, shard_size=shard_size, prefix="condition_negatives")
+
+    samples: List[VideoNegativeSample] = []
+    failures: List[Dict[str, str]] = []
+
+    for row in selected_rows:
+        try:
+            task_json = fetch_task_json(s3_client, bucket, row.s3_key)
+            item = extract_task_item(task_json, row.video_stem)
+
+            results = extract_latest_results(
+                item,
+                result_type=result_type,
+                from_name=from_name,
+                to_name=to_name,
+            )
+            total_frames = infer_total_frames(item, results=results)
+            if total_frames <= 0:
+                failures.append({"video_stem": row.video_stem, "reason": "total_frames_unavailable"})
+                continue
+
+            positive = extract_positive_frames(
+                item,
+                result_type=result_type,
+                from_name=from_name,
+                to_name=to_name,
+            )
+            eligible = eligible_negative_frames(
+                row.video_stem,
+                total_frames,
+                positive,
+                frame_stride=frame_stride,
+                frame_offset_mode=frame_offset_mode,
+                frame_offset=frame_offset,
+            )
+
+            sampled = sample_frames(
+                row.video_stem,
+                eligible,
+                k=frames_per_video,
+                seed=negative_seed,
+            )
+            if not sampled:
+                failures.append({"video_stem": row.video_stem, "reason": "no_eligible_negative_frames"})
+                continue
+
+            for frame_idx in sampled:
+                writer.write_text(f"{row.video_stem}/frame_{frame_idx:06d}.txt", "")
+
+            samples.append(
+                VideoNegativeSample(
+                    video_stem=row.video_stem,
+                    s3_key=row.s3_key,
+                    sampled_frames=sampled,
+                    total_frames=total_frames,
+                    positive_frames=len(positive),
+                    eligible_negative_frames=len(eligible),
+                    conditions=row.conditions,
+                    source_csv=row.source_csv,
+                )
+            )
+        except Exception as e:
+            failures.append({"video_stem": row.video_stem, "reason": repr(e)})
+
+    writer.close()
+
+    with manifest_csv.open("w", newline="", encoding="utf-8") as f:
+        fieldnames = [
+            "video_stem",
+            "s3_key",
+            "source_csv",
+            "total_frames",
+            "positive_frames",
+            "eligible_negative_frames",
+            "sampled_frames",
+        ] + condition_columns
+        w = csv.DictWriter(f, fieldnames=fieldnames)
+        w.writeheader()
+        for s in samples:
+            row = {
+                "video_stem": s.video_stem,
+                "s3_key": s.s3_key,
+                "source_csv": s.source_csv,
+                "total_frames": s.total_frames,
+                "positive_frames": s.positive_frames,
+                "eligible_negative_frames": s.eligible_negative_frames,
+                "sampled_frames": " ".join(str(x) for x in s.sampled_frames),
+            }
+            for col in condition_columns:
+                row[col] = s.conditions.get(col, "")
+            w.writerow(row)
+
+    selected_condition_counts: Dict[str, Counter] = {}
+    for col in condition_columns:
+        c = Counter()
+        for s in samples:
+            if col in s.conditions:
+                c[s.conditions[col]] += 1
+        selected_condition_counts[col] = c
+
+    summary = {
+        "bucket": bucket,
+        "csv_paths": [str(p) for p in csv_paths],
+        "condition_columns": condition_columns,
+        "input_rows": len(rows),
+        "selected_videos_before_fetch": len(selected_rows),
+        "written_videos": len(samples),
+        "written_negative_frames": sum(len(s.sampled_frames) for s in samples),
+        "frames_per_video": frames_per_video,
+        "frame_stride": frame_stride,
+        "frame_offset_mode": frame_offset_mode,
+        "frame_offset": frame_offset,
+        "targets_by_condition": {
+            col: {val: targets[(col, val)] for val in per_col_counts.get(col, {})}
+            for col in condition_columns
+        },
+        "input_counts_by_condition": {
+            col: dict(per_col_counts[col]) for col in condition_columns
+        },
+        "selected_counts_by_condition": {
+            col: dict(selected_condition_counts[col]) for col in condition_columns
+        },
+        "failures": failures,
+    }
+    summary_json.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+    return summary
diff --git a/training/object-detection/src/object_detection/yolo_ls/cli.py b/training/object-detection/src/object_detection/yolo_ls/cli.py
index 5a2876a..badb1b2 100644
--- a/training/object-detection/src/object_detection/yolo_ls/cli.py
+++ b/training/object-detection/src/object_detection/yolo_ls/cli.py
@@ -70,7 +70,7 @@ def main() -> None:
     else:
         s = conv.convert_file(inp)
 
-    neg_written, total_candidate_frames = conv.materialize_negatives()
+    neg_written, max_neg, total_candidate_frames = conv.materialize_negatives()
     s.negative_files_written += neg_written
     s.total_candidate_negative_frames += total_candidate_frames
 
@@ -82,5 +82,6 @@ def main() -> None:
         f"positive_label_files={s.label_files_written} "
         f"negative_label_files={s.negative_files_written} "
         f"total_candidate_negative_frames={s.total_candidate_negative_frames} "
+        f"max_neg={max_neg} "
         f"errors={s.errors}"
     )
diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index 6748674..c29c7e8 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -185,7 +185,7 @@ def convert_file(self, json_path: Path) -> ConvertStats:
                 self._log_error(f"_convert_item(id={item_id}, src={json_path})", e)
         return stats
 
-    def materialize_negatives(self) -> Tuple[int, int]:
+    def materialize_negatives(self) -> Tuple[int, int, int]:
         """
         Sample negatives globally so negatives are at most self.negative_ratio
         of the final dataset. Returns number of negative files written.
@@ -249,7 +249,7 @@ def materialize_negatives(self) -> Tuple[int, int]:
             wrote += 1
 
         self._negative_frame_files_written += wrote
-        return wrote, total_candidate_frames
+        return wrote, max_neg, total_candidate_frames
 
     # ---- internals ----
 
diff --git a/training/object-detection/uv.lock b/training/object-detection/uv.lock
index da7eedd..17a728c 100644
--- a/training/object-detection/uv.lock
+++ b/training/object-detection/uv.lock
@@ -7,6 +7,77 @@ resolution-markers = [
     "python_full_version < '3.9'",
 ]
 
+[[package]]
+name = "boto3"
+version = "1.37.38"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+dependencies = [
+    { name = "botocore", version = "1.37.38", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "jmespath", version = "1.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "s3transfer", version = "0.11.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0d/b5/d1c2e8c484cea43891629bbab6ca90ce9ca932586750bc0e786c8f096ccf/boto3-1.37.38.tar.gz", hash = "sha256:88c02910933ab7777597d1ca7c62375f52822e0aa1a8e0c51b2598a547af42b2", size = 111623, upload-time = "2025-04-21T19:27:18.06Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/87/8189f22ee798177bc7b40afd13f046442c5f91b699e70a950b42ff447e80/boto3-1.37.38-py3-none-any.whl", hash = "sha256:b6d42803607148804dff82389757827a24ce9271f0583748853934c86310999f", size = 139922, upload-time = "2025-04-21T19:27:16.107Z" },
+]
+
+[[package]]
+name = "boto3"
+version = "1.42.72"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+]
+dependencies = [
+    { name = "botocore", version = "1.42.72", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
+    { name = "jmespath", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
+    { name = "s3transfer", version = "0.16.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ff/78/daf8d8950341f4596b2ffa8e69750569a6f567bcae8f2be0a8095a15fc71/boto3-1.42.72.tar.gz", hash = "sha256:932f023ea3b54fd85df453dcfff7d8c5a0f8be144c0ad57568943505c72c1e6a", size = 112782, upload-time = "2026-03-19T21:03:35.786Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/30/c7/ac1e8d49b0cb5631410796dd439c621d50d9b241f98a79e38e8d466b8d00/boto3-1.42.72-py3-none-any.whl", hash = "sha256:2b5fdac4f202b2ccb9ed21f8b84229463b15573ea16941c2b1b8db1c69e08b63", size = 140555, upload-time = "2026-03-19T21:03:34.165Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.37.38"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+dependencies = [
+    { name = "jmespath", version = "1.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "python-dateutil", marker = "python_full_version < '3.9'" },
+    { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/34/79/4e072e614339727f79afef704e5993b5b4d2667c1671c757cc4deb954744/botocore-1.37.38.tar.gz", hash = "sha256:c3ea386177171f2259b284db6afc971c959ec103fa2115911c4368bea7cbbc5d", size = 13832365, upload-time = "2025-04-21T19:27:05.245Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/1b/93f3504afc7c523dcaa8a8147cfc75421983e30b08d9f93a533929589630/botocore-1.37.38-py3-none-any.whl", hash = "sha256:23b4097780e156a4dcaadfc1ed156ce25cb95b6087d010c4bb7f7f5d9bc9d219", size = 13499391, upload-time = "2025-04-21T19:27:00.869Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.42.72"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+]
+dependencies = [
+    { name = "jmespath", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
+    { name = "python-dateutil", marker = "python_full_version >= '3.9'" },
+    { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
+    { name = "urllib3", version = "2.6.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ca/ff/36bfa472894648ded83cfe7bb09faabd0db113551dbe920589cfc9cadad7/botocore-1.42.72.tar.gz", hash = "sha256:491b75eb41d46cf99cf8d6cab89f2e2c3602ce8e90d738a1da217a5fb6b2b7ef", size = 15004562, upload-time = "2026-03-19T21:03:24.45Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/66/50/3297ba278a0d6644396011cc592ee88f628206599fc6cff25f5c83b629e7/botocore-1.42.72-py3-none-any.whl", hash = "sha256:038f34553da68df2ce70edc729f170cc198678daaad43f98649727c59f6404d4", size = 14678727, upload-time = "2026-03-19T21:03:20.575Z" },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -54,11 +125,38 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
+[[package]]
+name = "jmespath"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
+]
+
+[[package]]
+name = "jmespath"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
+]
+
 [[package]]
 name = "object-detection"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
+    { name = "boto3", version = "1.37.38", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "boto3", version = "1.42.72", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
     { name = "pyyaml" },
 ]
 
@@ -70,7 +168,10 @@ dev = [
 ]
 
 [package.metadata]
-requires-dist = [{ name = "pyyaml", specifier = ">=6.0.3" }]
+requires-dist = [
+    { name = "boto3", specifier = ">=1.37.38" },
+    { name = "pyyaml", specifier = ">=6.0.3" },
+]
 
 [package.metadata.requires-dev]
 dev = [{ name = "pytest", specifier = ">=8.3.5" }]
@@ -180,6 +281,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.3"
@@ -260,6 +373,46 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" },
 ]
 
+[[package]]
+name = "s3transfer"
+version = "0.11.5"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.9'",
+]
+dependencies = [
+    { name = "botocore", version = "1.37.38", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/2b/5c9562795c2eb2b5f63536961754760c25bf0f34af93d36aa28dea2fb303/s3transfer-0.11.5.tar.gz", hash = "sha256:8c8aad92784779ab8688a61aefff3e28e9ebdce43142808eaa3f0b0f402f68b7", size = 149107, upload-time = "2025-04-17T19:23:19.051Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/39/13402e323666d17850eca87e4cd6ecfcf9fd7809cac9efdcce10272fc29d/s3transfer-0.11.5-py3-none-any.whl", hash = "sha256:757af0f2ac150d3c75bc4177a32355c3862a98d20447b69a0161812992fe0bd4", size = 84782, upload-time = "2025-04-17T19:23:17.516Z" },
+]
+
+[[package]]
+name = "s3transfer"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+    "python_full_version == '3.9.*'",
+]
+dependencies = [
+    { name = "botocore", version = "1.42.72", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
+]
+
 [[package]]
 name = "tomli"
 version = "2.4.0"
@@ -338,3 +491,28 @@ sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac8
 wheels = [
     { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
 ]
+
+[[package]]
+name = "urllib3"
+version = "1.26.20"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.9.*'",
+    "python_full_version < '3.9'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.6.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.10'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+]

From 9062b37355b39c5bc20b1e312023174bf11b4381 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:18:10 -0700
Subject: [PATCH 28/35] Add tests

---
 .../src/object_detection/yolo_ls/converter.py |  10 +-
 .../tests/test_negatives_cli.py               | 147 ++++
 .../tests/test_negatives_conditions.py        | 644 ++++++++++++++++++
 .../tests/test_yolo_ls_converter.py           |   8 +-
 4 files changed, 801 insertions(+), 8 deletions(-)
 create mode 100644 training/object-detection/tests/test_negatives_cli.py
 create mode 100644 training/object-detection/tests/test_negatives_conditions.py

diff --git a/training/object-detection/src/object_detection/yolo_ls/converter.py b/training/object-detection/src/object_detection/yolo_ls/converter.py
index c29c7e8..930bb94 100644
--- a/training/object-detection/src/object_detection/yolo_ls/converter.py
+++ b/training/object-detection/src/object_detection/yolo_ls/converter.py
@@ -191,15 +191,15 @@ def materialize_negatives(self) -> Tuple[int, int, int]:
         of the final dataset. Returns number of negative files written.
         """
         if not self.include_negatives:
-            return 0, 0
+            return 0, 0, 0
 
         pos = self._positive_frame_files_written
         if pos <= 0:
-            return 0, 0
+            return 0, 0, 0
 
         r = self.negative_ratio
         if r <= 0:
-            return 0, 0
+            return 0, 0, 0
         if r >= 1:
             max_neg = sum(
                 min(self.negatives_per_video, len(self._eligible_frames_for_video(c.video_stem, c.total_frames)))
@@ -209,7 +209,7 @@ def materialize_negatives(self) -> Tuple[int, int, int]:
             max_neg = int((r / (1.0 - r)) * pos)
 
         if max_neg <= 0:
-            return 0, 0
+            return 0, 0, 0
 
         # Build all candidates at the video level first
         per_video_samples: Dict[str, List[int]] = {}
@@ -227,7 +227,7 @@ def materialize_negatives(self) -> Tuple[int, int, int]:
                 total_candidate_frames += len(sampled)
 
         if total_candidate_frames <= 0:
-            return 0, total_candidate_frames
+            return 0, 0, total_candidate_frames
 
         # Flatten candidates, then globally subsample if needed
         flat: List[Tuple[str, int]] = []
diff --git a/training/object-detection/tests/test_negatives_cli.py b/training/object-detection/tests/test_negatives_cli.py
new file mode 100644
index 0000000..8dcf0ab
--- /dev/null
+++ b/training/object-detection/tests/test_negatives_cli.py
@@ -0,0 +1,147 @@
+import json
+import sys
+from pathlib import Path
+
+from object_detection.negatives.cli import main
+
+
+def write_conditions_csv(path: Path):
+    import csv
+
+    fieldnames = [
+        "Project",
+        "Site",
+        "Camera",
+        "Filename",
+        "Date",
+        "Time",
+        "Turbidity (1-5)",
+        "Debris (buildup within box 1-5)",
+        "Algae (buildup on plexiglass (1-5))",
+        "Lighting (1-5)",
+        "Tidal (0/1)",
+        "Camera orientation (Normal, vertical, horizontal) (0/1)",
+        "Box above water",
+        "Notes:",
+        "Image Link:",
+    ]
+    rows = [
+        {
+            "Project": "HIRMD",
+            "Site": "tankeeah",
+            "Camera": "jetson-0",
+            "Filename": "1233905",
+            "Date": "2025-06-04",
+            "Time": "0:42:42",
+            "Turbidity (1-5)": "1",
+            "Debris (buildup within box 1-5)": "1",
+            "Algae (buildup on plexiglass (1-5))": "1",
+            "Lighting (1-5)": "2",
+            "Tidal (0/1)": "0",
+            "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+            "Box above water": "0",
+            "Notes:": "",
+            "Image Link:": "",
+        }
+    ]
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="", encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=fieldnames)
+        w.writeheader()
+        for row in rows:
+            w.writerow(row)
+
+
+class FakeBody:
+    def __init__(self, payload: str):
+        self.payload = payload.encode("utf-8")
+
+    def read(self):
+        return self.payload
+
+
+class FakeS3Client:
+    def __init__(self, objects):
+        self.objects = objects
+
+    def get_object(self, Bucket: str, Key: str):
+        return {"Body": FakeBody(json.dumps(self.objects[Key]))}
+
+
+class FakeSession:
+    def __init__(self, objects):
+        self.objects = objects
+
+    def client(self, name: str):
+        assert name == "s3"
+        return FakeS3Client(self.objects)
+
+
+def test_negatives_cli_smoke(tmp_path, monkeypatch):
+    import object_detection.negatives.conditions as cond_mod
+
+    csv_path = tmp_path / "conditions.csv"
+    write_conditions_csv(csv_path)
+
+    video_stem = "HIRMD-tankeeah-jetson-0_20250604_004242_M"
+    task_item = {
+        "id": 1,
+        "data": {
+            "metadata_file_site_reference_string": "tankeeah",
+            "metadata_file_filename": f"{video_stem}.mp4",
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": 30,
+            "metadata_video_duration": 3.0,
+            "frames_per_second": 10.0,
+            "metadata_video_r_frame_rate": "10/1",
+            "metadata_video_avg_frame_rate": "10/1",
+            "video": f"s3://bucket/{video_stem}.mp4",
+        },
+        "annotations": [
+            {
+                "updated_at": "2025-03-01T03:49:36.058815Z",
+                "result": [
+                    {
+                        "type": "videorectangle",
+                        "from_name": "box",
+                        "to_name": "video",
+                        "value": {
+                            "labels": ["Sockeye"],
+                            "sequence": [
+                                {"enabled": True, "frame": 10, "x": 10, "y": 20, "width": 30, "height": 40}
+                            ],
+                            "framesCount": 30,
+                        },
+                    }
+                ],
+            }
+        ],
+    }
+
+    objects = {
+        f"HIRMD/tankeeah/jetson-0/labelstudio_tasks/{video_stem}.json": [task_item]
+    }
+
+    monkeypatch.setattr(cond_mod.boto3, "Session", lambda profile_name=None: FakeSession(objects))
+    out_dir = tmp_path / "out"
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "prog",
+            "--conditions-csv", str(csv_path),
+            "--out-dir", str(out_dir),
+            "--frames-per-video", "3",
+            "--frame-stride", "3",
+            "--frame-offset-mode", "fixed",
+            "--frame-offset", "0",
+        ],
+    )
+
+    main()
+
+    assert (out_dir / "condition_negative_manifest.csv").exists()
+    assert (out_dir / "condition_negative_summary.json").exists()
+    assert (out_dir / "condition_negatives-000000.tar").exists()
diff --git a/training/object-detection/tests/test_negatives_conditions.py b/training/object-detection/tests/test_negatives_conditions.py
new file mode 100644
index 0000000..b8d9c7d
--- /dev/null
+++ b/training/object-detection/tests/test_negatives_conditions.py
@@ -0,0 +1,644 @@
+import json
+import tarfile
+from pathlib import Path
+from typing import Dict, List
+
+import pytest
+
+from object_detection.negatives.conditions import (
+    DEFAULT_BUCKET,
+    ConditionRow,
+    active_condition_columns,
+    compute_condition_targets,
+    construct_task_s3_key,
+    construct_video_stem,
+    create_condition_negative_shards,
+    eligible_negative_frames,
+    extract_latest_results,
+    extract_positive_frames,
+    extract_task_item,
+    greedy_select_balanced_rows,
+    infer_condition_columns,
+    infer_total_frames,
+    interpolate_sequence,
+    load_condition_rows,
+    normalize_date,
+    normalize_time,
+    normalize_value,
+    parse_ffmpeg_rate,
+    parse_ts,
+    sample_frames,
+    stride_offset,
+)
+
+
+def write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+
+
+def make_conditions_csv(path: Path, rows: List[dict]) -> Path:
+    import csv
+
+    fieldnames = [
+        "Project",
+        "Site",
+        "Camera",
+        "Filename",
+        "Date",
+        "Time",
+        "Turbidity (1-5)",
+        "Debris (buildup within box 1-5)",
+        "Algae (buildup on plexiglass (1-5))",
+        "Lighting (1-5)",
+        "Tidal (0/1)",
+        "Camera orientation (Normal, vertical, horizontal) (0/1)",
+        "Box above water",
+        "Notes:",
+        "Image Link:",
+    ]
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="", encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=fieldnames)
+        w.writeheader()
+        for row in rows:
+            w.writerow(row)
+    return path
+
+
+def make_task_item(
+    *,
+    filename: str,
+    site: str = "tankeeah",
+    total_frames: int = 30,
+    sequence=None,
+    updated_at: str = "2025-03-01T03:49:36.058815Z",
+):
+    if sequence is None:
+        sequence = [
+            {"enabled": True, "frame": 10, "x": 10, "y": 20, "width": 30, "height": 40},
+            {"enabled": True, "frame": 12, "x": 20, "y": 30, "width": 30, "height": 40},
+        ]
+
+    result = []
+    if sequence is not None:
+        result = [
+            {
+                "type": "videorectangle",
+                "from_name": "box",
+                "to_name": "video",
+                "value": {
+                    "labels": ["Sockeye"],
+                    "sequence": sequence,
+                    "framesCount": total_frames,
+                },
+            }
+        ]
+
+    return {
+        "id": 1,
+        "data": {
+            "metadata_file_site_reference_string": site,
+            "metadata_file_filename": filename,
+            "metadata_video_width": 1280,
+            "metadata_video_height": 720,
+            "metadata_video_nb_frames": total_frames,
+            "metadata_video_duration": total_frames / 10.0,
+            "frames_per_second": 10.0,
+            "metadata_video_r_frame_rate": "10/1",
+            "metadata_video_avg_frame_rate": "10/1",
+            "video": f"s3://bucket/{filename}",
+        },
+        "annotations": [
+            {
+                "updated_at": updated_at,
+                "result": result,
+            }
+        ],
+    }
+
+
+class FakeBody:
+    def __init__(self, payload: str):
+        self.payload = payload.encode("utf-8")
+
+    def read(self):
+        return self.payload
+
+
+class FakeS3Client:
+    def __init__(self, objects: Dict[str, object]):
+        self.objects = objects
+
+    def get_object(self, Bucket: str, Key: str):
+        if Key not in self.objects:
+            raise KeyError(Key)
+        return {"Body": FakeBody(json.dumps(self.objects[Key]))}
+
+
+class FakeSession:
+    def __init__(self, objects: Dict[str, object]):
+        self.objects = objects
+
+    def client(self, name: str):
+        assert name == "s3"
+        return FakeS3Client(self.objects)
+
+
+# ----------------------------
+# Small helpers
+# ----------------------------
+
+def test_normalize_value():
+    assert normalize_value(" 1 ") == "1"
+    assert normalize_value("NA") is None
+    assert normalize_value("") is None
+    assert normalize_value(None) is None
+
+
+def test_normalize_date():
+    assert normalize_date("2025-06-04") == "20250604"
+
+
+def test_normalize_time():
+    assert normalize_time("3:17:05") == "031705"
+    assert normalize_time("13:17:05") == "131705"
+
+
+def test_construct_video_stem():
+    got = construct_video_stem("HIRMD", "tankeeah", "jetson-1", "2025-06-04", "3:17:05")
+    assert got == "HIRMD-tankeeah-jetson-1_20250604_031705_M"
+
+
+def test_construct_task_s3_key():
+    got = construct_task_s3_key(
+        "HIRMD",
+        "tankeeah",
+        "jetson-1",
+        "HIRMD-tankeeah-jetson-1_20250604_031705_M",
+    )
+    assert got == "HIRMD/tankeeah/jetson-1/labelstudio_tasks/HIRMD-tankeeah-jetson-1_20250604_031705_M.json"
+
+
+def test_infer_condition_columns():
+    fieldnames = [
+        "Project",
+        "Site",
+        "Camera",
+        "Filename",
+        "Date",
+        "Time",
+        "Turbidity (1-5)",
+        "Lighting (1-5)",
+        "Notes:",
+    ]
+    got = infer_condition_columns(fieldnames)
+    assert got == ["Turbidity (1-5)", "Lighting (1-5)"]
+
+
+# ----------------------------
+# CSV loading and balancing
+# ----------------------------
+
+def test_load_condition_rows(tmp_path: Path):
+    csv_path = make_conditions_csv(
+        tmp_path / "cond.csv",
+        [
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-1",
+                "Filename": "1233080",
+                "Date": "2025-06-04",
+                "Time": "3:17:05",
+                "Turbidity (1-5)": "1",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "2",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            }
+        ],
+    )
+
+    rows, cols = load_condition_rows([csv_path])
+
+    assert len(rows) == 1
+    assert rows[0].video_stem == "HIRMD-tankeeah-jetson-1_20250604_031705_M"
+    assert rows[0].s3_key.endswith(".json")
+    assert "Turbidity (1-5)" in cols
+    assert rows[0].conditions["Lighting (1-5)"] == "2"
+
+
+def test_load_condition_rows_skips_blank_or_na_rows(tmp_path: Path):
+    csv_path = make_conditions_csv(
+        tmp_path / "cond.csv",
+        [
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-1",
+                "Filename": "NA",
+                "Date": "2025-06-04",
+                "Time": "3:17:05",
+                "Turbidity (1-5)": "1",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "2",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            }
+        ],
+    )
+
+    rows, _ = load_condition_rows([csv_path])
+    assert rows == []
+
+
+def test_active_condition_columns():
+    rows = [
+        ConditionRow(
+            project="HIRMD",
+            site="tankeeah",
+            camera="jetson-0",
+            labelstudio_task_id="1",
+            date="2025-06-04",
+            time="3:17:05",
+            video_stem="a",
+            s3_key="a.json",
+            conditions={"Turbidity (1-5)": "1", "Lighting (1-5)": "2"},
+            source_csv="x.csv",
+        ),
+        ConditionRow(
+            project="HIRMD",
+            site="tankeeah",
+            camera="jetson-0",
+            labelstudio_task_id="2",
+            date="2025-06-05",
+            time="3:17:05",
+            video_stem="b",
+            s3_key="b.json",
+            conditions={"Turbidity (1-5)": "2", "Lighting (1-5)": "2"},
+            source_csv="x.csv",
+        ),
+    ]
+    cols = active_condition_columns(rows, ["Turbidity (1-5)", "Lighting (1-5)"])
+    assert cols == ["Turbidity (1-5)"]
+
+
+def test_compute_condition_targets():
+    rows = [
+        ConditionRow("HIRMD", "tankeeah", "jetson-0", "1", "2025-06-04", "3:17:05", "a", "a.json",
+                     {"Turbidity (1-5)": "1", "Lighting (1-5)": "2"}, "x.csv"),
+        ConditionRow("HIRMD", "tankeeah", "jetson-0", "2", "2025-06-05", "3:17:05", "b", "b.json",
+                     {"Turbidity (1-5)": "1", "Lighting (1-5)": "1"}, "x.csv"),
+        ConditionRow("HIRMD", "tankeeah", "jetson-0", "3", "2025-06-06", "3:17:05", "c", "c.json",
+                     {"Turbidity (1-5)": "2", "Lighting (1-5)": "2"}, "x.csv"),
+    ]
+    targets, counts = compute_condition_targets(rows, ["Turbidity (1-5)", "Lighting (1-5)"])
+
+    assert counts["Turbidity (1-5)"]["1"] == 2
+    assert counts["Turbidity (1-5)"]["2"] == 1
+    assert targets[("Turbidity (1-5)", "1")] == 1
+    assert targets[("Turbidity (1-5)", "2")] == 1
+
+
+def test_greedy_select_balanced_rows():
+    rows = [
+        ConditionRow("HIRMD", "tankeeah", "jetson-0", "1", "2025-06-04", "3:17:05", "a", "a.json",
+                     {"Turbidity (1-5)": "1", "Lighting (1-5)": "1"}, "x.csv"),
+        ConditionRow("HIRMD", "tankeeah", "jetson-0", "2", "2025-06-05", "3:17:05", "b", "b.json",
+                     {"Turbidity (1-5)": "1", "Lighting (1-5)": "2"}, "x.csv"),
+        ConditionRow("HIRMD", "tankeeah", "jetson-0", "3", "2025-06-06", "3:17:05", "c", "c.json",
+                     {"Turbidity (1-5)": "2", "Lighting (1-5)": "2"}, "x.csv"),
+    ]
+
+    selected, targets, counts = greedy_select_balanced_rows(rows, ["Turbidity (1-5)", "Lighting (1-5)"])
+    assert len(selected) >= 2
+    assert ("Turbidity (1-5)", "1") in targets
+    assert counts["Lighting (1-5)"]["2"] == 2
+
+
+# ----------------------------
+# Task JSON helpers
+# ----------------------------
+
+def test_parse_ts():
+    dt = parse_ts("2025-03-01T03:49:36.058815Z")
+    assert dt.year == 2025
+    assert dt.month == 3
+    assert dt.tzinfo is not None
+
+
+def test_parse_ffmpeg_rate():
+    assert parse_ffmpeg_rate("30000/1001") == pytest.approx(29.97002997)
+    assert parse_ffmpeg_rate("10") == pytest.approx(10.0)
+    assert parse_ffmpeg_rate("bad") == 0.0
+
+
+def test_interpolate_sequence():
+    seq = [
+        {"enabled": True, "frame": 10, "x": 0, "y": 0, "width": 10, "height": 10},
+        {"enabled": True, "frame": 12, "x": 20, "y": 20, "width": 10, "height": 10},
+    ]
+    out = interpolate_sequence(seq)
+    assert sorted(out.keys()) == [10, 11, 12]
+    assert out[11][0][0] == pytest.approx(10.0)
+
+
+def test_extract_task_item_from_dict():
+    item = make_task_item(filename="HIRMD-tankeeah-jetson-0_20250604_031705_M.mp4")
+    got = extract_task_item(item, "HIRMD-tankeeah-jetson-0_20250604_031705_M")
+    assert got["data"]["metadata_file_filename"].endswith(".mp4")
+
+
+def test_extract_task_item_from_list_prefers_matching_stem():
+    item1 = make_task_item(filename="A-tankeeah-jetson-0_20250604_031705_M.mp4")
+    item2 = make_task_item(filename="HIRMD-tankeeah-jetson-0_20250604_031705_M.mp4")
+    got = extract_task_item([item1, item2], "HIRMD-tankeeah-jetson-0_20250604_031705_M")
+    assert got["data"]["metadata_file_filename"] == "HIRMD-tankeeah-jetson-0_20250604_031705_M.mp4"
+
+
+def test_infer_total_frames_prefers_metadata_video_nb_frames():
+    item = make_task_item(filename="x.mp4", total_frames=97)
+    assert infer_total_frames(item) == 97
+
+
+def test_infer_total_frames_falls_back_to_result_framescount():
+    item = make_task_item(filename="x.mp4", total_frames=30)
+    item["data"]["metadata_video_nb_frames"] = 0
+    results = item["annotations"][0]["result"]
+    assert infer_total_frames(item, results=results) == 30
+
+
+def test_infer_total_frames_falls_back_to_duration_times_fps():
+    item = make_task_item(filename="x.mp4", total_frames=30)
+    item["data"]["metadata_video_nb_frames"] = 0
+    item["annotations"][0]["result"][0]["value"].pop("framesCount", None)
+    item["data"]["metadata_video_duration"] = 9.7
+    item["data"]["frames_per_second"] = 10.0
+    assert infer_total_frames(item, results=item["annotations"][0]["result"]) == 97
+
+
+def test_extract_latest_results_filters_type_names():
+    item = make_task_item(filename="x.mp4")
+    item["annotations"].append(
+        {
+            "updated_at": "2025-03-02T03:49:36.058815Z",
+            "result": [
+                {
+                    "type": "videorectangle",
+                    "from_name": "box",
+                    "to_name": "video",
+                    "value": {"labels": ["Sockeye"], "sequence": []},
+                },
+                {
+                    "type": "not_video",
+                    "from_name": "box",
+                    "to_name": "video",
+                    "value": {},
+                },
+            ],
+        }
+    )
+
+    got = extract_latest_results(item, result_type="videorectangle", from_name="box", to_name="video")
+    assert len(got) == 1
+    assert got[0]["type"] == "videorectangle"
+
+
+def test_extract_positive_frames():
+    item = make_task_item(filename="x.mp4")
+    got = extract_positive_frames(item)
+    assert got == {10, 11, 12}
+
+
+def test_stride_offset_fixed():
+    assert stride_offset("video", 3, "fixed", 1) == 1
+
+
+def test_stride_offset_video_hash_is_deterministic():
+    a = stride_offset("video_name", 3, "video_hash", 0)
+    b = stride_offset("video_name", 3, "video_hash", 0)
+    assert a == b
+    assert 0 <= a < 3
+
+
+def test_eligible_negative_frames():
+    got = eligible_negative_frames(
+        "video",
+        total_frames=10,
+        positive_frames={0, 3, 6},
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+    )
+    assert got == [9]
+
+
+def test_sample_frames_is_deterministic():
+    eligible = list(range(20))
+    a = sample_frames("video", eligible, 5, seed=42)
+    b = sample_frames("video", eligible, 5, seed=42)
+    assert a == b
+    assert len(a) == 5
+
+
+# ----------------------------
+# End-to-end create_condition_negative_shards
+# ----------------------------
+
+def test_create_condition_negative_shards(tmp_path: Path, monkeypatch):
+    csv_path = make_conditions_csv(
+        tmp_path / "conditions.csv",
+        [
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-0",
+                "Filename": "1233905",
+                "Date": "2025-06-04",
+                "Time": "0:42:42",
+                "Turbidity (1-5)": "1",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "2",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            },
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-1",
+                "Filename": "1233088",
+                "Date": "2025-06-04",
+                "Time": "8:22:39",
+                "Turbidity (1-5)": "2",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "1",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            },
+        ],
+    )
+
+    video_stem_a = "HIRMD-tankeeah-jetson-0_20250604_004242_M"
+    video_stem_b = "HIRMD-tankeeah-jetson-1_20250604_082239_M"
+
+    objects = {
+        f"HIRMD/tankeeah/jetson-0/labelstudio_tasks/{video_stem_a}.json": [
+            make_task_item(filename=f"{video_stem_a}.mp4", total_frames=30, sequence=[{"enabled": True, "frame": 10, "x": 10, "y": 20, "width": 30, "height": 40}])
+        ],
+        f"HIRMD/tankeeah/jetson-1/labelstudio_tasks/{video_stem_b}.json": [
+            make_task_item(filename=f"{video_stem_b}.mp4", total_frames=30, sequence=[{"enabled": True, "frame": 12, "x": 10, "y": 20, "width": 30, "height": 40}])
+        ],
+    }
+
+    import object_detection.negatives.conditions as cond_mod
+
+    monkeypatch.setattr(cond_mod.boto3, "Session", lambda profile_name=None: FakeSession(objects))
+
+    out_dir = tmp_path / "out"
+    summary = create_condition_negative_shards(
+        csv_paths=[csv_path],
+        out_dir=out_dir,
+        frames_per_video=5,
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+        shard_size=1000,
+        negative_seed=42,
+    )
+
+    assert summary["input_rows"] == 2
+    assert summary["written_videos"] >= 1
+    assert summary["written_negative_frames"] >= 1
+
+    manifest = out_dir / "condition_negative_manifest.csv"
+    report = out_dir / "condition_negative_summary.json"
+    shard = out_dir / "condition_negatives-000000.tar"
+
+    assert manifest.exists()
+    assert report.exists()
+    assert shard.exists()
+
+    with tarfile.open(shard, "r") as tf:
+        names = tf.getnames()
+        assert len(names) >= 1
+        for name in names:
+            content = tf.extractfile(name).read().decode("utf-8")
+            assert content == ""
+
+
+def test_create_condition_negative_shards_handles_missing_s3_object(tmp_path: Path, monkeypatch):
+    csv_path = make_conditions_csv(
+        tmp_path / "conditions.csv",
+        [
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-0",
+                "Filename": "1233905",
+                "Date": "2025-06-04",
+                "Time": "0:42:42",
+                "Turbidity (1-5)": "1",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "2",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            }
+        ],
+    )
+
+    import object_detection.negatives.conditions as cond_mod
+    monkeypatch.setattr(cond_mod.boto3, "Session", lambda profile_name=None: FakeSession({}))
+
+    out_dir = tmp_path / "out"
+    summary = create_condition_negative_shards(
+        csv_paths=[csv_path],
+        out_dir=out_dir,
+        frames_per_video=5,
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+    )
+
+    assert summary["written_videos"] == 0
+    assert len(summary["failures"]) == 1
+    assert (out_dir / "condition_negative_summary.json").exists()
+
+
+def test_create_condition_negative_shards_respects_from_to_name(tmp_path: Path, monkeypatch):
+    csv_path = make_conditions_csv(
+        tmp_path / "conditions.csv",
+        [
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-0",
+                "Filename": "1233905",
+                "Date": "2025-06-04",
+                "Time": "0:42:42",
+                "Turbidity (1-5)": "1",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "2",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            }
+        ],
+    )
+
+    video_stem = "HIRMD-tankeeah-jetson-0_20250604_004242_M"
+    item = make_task_item(filename=f"{video_stem}.mp4", total_frames=30)
+    item["annotations"][0]["result"] = [
+        {
+            "type": "videorectangle",
+            "from_name": "other",
+            "to_name": "video",
+            "value": {"labels": ["Sockeye"], "sequence": [{"enabled": True, "frame": 9, "x": 10, "y": 20, "width": 30, "height": 40}]},
+        }
+    ]
+
+    objects = {
+        f"HIRMD/tankeeah/jetson-0/labelstudio_tasks/{video_stem}.json": [item]
+    }
+
+    import object_detection.negatives.conditions as cond_mod
+    monkeypatch.setattr(cond_mod.boto3, "Session", lambda profile_name=None: FakeSession(objects))
+
+    out_dir = tmp_path / "out"
+    summary = create_condition_negative_shards(
+        csv_paths=[csv_path],
+        out_dir=out_dir,
+        frames_per_video=3,
+        frame_stride=3,
+        frame_offset_mode="fixed",
+        frame_offset=0,
+        from_name="box",
+        to_name="video",
+    )
+
+    # Since the only annotation is filtered out, all stride-compatible frames are eligible negatives
+    assert summary["written_videos"] == 1
+    assert summary["written_negative_frames"] == 3
diff --git a/training/object-detection/tests/test_yolo_ls_converter.py b/training/object-detection/tests/test_yolo_ls_converter.py
index aba84ff..3264b70 100644
--- a/training/object-detection/tests/test_yolo_ls_converter.py
+++ b/training/object-detection/tests/test_yolo_ls_converter.py
@@ -368,9 +368,10 @@ def test_materialize_negatives_writes_empty_files(tmp_path: Path, sample_item):
     }
     conv._convert_item(empty_item)
 
-    wrote, total_candidate_frames = conv.materialize_negatives()
+    wrote, max_neg, total_candidate_frames = conv.materialize_negatives()
     assert wrote == 0
     assert total_candidate_frames == 0
+    assert max_neg == 0
 
 
 def test_materialize_negatives_with_enough_positives(tmp_path: Path, sample_item):
@@ -409,9 +410,10 @@ def test_materialize_negatives_with_enough_positives(tmp_path: Path, sample_item
         }
         conv._convert_item(empty_item)
 
-    wrote, total_candidate_frames = conv.materialize_negatives()
+    wrote, max_neg, total_candidate_frames = conv.materialize_negatives()
 
     assert wrote == 2
+    assert max_neg == 2
     assert total_candidate_frames == 6
     assert conv._negative_frame_files_written == 2
 
@@ -455,7 +457,7 @@ def test_materialize_negatives_writes_to_shards(tmp_path: Path, sample_item):
     }
     conv._convert_item(empty_item)
 
-    wrote, _ = conv.materialize_negatives()
+    wrote, _, _ = conv.materialize_negatives()
     conv._sharder.close()
 
     assert wrote == 2

From 3881107d2d3451e79b1d7c038dfb26a7791827bc Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:33:09 -0700
Subject: [PATCH 29/35] Fix tests

---
 .../tests/test_negatives_conditions.py        | 59 ++++++++++++++++---
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/training/object-detection/tests/test_negatives_conditions.py b/training/object-detection/tests/test_negatives_conditions.py
index b8d9c7d..bb58047 100644
--- a/training/object-detection/tests/test_negatives_conditions.py
+++ b/training/object-detection/tests/test_negatives_conditions.py
@@ -563,7 +563,24 @@ def test_create_condition_negative_shards_handles_missing_s3_object(tmp_path: Pa
                 "Box above water": "0",
                 "Notes:": "",
                 "Image Link:": "",
-            }
+            },
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-1",
+                "Filename": "1233088",
+                "Date": "2025-06-04",
+                "Time": "8:22:39",
+                "Turbidity (1-5)": "2",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "1",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            },
         ],
     )
 
@@ -581,7 +598,7 @@ def test_create_condition_negative_shards_handles_missing_s3_object(tmp_path: Pa
     )
 
     assert summary["written_videos"] == 0
-    assert len(summary["failures"]) == 1
+    assert len(summary["failures"]) == 2
     assert (out_dir / "condition_negative_summary.json").exists()
 
 
@@ -605,7 +622,24 @@ def test_create_condition_negative_shards_respects_from_to_name(tmp_path: Path,
                 "Box above water": "0",
                 "Notes:": "",
                 "Image Link:": "",
-            }
+            },
+            {
+                "Project": "HIRMD",
+                "Site": "tankeeah",
+                "Camera": "jetson-1",
+                "Filename": "1233088",
+                "Date": "2025-06-04",
+                "Time": "8:22:39",
+                "Turbidity (1-5)": "2",
+                "Debris (buildup within box 1-5)": "1",
+                "Algae (buildup on plexiglass (1-5))": "1",
+                "Lighting (1-5)": "1",
+                "Tidal (0/1)": "0",
+                "Camera orientation (Normal, vertical, horizontal) (0/1)": "Normal",
+                "Box above water": "0",
+                "Notes:": "",
+                "Image Link:": "",
+            },
         ],
     )
 
@@ -614,14 +648,23 @@ def test_create_condition_negative_shards_respects_from_to_name(tmp_path: Path,
     item["annotations"][0]["result"] = [
         {
             "type": "videorectangle",
-            "from_name": "other",
+            "from_name": "other",   # intentionally filtered out
             "to_name": "video",
-            "value": {"labels": ["Sockeye"], "sequence": [{"enabled": True, "frame": 9, "x": 10, "y": 20, "width": 30, "height": 40}]},
+            "value": {
+                "labels": ["Sockeye"],
+                "sequence": [
+                    {"enabled": True, "frame": 9, "x": 10, "y": 20, "width": 30, "height": 40}
+                ],
+            },
         }
     ]
 
+    video_stem_2 = "HIRMD-tankeeah-jetson-1_20250604_082239_M"
+    item2 = make_task_item(filename=f"{video_stem_2}.mp4", total_frames=30, sequence=[])
+
     objects = {
-        f"HIRMD/tankeeah/jetson-0/labelstudio_tasks/{video_stem}.json": [item]
+        f"HIRMD/tankeeah/jetson-0/labelstudio_tasks/{video_stem}.json": [item],
+        f"HIRMD/tankeeah/jetson-1/labelstudio_tasks/{video_stem_2}.json": [item2],
     }
 
     import object_detection.negatives.conditions as cond_mod
@@ -640,5 +683,5 @@ def test_create_condition_negative_shards_respects_from_to_name(tmp_path: Path,
     )
 
     # Since the only annotation is filtered out, all stride-compatible frames are eligible negatives
-    assert summary["written_videos"] == 1
-    assert summary["written_negative_frames"] == 3
+    assert summary["written_videos"] == 2
+    assert summary["written_negative_frames"] == 6

From 4422edb4c54e4e436e6c552913992d835a0d9c7f Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:35:45 -0700
Subject: [PATCH 30/35] Update dvc

---
 training/object-detection/dvc.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/object-detection/dvc.lock b/training/object-detection/dvc.lock
index 93dbc88..76dccb0 100644
--- a/training/object-detection/dvc.lock
+++ b/training/object-detection/dvc.lock
@@ -92,8 +92,8 @@ stages:
       nfiles: 4
     - path: src/object_detection/yolo_ls
       hash: md5
-      md5: dc5c6c05b7e13684e00009e7f4d213a0.dir
-      size: 52733
+      md5: 80ca7160d1912861bbc2a013a45ac5d5.dir
+      size: 52757
       nfiles: 10
     outs:
     - path: data/02_interim/yolo_annos

From 17b082db208cd65c501a7f59747c5158fc57d1a9 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Tue, 24 Mar 2026 11:29:33 -0700
Subject: [PATCH 31/35] Add log copy docker service

---
 .../services/Dockerfiles/Dockerfile-log-sync  |  3 +++
 utils/pi/services/analyze_salmonmd_logs.py    |  2 +-
 utils/pi/services/copy_logs.sh                | 26 +++++++++++++++++++
 utils/pi/services/docker-compose.yml          | 25 ++++++++++++++++--
 utils/pi/services/run_all_videos.py           |  6 ++---
 utils/pi/services/sync_logs.sh                |  0
 6 files changed, 56 insertions(+), 6 deletions(-)
 create mode 100644 utils/pi/services/Dockerfiles/Dockerfile-log-sync
 create mode 100644 utils/pi/services/copy_logs.sh
 mode change 100644 => 100755 utils/pi/services/sync_logs.sh

diff --git a/utils/pi/services/Dockerfiles/Dockerfile-log-sync b/utils/pi/services/Dockerfiles/Dockerfile-log-sync
new file mode 100644
index 0000000..c4794b1
--- /dev/null
+++ b/utils/pi/services/Dockerfiles/Dockerfile-log-sync
@@ -0,0 +1,3 @@
+FROM rclone/rclone
+
+COPY copy_logs.sh /app/copy_logs.sh
diff --git a/utils/pi/services/analyze_salmonmd_logs.py b/utils/pi/services/analyze_salmonmd_logs.py
index b528b25..281d6d3 100644
--- a/utils/pi/services/analyze_salmonmd_logs.py
+++ b/utils/pi/services/analyze_salmonmd_logs.py
@@ -344,7 +344,7 @@ def main() -> None:
     today = date.today()
 
     # Update CSV output with year suffix
-    args.output_csv = args.output_csv.with_suffix(f".{today.year}" + args.output_csv.suffix)
+    args.output_csv = args.output_csv.with_suffix(f"_{today.year}" + args.output_csv.suffix)
 
     # Determine state file path
     if args.state_file is not None:
diff --git a/utils/pi/services/copy_logs.sh b/utils/pi/services/copy_logs.sh
new file mode 100644
index 0000000..952f5f4
--- /dev/null
+++ b/utils/pi/services/copy_logs.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Parse options
+while getopts "s:o:i:d:c:" opt; do
+    case $opt in
+        s) SITE_NAME="$OPTARG" ;;
+        o) ORGID="$OPTARG" ;;
+        i) DEVICE_ID="$OPTARG" ;;
+        d) DRIVE="$OPTARG" ;;
+        c) CONFIG="$OPTARG" ;;
+        \?) echo "Invalid option -$OPTARG" >&2 ;;
+    esac
+done
+
+# Check required arguments
+if [ -z "$SITE_NAME" ] || [ -z "$ORGID" ] || [ -z "$DEVICE_ID" ] || [ -z "$DRIVE" ] || [ -z "$CONFIG" ]; then
+    echo "Usage: $0 -s SITE_NAME -o ORGID -i DEVICE_ID -d DRIVE -c CONFIG"
+    exit 1
+fi
+
+rclone copy \
+    --include='${ORGID}/${SITE_NAME}/${DEVICE_ID}/logs/**' \
+    --progress \
+    /media/local_hdd "${DRIVE}"
+
+sleep 2h
diff --git a/utils/pi/services/docker-compose.yml b/utils/pi/services/docker-compose.yml
index 84f471b..1bab940 100644
--- a/utils/pi/services/docker-compose.yml
+++ b/utils/pi/services/docker-compose.yml
@@ -24,7 +24,28 @@ services:
     restart: always
     command: >
       bash -c "python3 training/tools/run_motion_detect_rtsp.py ${FLAGS} --device-id ${DEVICE_ID} --fps ${FPS} '${RTSP_URL}' \"${DRIVE}\""
-
+  log-sync:
+    build: 
+      context: .
+      dockerfile: Dockerfiles/Dockerfile-log-sync
+    init: true
+    environment:
+      - HOSTNAME
+    env_file:
+      - ./.env
+    container_name: salmonmd-log-sync
+    volumes:
+      - type: bind
+        source: ${DRIVE}
+        target: ${DRIVE}
+        bind:
+          propagation: rslave
+      - /media/local_hdd:/media/local_hdd
+      - /home/${USERNAME}/.config/rclone/:/config/rclone/
+    restart: always
+    entrypoint: /bin/sh
+    command: |
+      /app/copy_logs.sh -o "${ORGID}" -s "${SITE_NAME}" -i "${DEVICE_ID}" -d "${DRIVE}" -c /config/rclone/rclone.conf
   salmonmd-log-analyzer:
     image: ${IMAGE_REPO_HOST}/salmonmd:${TAG}
     container_name: salmonmd-log-analyzer
@@ -37,5 +58,5 @@ services:
           propagation: rslave
       - /media/local_hdd:/media/local_hdd
     command: >
-      bash -c "python3 utils/pi/services/analyze_salmonmd_logs.py --log-dir '${DRIVE}/${ORGID}/${SITE_NAME}/${DEVICE_ID}/logs/salmonmd_logs' --pattern '*salmonmd*' --output-csv '${DRIVE}/${ORGID}/${SITE_NAME}/${DEVICE_ID}/logs/outages.csv' --min-downtime-seconds 5; sleep 8h"
+      bash -c "python3 utils/pi/services/analyze_salmonmd_logs.py --log-dir '${DRIVE}/${ORGID}/${SITE_NAME}/${DEVICE_ID}/logs/salmonmd_logs' --pattern '*salmonmd*' --output-csv '${DRIVE}/${ORGID}/${SITE_NAME}/${DEVICE_ID}/logs/${ORGID}-${SITE_NAME}-${DEVICE_ID}_parsed_logs_outages.csv' --min-downtime-seconds 5; sleep 8h"
     restart: always
diff --git a/utils/pi/services/run_all_videos.py b/utils/pi/services/run_all_videos.py
index 83b0426..7db3607 100644
--- a/utils/pi/services/run_all_videos.py
+++ b/utils/pi/services/run_all_videos.py
@@ -3,7 +3,7 @@
 import contextlib, os, shlex, time, json, subprocess
 from pathlib import Path
 
-VIDEO_DIR = Path("/media/hdd/ADFG/chignik/raw/")
+VIDEO_DIR = Path("/media/remote")
 #VIDEO_DIR = Path("/app/2024_test_vids")
 STATE_DIR = VIDEO_DIR / ".state"   # keep markers together
 STATE_DIR.mkdir(exist_ok=True)
@@ -40,7 +40,7 @@ def process_video(vid: Path, flags, device_id, fps, drive):
     if flags:
         cmd += shlex.split(FLAGS)
     if device_id:
-        cmd += shlex.split(DEVICE_ID)
+        cmd += ["--device-id", DEVICE_ID]
     if fps:
         cmd += ["--fps", FPS]
     # positionals must come last: input, save_folder
@@ -49,7 +49,7 @@ def process_video(vid: Path, flags, device_id, fps, drive):
     subprocess.run(cmd, check=True)
 
 def run_all(flags="", device_id="", fps="30", drive="/media/local_hdd"):
-    vids = sorted(VIDEO_DIR.rglob("Near.*.mp4"))
+    vids = sorted(VIDEO_DIR.rglob("Far.*.mp4"))
     #vids = sorted(VIDEO_DIR.rglob("*.mp4"))
     for vid in vids:
         meta_path, lock_path = marker_paths(vid)
diff --git a/utils/pi/services/sync_logs.sh b/utils/pi/services/sync_logs.sh
old mode 100644
new mode 100755

From 6518081b59fbb7e6f0d359abb6cd5c67ebf8140e Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 30 Mar 2026 15:06:50 -0700
Subject: [PATCH 32/35] Add frame extractor and video metadata

---
 training/object-detection/dvc.yaml            |  31 +++
 .../scripts/build_video_metadata_index.py     |   5 +
 .../scripts/extract_split_frames.py           |   5 +
 .../src/object_detection/frames/__init__.py   |   0
 .../src/object_detection/frames/cli.py        |  47 ++++
 .../src/object_detection/frames/extractor.py  | 235 ++++++++++++++++++
 .../src/object_detection/frames/parsing.py    |  48 ++++
 .../src/object_detection/metadata/__init__.py |   0
 .../src/object_detection/metadata/cli.py      |  18 ++
 .../src/object_detection/metadata/index.py    | 128 ++++++++++
 .../src/object_detection/splits/parsing.py    |  41 ---
 .../src/object_detection/splits/splitter.py   |   2 +-
 .../src/object_detection/utils/utils.py       |  48 ++++
 13 files changed, 566 insertions(+), 42 deletions(-)
 create mode 100755 training/object-detection/scripts/build_video_metadata_index.py
 create mode 100755 training/object-detection/scripts/extract_split_frames.py
 create mode 100644 training/object-detection/src/object_detection/frames/__init__.py
 create mode 100644 training/object-detection/src/object_detection/frames/cli.py
 create mode 100644 training/object-detection/src/object_detection/frames/extractor.py
 create mode 100644 training/object-detection/src/object_detection/frames/parsing.py
 create mode 100644 training/object-detection/src/object_detection/metadata/__init__.py
 create mode 100644 training/object-detection/src/object_detection/metadata/cli.py
 create mode 100644 training/object-detection/src/object_detection/metadata/index.py

diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 02e48b9..18344fd 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -80,3 +80,34 @@ stages:
       - data/01_raw/salmon_vid_counts_summary.csv
     outs:
       - data/03_processed/splits_baseline
+  build_video_metadata_index:
+    cmd: >-
+      scripts/build_video_metadata_index.py
+      --json-dir data/01_raw/labelstudio_annos
+      --out-csv data/02_interim/video_metadata_index.csv
+    deps:
+      - src/object_detection/metadata/index.py
+      - src/object_detection/metadata/cli.py
+      - data/01_raw/labelstudio_annos
+    outs:
+      - data/02_interim/video_metadata_index.csv
+  extract_split_images:
+    cmd: >-
+      scripts/extract_split_frames.py
+      --splits-dir data/03_processed/splits_baseline
+      --images-root /mnt/harukassd4tb/masamim/salmon_dataset/images
+      --temp-video-dir /mnt/harukassd4tb/masamim/salmon_dataset/tmp_videos
+      --bucket prod-salmonvision-edge-assets-labelstudio-source
+      --fps 10.0
+      --image-ext .jpg
+      --manifest-csv data/03_processed/frame_extraction_manifest.csv
+      --splits train val test
+    deps:
+      - scripts/extract_split_frames.py
+      - src/object_detection/frames/parsing.py
+      - src/object_detection/frames/extractor.py
+      - src/object_detection/frames/cli.py
+      - data/03_processed/splits_baseline
+    outs:
+      - /mnt/harukassd4tb/masamim/salmon_dataset/images
+      - data/03_processed/frame_extraction_manifest.csv
diff --git a/training/object-detection/scripts/build_video_metadata_index.py b/training/object-detection/scripts/build_video_metadata_index.py
new file mode 100755
index 0000000..38250cb
--- /dev/null
+++ b/training/object-detection/scripts/build_video_metadata_index.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env -S uv run python
+from object_detection.metadata.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/training/object-detection/scripts/extract_split_frames.py b/training/object-detection/scripts/extract_split_frames.py
new file mode 100755
index 0000000..c4eb631
--- /dev/null
+++ b/training/object-detection/scripts/extract_split_frames.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env -S uv run python
+from object_detection.frames.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/training/object-detection/src/object_detection/frames/__init__.py b/training/object-detection/src/object_detection/frames/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/src/object_detection/frames/cli.py b/training/object-detection/src/object_detection/frames/cli.py
new file mode 100644
index 0000000..0f2487b
--- /dev/null
+++ b/training/object-detection/src/object_detection/frames/cli.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from object_detection.frames.extractor import extract_split_dataset_images
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="Download source videos and extract only split-requested frames.")
+    p.add_argument("--splits-dir", required=True, help="Directory containing train.txt / val.txt / test.txt")
+    p.add_argument("--images-root", required=True, help="Output images root")
+    p.add_argument("--temp-video-dir", required=True, help="Temporary directory for downloaded videos")
+    p.add_argument("--bucket", default="prod-salmonvision-edge-assets-labelstudio-source")
+    p.add_argument("--fps", type=float, default=10.0)
+    p.add_argument("--image-ext", default=".jpg", choices=[".jpg", ".png"])
+    p.add_argument("--overwrite", action="store_true")
+    p.add_argument("--keep-videos", action="store_true")
+    p.add_argument("--manifest-csv", default=None)
+    p.add_argument("--splits", nargs="*", default=["train", "val", "test"])
+    return p
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+
+    stats = extract_split_dataset_images(
+        splits_dir=Path(args.splits_dir),
+        images_root=Path(args.images_root),
+        temp_video_dir=Path(args.temp_video_dir),
+        bucket=args.bucket,
+        fps=args.fps,
+        image_ext=args.image_ext,
+        overwrite=args.overwrite,
+        cleanup_video=not args.keep_videos,
+        split_names=args.splits,
+        manifest_csv=Path(args.manifest_csv) if args.manifest_csv else None,
+    )
+
+    print(
+        f"Done. splits_seen={stats.splits_seen} "
+        f"videos_seen={stats.videos_seen} "
+        f"videos_processed={stats.videos_processed} "
+        f"videos_failed={stats.videos_failed} "
+        f"frames_requested={stats.frames_requested} "
+        f"frames_written={stats.frames_written}"
+    )
diff --git a/training/object-detection/src/object_detection/frames/extractor.py b/training/object-detection/src/object_detection/frames/extractor.py
new file mode 100644
index 0000000..3f44951
--- /dev/null
+++ b/training/object-detection/src/object_detection/frames/extractor.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+import csv
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Set
+
+from object_detection.frames.parsing import (
+    label_relpath_to_image_relpath,
+    parse_manifest_relpath,
+    video_stem_to_s3_key,
+)
+
+
+@dataclass
+class ExtractionStats:
+    splits_seen: int = 0
+    videos_seen: int = 0
+    videos_processed: int = 0
+    videos_failed: int = 0
+    frames_requested: int = 0
+    frames_written: int = 0
+
+
+def ensure_dir(path: Path) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+
+
+def read_split_manifest(path: Path) -> List[str]:
+    lines: List[str] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        s = line.strip()
+        if s:
+            lines.append(s)
+    return lines
+
+
+def load_split_requests(splits_dir: Path, split_names: Iterable[str]) -> Dict[str, Dict[str, List[int]]]:
+    """
+    Returns:
+      {
+        "train": {video_stem: [frame_idx, ...], ...},
+        "val":   {...},
+        "test":  {...},
+      }
+    """
+    out: Dict[str, Dict[str, List[int]]] = {}
+
+    for split in split_names:
+        manifest = splits_dir / f"{split}.txt"
+        if not manifest.exists():
+            continue
+
+        by_video: Dict[str, List[int]] = {}
+        for line in read_split_manifest(manifest):
+            video_stem, frame_idx = parse_manifest_relpath(line)
+            by_video.setdefault(video_stem, []).append(frame_idx)
+
+        # dedupe + sort
+        by_video = {k: sorted(set(v)) for k, v in by_video.items()}
+        out[split] = by_video
+
+    return out
+
+
+def download_s3_video(bucket: str, s3_key: str, local_video_path: Path) -> None:
+    ensure_dir(local_video_path.parent)
+    cmd = [
+        "aws", "s3", "cp",
+        f"s3://{bucket}/{s3_key}",
+        str(local_video_path),
+    ]
+    subprocess.run(cmd, check=True)
+
+
+def extract_frame_ffmpeg(
+    video_path: Path,
+    frame_idx: int,
+    fps: float,
+    output_path: Path,
+    overwrite: bool = False,
+) -> bool:
+    """
+    Extract one frame using timestamp = frame_idx / fps.
+    """
+    if output_path.exists() and not overwrite:
+        return False
+
+    ensure_dir(output_path.parent)
+    timestamp = frame_idx / float(fps)
+
+    cmd = [
+        "ffmpeg",
+        "-hide_banner",
+        "-loglevel", "error",
+        "-ss", f"{timestamp:.6f}",
+        "-i", str(video_path),
+        "-frames:v", "1",
+        "-q:v", "2",
+        "-y" if overwrite else "-n",
+        str(output_path),
+    ]
+    subprocess.run(cmd, check=True)
+    return True
+
+
+def extract_video_frames_for_split(
+    video_path: Path,
+    frame_indices: Iterable[int],
+    output_dir: Path,
+    fps: float,
+    image_ext: str = ".jpg",
+    overwrite: bool = False,
+) -> int:
+    written = 0
+    for frame_idx in sorted(set(frame_indices)):
+        out_path = output_dir / f"frame_{frame_idx:06d}"
+        out_path = out_path.with_suffix(image_ext)
+
+        did_write = extract_frame_ffmpeg(
+            video_path=video_path,
+            frame_idx=frame_idx,
+            fps=fps,
+            output_path=out_path,
+            overwrite=overwrite,
+        )
+        if did_write or out_path.exists():
+            written += 1
+    return written
+
+
+def extract_split_dataset_images(
+    splits_dir: Path,
+    images_root: Path,
+    temp_video_dir: Path,
+    bucket: str,
+    fps: float = 10.0,
+    image_ext: str = ".jpg",
+    overwrite: bool = False,
+    cleanup_video: bool = True,
+    split_names: Iterable[str] = ("train", "val", "test"),
+    manifest_csv: Optional[Path] = None,
+) -> ExtractionStats:
+    """
+    Reads split manifests:
+      splits_dir/train.txt
+      splits_dir/val.txt
+      splits_dir/test.txt
+
+    Writes:
+      images_root/train/<video_stem>/frame_XXXXXX.jpg
+      images_root/val/<video_stem>/frame_XXXXXX.jpg
+      images_root/test/<video_stem>/frame_XXXXXX.jpg
+    """
+    split_requests = load_split_requests(splits_dir, split_names)
+    stats = ExtractionStats(splits_seen=len(split_requests))
+
+    manifest_rows: List[Dict[str, str]] = []
+
+    for split, by_video in split_requests.items():
+        for video_stem, frame_indices in by_video.items():
+            stats.videos_seen += 1
+            stats.frames_requested += len(frame_indices)
+
+            local_video = temp_video_dir / f"{video_stem}.mp4"
+            s3_key = ""
+            try:
+                s3_key = video_stem_to_s3_key(video_stem)
+                output_dir = images_root / split / video_stem
+
+                download_s3_video(bucket=bucket, s3_key=s3_key, local_video_path=local_video)
+
+                frames_written = extract_video_frames_for_split(
+                    video_path=local_video,
+                    frame_indices=frame_indices,
+                    output_dir=output_dir,
+                    fps=fps,
+                    image_ext=image_ext,
+                    overwrite=overwrite,
+                )
+
+                stats.videos_processed += 1
+                stats.frames_written += frames_written
+
+                manifest_rows.append({
+                    "split": split,
+                    "video_stem": video_stem,
+                    "s3_key": s3_key,
+                    "requested_frames": str(len(frame_indices)),
+                    "written_frames": str(frames_written),
+                    "status": "ok",
+                    "error": "",
+                })
+
+            except Exception as e:
+                stats.videos_failed += 1
+                manifest_rows.append({
+                    "split": split,
+                    "video_stem": video_stem,
+                    "s3_key": s3_key,
+                    "requested_frames": str(len(frame_indices)),
+                    "written_frames": "0",
+                    "status": "error",
+                    "error": repr(e),
+                })
+
+            finally:
+                if cleanup_video:
+                    try:
+                        if local_video.exists():
+                            local_video.unlink()
+                    except Exception:
+                        pass
+
+    if manifest_csv is not None:
+        ensure_dir(manifest_csv.parent)
+        with manifest_csv.open("w", newline="", encoding="utf-8") as f:
+            w = csv.DictWriter(
+                f,
+                fieldnames=[
+                    "split",
+                    "video_stem",
+                    "s3_key",
+                    "requested_frames",
+                    "written_frames",
+                    "status",
+                    "error",
+                ],
+            )
+            w.writeheader()
+            for row in manifest_rows:
+                w.writerow(row)
+
+    return stats
diff --git a/training/object-detection/src/object_detection/frames/parsing.py b/training/object-detection/src/object_detection/frames/parsing.py
new file mode 100644
index 0000000..5b8549b
--- /dev/null
+++ b/training/object-detection/src/object_detection/frames/parsing.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+
+from object_detection.utils.utils import parse_video_stem
+
+
+def parse_frame_idx(label_filename: str) -> Optional[int]:
+    m = re.match(r"^frame_(\d+)\.txt$", label_filename)
+    if not m:
+        return None
+    try:
+        return int(m.group(1))
+    except Exception:
+        return None
+
+
+def video_stem_to_s3_key(video_stem: str) -> str:
+    meta = parse_video_stem(video_stem)
+    if meta is None:
+        raise ValueError("Could not parse video stem: %s" % video_stem)
+    return f"{meta['org']}/{meta['site']}/{meta['device']}/motion_vids/{video_stem}.mp4"
+
+
+def parse_manifest_relpath(relpath: str) -> Tuple[str, int]:
+    """
+    Input line example:
+      HIRMD-tankeeah-jetson-0_20250714_012827_M/frame_000123.txt
+    Returns:
+      (video_stem, frame_idx)
+    """
+    p = Path(relpath.strip())
+    if len(p.parts) < 2:
+        raise ValueError("Invalid manifest relpath: %s" % relpath)
+
+    video_stem = p.parts[0]
+    frame_idx = parse_frame_idx(p.name)
+    if frame_idx is None:
+        raise ValueError("Invalid frame filename: %s" % p.name)
+
+    return video_stem, frame_idx
+
+
+def label_relpath_to_image_relpath(relpath: str, image_ext: str = ".jpg") -> Path:
+    p = Path(relpath.strip())
+    return p.with_suffix(image_ext)
diff --git a/training/object-detection/src/object_detection/metadata/__init__.py b/training/object-detection/src/object_detection/metadata/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/training/object-detection/src/object_detection/metadata/cli.py b/training/object-detection/src/object_detection/metadata/cli.py
new file mode 100644
index 0000000..1cc3942
--- /dev/null
+++ b/training/object-detection/src/object_detection/metadata/cli.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from object_detection.metadata.index import build_video_metadata_index, write_video_metadata_index
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Build per-video metadata index from Label Studio task JSONs.")
+    p.add_argument("--json-dir", required=True)
+    p.add_argument("--out-csv", required=True)
+    args = p.parse_args()
+
+    rows = build_video_metadata_index(Path(args.json_dir))
+    write_video_metadata_index(rows, Path(args.out_csv))
+
+    print(f"Done. indexed_videos={len(rows)} out={args.out_csv}")
diff --git a/training/object-detection/src/object_detection/metadata/index.py b/training/object-detection/src/object_detection/metadata/index.py
new file mode 100644
index 0000000..69d3a72
--- /dev/null
+++ b/training/object-detection/src/object_detection/metadata/index.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+
+def safe_float(v: Any, default: float = 0.0) -> float:
+    try:
+        return float(v)
+    except Exception:
+        return default
+
+
+def parse_ffmpeg_rate(rate: Any) -> float:
+    if rate is None:
+        return 0.0
+    if isinstance(rate, (int, float)):
+        return float(rate)
+
+    s = str(rate).strip()
+    if "/" in s:
+        a, b = s.split("/", 1)
+        try:
+            num = float(a)
+            den = float(b)
+            return num / den if den else 0.0
+        except Exception:
+            return 0.0
+    try:
+        return float(s)
+    except Exception:
+        return 0.0
+
+
+def infer_fps(data: Dict[str, Any]) -> float:
+    fps = safe_float(data.get("frames_per_second"), 0.0)
+    if fps > 0:
+        return fps
+
+    fps = parse_ffmpeg_rate(data.get("metadata_video_r_frame_rate"))
+    if fps > 0:
+        return fps
+
+    fps = parse_ffmpeg_rate(data.get("metadata_video_avg_frame_rate"))
+    if fps > 0:
+        return fps
+
+    duration = safe_float(data.get("metadata_video_duration", data.get("duration")), 0.0)
+    nb_frames = int(safe_float(data.get("metadata_video_nb_frames"), 0))
+    if duration > 0 and nb_frames > 0:
+        return nb_frames / duration
+
+    return 0.0
+
+
+def infer_s3_key(data: Dict[str, Any], video_stem: str) -> str:
+    org = data.get("metadata_file_organization_reference_string", "")
+    site = data.get("metadata_file_site_reference_string", "")
+    cam = data.get("metadata_file_camera_reference_string", "")
+    if org and site and cam:
+        return f"{org}/{site}/{cam}/motion_vids/{video_stem}.mp4"
+    return ""
+
+
+def iter_task_items(json_dir: Path, pattern: str = "**/*.json") -> Iterable[Dict[str, Any]]:
+    for path in sorted(json_dir.glob(pattern)):
+        try:
+            payload = json.loads(path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+
+        if isinstance(payload, dict):
+            yield payload
+        elif isinstance(payload, list):
+            for item in payload:
+                if isinstance(item, dict):
+                    yield item
+
+
+def build_video_metadata_index(json_dir: Path) -> List[Dict[str, str]]:
+    rows: Dict[str, Dict[str, str]] = {}
+
+    for item in iter_task_items(json_dir):
+        data = item.get("data") or {}
+        filename = data.get("metadata_file_filename") or data.get("video") or ""
+        video_stem = Path(filename).stem
+        if not video_stem:
+            continue
+
+        row = {
+            "video_stem": video_stem,
+            "fps": str(infer_fps(data)),
+            "nb_frames": str(int(safe_float(data.get("metadata_video_nb_frames"), 0))),
+            "duration": str(safe_float(data.get("metadata_video_duration", data.get("duration")), 0.0)),
+            "width": str(int(safe_float(data.get("metadata_video_width"), 0))),
+            "height": str(int(safe_float(data.get("metadata_video_height"), 0))),
+            "org": str(data.get("metadata_file_organization_reference_string", "")),
+            "site": str(data.get("metadata_file_site_reference_string", "")),
+            "device": str(data.get("metadata_file_camera_reference_string", "")),
+            "s3_key": infer_s3_key(data, video_stem),
+        }
+        rows[video_stem] = row
+
+    return list(rows.values())
+
+
+def write_video_metadata_index(rows: List[Dict[str, str]], out_csv: Path) -> None:
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    fieldnames = [
+        "video_stem",
+        "fps",
+        "nb_frames",
+        "duration",
+        "width",
+        "height",
+        "org",
+        "site",
+        "device",
+        "s3_key",
+    ]
+    with out_csv.open("w", newline="", encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=fieldnames)
+        w.writeheader()
+        for row in rows:
+            w.writerow(row)
+
diff --git a/training/object-detection/src/object_detection/splits/parsing.py b/training/object-detection/src/object_detection/splits/parsing.py
index 55a9c68..bf21982 100644
--- a/training/object-detection/src/object_detection/splits/parsing.py
+++ b/training/object-detection/src/object_detection/splits/parsing.py
@@ -5,47 +5,6 @@
 
 from object_detection.utils.utils import safe_float
 
-_STEM_RE = re.compile(
-    r"""
-    ^
-    (?P<prefix>.+?)                 # ORG-site-device-id (up to first underscore)
-    _
-    (?P<date>\d{8})                 # YYYYMMDD
-    _
-    (?P<time>\d{6})                 # HHMMSS
-    _
-    (?P<suffix>.+)                  # M / etc
-    $
-    """,
-    re.VERBOSE,
-)
-
-def parse_video_stem(video_stem: str) -> Optional[Dict[str, str]]:
-    """
-    Parse:
-      HIRMD-tankeeah-jetson-0_20250714_012827_M
-    into:
-      org="HIRMD", site="tankeeah", device="jetson-0", date="20250714", time="012827"
-    """
-    m = _STEM_RE.match(video_stem)
-    if not m:
-        return None
-
-    prefix = m.group("prefix")
-    date = m.group("date")
-    time = m.group("time")
-
-    parts = prefix.split("-")
-    if len(parts) < 3:
-        return None
-
-    org = parts[0]
-    site = parts[1]
-    device = "-".join(parts[2:])  # includes jetson-0, jetsonorin-1, etc.
-
-    return {"org": org, "site": site, "device": device, "date": date, "time": time}
-
-
 def time_bucket(hhmmss: str) -> str:
     """Coarse time-of-day buckets based on HH."""
     try:
diff --git a/training/object-detection/src/object_detection/splits/splitter.py b/training/object-detection/src/object_detection/splits/splitter.py
index b77cba8..12dd77d 100644
--- a/training/object-detection/src/object_detection/splits/splitter.py
+++ b/training/object-detection/src/object_detection/splits/splitter.py
@@ -5,12 +5,12 @@
 from collections import Counter
 
 from object_detection.splits.parsing import (
-        parse_video_stem,
         time_bucket,
         density_bin,
         parse_frame_idx,
         read_yolo_label,
 )
+from object_detection.utils.utils import parse_video_stem
 
 # -----------------------------
 # Data structures
diff --git a/training/object-detection/src/object_detection/utils/utils.py b/training/object-detection/src/object_detection/utils/utils.py
index 6b6166c..2fe7e76 100644
--- a/training/object-detection/src/object_detection/utils/utils.py
+++ b/training/object-detection/src/object_detection/utils/utils.py
@@ -3,3 +3,51 @@ def safe_float(x: str, default: float = 0.0) -> float:
         return float(x)
     except Exception:
         return default
+
+
+_STEM_RE = re.compile(
+    r"""
+    ^
+    (?P<prefix>.+?)
+    _
+    (?P<date>\d{8})
+    _
+    (?P<time>\d{6})
+    _
+    (?P<suffix>.+)
+    $
+    """,
+    re.VERBOSE,
+)
+
+def parse_video_stem(video_stem: str) -> Optional[Dict[str, str]]:
+    """
+    Example:
+      HIRMD-tankeeah-jetson-0_20250714_012827_M
+    ->
+      {
+        "org": "HIRMD",
+        "site": "tankeeah",
+        "device": "jetson-0",
+        "date": "20250714",
+        "time": "012827",
+        "suffix": "M",
+      }
+    """
+    m = _STEM_RE.match(video_stem)
+    if not m:
+        return None
+
+    prefix = m.group("prefix")
+    parts = prefix.split("-")
+    if len(parts) < 3:
+        return None
+
+    return {
+        "org": parts[0],
+        "site": parts[1],
+        "device": "-".join(parts[2:]),
+        "date": m.group("date"),
+        "time": m.group("time"),
+        "suffix": m.group("suffix"),
+    }

From d2dd830bafb066f704fdbdb5f6811bfc5fb16595 Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Mon, 30 Mar 2026 20:50:23 -0700
Subject: [PATCH 33/35] Output video metadata for negatives

---
 training/object-detection/dvc.yaml            |   3 +-
 .../src/object_detection/frames/cli.py        |   6 +-
 .../src/object_detection/frames/extractor.py  |  21 +++-
 .../src/object_detection/negatives/cli.py     |   2 +
 .../object_detection/negatives/conditions.py  | 112 +++++++++++++++++-
 5 files changed, 135 insertions(+), 9 deletions(-)

diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 18344fd..cfe37de 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -98,7 +98,7 @@ stages:
       --images-root /mnt/harukassd4tb/masamim/salmon_dataset/images
       --temp-video-dir /mnt/harukassd4tb/masamim/salmon_dataset/tmp_videos
       --bucket prod-salmonvision-edge-assets-labelstudio-source
-      --fps 10.0
+      --metadata-csv data/02_interim/video_metadata_index.csv
       --image-ext .jpg
       --manifest-csv data/03_processed/frame_extraction_manifest.csv
       --splits train val test
@@ -108,6 +108,7 @@ stages:
       - src/object_detection/frames/extractor.py
       - src/object_detection/frames/cli.py
       - data/03_processed/splits_baseline
+      - data/02_interim/video_metadata_index.csv
     outs:
       - /mnt/harukassd4tb/masamim/salmon_dataset/images
       - data/03_processed/frame_extraction_manifest.csv
diff --git a/training/object-detection/src/object_detection/frames/cli.py b/training/object-detection/src/object_detection/frames/cli.py
index 0f2487b..1981614 100644
--- a/training/object-detection/src/object_detection/frames/cli.py
+++ b/training/object-detection/src/object_detection/frames/cli.py
@@ -11,8 +11,8 @@ def build_parser() -> argparse.ArgumentParser:
     p.add_argument("--splits-dir", required=True, help="Directory containing train.txt / val.txt / test.txt")
     p.add_argument("--images-root", required=True, help="Output images root")
     p.add_argument("--temp-video-dir", required=True, help="Temporary directory for downloaded videos")
-    p.add_argument("--bucket", default="prod-salmonvision-edge-assets-labelstudio-source")
-    p.add_argument("--fps", type=float, default=10.0)
+    p.add_argument("--bucket", required=True, help="S3 bucket where videos are kept")
+    p.add_argument("--metadata-csv", required=True, help="Metadata CSV of videos especially FPS")
     p.add_argument("--image-ext", default=".jpg", choices=[".jpg", ".png"])
     p.add_argument("--overwrite", action="store_true")
     p.add_argument("--keep-videos", action="store_true")
@@ -29,7 +29,7 @@ def main() -> None:
         images_root=Path(args.images_root),
         temp_video_dir=Path(args.temp_video_dir),
         bucket=args.bucket,
-        fps=args.fps,
+        metadata_csv=args.metadata_csv,
         image_ext=args.image_ext,
         overwrite=args.overwrite,
         cleanup_video=not args.keep_videos,
diff --git a/training/object-detection/src/object_detection/frames/extractor.py b/training/object-detection/src/object_detection/frames/extractor.py
index 3f44951..00e2b7b 100644
--- a/training/object-detection/src/object_detection/frames/extractor.py
+++ b/training/object-detection/src/object_detection/frames/extractor.py
@@ -22,6 +22,13 @@ class ExtractionStats:
     frames_requested: int = 0
     frames_written: int = 0
 
+def load_video_metadata_index(path: Path) -> Dict[str, Dict[str, str]]:
+    import csv
+    out = {}
+    with path.open("r", newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            out[row["video_stem"]] = row
+    return out
 
 def ensure_dir(path: Path) -> None:
     path.mkdir(parents=True, exist_ok=True)
@@ -135,7 +142,7 @@ def extract_split_dataset_images(
     images_root: Path,
     temp_video_dir: Path,
     bucket: str,
-    fps: float = 10.0,
+    metadata_csv: str,
     image_ext: str = ".jpg",
     overwrite: bool = False,
     cleanup_video: bool = True,
@@ -158,6 +165,8 @@ def extract_split_dataset_images(
 
     manifest_rows: List[Dict[str, str]] = []
 
+    metadata_index = load_video_metadata_index(metadata_csv)
+
     for split, by_video in split_requests.items():
         for video_stem, frame_indices in by_video.items():
             stats.videos_seen += 1
@@ -166,7 +175,15 @@ def extract_split_dataset_images(
             local_video = temp_video_dir / f"{video_stem}.mp4"
             s3_key = ""
             try:
-                s3_key = video_stem_to_s3_key(video_stem)
+                video_meta = metadata_index.get(video_stem)
+                if not video_meta:
+                    raise KeyError(f"Missing metadata for {video_stem}")
+
+                fps = float(video_meta["fps"])
+                if fps <= 0:
+                    raise ValueError(f"Invalid fps for {video_stem}: {video_meta['fps']}")
+
+                s3_key = video_meta["s3_key"] or video_stem_to_s3_key(video_stem)
                 output_dir = images_root / split / video_stem
 
                 download_s3_video(bucket=bucket, s3_key=s3_key, local_video_path=local_video)
diff --git a/training/object-detection/src/object_detection/negatives/cli.py b/training/object-detection/src/object_detection/negatives/cli.py
index 1e73362..4d2aa4a 100644
--- a/training/object-detection/src/object_detection/negatives/cli.py
+++ b/training/object-detection/src/object_detection/negatives/cli.py
@@ -21,6 +21,7 @@ def build_parser() -> argparse.ArgumentParser:
     p.add_argument("--from-name", default=None)
     p.add_argument("--to-name", default=None)
     p.add_argument("--aws-profile", default=None)
+    p.add_argument("--cache-task-json-dir", default=None)
     return p
 
 
@@ -41,6 +42,7 @@ def main() -> None:
         from_name=args.from_name,
         to_name=args.to_name,
         aws_profile=args.aws_profile,
+        cache_task_json_dir=Path(args.cache_task_json_dir) if args.cache_task_json_dir else None,
     )
 
     print(
diff --git a/training/object-detection/src/object_detection/negatives/conditions.py b/training/object-detection/src/object_detection/negatives/conditions.py
index 4908205..d68b7c8 100644
--- a/training/object-detection/src/object_detection/negatives/conditions.py
+++ b/training/object-detection/src/object_detection/negatives/conditions.py
@@ -3,9 +3,8 @@
 import csv
 import json
 import random
-import re
-from collections import Counter, defaultdict
-from dataclasses import dataclass
+from collections import Counter
+from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -56,6 +55,21 @@ class VideoNegativeSample:
     source_csv: str
 
 
+@dataclass
+class VideoMetadataRecord:
+    video_stem: str
+    s3_key: str
+    fps: float
+    nb_frames: int
+    duration: float
+    width: int
+    height: int
+    org: str
+    site: str
+    device: str
+    source_csv: str
+
+
 def normalize_value(v: Any) -> Optional[str]:
     if v is None:
         return None
@@ -267,6 +281,58 @@ def parse_ffmpeg_rate(rate: Any) -> float:
         return 0.0
 
 
+def infer_fps(data: Dict[str, Any]) -> float:
+    fps = safe_float(data.get("frames_per_second"), 0.0)
+    if fps > 0:
+        return fps
+
+    fps = parse_ffmpeg_rate(data.get("metadata_video_r_frame_rate"))
+    if fps > 0:
+        return fps
+
+    fps = parse_ffmpeg_rate(data.get("metadata_video_avg_frame_rate"))
+    if fps > 0:
+        return fps
+
+    duration = safe_float(data.get("metadata_video_duration", data.get("duration", 0.0)), 0.0)
+    nb_frames = int(safe_float(data.get("metadata_video_nb_frames"), 0))
+    if duration > 0 and nb_frames > 0:
+        return nb_frames / duration
+
+    return 0.0
+
+
+def extract_video_metadata_record(
+    item: dict,
+    *,
+    video_stem: str,
+    s3_key: str,
+    source_csv: str,
+) -> VideoMetadataRecord:
+    data = item.get("data") or {}
+
+    return VideoMetadataRecord(
+        video_stem=video_stem,
+        s3_key=s3_key,
+        fps=infer_fps(data),
+        nb_frames=int(safe_float(data.get("metadata_video_nb_frames"), 0)),
+        duration=safe_float(data.get("metadata_video_duration", data.get("duration", 0.0)), 0.0),
+        width=int(safe_float(data.get("metadata_video_width"), 0)),
+        height=int(safe_float(data.get("metadata_video_height"), 0)),
+        org=str(data.get("metadata_file_organization_reference_string", "")),
+        site=str(data.get("metadata_file_site_reference_string", "")),
+        device=str(data.get("metadata_file_camera_reference_string", "")),
+        source_csv=source_csv,
+    )
+
+
+def cache_task_json(task_json: Any, cache_root: Path, s3_key: str) -> Path:
+    out_path = cache_root / s3_key
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(task_json, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    return out_path
+
+
 def interpolate_sequence(seq: Iterable[dict]) -> Dict[int, List[Tuple[float, float, float, float]]]:
     kfs = sorted(seq, key=lambda k: int(safe_float(k.get("frame"), 0)))
     frames_boxes: Dict[int, List[Tuple[float, float, float, float]]] = {}
@@ -458,10 +524,14 @@ def create_condition_negative_shards(
     from_name: Optional[str] = None,
     to_name: Optional[str] = None,
     aws_profile: Optional[str] = None,
+    cache_task_json_dir: Optional[Path] = None,
 ) -> Dict[str, Any]:
     out_dir.mkdir(parents=True, exist_ok=True)
     manifest_csv = out_dir / "condition_negative_manifest.csv"
     summary_json = out_dir / "condition_negative_summary.json"
+    metadata_csv = out_dir / "condition_negative_video_metadata.csv"
+    if cache_task_json_dir is not None:
+        cache_task_json_dir.mkdir(parents=True, exist_ok=True)
 
     session = boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
     s3_client = session.client("s3")
@@ -475,12 +545,26 @@ def create_condition_negative_shards(
 
     samples: List[VideoNegativeSample] = []
     failures: List[Dict[str, str]] = []
+    metadata_records: List[VideoMetadataRecord] = []
 
     for row in selected_rows:
         try:
             task_json = fetch_task_json(s3_client, bucket, row.s3_key)
+
+            if cache_task_json_dir is not None:
+                cache_task_json(task_json, cache_task_json_dir, row.s3_key)
+
             item = extract_task_item(task_json, row.video_stem)
 
+            metadata_records.append(
+                extract_video_metadata_record(
+                    item,
+                    video_stem=row.video_stem,
+                    s3_key=row.s3_key,
+                    source_csv=row.source_csv,
+                )
+            )
+
             results = extract_latest_results(
                 item,
                 result_type=result_type,
@@ -563,6 +647,25 @@ def create_condition_negative_shards(
                 row[col] = s.conditions.get(col, "")
             w.writerow(row)
 
+    with metadata_csv.open("w", newline="", encoding="utf-8") as f:
+        fieldnames = [
+            "video_stem",
+            "s3_key",
+            "fps",
+            "nb_frames",
+            "duration",
+            "width",
+            "height",
+            "org",
+            "site",
+            "device",
+            "source_csv",
+        ]
+        w = csv.DictWriter(f, fieldnames=fieldnames)
+        w.writeheader()
+        for rec in metadata_records:
+            w.writerow(asdict(rec))
+
     selected_condition_counts: Dict[str, Counter] = {}
     for col in condition_columns:
         c = Counter()
@@ -583,6 +686,9 @@ def create_condition_negative_shards(
         "frame_stride": frame_stride,
         "frame_offset_mode": frame_offset_mode,
         "frame_offset": frame_offset,
+        "metadata_csv": str(metadata_csv),
+        "cached_task_json_dir": str(cache_task_json_dir) if cache_task_json_dir is not None else "",
+        "metadata_records_written": len(metadata_records),
         "targets_by_condition": {
             col: {val: targets[(col, val)] for val in per_col_counts.get(col, {})}
             for col in condition_columns

From eecd99c7f0d7bc6a35a8a3a54f9487af060f63fb Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 1 Apr 2026 16:28:38 -0700
Subject: [PATCH 34/35] Parse multiple video metadata CSVs

---
 training/object-detection/dvc.yaml            |  2 +
 .../src/object_detection/frames/cli.py        |  8 ++-
 .../src/object_detection/frames/extractor.py  | 64 ++++++++++++++++---
 .../src/object_detection/utils/utils.py       |  3 +
 .../tests/test_splits_parsing.py              | 16 -----
 training/object-detection/tests/test_utils.py | 20 ++++++
 6 files changed, 84 insertions(+), 29 deletions(-)
 create mode 100644 training/object-detection/tests/test_utils.py

diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index cfe37de..0e6c6fa 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -99,6 +99,7 @@ stages:
       --temp-video-dir /mnt/harukassd4tb/masamim/salmon_dataset/tmp_videos
       --bucket prod-salmonvision-edge-assets-labelstudio-source
       --metadata-csv data/02_interim/video_metadata_index.csv
+      data/02_interim/yolo_condition_negatives/condition_negative_video_metadata.csv
       --image-ext .jpg
       --manifest-csv data/03_processed/frame_extraction_manifest.csv
       --splits train val test
@@ -109,6 +110,7 @@ stages:
       - src/object_detection/frames/cli.py
       - data/03_processed/splits_baseline
       - data/02_interim/video_metadata_index.csv
+      - data/02_interim/yolo_condition_negatives/condition_negative_video_metadata.csv
     outs:
       - /mnt/harukassd4tb/masamim/salmon_dataset/images
       - data/03_processed/frame_extraction_manifest.csv
diff --git a/training/object-detection/src/object_detection/frames/cli.py b/training/object-detection/src/object_detection/frames/cli.py
index 1981614..7c3895a 100644
--- a/training/object-detection/src/object_detection/frames/cli.py
+++ b/training/object-detection/src/object_detection/frames/cli.py
@@ -11,8 +11,10 @@ def build_parser() -> argparse.ArgumentParser:
     p.add_argument("--splits-dir", required=True, help="Directory containing train.txt / val.txt / test.txt")
     p.add_argument("--images-root", required=True, help="Output images root")
     p.add_argument("--temp-video-dir", required=True, help="Temporary directory for downloaded videos")
-    p.add_argument("--bucket", required=True, help="S3 bucket where videos are kept")
-    p.add_argument("--metadata-csv", required=True, help="Metadata CSV of videos especially FPS")
+    p.add_argument("--metadata-csv", nargs="+", required=True,
+                   help="One or more metadata CSVs with columns including video_stem,fps,s3_key")
+    p.add_argument("--bucket", default="",
+                   help="Fallback bucket if metadata row lacks s3_key")
     p.add_argument("--image-ext", default=".jpg", choices=[".jpg", ".png"])
     p.add_argument("--overwrite", action="store_true")
     p.add_argument("--keep-videos", action="store_true")
@@ -29,7 +31,7 @@ def main() -> None:
         images_root=Path(args.images_root),
         temp_video_dir=Path(args.temp_video_dir),
         bucket=args.bucket,
-        metadata_csv=args.metadata_csv,
+        metadata_csv_paths=[Path(p) for p in args.metadata_csv],
         image_ext=args.image_ext,
         overwrite=args.overwrite,
         cleanup_video=not args.keep_videos,
diff --git a/training/object-detection/src/object_detection/frames/extractor.py b/training/object-detection/src/object_detection/frames/extractor.py
index 00e2b7b..dd59a4e 100644
--- a/training/object-detection/src/object_detection/frames/extractor.py
+++ b/training/object-detection/src/object_detection/frames/extractor.py
@@ -12,6 +12,8 @@
     video_stem_to_s3_key,
 )
 
+from object_detection.utils.utils import safe_float
+
 
 @dataclass
 class ExtractionStats:
@@ -23,13 +25,36 @@ class ExtractionStats:
     frames_written: int = 0
 
 def load_video_metadata_index(path: Path) -> Dict[str, Dict[str, str]]:
-    import csv
-    out = {}
+    out: Dict[str, Dict[str, str]] = {}
     with path.open("r", newline="", encoding="utf-8") as f:
-        for row in csv.DictReader(f):
-            out[row["video_stem"]] = row
+        reader = csv.DictReader(f)
+        for row in reader:
+            video_stem = (row.get("video_stem") or "").strip()
+            if not video_stem:
+                continue
+            out[video_stem] = dict(row)
     return out
 
+def merge_video_metadata_csvs(paths: Iterable[Path]) -> Dict[str, Dict[str, str]]:
+    """
+    Merge metadata CSVs by video_stem.
+    Later CSVs overwrite earlier CSVs on conflicts.
+    """
+    merged: Dict[str, Dict[str, str]] = {}
+    for path in paths:
+        current = load_video_metadata_index(path)
+        for video_stem, row in current.items():
+            if video_stem in merged:
+                prev = merged[video_stem]
+                if prev.get("fps") != row.get("fps") or prev.get("s3_key") != row.get("s3_key"):
+                    print(
+                        f"[frames] warning: overriding metadata for {video_stem} "
+                        f"from fps={prev.get('fps')} s3_key={prev.get('s3_key')} "
+                        f"to fps={row.get('fps')} s3_key={row.get('s3_key')}"
+                    )
+            merged[video_stem] = row
+    return merged
+
 def ensure_dir(path: Path) -> None:
     path.mkdir(parents=True, exist_ok=True)
 
@@ -141,8 +166,8 @@ def extract_split_dataset_images(
     splits_dir: Path,
     images_root: Path,
     temp_video_dir: Path,
-    bucket: str,
-    metadata_csv: str,
+    metadata_csv_paths: Iterable[Path],
+    bucket: str = "",
     image_ext: str = ".jpg",
     overwrite: bool = False,
     cleanup_video: bool = True,
@@ -162,11 +187,10 @@ def extract_split_dataset_images(
     """
     split_requests = load_split_requests(splits_dir, split_names)
     stats = ExtractionStats(splits_seen=len(split_requests))
+    metadata_index = merge_video_metadata_csvs(metadata_csv_paths)
 
     manifest_rows: List[Dict[str, str]] = []
 
-    metadata_index = load_video_metadata_index(metadata_csv)
-
     for split, by_video in split_requests.items():
         for video_stem, frame_indices in by_video.items():
             stats.videos_seen += 1
@@ -183,10 +207,27 @@ def extract_split_dataset_images(
                 if fps <= 0:
                     raise ValueError(f"Invalid fps for {video_stem}: {video_meta['fps']}")
 
-                s3_key = video_meta["s3_key"] or video_stem_to_s3_key(video_stem)
+                meta = metadata_index.get(video_stem)
+                if meta is None:
+                    raise KeyError(f"Missing metadata for video_stem={video_stem}")
+
+                fps = safe_float(meta.get("fps", ""), 0.0)
+                if fps <= 0:
+                    raise ValueError(f"Invalid fps for video_stem={video_stem}: {meta.get('fps', '')!r}")
+
+                s3_key = (meta.get("s3_key") or "").strip()
+                if not s3_key:
+                    if not bucket:
+                        raise ValueError(f"Missing s3_key for video_stem={video_stem} and no bucket fallback available")
+                    s3_key = video_stem_to_s3_key(video_stem)
+
+                # If s3_key already contains bucket-like prefix, keep bucket only for aws cli command construction.
                 output_dir = images_root / split / video_stem
 
-                download_s3_video(bucket=bucket, s3_key=s3_key, local_video_path=local_video)
+                # If caller did not pass bucket explicitly, infer it from known default in your project usage.
+                effective_bucket = bucket or "prod-salmonvision-edge-assets-labelstudio-source"
+
+                download_s3_video(bucket=effective_bucket, s3_key=s3_key, local_video_path=local_video)
 
                 frames_written = extract_video_frames_for_split(
                     video_path=local_video,
@@ -204,6 +245,7 @@ def extract_split_dataset_images(
                     "split": split,
                     "video_stem": video_stem,
                     "s3_key": s3_key,
+                    "fps": str(fps),
                     "requested_frames": str(len(frame_indices)),
                     "written_frames": str(frames_written),
                     "status": "ok",
@@ -216,6 +258,7 @@ def extract_split_dataset_images(
                     "split": split,
                     "video_stem": video_stem,
                     "s3_key": s3_key,
+                    "fps": str(meta.get("fps", "")) if "meta" in locals() and meta is not None else "",
                     "requested_frames": str(len(frame_indices)),
                     "written_frames": "0",
                     "status": "error",
@@ -239,6 +282,7 @@ def extract_split_dataset_images(
                     "split",
                     "video_stem",
                     "s3_key",
+                    "fps",
                     "requested_frames",
                     "written_frames",
                     "status",
diff --git a/training/object-detection/src/object_detection/utils/utils.py b/training/object-detection/src/object_detection/utils/utils.py
index 2fe7e76..a3506da 100644
--- a/training/object-detection/src/object_detection/utils/utils.py
+++ b/training/object-detection/src/object_detection/utils/utils.py
@@ -1,3 +1,6 @@
+import re
+from typing import Dict, Optional
+
 def safe_float(x: str, default: float = 0.0) -> float:
     try:
         return float(x)
diff --git a/training/object-detection/tests/test_splits_parsing.py b/training/object-detection/tests/test_splits_parsing.py
index 5391dc5..0eca315 100644
--- a/training/object-detection/tests/test_splits_parsing.py
+++ b/training/object-detection/tests/test_splits_parsing.py
@@ -5,27 +5,11 @@
     area_bin,
     density_bin,
     parse_frame_idx,
-    parse_video_stem,
     read_yolo_label,
     time_bucket,
 )
 
 
-def test_parse_video_stem_happy_path():
-    got = parse_video_stem("HIRMD-tankeeah-jetson-0_20250714_012827_M")
-    assert got == {
-        "org": "HIRMD",
-        "site": "tankeeah",
-        "device": "jetson-0",
-        "date": "20250714",
-        "time": "012827",
-    }
-
-
-def test_parse_video_stem_invalid():
-    assert parse_video_stem("not_a_valid_stem") is None
-
-
 def test_time_bucket():
     assert time_bucket("010203") == "night"
     assert time_bucket("070000") == "morning"
diff --git a/training/object-detection/tests/test_utils.py b/training/object-detection/tests/test_utils.py
new file mode 100644
index 0000000..955acd8
--- /dev/null
+++ b/training/object-detection/tests/test_utils.py
@@ -0,0 +1,20 @@
+from object_detection.utils.utils import (
+    parse_video_stem,
+)
+
+
+def test_parse_video_stem_happy_path():
+    got = parse_video_stem("HIRMD-tankeeah-jetson-0_20250714_012827_M")
+    assert got == {
+        "org": "HIRMD",
+        "site": "tankeeah",
+        "device": "jetson-0",
+        "date": "20250714",
+        "time": "012827",
+        "suffix": "M",
+    }
+
+
+def test_parse_video_stem_invalid():
+    assert parse_video_stem("not_a_valid_stem") is None
+

From 6ee5fccedca6699ecefe00d34fe7c989f6e3f53a Mon Sep 17 00:00:00 2001
From: KamiCreed <9517086+KamiCreed@users.noreply.github.com>
Date: Wed, 1 Apr 2026 20:14:08 -0700
Subject: [PATCH 35/35] Pack data into shards instead

---
 training/object-detection/dvc.yaml            |  44 +++-
 training/object-detection/params.yaml         |   2 +
 ..._split_frames.py => pack_split_dataset.py} |   0
 .../src/object_detection/frames/cli.py        |  39 +++-
 .../src/object_detection/frames/extractor.py  | 205 ++++++++++++------
 .../src/object_detection/frames/parsing.py    |  19 ++
 .../src/object_detection/yolo_ls/shards.py    |   6 +-
 7 files changed, 227 insertions(+), 88 deletions(-)
 create mode 100644 training/object-detection/params.yaml
 rename training/object-detection/scripts/{extract_split_frames.py => pack_split_dataset.py} (100%)

diff --git a/training/object-detection/dvc.yaml b/training/object-detection/dvc.yaml
index 0e6c6fa..33ccbaf 100644
--- a/training/object-detection/dvc.yaml
+++ b/training/object-detection/dvc.yaml
@@ -91,26 +91,52 @@ stages:
       - data/01_raw/labelstudio_annos
     outs:
       - data/02_interim/video_metadata_index.csv
-  extract_split_images:
+  pack_split_dataset:
     cmd: >-
-      scripts/extract_split_frames.py
+      scripts/pack_split_dataset.py
       --splits-dir data/03_processed/splits_baseline
-      --images-root /mnt/harukassd4tb/masamim/salmon_dataset/images
-      --temp-video-dir /mnt/harukassd4tb/masamim/salmon_dataset/tmp_videos
-      --bucket prod-salmonvision-edge-assets-labelstudio-source
+      --labels-root data/02_interim/yolo_annos_unpacked
+      --shards-root ${config.drive}/salmon_dataset/dataset_sharded/shards
+      --manifests-root ${config.drive}/salmon_dataset/dataset_sharded/manifests
+      --temp-video-dir ${config.drive}/salmon_dataset/tmp_videos
       --metadata-csv data/02_interim/video_metadata_index.csv
       data/02_interim/yolo_condition_negatives/condition_negative_video_metadata.csv
+      --data-yaml config/salmon_yolo.yaml
+      --bucket prod-salmonvision-edge-assets-labelstudio-source
       --image-ext .jpg
-      --manifest-csv data/03_processed/frame_extraction_manifest.csv
+      --manifest-csv data/03_processed/packed_dataset_manifest.csv
       --splits train val test
+      --shard-size 100000
     deps:
-      - scripts/extract_split_frames.py
       - src/object_detection/frames/parsing.py
       - src/object_detection/frames/extractor.py
       - src/object_detection/frames/cli.py
+      - src/object_detection/yolo_ls/shards.py
       - data/03_processed/splits_baseline
+      - data/02_interim/yolo_annos_unpacked
       - data/02_interim/video_metadata_index.csv
       - data/02_interim/yolo_condition_negatives/condition_negative_video_metadata.csv
+      - config/salmon_yolo.yaml
+    params:
+      - config.drive
     outs:
-      - /mnt/harukassd4tb/masamim/salmon_dataset/images
-      - data/03_processed/frame_extraction_manifest.csv
+      - ${config.drive}/salmon_dataset/dataset_sharded/shards
+      - ${config.drive}/salmon_dataset/dataset_sharded/manifests
+      - data/03_processed/packed_dataset_manifest.csv
+  unpack_split_dataset:
+    cmd: >-
+      rm -rf ${config.drive}/salmon_dataset/yolo_workdir &&
+      mkdir -p ${config.drive}/salmon_dataset/yolo_workdir &&
+      scripts/unpack_annos.sh
+      ${config.drive}/salmon_dataset/dataset_sharded/shards
+      ${config.drive}/salmon_dataset/yolo_workdir &&
+      cp ${config.drive}/salmon_dataset/dataset_sharded/manifests/train.txt ${config.drive}/salmon_dataset/yolo_workdir/train.txt &&
+      cp ${config.drive}/salmon_dataset/dataset_sharded/manifests/val.txt ${config.drive}/salmon_dataset/yolo_workdir/val.txt &&
+      cp ${config.drive}/salmon_dataset/dataset_sharded/manifests/test.txt ${config.drive}/salmon_dataset/yolo_workdir/test.txt &&
+      cp ${config.drive}/salmon_dataset/dataset_sharded/manifests/data.yaml ${config.drive}/salmon_dataset/yolo_workdir/data.yaml
+    deps:
+      - ${config.drive}/salmon_dataset/dataset_sharded/shards
+      - ${config.drive}/salmon_dataset/dataset_sharded/manifests
+    params:
+      - config.drive
+    frozen: true
diff --git a/training/object-detection/params.yaml b/training/object-detection/params.yaml
new file mode 100644
index 0000000..4f332bb
--- /dev/null
+++ b/training/object-detection/params.yaml
@@ -0,0 +1,2 @@
+config:
+  drive: /mnt/harukassd4tb/masamim
diff --git a/training/object-detection/scripts/extract_split_frames.py b/training/object-detection/scripts/pack_split_dataset.py
similarity index 100%
rename from training/object-detection/scripts/extract_split_frames.py
rename to training/object-detection/scripts/pack_split_dataset.py
diff --git a/training/object-detection/src/object_detection/frames/cli.py b/training/object-detection/src/object_detection/frames/cli.py
index 7c3895a..6c447fe 100644
--- a/training/object-detection/src/object_detection/frames/cli.py
+++ b/training/object-detection/src/object_detection/frames/cli.py
@@ -2,41 +2,59 @@
 
 import argparse
 from pathlib import Path
+import yaml
 
-from object_detection.frames.extractor import extract_split_dataset_images
+from object_detection.frames.extractor import pack_split_dataset_shards
+
+
+def load_class_names_from_yolo_yaml(path: Path):
+    data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    names = data.get("names")
+    if isinstance(names, dict):
+        return [names[k] for k in sorted(names, key=lambda x: int(x))]
+    if isinstance(names, list):
+        return names
+    raise ValueError(f"Unsupported names format in {path}")
 
 
 def build_parser() -> argparse.ArgumentParser:
-    p = argparse.ArgumentParser(description="Download source videos and extract only split-requested frames.")
+    p = argparse.ArgumentParser(description="Pack split-aware YOLO dataset into tar shards.")
     p.add_argument("--splits-dir", required=True, help="Directory containing train.txt / val.txt / test.txt")
-    p.add_argument("--images-root", required=True, help="Output images root")
+    p.add_argument("--labels-root", required=True, help="Root of unpacked label files referenced by split manifests")
+    p.add_argument("--shards-root", required=True, help="Output directory for packed tar shards")
+    p.add_argument("--manifests-root", required=True, help="Output directory for new image manifests + data.yaml")
     p.add_argument("--temp-video-dir", required=True, help="Temporary directory for downloaded videos")
     p.add_argument("--metadata-csv", nargs="+", required=True,
                    help="One or more metadata CSVs with columns including video_stem,fps,s3_key")
-    p.add_argument("--bucket", default="",
+    p.add_argument("--data-yaml", required=True, help="YOLO data.yaml used only to get class names")
+    p.add_argument("--bucket", default="prod-salmonvision-edge-assets-labelstudio-source",
                    help="Fallback bucket if metadata row lacks s3_key")
     p.add_argument("--image-ext", default=".jpg", choices=[".jpg", ".png"])
-    p.add_argument("--overwrite", action="store_true")
     p.add_argument("--keep-videos", action="store_true")
     p.add_argument("--manifest-csv", default=None)
     p.add_argument("--splits", nargs="*", default=["train", "val", "test"])
+    p.add_argument("--shard-size", type=int, default=100000)
     return p
 
 
 def main() -> None:
     args = build_parser().parse_args()
+    class_names = load_class_names_from_yolo_yaml(Path(args.data_yaml))
 
-    stats = extract_split_dataset_images(
+    stats = pack_split_dataset_shards(
         splits_dir=Path(args.splits_dir),
-        images_root=Path(args.images_root),
+        labels_root=Path(args.labels_root),
+        shards_root=Path(args.shards_root),
+        manifests_root=Path(args.manifests_root),
         temp_video_dir=Path(args.temp_video_dir),
-        bucket=args.bucket,
         metadata_csv_paths=[Path(p) for p in args.metadata_csv],
+        class_names=class_names,
+        bucket=args.bucket,
         image_ext=args.image_ext,
-        overwrite=args.overwrite,
         cleanup_video=not args.keep_videos,
         split_names=args.splits,
         manifest_csv=Path(args.manifest_csv) if args.manifest_csv else None,
+        shard_size=args.shard_size,
     )
 
     print(
@@ -45,5 +63,6 @@ def main() -> None:
         f"videos_processed={stats.videos_processed} "
         f"videos_failed={stats.videos_failed} "
         f"frames_requested={stats.frames_requested} "
-        f"frames_written={stats.frames_written}"
+        f"images_written={stats.images_written} "
+        f"labels_written={stats.labels_written}"
     )
diff --git a/training/object-detection/src/object_detection/frames/extractor.py b/training/object-detection/src/object_detection/frames/extractor.py
index dd59a4e..9b2860f 100644
--- a/training/object-detection/src/object_detection/frames/extractor.py
+++ b/training/object-detection/src/object_detection/frames/extractor.py
@@ -5,10 +5,13 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Set
+import io
+import tarfile
 
+from object_detection.yolo_ls.shards import TarShardWriter
 from object_detection.frames.parsing import (
-    label_relpath_to_image_relpath,
     parse_manifest_relpath,
+    split_label_relpath_to_packed_paths,
     video_stem_to_s3_key,
 )
 
@@ -23,6 +26,7 @@ class ExtractionStats:
     videos_failed: int = 0
     frames_requested: int = 0
     frames_written: int = 0
+    labels_written: int = 0
 
 def load_video_metadata_index(path: Path) -> Dict[str, Dict[str, str]]:
     out: Dict[str, Dict[str, str]] = {}
@@ -68,6 +72,35 @@ def read_split_manifest(path: Path) -> List[str]:
     return lines
 
 
+def write_split_manifests(
+    manifests_root: Path,
+    split_to_image_relpaths: Dict[str, List[str]],
+) -> None:
+    ensure_dir(manifests_root)
+    for split, relpaths in split_to_image_relpaths.items():
+        out_path = manifests_root / f"{split}.txt"
+        relpaths = sorted(relpaths)
+        out_path.write_text("\n".join(relpaths) + ("\n" if relpaths else ""), encoding="utf-8")
+
+
+def write_data_yaml(
+    manifests_root: Path,
+    class_names: List[str],
+) -> None:
+    """
+    Writes a YOLO-style data.yaml that uses split manifest files.
+    """
+    lines = [
+        f"train: {str((manifests_root / 'train.txt').resolve())}",
+        f"val: {str((manifests_root / 'val.txt').resolve())}",
+        f"test: {str((manifests_root / 'test.txt').resolve())}",
+        "names:",
+    ]
+    for idx, name in enumerate(class_names):
+        lines.append(f"  {idx}: {name}")
+    (manifests_root / "data.yaml").write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
 def load_split_requests(splits_dir: Path, split_names: Iterable[str]) -> Dict[str, Dict[str, List[int]]]:
     """
     Returns:
@@ -106,6 +139,42 @@ def download_s3_video(bucket: str, s3_key: str, local_video_path: Path) -> None:
     subprocess.run(cmd, check=True)
 
 
+def read_label_text(labels_root: Path, relpath: str) -> str:
+    path = labels_root / relpath
+    return path.read_text(encoding="utf-8")
+
+
+def extract_frame_bytes_ffmpeg(
+    video_path: Path,
+    frame_idx: int,
+    fps: float,
+    image_ext: str = ".jpg",
+) -> bytes:
+    """
+    Extract one frame and return the encoded image bytes.
+    """
+    timestamp = frame_idx / float(fps)
+
+    if image_ext == ".jpg":
+        codec_args = ["-f", "image2", "-vcodec", "mjpeg"]
+    elif image_ext == ".png":
+        codec_args = ["-f", "image2", "-vcodec", "png"]
+    else:
+        raise ValueError(f"Unsupported image_ext: {image_ext}")
+
+    cmd = [
+        "ffmpeg",
+        "-hide_banner",
+        "-loglevel", "error",
+        "-ss", f"{timestamp:.6f}",
+        "-i", str(video_path),
+        "-frames:v", "1",
+    ] + codec_args + ["pipe:1"]
+
+    result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
+    return result.stdout
+
+
 def extract_frame_ffmpeg(
     video_path: Path,
     frame_idx: int,
@@ -137,76 +206,63 @@ def extract_frame_ffmpeg(
     return True
 
 
-def extract_video_frames_for_split(
-    video_path: Path,
-    frame_indices: Iterable[int],
-    output_dir: Path,
-    fps: float,
-    image_ext: str = ".jpg",
-    overwrite: bool = False,
-) -> int:
-    written = 0
-    for frame_idx in sorted(set(frame_indices)):
-        out_path = output_dir / f"frame_{frame_idx:06d}"
-        out_path = out_path.with_suffix(image_ext)
-
-        did_write = extract_frame_ffmpeg(
-            video_path=video_path,
-            frame_idx=frame_idx,
-            fps=fps,
-            output_path=out_path,
-            overwrite=overwrite,
-        )
-        if did_write or out_path.exists():
-            written += 1
-    return written
-
-
-def extract_split_dataset_images(
+def pack_split_dataset_shards(
     splits_dir: Path,
-    images_root: Path,
+    labels_root: Path,
+    shards_root: Path,
+    manifests_root: Path,
     temp_video_dir: Path,
     metadata_csv_paths: Iterable[Path],
-    bucket: str = "",
+    class_names: List[str],
+    bucket: str,
     image_ext: str = ".jpg",
-    overwrite: bool = False,
     cleanup_video: bool = True,
     split_names: Iterable[str] = ("train", "val", "test"),
     manifest_csv: Optional[Path] = None,
+    shard_size: int = 100000,
 ) -> ExtractionStats:
     """
-    Reads split manifests:
-      splits_dir/train.txt
-      splits_dir/val.txt
-      splits_dir/test.txt
-
-    Writes:
-      images_root/train/<video_stem>/frame_XXXXXX.jpg
-      images_root/val/<video_stem>/frame_XXXXXX.jpg
-      images_root/test/<video_stem>/frame_XXXXXX.jpg
+    Reads split manifests containing label relpaths, e.g.
+      HIRMD-.../frame_000123.txt
+
+    Produces sharded paired dataset:
+      train/<video_stem>/frame_000123.jpg
+      train/<video_stem>/frame_000123.txt
+      ...
+
+    Also writes fresh split manifests that point to image relpaths inside the packed layout:
+      train/HIRMD-.../frame_000123.jpg
     """
     split_requests = load_split_requests(splits_dir, split_names)
     stats = ExtractionStats(splits_seen=len(split_requests))
     metadata_index = merge_video_metadata_csvs(metadata_csv_paths)
 
+    ensure_dir(shards_root)
+    ensure_dir(manifests_root)
+
+    shard_writers: Dict[str, TarShardWriter] = {}
+    for split in split_requests.keys():
+        shard_writers[split] = TarShardWriter(
+            shards_root,
+            shard_size=shard_size,
+            prefix=split,
+        )
+
+    split_to_image_relpaths: Dict[str, List[str]] = {split: [] for split in split_requests.keys()}
     manifest_rows: List[Dict[str, str]] = []
 
     for split, by_video in split_requests.items():
+        writer = shard_writers[split]
+
         for video_stem, frame_indices in by_video.items():
             stats.videos_seen += 1
             stats.frames_requested += len(frame_indices)
 
             local_video = temp_video_dir / f"{video_stem}.mp4"
             s3_key = ""
-            try:
-                video_meta = metadata_index.get(video_stem)
-                if not video_meta:
-                    raise KeyError(f"Missing metadata for {video_stem}")
-
-                fps = float(video_meta["fps"])
-                if fps <= 0:
-                    raise ValueError(f"Invalid fps for {video_stem}: {video_meta['fps']}")
+            fps = 0.0
 
+            try:
                 meta = metadata_index.get(video_stem)
                 if meta is None:
                     raise KeyError(f"Missing metadata for video_stem={video_stem}")
@@ -218,28 +274,36 @@ def extract_split_dataset_images(
                 s3_key = (meta.get("s3_key") or "").strip()
                 if not s3_key:
                     if not bucket:
-                        raise ValueError(f"Missing s3_key for video_stem={video_stem} and no bucket fallback available")
+                        raise ValueError(f"Missing s3_key for video_stem={video_stem}")
                     s3_key = video_stem_to_s3_key(video_stem)
 
-                # If s3_key already contains bucket-like prefix, keep bucket only for aws cli command construction.
-                output_dir = images_root / split / video_stem
+                download_s3_video(bucket=bucket, s3_key=s3_key, local_video_path=local_video)
 
-                # If caller did not pass bucket explicitly, infer it from known default in your project usage.
-                effective_bucket = bucket or "prod-salmonvision-edge-assets-labelstudio-source"
+                for frame_idx in frame_indices:
+                    label_relpath = f"{video_stem}/frame_{frame_idx:06d}.txt"
+                    image_relpath, packed_label_relpath = split_label_relpath_to_packed_paths(
+                        split=split,
+                        relpath=label_relpath,
+                        image_ext=image_ext,
+                    )
+
+                    image_bytes = extract_frame_bytes_ffmpeg(
+                        video_path=local_video,
+                        frame_idx=frame_idx,
+                        fps=fps,
+                        image_ext=image_ext,
+                    )
+                    label_text = read_label_text(labels_root, label_relpath)
 
-                download_s3_video(bucket=effective_bucket, s3_key=s3_key, local_video_path=local_video)
+                    writer.write_bytes(str(image_relpath), image_bytes)
+                    split_to_image_relpaths[split].append(str(image_relpath))
 
-                frames_written = extract_video_frames_for_split(
-                    video_path=local_video,
-                    frame_indices=frame_indices,
-                    output_dir=output_dir,
-                    fps=fps,
-                    image_ext=image_ext,
-                    overwrite=overwrite,
-                )
+                    stats.images_written += 1
+
+                    writer.write_text(str(packed_label_relpath), label_text)
+                    stats.labels_written += 1
 
                 stats.videos_processed += 1
-                stats.frames_written += frames_written
 
                 manifest_rows.append({
                     "split": split,
@@ -247,7 +311,8 @@ def extract_split_dataset_images(
                     "s3_key": s3_key,
                     "fps": str(fps),
                     "requested_frames": str(len(frame_indices)),
-                    "written_frames": str(frames_written),
+                    "images_written": str(len(frame_indices)),
+                    "labels_written": str(len(frame_indices)),
                     "status": "ok",
                     "error": "",
                 })
@@ -258,9 +323,10 @@ def extract_split_dataset_images(
                     "split": split,
                     "video_stem": video_stem,
                     "s3_key": s3_key,
-                    "fps": str(meta.get("fps", "")) if "meta" in locals() and meta is not None else "",
+                    "fps": str(fps) if fps > 0 else "",
                     "requested_frames": str(len(frame_indices)),
-                    "written_frames": "0",
+                    "images_written": "0",
+                    "labels_written": "0",
                     "status": "error",
                     "error": repr(e),
                 })
@@ -273,6 +339,12 @@ def extract_split_dataset_images(
                     except Exception:
                         pass
 
+    for writer in shard_writers.values():
+        writer.close()
+
+    write_split_manifests(manifests_root, split_to_image_relpaths)
+    write_data_yaml(manifests_root, class_names)
+
     if manifest_csv is not None:
         ensure_dir(manifest_csv.parent)
         with manifest_csv.open("w", newline="", encoding="utf-8") as f:
@@ -284,7 +356,8 @@ def extract_split_dataset_images(
                     "s3_key",
                     "fps",
                     "requested_frames",
-                    "written_frames",
+                    "images_written",
+                    "labels_written",
                     "status",
                     "error",
                 ],
diff --git a/training/object-detection/src/object_detection/frames/parsing.py b/training/object-detection/src/object_detection/frames/parsing.py
index 5b8549b..3685b8d 100644
--- a/training/object-detection/src/object_detection/frames/parsing.py
+++ b/training/object-detection/src/object_detection/frames/parsing.py
@@ -7,6 +7,25 @@
 from object_detection.utils.utils import parse_video_stem
 
 
+def split_label_relpath_to_packed_paths(
+    split: str,
+    relpath: str,
+    image_ext: str = ".jpg",
+) -> Tuple[Path, Path]:
+    """
+    Convert a split manifest label entry like:
+      HIRMD-tankeeah-jetson-0_20250714_012827_M/frame_000123.txt
+
+    into packed dataset paths:
+      train/HIRMD-tankeeah-jetson-0_20250714_012827_M/frame_000123.jpg
+      train/HIRMD-tankeeah-jetson-0_20250714_012827_M/frame_000123.txt
+    """
+    p = Path(relpath.strip())
+    label_rel = Path(split) / p
+    image_rel = label_rel.with_suffix(image_ext)
+    return image_rel, label_rel
+
+
 def parse_frame_idx(label_filename: str) -> Optional[int]:
     m = re.match(r"^frame_(\d+)\.txt$", label_filename)
     if not m:
diff --git a/training/object-detection/src/object_detection/yolo_ls/shards.py b/training/object-detection/src/object_detection/yolo_ls/shards.py
index 5eb9de3..88286aa 100644
--- a/training/object-detection/src/object_detection/yolo_ls/shards.py
+++ b/training/object-detection/src/object_detection/yolo_ls/shards.py
@@ -25,15 +25,15 @@ def _open_new(self):
         self._shard_idx += 1
 
     def write_text(self, rel_path: str, text: str):
-        # rotate shard if needed
+        self.write_bytes(rel_path, text.encode("utf-8"))
+
+    def write_bytes(self, rel_path: str, data: bytes):
         if self._n_in_shard >= self.shard_size:
             self._open_new()
 
-        data = text.encode("utf-8")
         ti = tarfile.TarInfo(name=rel_path)
         ti.size = len(data)
         self._tar.addfile(ti, io.BytesIO(data))
-
         self._n_in_shard += 1
 
     def close(self):