fix: parse indented rule/timestamp lines in group/pipe job blocks

nh13 · nh13 · commit 46b52ac02c41 · 2026-03-28T14:39:03.000-07:00
Snakemake indents log output for jobs within group/pipe blocks by 4 spaces. The parser used RULE_START_PATTERN.match() anchored at position 0 and line.startswith("[") checks that both fail on indented lines, causing group jobs to be invisible or assigned wrong rule names. Fix by using RULE_START_PATTERN.match(line.lstrip()) for rule detection and TIMESTAMP_PATTERN.search() for timestamp detection across all parser functions. Add _parse_indented_or_group_line() to LogLineParser for the same handling in the streaming path. Closes #42
diff --git a/snakesee/parser/core.py b/snakesee/parser/core.py
@@ -259,15 +259,21 @@ def parse_rules_from_log(log_path: Path) -> dict[str, int]:
     """
     rule_counts: dict[str, int] = {}
     current_rule: str | None = None
+    job_rules: dict[str, str] = {}
 
     try:
         for line in log_path.read_text().splitlines():
             # Track current rule being executed
-            if match := RULE_START_PATTERN.match(line):
+            if match := RULE_START_PATTERN.match(line.lstrip()):
                 current_rule = match.group(1)
-            # Count "Finished job" as rule completion
-            elif "Finished job" in line and current_rule is not None:
-                rule_counts[current_rule] = rule_counts.get(current_rule, 0) + 1
+            # Map jobid to current rule
+            elif (match := JOBID_PATTERN.match(line)) and current_rule is not None:
+                job_rules[match.group(1)] = current_rule
+            # Count finished jobs using jobid-to-rule mapping
+            elif match := FINISHED_JOB_PATTERN.search(line):
+                rule = job_rules.get(match.group(1), current_rule)
+                if rule is not None:
+                    rule_counts[rule] = rule_counts.get(rule, 0) + 1
     except OSError as e:
         logger.info("Could not read log file %s: %s", log_path, e)
 
@@ -320,7 +326,7 @@ def record_pending_error() -> None:
         lines = _cached_lines if _cached_lines is not None else log_path.read_text().splitlines()
         for line_num, line in enumerate(lines):
             # Track current rule being executed
-            if match := RULE_START_PATTERN.match(line):
+            if match := RULE_START_PATTERN.match(line.lstrip()):
                 record_pending_error()
                 current_rule = match.group(1)
                 current_jobid = None  # Reset jobid for new rule block
@@ -329,7 +335,7 @@ def record_pending_error() -> None:
                 current_log_path = None
 
             # Timestamp lines end error blocks
-            elif line.startswith("[") and TIMESTAMP_PATTERN.match(line):
+            elif TIMESTAMP_PATTERN.match(line.lstrip()):
                 record_pending_error()
 
             # Capture wildcards within rule block
@@ -484,7 +490,7 @@ def emit_pending_error() -> None:
         lines = _cached_lines if _cached_lines is not None else log_path.read_text().splitlines()
         for line in lines:
             # Track current rule - this also ends any pending error block
-            if match := RULE_START_PATTERN.match(line):
+            if match := RULE_START_PATTERN.match(line.lstrip()):
                 emit_pending_error()
                 current_rule = match.group(1)
                 current_jobid = None
@@ -493,7 +499,7 @@ def emit_pending_error() -> None:
                 current_log_path = None
 
             # Timestamp lines end error blocks
-            elif line.startswith("[") and TIMESTAMP_PATTERN.match(line):
+            elif TIMESTAMP_PATTERN.match(line.lstrip()):
                 emit_pending_error()
 
             # Capture wildcards - applies to both rule blocks and error blocks
@@ -633,12 +639,12 @@ def _get_first_log_timestamp(
     try:
         if _cached_lines is not None:
             for line in _cached_lines:
-                if match := TIMESTAMP_PATTERN.match(line):
+                if match := TIMESTAMP_PATTERN.match(line.lstrip()):
                     return _parse_timestamp(match.group(1))
         else:
             with log_path.open() as f:
                 for line in f:
-                    if match := TIMESTAMP_PATTERN.match(line):
+                    if match := TIMESTAMP_PATTERN.match(line.lstrip()):
                         return _parse_timestamp(match.group(1))
     except OSError as e:
         logger.info("Could not read log file %s: %s", log_path, e)
@@ -681,11 +687,11 @@ def parse_completed_jobs_from_log(
         lines = _cached_lines if _cached_lines is not None else log_path.read_text().splitlines()
         for line in lines:
             # Check for timestamp
-            if match := TIMESTAMP_PATTERN.match(line):
+            if match := TIMESTAMP_PATTERN.match(line.lstrip()):
                 current_timestamp = _parse_timestamp(match.group(1))
 
             # Track current rule being executed
-            elif match := RULE_START_PATTERN.match(line):
+            elif match := RULE_START_PATTERN.match(line.lstrip()):
                 current_rule = match.group(1)
                 current_wildcards = None
                 current_threads = None
@@ -769,7 +775,7 @@ def parse_threads_from_log(log_path: Path) -> dict[str, int]:
     try:
         for line in log_path.read_text().splitlines():
             # Track current rule (resets context)
-            if RULE_START_PATTERN.match(line):
+            if RULE_START_PATTERN.match(line.lstrip()):
                 current_jobid = None
                 current_threads = None
 
@@ -826,7 +832,7 @@ def parse_all_jobs_from_log(
         lines = _cached_lines if _cached_lines is not None else log_path.read_text().splitlines()
         for line in lines:
             # Track current rule being scheduled
-            if match := RULE_START_PATTERN.match(line):
+            if match := RULE_START_PATTERN.match(line.lstrip()):
                 # Save previous job if complete
                 if current_rule is not None and current_jobid is not None:
                     if current_jobid not in seen_jobids:
diff --git a/snakesee/parser/line_parser.py b/snakesee/parser/line_parser.py
@@ -178,12 +178,10 @@ def parse_line(self, line: str) -> list[ParseEvent]:
                 events.append(ParseEvent(ParseEventType.TIMESTAMP, {"timestamp": timestamp}))
             return events
 
-        # Indented lines (properties) start with space/tab
+        # Indented lines start with space/tab.  In group/pipe job blocks,
+        # rule starts and timestamps are indented by 4 spaces.
         if first_char in (" ", "\t"):
-            event = self._parse_indented_line(line)
-            if event:
-                events.append(event)
-            return events
+            return self._parse_indented_or_group_line(line, events)
 
         # Rule/checkpoint start - this ends error blocks
         # Matches: "rule X:", "localrule X:", "checkpoint X:", "localcheckpoint X:"
@@ -243,6 +241,56 @@ def flush_pending_error(self) -> ParseEvent | None:
         """
         return self.context.get_pending_error()
 
+    def _parse_indented_or_group_line(
+        self, line: str, events: list[ParseEvent]
+    ) -> list[ParseEvent]:
+        """Parse indented lines: group-block elements or property lines.
+
+        In group/pipe job blocks, rule starts and timestamps are indented by
+        4 spaces.  Property lines are indented by 4 (normal) or 8 (group) spaces.
+
+        Args:
+            line: Indented log line starting with space/tab.
+            events: Mutable list to append events to.
+
+        Returns:
+            The events list (same object passed in).
+        """
+        stripped = line.lstrip()
+        if not stripped:
+            return events
+
+        first_stripped = stripped[0]
+
+        # Indented timestamp: "    [Mon Jan  6 10:00:00 2026]"
+        if first_stripped == "[":
+            if match := TIMESTAMP_PATTERN.match(stripped):
+                if pending := self.context.get_pending_error():
+                    events.append(pending)
+                timestamp = _parse_timestamp(match.group(1))
+                self.context.timestamp = timestamp
+                events.append(ParseEvent(ParseEventType.TIMESTAMP, {"timestamp": timestamp}))
+            return events
+
+        # Indented rule start: "    rule X:", "    localrule X:",
+        # "    checkpoint X:", or "    localcheckpoint X:"
+        if first_stripped in ("r", "l", "c") and stripped.startswith(
+            ("rule ", "localrule ", "checkpoint ", "localcheckpoint ")
+        ):
+            if match := RULE_START_PATTERN.match(stripped):
+                if pending := self.context.get_pending_error():
+                    events.append(pending)
+                rule = match.group(1)
+                self.context.reset_for_new_rule(rule)
+                events.append(ParseEvent(ParseEventType.RULE_START, {"rule": rule}))
+            return events
+
+        # Property lines (wildcards, threads, log, jobid)
+        event = self._parse_indented_line(line)
+        if event:
+            events.append(event)
+        return events
+
     def _parse_indented_line(self, line: str) -> ParseEvent | None:
         """Parse indented property lines (wildcards, threads, log, jobid).
 
diff --git a/tests/integration/test_workflows.py b/tests/integration/test_workflows.py
@@ -47,10 +47,13 @@ def test_simple_linear(self, workflow_runner: WorkflowRunner) -> None:
 
         # Verify workflow lifecycle
         assert result.workflow_started, "Workflow started event missing"
-        assert result.total_jobs == 4, f"Expected 4 total jobs, got {result.total_jobs}"
         # Progress events may be incomplete in CI environments due to process exit
         # timing for both Snakemake 8.x (log handler) and 9.x (logger plugin).
+        # total_jobs comes from PROGRESS events which may not arrive, so allow 0.
         # We verify workflow completion via Snakemake's exit code instead.
+        assert result.total_jobs in (0, 4), (
+            f"Expected 4 total jobs (or 0 if progress events lost), got {result.total_jobs}"
+        )
         # Allow up to 1 missing job due to CI timing (expect 4, require >= 3)
         assert result.completed_jobs >= 3, (
             f"Expected at least 3 completed jobs, got {result.completed_jobs}"
diff --git a/tests/test_parser.py b/tests/test_parser.py