Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 79 additions & 29 deletions PythonScripts/audit_translations/auditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,11 +291,84 @@ def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -
return lines[0] if lines else rule.line_number


def structure_token_occurrence_index(tokens: List[str], position: int) -> Optional[int]:
"""
Return which occurrence of a token appears at a given absolute token position.

Example: for ["test:", "if:", "test:"], position 2 returns 1.
"""
if position < 0 or position >= len(tokens):
return None
token = tokens[position]
return sum(1 for current in tokens[:position] if current == token)


def resolve_structure_issue_lines(diff: RuleDifference) -> Optional[Tuple[int, int]]:
"""
Resolve stable line anchors for a structural rule difference.

Strategy:
- Use position-aware token occurrence matching when possible.
- For insert/delete cases (one side missing token), anchor to the previous
shared structural token; if unavailable, anchor to `replace:`.
"""
en_tokens = extract_structure_elements(diff.english_rule.data)
tr_tokens = extract_structure_elements(diff.translated_rule.data)
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)

if mismatch_pos < 0:
return None

# Insertion/deletion: anchor to the previous shared token if possible.
if en_token is None or tr_token is None:
anchor_pos = mismatch_pos - 1
if (
anchor_pos >= 0
and anchor_pos < len(en_tokens)
and anchor_pos < len(tr_tokens)
and en_tokens[anchor_pos] == tr_tokens[anchor_pos]
):
anchor_token = en_tokens[anchor_pos]
en_occ = structure_token_occurrence_index(en_tokens, anchor_pos)
tr_occ = structure_token_occurrence_index(tr_tokens, anchor_pos)
if en_occ is not None and tr_occ is not None:
line_en = resolve_issue_line_at_position(diff.english_rule, "structure", anchor_token, en_occ)
line_tr = resolve_issue_line_at_position(diff.translated_rule, "structure", anchor_token, tr_occ)
if line_en is not None and line_tr is not None:
return line_en, line_tr

# Fallback: anchor both sides to replace, which is the rule body entrypoint.
line_en = resolve_issue_line(diff.english_rule, "structure", "replace:") or diff.english_rule.line_number
line_tr = resolve_issue_line(diff.translated_rule, "structure", "replace:") or diff.translated_rule.line_number
return line_en, line_tr

# Exact token available on both sides: resolve by occurrence index at mismatch.
en_occ = structure_token_occurrence_index(en_tokens, mismatch_pos)
tr_occ = structure_token_occurrence_index(tr_tokens, mismatch_pos)
if en_occ is not None and tr_occ is not None:
line_en = resolve_issue_line_at_position(diff.english_rule, "structure", en_token, en_occ)
line_tr = resolve_issue_line_at_position(diff.translated_rule, "structure", tr_token, tr_occ)
if line_en is not None and line_tr is not None:
return line_en, line_tr

line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
if line_en is None or line_tr is None:
return None
return line_en, line_tr


def collect_issues(
result: ComparisonResult,
file_name: str,
language: str,
) -> List[dict]:
"""
Flatten a ComparisonResult into one normalized dictionary per issue.

This is the canonical bridge from parser/diff objects to serializable
records consumed by JSONL output, snapshot tests, and line-level assertions.
"""
issues = []

for rule in result.missing_rules:
Expand Down Expand Up @@ -345,23 +418,10 @@ def collect_issues(
rule = diff.english_rule
issue = issue_base(rule, file_name, language)
if diff.diff_type == "structure":
en_tokens = extract_structure_elements(diff.english_rule.data)
tr_tokens = extract_structure_elements(diff.translated_rule.data)
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)

# Skip reporting when tokens are misaligned (both exist but differ)
# This avoids misleading line numbers when entire blocks are missing/added
# We only report when one is None (clear case of missing element)
if en_token is not None and tr_token is not None and en_token != tr_token:
continue

issue_line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
issue_line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)

# Skip reporting structure differences where we can't find both tokens
# This avoids misleading line numbers when blocks are missing
if issue_line_en is None or issue_line_tr is None:
structure_lines = resolve_structure_issue_lines(diff)
if structure_lines is None:
continue
issue_line_en, issue_line_tr = structure_lines
else:
issue_line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
issue_line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
Expand Down Expand Up @@ -438,20 +498,10 @@ def add_issue(rule: RuleInfo, issue_type: str, payload: Dict[str, Any]) -> None:

for diff in result.rule_differences:
if diff.diff_type == "structure":
en_tokens = extract_structure_elements(diff.english_rule.data)
tr_tokens = extract_structure_elements(diff.translated_rule.data)
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)

# Skip reporting when tokens are misaligned (both exist but differ)
# This avoids misleading line numbers when entire blocks are missing/added
if en_token is not None and tr_token is not None and en_token != tr_token:
continue

line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
# Skip structure diffs where we can't find both tokens
if line_en is None or line_tr is None:
structure_lines = resolve_structure_issue_lines(diff)
if structure_lines is None:
continue
line_en, line_tr = structure_lines
else:
line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Repro Fixtures

This folder contains minimal fixtures from snapshots of the Rules at some point in time.

The first use of this is comparing `per-fraction` between the English and Norwegian rules as of 2026-02-17, which had a bug of the wrong lines being shown.
- `en/per_fraction.yaml`
- `nb/per_fraction.yaml`
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- name: per-fraction
tag: fraction
match:
- "BaseNode(*[1])[contains(@data-intent-property, ':unit') or"
- " ( self::m:mrow and count(*)=3 and" # maybe a bit paranoid checking the structure...
- " *[1][self::m:mn] and *[2][.='\u2062'] and BaseNode(*[3])[contains(@data-intent-property, ':unit')] ) ] and"
- "BaseNode(*[2])[contains(@data-intent-property, ':unit')] "
replace:
- x: "*[1]"
- t: "per" # phrase('5 meters 'per' second)
- x: "*[2]"

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
- name: per-fraction
tag: fraction
match:
- "BaseNode(*[1])[contains(@data-intent-property, ':unit') or"
- " ( self::m:mrow and count(*)=3 and" # maybe a bit paranoid checking the structure...
- " *[1][self::m:mn] and *[2][.='\u2062'] and BaseNode(*[3])[contains(@data-intent-property, ':unit')] ) ] and"
- "BaseNode(*[2])[contains(@data-intent-property, ':unit') or (contains(@data-intent-property, ':unit') and .='t')] "
replace:
- x: "*[1]"
- T: "per" # phrase('5 meters 'per' second)
- test:
if: "*[2]='t'"
then: [T: "time"]
else:
- x: "*[2]"

28 changes: 23 additions & 5 deletions PythonScripts/audit_translations/tests/golden/jsonl/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@
"rule_name": "struct-rule",
"rule_tag": "mi",
"rule_key": "struct-rule|mi",
"issue_line_en": 9,
"issue_line_tr": 1,
"issue_line_en": 7,
"issue_line_tr": 7,
"rule_line_en": 1,
"rule_line_tr": 1,
"issue_type": "rule_difference",
Expand Down Expand Up @@ -161,16 +161,34 @@
"english_snippet": "$Verbosity!='Terse', $Setting = 'Value', parent::m:minus, *[2][.='2']",
"translated_snippet": "$Setting = 'Value', parent::m:minus, *[2][.='2']",
"untranslated_texts": [],
"_explanation": "structure_misaligned.yaml: English has extra test block causing misalignment. Fix filters out misleading structure differences but reports condition difference."
"_explanation": "structure_misaligned.yaml: condition difference remains reported."
},
{
"language": "de",
"file": "structure_misaligned.yaml",
"rule_name": "misaligned-structure",
"rule_tag": "root",
"rule_key": "misaligned-structure|root",
"issue_line_en": 11,
"issue_line_tr": 11,
"rule_line_en": 1,
"rule_line_tr": 1,
"issue_type": "rule_difference",
"diff_type": "structure",
"description": "Rule structure differs (test/if/then/else blocks)",
"english_snippet": "replace: test: if: then: test: if: then: test: if: then: else: test: if: then: else:",
"translated_snippet": "replace: test: if: then: test: if: then: else: test: if: then: else:",
"untranslated_texts": [],
"_explanation": "structure_misaligned.yaml: structure substitutions/realignments are now reported with position-aware anchors."
},
{
"language": "de",
"file": "structure_missing_else.yaml",
"rule_name": "missing-else-block",
"rule_tag": "root",
"rule_key": "missing-else-block|root",
"issue_line_en": 8,
"issue_line_tr": 1,
"issue_line_en": 7,
"issue_line_tr": 7,
"rule_line_en": 1,
"rule_line_tr": 1,
"issue_type": "rule_difference",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
en: $Verbosity!='Terse', not(IsNode(*[1], 'leaf'))
tr: not(IsNode(*[1], 'leaf'))
Structure Differences [1]
• (line 38 en, 18 tr)
• (line 40 en, 25 tr)
Rule structure differs (test/if/then/else blocks)
en: replace: test: if: then: test: if: then:
tr: replace: test: if: then:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
≠ Rule Issues [1] (grouped by rule and issue type)
• struct-rule (mi)
Structure Differences [1]
• (line 9 en, 1 tr)
• (line 7 en, 7 tr)
Rule structure differs (test/if/then/else blocks)
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
≠ Rule Issues [1] (grouped by rule and issue type)
• struct-rule (mi)
Structure Differences [1]
• (line 9 en, 1 tr)
• (line 7 en, 7 tr)
Rule structure differs (test/if/then/else blocks)
en: replace: test: if: then: else:
tr: replace: test: if: then:
Loading
Loading