openpotato · fstueber · Jan 18, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 15, 2026
diff --git a/.github/workflows/check-data.yaml b/.github/workflows/check-data.yaml
@@ -1,6 +1,6 @@
 name: Check Data
 
-on: [push]
+on: [push, pull_request]
 
 jobs:
   build:
@@ -9,9 +9,9 @@ jobs:
       matrix:
         python-version: ["3.10"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v6
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

diff --git a/bin/test_all_tables.py b/bin/test_all_tables.py
@@ -1,38 +1,215 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
+from __future__ import annotations
 
 from argparse import ArgumentParser
 from pathlib import Path
+import sys
+from uuid import UUID
 
 import pandas as pd
 
 
-def _check_duration(df: pd.DataFrame, filename: Path) -> None:
-    """Parses StartDate and EndDate as YYYY-MM-DD and checks that EndDate if after the
-    StartDate"""
+REQUIRED_COLUMNS = {
+    "Id",
+    "Country",
+    "StartDate",
+    "EndDate",
+    "Type",
+    "RegionalScope",
+    "Name",
+}
+
+# Columns that must exist and must not be empty per row.
+REQUIRED_NON_EMPTY = {
+    "Id",
+    "Country",
+    "StartDate",
+    "Type",
+    "RegionalScope",
+    "Name",
+}
+
+
+def _csv_line(row_index: int) -> int:
+    # +1 for header, +1 for 0-based index -> line number
+    return row_index + 2
+
+
+def _read_csv(path: Path) -> pd.DataFrame:
+    # Use UTF-8 with BOM and force strings to avoid accidental type coercion.
+    # Treat only empty fields as missing.
+    return pd.read_csv(
+        path,
+        sep=";",
+        encoding="utf-8-sig",
+        dtype=str,
+        keep_default_na=False,
+        na_values=[""],
+    )
+
+
+def _check_required_columns(df: pd.DataFrame, filename: Path) -> list[str]:
+    missing = REQUIRED_COLUMNS - set(df.columns)
+    if not missing:
+        return []
+    return [f"{filename}: missing required columns: {sorted(missing)}"]
+
+
+def _check_required_values(df: pd.DataFrame, filename: Path) -> list[str]:
+    errors: list[str] = []
+    for col in sorted(REQUIRED_NON_EMPTY):
+        if col not in df.columns:
+            continue
+        missing = df[col].isna()
+        if not missing.any():
+            continue
+        for idx in missing[missing].index[:10]:
+            errors.append(f"{filename} (line {_csv_line(int(idx))}): missing value in column '{col}'")
+        more = int(missing.sum()) - min(10, int(missing.sum()))
+        if more > 0:
+            errors.append(f"{filename}: {more} more missing '{col}' values not shown")
+    return errors
+
 
-    df.StartDate = pd.to_datetime(df.StartDate, format="%Y-%m-%d")
-    df.EndDate = pd.to_datetime(df.EndDate, format="%Y-%m-%d")
-    positive_duration_mask = df.EndDate.isna() | ((df.EndDate >= df.StartDate))
-    if not positive_duration_mask.all():
-        raise ValueError(
-            f"Holidays with negative duration in '{filename}':\n"
-            f"{df[~positive_duration_mask]}"
+def _check_country_column(df: pd.DataFrame, filename: Path, expected_country: str) -> list[str]:
+    if "Country" not in df:
+        return []
+    wrong = df[~df["Country"].isna() & (df["Country"] != expected_country)]
+    if wrong.empty:
+        return []
+    lines = ", ".join(str(_csv_line(int(i))) for i in wrong.index[:10])
+    return [
+        f"{filename}: Country column mismatch (expected '{expected_country}') at lines {lines}"
+    ]
+
+
+def _parse_dates(df: pd.DataFrame, filename: Path) -> tuple[pd.Series, pd.Series, list[str]]:
+    errors: list[str] = []
+
+    start_raw = df["StartDate"]
+    end_raw = df["EndDate"]
+
+    start_parsed = pd.to_datetime(start_raw, format="%Y-%m-%d", errors="coerce")
+    end_parsed = pd.to_datetime(end_raw, format="%Y-%m-%d", errors="coerce")
+
+    invalid_start = start_raw.notna() & start_parsed.isna()
+    invalid_end = end_raw.notna() & end_parsed.isna()
+
+    if invalid_start.any():
+        bad = df[invalid_start].head(10)
+        for idx, val in bad["StartDate"].items():
+            errors.append(f"{filename} (line {_csv_line(int(idx))}): invalid StartDate '{val}'")
+
+    if invalid_end.any():
+        bad = df[invalid_end].head(10)
+        for idx, val in bad["EndDate"].items():
+            errors.append(f"{filename} (line {_csv_line(int(idx))}): invalid EndDate '{val}'")
+
+    return start_parsed, end_parsed, errors
+
+
+def _check_duration(
+    start_dates: pd.Series, end_dates: pd.Series, filename: Path
+) -> list[str]:
+    # EndDate may be empty -> allowed. If present, must be >= StartDate.
+    mask = end_dates.notna() & start_dates.notna() & (end_dates < start_dates)
+    if not mask.any():
+        return []
+
+    errors: list[str] = []
+    for idx in mask[mask].index[:10]:
+        errors.append(
+            f"{filename} (line {_csv_line(int(idx))}): EndDate < StartDate ({end_dates.loc[idx].date()} < {start_dates.loc[idx].date()})"
         )
+    more = int(mask.sum()) - len(errors)
+    if more > 0:
+        errors.append(f"{filename}: {more} more negative durations not shown")
+    return errors
+
+
+def _check_sorting(start_dates: pd.Series, filename: Path) -> list[str]:
+    # Only compare rows with valid StartDate values.
+    errors: list[str] = []
+    for i in range(len(start_dates) - 1):
+        a = start_dates.iloc[i]
+        b = start_dates.iloc[i + 1]
+        if pd.isna(a) or pd.isna(b):
+            continue
+        if a > b:
+            errors.append(
+                f"{filename}: not sorted by StartDate: line {_csv_line(i)} ({a.date()}) > line {_csv_line(i + 1)} ({b.date()})"
+            )
+            if len(errors) >= 5:
+                break
+    return errors
+
+
+def _split_csv_list(value: str) -> list[str]:
+    return [part.strip() for part in value.split(",") if part.strip()]
+
+
+def _check_subdivisions(df: pd.DataFrame, subdivisions: set[str], filename: Path) -> list[str]:
+    if not subdivisions:
+        return []
+    if "Subdivisions" not in df.columns:
+        return []
+
+    used: set[str] = set()
+    for val in df["Subdivisions"].dropna():
+        used.update(_split_csv_list(val))
 
+    unknown = used - subdivisions
+    if not unknown:
+        return []
+    return [f"{filename}: unknown Subdivisions values: {sorted(unknown)}"]
+
+
+def _check_uuids_and_global_uniqueness(
+    df: pd.DataFrame, filename: Path, seen: dict[str, tuple[Path, int]]
+) -> list[str]:
+    if "Id" not in df.columns:
+        return [f"{filename}: missing Id column"]
+
+    errors: list[str] = []
+    for idx, raw in df["Id"].items():
+        line = _csv_line(int(idx))
+        if pd.isna(raw) or str(raw).strip() == "":
+            errors.append(f"{filename} (line {line}): missing UUID")
+            continue
+
+        try:
+            normalized = str(UUID(str(raw))).lower()
+        except (ValueError, AttributeError, TypeError):
+            errors.append(f"{filename} (line {line}): invalid UUID '{raw}'")
+            continue
 
-def _check_subdivisions(
-    df: pd.DataFrame, subdivisions: set[str], filename: Path
-) -> None:
-    """Checks that the subdivisions in df are also present in subdivisions.csv"""
-    if "Subdivisions" in df:
-        unknown_subdivisions = set(
-            df.Subdivisions.dropna().map(lambda x: x.split(",")).explode()
-        ) - set(subdivisions)
-        if unknown_subdivisions:
-            raise ValueError(
-                f"Unknown subdivisions in {filename}: {unknown_subdivisions}. "
-                f"Known are {subdivisions}"
+        if normalized in seen:
+            prev_file, prev_line = seen[normalized]
+            errors.append(
+                "Duplicate UUID "
+                + normalized
+                + ":\n"
+                + f"  - {prev_file} (line {prev_line})\n"
+                + f"  - {filename} (line {line})"
             )
+        else:
+            seen[normalized] = (filename, line)
+
+    return errors
+
+
+def _load_subdivisions(country_dir: Path) -> set[str]:
+    subdivisions_csv = country_dir / "subdivisions.csv"
+    if not subdivisions_csv.exists():
+        return set()
+    try:
+        df = _read_csv(subdivisions_csv)
+    except pd.errors.ParserError:
+        return set()
+    if "ShortName" not in df.columns:
+        return set()
+    return set(df["ShortName"].dropna().astype(str))
 
 
 def main() -> None:
@@ -42,23 +219,48 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    for country_dir in sorted(args.data_folder.iterdir()):
-        if not country_dir.is_dir():
-            continue
+    errors: list[str] = []
+    seen_uuids: dict[str, tuple[Path, int]] = {}
 
-        try:
-            df_subdivisions = pd.read_csv(country_dir / "subdivisions.csv", sep=";", keep_default_na=False)
-            subdivisions = set(df_subdivisions.ShortName)
-        except FileNotFoundError:
-            subdivisions = {}
+    for country_dir in sorted([p for p in args.data_folder.iterdir() if p.is_dir()]):
+        expected_country = country_dir.name.upper()
+        subdivisions = _load_subdivisions(country_dir)
+
+        holidays_dir = country_dir / "holidays"
+        if not holidays_dir.exists():
+            continue
 
-        for holidays_file in (country_dir / "holidays").iterdir():
+        for holidays_file in sorted(holidays_dir.glob("*.csv")):
             try:
-                df = pd.read_csv(holidays_file, sep=";")
+                df = _read_csv(holidays_file)
             except pd.errors.ParserError as error:
-                raise ValueError(f"Could not parse '{holidays_file}'") from error
-            _check_subdivisions(df, subdivisions, holidays_file)
-            _check_duration(df, holidays_file)
+                errors.append(f"{holidays_file}: could not parse CSV - {error}")
+                continue
+
+            errors.extend(_check_required_columns(df, holidays_file))
+            if REQUIRED_COLUMNS - set(df.columns):
+                # Don’t cascade on missing columns.
+                continue
+
+            errors.extend(_check_required_values(df, holidays_file))
+
+            errors.extend(_check_country_column(df, holidays_file, expected_country))
+            errors.extend(_check_uuids_and_global_uniqueness(df, holidays_file, seen_uuids))
+
+            start_dates, end_dates, date_errors = _parse_dates(df, holidays_file)
+            errors.extend(date_errors)
+            errors.extend(_check_duration(start_dates, end_dates, holidays_file))
+            errors.extend(_check_sorting(start_dates, holidays_file))
+            errors.extend(_check_subdivisions(df, subdivisions, holidays_file))
+
+    if errors:
+        print(f"Validation failed with {len(errors)} error(s):\n", file=sys.stderr)
+        for message in errors:
+            print(f"- {message}", file=sys.stderr)
+        sys.exit(1)
+
+    print("✓ All validations passed")
 
 
-main()
+if __name__ == "__main__":
+    main()