Skip to content

Commit dfa8ef6

Browse files
feat(validation): add Name field validation and improve error output
- Validate that Name field parts start with language code (2 uppercase letters + space) - Group errors by file with max 10 errors shown per file - Add summary with files/errors count
1 parent 1f80598 commit dfa8ef6

1 file changed

Lines changed: 84 additions & 14 deletions

File tree

bin/test_all_tables.py

100755100644
Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,42 @@ def _check_subdivisions(df: pd.DataFrame, subdivisions: set[str], filename: Path
165165
return [f"{filename}: unknown Subdivisions values: {sorted(unknown)}"]
166166

167167

168+
def _check_name_format(df: pd.DataFrame, filename: Path) -> list[str]:
169+
"""Check that each part of the Name field starts with a language code (two uppercase letters) followed by a space."""
170+
if "Name" not in df.columns:
171+
return []
172+
173+
errors: list[str] = []
174+
for idx, name in df["Name"].items():
175+
if pd.isna(name):
176+
continue
177+
178+
parts = _split_csv_list(str(name))
179+
for part in parts:
180+
# Each part must start with a language code (2 uppercase letters) followed by a space
181+
if len(part) < 3:
182+
errors.append(
183+
f"{filename} (line {_csv_line(int(idx))}): Name part '{part}' is too short (must be language code + space + text). Hint: Use %2C instead of comma within text."
184+
)
185+
continue
186+
187+
# Check first 2 characters are uppercase letters (language code)
188+
if not (part[0].isupper() and part[0].isalpha() and
189+
part[1].isupper() and part[1].isalpha()):
190+
errors.append(
191+
f"{filename} (line {_csv_line(int(idx))}): Name part '{part}' must start with a language code (two uppercase letters). Hint: If this text contains a comma, use %2C instead."
192+
)
193+
continue
194+
195+
# Check that the 3rd character is a space
196+
if part[2] != ' ':
197+
errors.append(
198+
f"{filename} (line {_csv_line(int(idx))}): Name part '{part}' must have a space after the language code"
199+
)
200+
201+
return errors
202+
203+
168204
def _check_uuids_and_global_uniqueness(
169205
df: pd.DataFrame, filename: Path, seen: dict[str, tuple[Path, int]]
170206
) -> list[str]:
@@ -221,6 +257,9 @@ def main() -> None:
221257

222258
errors: list[str] = []
223259
seen_uuids: dict[str, tuple[Path, int]] = {}
260+
total_files = 0
261+
files_with_errors = 0
262+
errors_by_file: dict[Path, list[str]] = {}
224263

225264
for country_dir in sorted([p for p in args.data_folder.iterdir() if p.is_dir()]):
226265
expected_country = country_dir.name.upper()
@@ -231,35 +270,66 @@ def main() -> None:
231270
continue
232271

233272
for holidays_file in sorted(holidays_dir.glob("*.csv")):
273+
total_files += 1
274+
file_errors: list[str] = []
275+
234276
try:
235277
df = _read_csv(holidays_file)
236278
except pd.errors.ParserError as error:
237-
errors.append(f"{holidays_file}: could not parse CSV - {error}")
279+
file_errors.append(f"{holidays_file}: could not parse CSV - {error}")
280+
errors.extend(file_errors)
281+
files_with_errors += 1
238282
continue
239283

240-
errors.extend(_check_required_columns(df, holidays_file))
284+
file_errors.extend(_check_required_columns(df, holidays_file))
241285
if REQUIRED_COLUMNS - set(df.columns):
242-
# Don’t cascade on missing columns.
286+
# Don't cascade on missing columns.
287+
errors.extend(file_errors)
288+
files_with_errors += 1
243289
continue
244290

245-
errors.extend(_check_required_values(df, holidays_file))
291+
file_errors.extend(_check_required_values(df, holidays_file))
246292

247-
errors.extend(_check_country_column(df, holidays_file, expected_country))
248-
errors.extend(_check_uuids_and_global_uniqueness(df, holidays_file, seen_uuids))
293+
file_errors.extend(_check_country_column(df, holidays_file, expected_country))
294+
file_errors.extend(_check_uuids_and_global_uniqueness(df, holidays_file, seen_uuids))
249295

250296
start_dates, end_dates, date_errors = _parse_dates(df, holidays_file)
251-
errors.extend(date_errors)
252-
errors.extend(_check_duration(start_dates, end_dates, holidays_file))
253-
errors.extend(_check_sorting(start_dates, holidays_file))
254-
errors.extend(_check_subdivisions(df, subdivisions, holidays_file))
297+
file_errors.extend(date_errors)
298+
file_errors.extend(_check_duration(start_dates, end_dates, holidays_file))
299+
file_errors.extend(_check_sorting(start_dates, holidays_file))
300+
file_errors.extend(_check_subdivisions(df, subdivisions, holidays_file))
301+
file_errors.extend(_check_name_format(df, holidays_file))
302+
303+
if file_errors:
304+
files_with_errors += 1
305+
errors.extend(file_errors)
306+
errors_by_file[holidays_file] = file_errors
255307

256308
if errors:
257-
print(f"Validation failed with {len(errors)} error(s):\n", file=sys.stderr)
258-
for message in errors:
259-
print(f"- {message}", file=sys.stderr)
309+
print(f"\n{'=' * 70}", file=sys.stderr)
310+
print(f"VALIDATION FAILED", file=sys.stderr)
311+
print(f"{'=' * 70}\n", file=sys.stderr)
312+
313+
# Group errors by file for better readability
314+
for file_path, file_errors in errors_by_file.items():
315+
print(f"\n{file_path}: {len(file_errors)} error(s)", file=sys.stderr)
316+
for message in file_errors[:10]: # Show first 10 errors per file
317+
# Remove redundant file path from message
318+
clean_message = message.replace(f"{file_path} ", "")
319+
print(f" • {clean_message}", file=sys.stderr)
320+
if len(file_errors) > 10:
321+
print(f" ... and {len(file_errors) - 10} more errors", file=sys.stderr)
322+
323+
print(f"\n{'=' * 70}", file=sys.stderr)
324+
print(f"Summary: {files_with_errors}/{total_files} files with errors ({len(errors)} total errors)", file=sys.stderr)
325+
print(f"{'=' * 70}", file=sys.stderr)
260326
sys.exit(1)
261327

262-
print("✓ All validations passed")
328+
print(f"{'=' * 70}")
329+
print(f"✓ All validations passed!")
330+
print(f"{'=' * 70}")
331+
print(f"Checked {total_files} files successfully.")
332+
print(f"{'=' * 70}")
263333

264334

265335
if __name__ == "__main__":

0 commit comments

Comments
 (0)