From ff62155d13d3d874650030b946f2776c20181e95 Mon Sep 17 00:00:00 2001 From: Adam Israel Date: Mon, 8 Dec 2025 10:59:52 -0500 Subject: [PATCH 1/2] Implement comprehensive error message improvements with validation Add improved error handling system to complete Version 0.2.0 goals. This commit implements enhanced error types, better error messages, automatic validation, and non-fatal warning collection to provide users with clear, actionable feedback about GEDCOM file issues. Changes: - Enhanced GedcomError enum with 3 new variants: * ValidationError - Data quality issues in records * EncodingError - Character encoding conversion problems * MissingRequiredField - GEDCOM 5.5.1 required field violations - Improved ParseError with context fields (record_type, field, context) - Enhanced InvalidStructure with record_xref identification - Better error display messages with: * Multi-line formatting with hints * Clear location indicators (line, record, field) * Context snippets for parse errors * User-friendly descriptions - Validation system: * Added warnings Vec to Gedcom struct * Automatic validation after parsing * Non-fatal warnings allow parsing to continue * Validates: individuals, families, submitters, repositories, multimedia - Encoding error tracking: * Captures character encoding conversion errors * Adds warnings when encoding issues occur * Maintains compatibility with verbose mode - Helper methods: * Gedcom::has_warnings() - Check for validation issues - Comprehensive tests: * 13 new error-specific unit tests * 2 new integration tests for validation warnings * All 291 tests passing Non-breaking changes - warnings are collected but don't prevent parsing. Completes all Version 0.2.0 goals from ROADMAP.md. --- docs/ROADMAP.md | 2 +- src/error.rs | 227 ++++++++++++++++++++++++++++++++++--- src/parse.rs | 107 +++++++++++++++-- src/types/mod.rs | 27 +++++ tests/integration_tests.rs | 80 +++++++++++++ 5 files changed, 418 insertions(+), 25 deletions(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index d72e32b..77208c7 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -290,7 +290,7 @@ Dates are a freeform field, with some common conventions. We need to implement p - [x] Complete NOTE record parsing (all GEDCOM 5.5.1 fields) - [x] Complete OBJE record parsing (all GEDCOM 5.5.1 fields) - [x] Complete REPO record parsing (all GEDCOM 5.5.1 fields) -- [ ] Improved error messages +- [x] Improved error messages (enhanced error types, validation warnings, better formatting) - [x] Performance optimizations for large files (minimal overhead for new record types) ### Version 0.3.0 Goals diff --git a/src/error.rs b/src/error.rs index c0b09c5..6af5452 100644 --- a/src/error.rs +++ b/src/error.rs @@ -13,29 +13,136 @@ pub enum GedcomError { /// Error parsing a specific line or record ParseError { line_number: Option, + record_type: Option, + field: Option, message: String, + context: Option, }, /// Invalid GEDCOM structure (e.g., missing required fields) - InvalidStructure(String), + InvalidStructure { + record_xref: Option, + message: String, + }, + + /// Validation error for a specific field + ValidationError { + record_type: String, + record_xref: Option, + field: String, + message: String, + }, + + /// Character encoding issues during file reading + EncodingError { + declared_encoding: String, + detected_encoding: Option, + message: String, + had_errors: bool, + }, + + /// Required field is missing from a record + MissingRequiredField { + record_type: String, + record_xref: Option, + field: String, + }, } impl fmt::Display for GedcomError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { GedcomError::Io(err) => write!(f, "I/O error: {}", err), - GedcomError::FileNotFound(path) => write!(f, "File not found: {}", path), + GedcomError::FileNotFound(path) => { + writeln!(f, "File not found: '{}'", path)?; + write!( + f, + " Hint: Check that the file path is correct and the file exists" + ) + } GedcomError::ParseError { line_number, + record_type, + field, message, + context, } => { + write!(f, "Parse error")?; if let Some(line) = line_number { - write!(f, "Parse error at line {}: {}", line, message) - } else { - write!(f, "Parse error: {}", message) + write!(f, " at line {}", line)?; + } + if let Some(rec_type) = record_type { + write!(f, " in {} record", rec_type)?; } + if let Some(fld) = field { + write!(f, ", field '{}'", fld)?; + } + write!(f, ": {}", message)?; + if let Some(ctx) = context { + write!(f, "\n Context: {}", ctx)?; + } + Ok(()) + } + GedcomError::InvalidStructure { + record_xref, + message, + } => { + write!(f, "Invalid GEDCOM structure")?; + if let Some(xref) = record_xref { + write!(f, " in record {}", xref)?; + } + write!(f, ": {}", message) + } + GedcomError::ValidationError { + record_type, + record_xref, + field, + message, + } => { + write!(f, "Validation error in {} record", record_type)?; + if let Some(xref) = record_xref { + write!(f, " ({})", xref)?; + } + write!(f, ", field '{}': {}", field, message) + } + GedcomError::EncodingError { + declared_encoding, + detected_encoding, + message, + had_errors, + } => { + write!( + f, + "Character encoding error: declared as '{}', ", + declared_encoding + )?; + if let Some(detected) = detected_encoding { + write!(f, "detected as '{}', ", detected)?; + } + write!(f, "{}", message)?; + if *had_errors { + write!( + f, + "\n Warning: Some characters could not be converted correctly" + )?; + } + Ok(()) + } + GedcomError::MissingRequiredField { + record_type, + record_xref, + field, + } => { + write!( + f, + "Missing required field '{}' in {} record", + field, record_type + )?; + if let Some(xref) = record_xref { + write!(f, " ({})", xref)?; + } + Ok(()) } - GedcomError::InvalidStructure(msg) => write!(f, "Invalid GEDCOM structure: {}", msg), } } } @@ -73,34 +180,118 @@ mod tests { #[test] fn test_file_not_found_display() { let err = GedcomError::FileNotFound("test.ged".to_string()); - assert_eq!(format!("{}", err), "File not found: test.ged"); + let msg = format!("{}", err); + assert!(msg.contains("File not found")); + assert!(msg.contains("test.ged")); } #[test] fn test_parse_error_with_line_number() { let err = GedcomError::ParseError { line_number: Some(42), + record_type: None, + field: None, message: "Invalid syntax".to_string(), + context: None, }; - assert_eq!(format!("{}", err), "Parse error at line 42: Invalid syntax"); + assert!(format!("{}", err).contains("line 42")); + assert!(format!("{}", err).contains("Invalid syntax")); } #[test] fn test_parse_error_without_line_number() { let err = GedcomError::ParseError { line_number: None, + record_type: None, + field: None, message: "Invalid syntax".to_string(), + context: None, }; - assert_eq!(format!("{}", err), "Parse error: Invalid syntax"); + assert!(format!("{}", err).contains("Parse error")); + assert!(format!("{}", err).contains("Invalid syntax")); + } + + #[test] + fn test_parse_error_with_full_context() { + let err = GedcomError::ParseError { + line_number: Some(42), + record_type: Some("INDI".to_string()), + field: Some("NAME".to_string()), + message: "Invalid name format".to_string(), + context: Some("1 NAME /Invalid/Name/".to_string()), + }; + let msg = format!("{}", err); + assert!(msg.contains("line 42")); + assert!(msg.contains("INDI")); + assert!(msg.contains("NAME")); + assert!(msg.contains("Invalid name format")); + assert!(msg.contains("Context")); } #[test] fn test_invalid_structure_display() { - let err = GedcomError::InvalidStructure("Missing required field".to_string()); - assert_eq!( - format!("{}", err), - "Invalid GEDCOM structure: Missing required field" - ); + let err = GedcomError::InvalidStructure { + record_xref: Some("@I1@".to_string()), + message: "Missing required field".to_string(), + }; + let msg = format!("{}", err); + assert!(msg.contains("Invalid GEDCOM structure")); + assert!(msg.contains("@I1@")); + assert!(msg.contains("Missing required field")); + } + + #[test] + fn test_validation_error_display() { + let err = GedcomError::ValidationError { + record_type: "INDI".to_string(), + record_xref: Some("@I1@".to_string()), + field: "NAME".to_string(), + message: "Name cannot be empty".to_string(), + }; + let msg = format!("{}", err); + assert!(msg.contains("Validation error")); + assert!(msg.contains("INDI")); + assert!(msg.contains("@I1@")); + assert!(msg.contains("NAME")); + assert!(msg.contains("Name cannot be empty")); + } + + #[test] + fn test_encoding_error_display() { + let err = GedcomError::EncodingError { + declared_encoding: "ANSEL".to_string(), + detected_encoding: Some("UTF-8".to_string()), + message: "Using UTF-8 fallback".to_string(), + had_errors: true, + }; + let msg = format!("{}", err); + assert!(msg.contains("encoding error")); + assert!(msg.contains("ANSEL")); + assert!(msg.contains("UTF-8")); + assert!(msg.contains("Warning")); + } + + #[test] + fn test_missing_required_field_display() { + let err = GedcomError::MissingRequiredField { + record_type: "INDI".to_string(), + record_xref: Some("@I1@".to_string()), + field: "NAME".to_string(), + }; + let msg = format!("{}", err); + assert!(msg.contains("Missing required field")); + assert!(msg.contains("NAME")); + assert!(msg.contains("INDI")); + assert!(msg.contains("@I1@")); + } + + #[test] + fn test_file_not_found_includes_hint() { + let err = GedcomError::FileNotFound("missing.ged".to_string()); + let msg = format!("{}", err); + assert!(msg.contains("File not found")); + assert!(msg.contains("missing.ged")); + assert!(msg.contains("Hint")); } #[test] @@ -117,11 +308,17 @@ mod tests { let err = GedcomError::ParseError { line_number: None, + record_type: None, + field: None, message: "test".to_string(), + context: None, }; assert!(err.source().is_none()); - let err = GedcomError::InvalidStructure("test".to_string()); + let err = GedcomError::InvalidStructure { + record_xref: None, + message: "test".to_string(), + }; assert!(err.source().is_none()); } diff --git a/src/parse.rs b/src/parse.rs index 51a9405..6a7fe87 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -126,7 +126,10 @@ fn detect_gedcom_encoding(bytes: &[u8]) -> (&'static Encoding, String) { } /// Read file as bytes and convert to UTF-8 String based on declared encoding -fn read_file_with_encoding(filename: &str, config: &GedcomConfig) -> Result { +fn read_file_with_encoding( + filename: &str, + config: &GedcomConfig, +) -> Result<(String, Option)> { let mut file = File::open(filename)?; let mut bytes = Vec::new(); @@ -159,14 +162,27 @@ fn read_file_with_encoding(filename: &str, config: &GedcomConfig) -> Result Result { } // Read the entire file with proper encoding handling - let content = read_file_with_encoding(filename, config)?; + let (content, encoding_warning) = read_file_with_encoding(filename, config)?; // Initialize an empty gedcom with pre-allocated capacity let mut gedcom = Gedcom { @@ -241,8 +257,14 @@ pub fn parse_gedcom(filename: &str, config: &GedcomConfig) -> Result { notes: Vec::new(), multimedia: Vec::new(), submitters: Vec::new(), + warnings: Vec::new(), }; + // Add encoding warning if present + if let Some(warning) = encoding_warning { + gedcom.warnings.push(warning); + } + // Capacity management constants const INITIAL_RECORD_CAPACITY: usize = 2048; // Typical record ~1-2KB @@ -358,9 +380,76 @@ pub fn parse_gedcom(filename: &str, config: &GedcomConfig) -> Result { // TODO: sources // TODO: multimedia + // Validate records and collect warnings + validate_gedcom(&mut gedcom); + Ok(gedcom) } +/// Validate GEDCOM records and add warnings for missing required fields +fn validate_gedcom(gedcom: &mut Gedcom) { + // Validate individuals - NAME is recommended but not strictly required in GEDCOM 5.5.1 + // We'll warn about individuals without names as it's a common data quality issue + for individual in &gedcom.individuals { + if individual.names.is_empty() { + gedcom.warnings.push(GedcomError::ValidationError { + record_type: "INDI".to_string(), + record_xref: individual.xref.as_ref().map(|x| x.to_string()), + field: "NAME".to_string(), + message: "Individual has no name - this may indicate incomplete data".to_string(), + }); + } + } + + // Validate families - at least one spouse (HUSB or WIFE) is recommended + for family in &gedcom.families { + if family.husband.is_none() && family.wife.is_none() && family.children.is_empty() { + gedcom.warnings.push(GedcomError::ValidationError { + record_type: "FAM".to_string(), + record_xref: Some(family.xref.to_string()), + field: "HUSB/WIFE/CHIL".to_string(), + message: + "Family has no husband, wife, or children - this may indicate incomplete data" + .to_string(), + }); + } + } + + // Validate submitters - NAME is required in GEDCOM 5.5.1 + for submitter in &gedcom.submitters { + if submitter.name.is_none() { + gedcom.warnings.push(GedcomError::MissingRequiredField { + record_type: "SUBM".to_string(), + record_xref: Some(submitter.xref.to_string()), + field: "NAME".to_string(), + }); + } + } + + // Validate repositories - NAME is recommended + for repository in &gedcom.repositories { + if repository.name.is_none() { + gedcom.warnings.push(GedcomError::ValidationError { + record_type: "REPO".to_string(), + record_xref: repository.xref.as_ref().map(|x| x.to_string()), + field: "NAME".to_string(), + message: "Repository has no name - this may indicate incomplete data".to_string(), + }); + } + } + + // Validate multimedia records - at least one FILE is required + for multimedia in &gedcom.multimedia { + if multimedia.files.is_empty() { + gedcom.warnings.push(GedcomError::MissingRequiredField { + record_type: "OBJE".to_string(), + record_xref: multimedia.xref.as_ref().map(|x| x.to_string()), + field: "FILE".to_string(), + }); + } + } +} + #[allow(clippy::unwrap_used)] #[cfg(test)] mod tests { diff --git a/src/types/mod.rs b/src/types/mod.rs index 1262fd0..c141df6 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -158,9 +158,36 @@ pub struct Gedcom { pub notes: Vec, pub multimedia: Vec, pub submitters: Vec, + + /// Validation warnings encountered during parsing + /// These are non-fatal issues that don't prevent parsing but indicate + /// potential problems with the GEDCOM file + pub warnings: Vec, } impl Gedcom { + // ===== Validation Functions ===== + + /// Returns whether the GEDCOM file has any validation warnings + /// + /// # Examples + /// + /// ```no_run + /// use gedcom_rs::parse::{parse_gedcom, GedcomConfig}; + /// + /// let gedcom = parse_gedcom("file.ged", &GedcomConfig::new())?; + /// if gedcom.has_warnings() { + /// println!("File has {} validation warnings", gedcom.warnings.len()); + /// for warning in &gedcom.warnings { + /// eprintln!("Warning: {}", warning); + /// } + /// } + /// # Ok::<(), Box>(()) + /// ``` + pub fn has_warnings(&self) -> bool { + !self.warnings.is_empty() + } + // ===== Search Functions ===== /// Find an individual by their cross-reference ID (xref) diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 647f13b..cedf7b8 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -411,3 +411,83 @@ fn test_parse_gedcom_with_all_record_types() { assert_eq!(gedcom.notes.len(), 1); assert_eq!(gedcom.multimedia.len(), 1); } + +#[test] +fn test_parse_gedcom_with_validation_warnings() { + use std::fs; + let temp_file = "test_validation_warnings.ged"; + let content = "0 HEAD\n1 CHAR UTF-8\n0 @I1@ INDI\n0 @F1@ FAM\n0 @U1@ SUBM\n0 TRLR\n"; + + fs::write(temp_file, content).unwrap(); + + let config = GedcomConfig::new(); + let result = parse_gedcom(temp_file, &config); + + // Cleanup + let _ = fs::remove_file(temp_file); + + assert!(result.is_ok()); + let gedcom = result.unwrap(); + + // Should have validation warnings + assert!(!gedcom.warnings.is_empty(), "Expected validation warnings"); + + // Check for individual without name warning + let has_indi_warning = gedcom.warnings.iter().any(|w| { + matches!(w, gedcom_rs::error::GedcomError::ValidationError { + record_type, field, .. + } if record_type == "INDI" && field == "NAME") + }); + assert!( + has_indi_warning, + "Expected warning for individual without name" + ); + + // Check for family without members warning + let has_fam_warning = gedcom.warnings.iter().any(|w| { + matches!(w, gedcom_rs::error::GedcomError::ValidationError { + record_type, field, .. + } if record_type == "FAM" && field == "HUSB/WIFE/CHIL") + }); + assert!(has_fam_warning, "Expected warning for empty family"); + + // Check for submitter without name error + let has_subm_error = gedcom.warnings.iter().any(|w| { + matches!(w, gedcom_rs::error::GedcomError::MissingRequiredField { + record_type, field, .. + } if record_type == "SUBM" && field == "NAME") + }); + assert!( + has_subm_error, + "Expected error for submitter without required name" + ); +} + +#[test] +fn test_parse_gedcom_warnings_do_not_prevent_parsing() { + use std::fs; + let temp_file = "test_warnings_nonfatal.ged"; + // File with warnings but still parseable + let content = + "0 HEAD\n1 CHAR UTF-8\n0 @I1@ INDI\n1 SEX M\n0 @I2@ INDI\n1 NAME Test /Person/\n0 TRLR\n"; + + fs::write(temp_file, content).unwrap(); + + let config = GedcomConfig::new(); + let result = parse_gedcom(temp_file, &config); + + // Cleanup + let _ = fs::remove_file(temp_file); + + assert!(result.is_ok(), "Parsing should succeed despite warnings"); + let gedcom = result.unwrap(); + + // Should have 2 individuals + assert_eq!(gedcom.individuals.len(), 2); + + // But should have warning for one without name + let name_warnings = gedcom.warnings.iter().filter(|w| { + matches!(w, gedcom_rs::error::GedcomError::ValidationError { field, .. } if field == "NAME") + }).count(); + assert_eq!(name_warnings, 1, "Expected exactly one NAME warning"); +} From b3784061478650791111b9c537c9291b4f10e61f Mon Sep 17 00:00:00 2001 From: Adam Israel Date: Mon, 8 Dec 2025 14:22:21 -0500 Subject: [PATCH 2/2] Add PR #16 review comments to ROADMAP as known issues Document the 5 validation test coverage issues identified by Copilot in PR #16: - Individual validation lacking unit test coverage - Family validation lacking unit test coverage - Submitter validation lacking unit test coverage - Repository validation lacking unit test coverage - Multimedia validation lacking unit test coverage These validation features work correctly and are covered by integration tests, but lack isolated unit tests for better maintainability. Adding these to the Known Issues section for future improvement. --- docs/ROADMAP.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 77208c7..497938c 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -279,6 +279,18 @@ Dates are a freeform field, with some common conventions. We need to implement p - Basic support exists - Complete implementation needed +## Known Issues / Technical Debt + +### Error Handling and Validation (PR #16) +The following validation logic branches need dedicated unit test coverage: +- [ ] Individual validation (empty names check) - Currently only covered by integration tests +- [ ] Family validation (empty family check) - Missing unit test for all three fields empty +- [ ] Submitter validation (missing NAME field) - No dedicated unit test for validation branch +- [ ] Repository validation (missing NAME field) - No test verifying ValidationError generation +- [ ] Multimedia validation (missing FILE field) - No test verifying MissingRequiredField error + +Note: These validation features work correctly and are tested via integration tests, but lack isolated unit tests for better maintainability. + ## Future Enhancements ### Version 0.2.0 Goals