diff --git a/scripts/us_fed/treasury_constant_maturity_rates/manifest.json b/scripts/us_fed/treasury_constant_maturity_rates/manifest.json index 185e77fec9..009009b94c 100644 --- a/scripts/us_fed/treasury_constant_maturity_rates/manifest.json +++ b/scripts/us_fed/treasury_constant_maturity_rates/manifest.json @@ -20,10 +20,7 @@ "source_files": [ "treasury_constant_maturity_rates.csv" ], - "cron_schedule": "15 3 * * *", - "config_override": { - "ignore_validation_status": false - } + "cron_schedule": "15 3 * * *" }, { "import_name": "USFed_ConstantMaturityRates", @@ -45,10 +42,7 @@ "source_files": [ "treasury_constant_maturity_rates.csv" ], - "cron_schedule": "15 3 * * *", - "config_override": { - "ignore_validation_status": false - } + "cron_schedule": "15 3 * * *" } ] } \ No newline at end of file diff --git a/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json b/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json deleted file mode 100644 index b92d23c5bd..0000000000 --- a/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "schema_version": "1.0", - "rules": [ - { - "rule_id": "check_deleted_count", - "description": "Checks that the number of deleted points is within the threshold.", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, - "params": { - "threshold": 0 - } - }, - { - "rule_id": "check_missing_refs_count", - "validator": "MISSING_REFS_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } - }, - { - "rule_id": "check_lint_error_count", - "validator": "LINT_ERROR_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } - } - ] -} \ No newline at end of file diff --git a/statvar_imports/bis/bis_central_bank_policy_rate/validation_config.json b/statvar_imports/bis/bis_central_bank_policy_rate/validation_config.json index fffd9e0149..0854c7a7ab 100644 --- a/statvar_imports/bis/bis_central_bank_policy_rate/validation_config.json +++ b/statvar_imports/bis/bis_central_bank_policy_rate/validation_config.json @@ -18,35 +18,11 @@ } }, { - "rule_id": "check_deleted_count", - "description": "Checks that the number of deleted points is within the threshold.", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 100 } - }, - { - "rule_id": "check_missing_refs_count", - "validator": "MISSING_REFS_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } - }, - { - "rule_id": "check_lint_error_count", - "validator": "LINT_ERROR_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } } ] } diff --git a/statvar_imports/school_algebra1/validation_config.json b/statvar_imports/school_algebra1/validation_config.json index e2ce97abb1..4ad1218d71 100644 --- a/statvar_imports/school_algebra1/validation_config.json +++ b/statvar_imports/school_algebra1/validation_config.json @@ -1,36 +1,12 @@ { "schema_version": "1.0", "rules": [ - { - "rule_id": "check_deleted_count", - "description": "Checks that the number of deleted points is within the threshold.", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, - "params": { - "threshold": 0 - } - }, { "rule_id": "check_missing_refs_count", "validator": "MISSING_REFS_COUNT", - "scope": { - "data_source": "lint" - }, "params": { "threshold": 457000 } - }, - { - "rule_id": "check_lint_error_count", - "validator": "LINT_ERROR_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } } ] -} \ No newline at end of file +} diff --git a/statvar_imports/us_bls/cpi_category/validation_config.json b/statvar_imports/us_bls/cpi_category/validation_config.json index 17d2f2f5aa..83d9d65b6c 100644 --- a/statvar_imports/us_bls/cpi_category/validation_config.json +++ b/statvar_imports/us_bls/cpi_category/validation_config.json @@ -2,35 +2,11 @@ "schema_version": "1.0", "rules": [ { - "rule_id": "check_deleted_count", - "description": "Checks that the number of deleted points is within the threshold.", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 1230 } - }, - { - "rule_id": "check_missing_refs_count", - "validator": "MISSING_REFS_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } - }, - { - "rule_id": "check_lint_error_count", - "validator": "LINT_ERROR_COUNT", - "scope": { - "data_source": "lint" - }, - "params": { - "threshold": 0 - } } ] -} \ No newline at end of file +} diff --git a/tools/import_validation/README.md b/tools/import_validation/README.md index 54406457df..5496c565f7 100644 --- a/tools/import_validation/README.md +++ b/tools/import_validation/README.md @@ -60,7 +60,6 @@ Each object in the `rules` list defines a single validation check with the follo The `scope` object specifies which data to run the validation on. It has two main parts: -- `data_source`: The key for the data source to use (`"stats"` or `"differ"`). - `variables`: An optional filter to select a subset of StatVars from the data source. The `variables` object can contain any of the following keys: @@ -82,17 +81,11 @@ Here is an example of a complete configuration file: { "rule_id": "check_latest_date_for_all", "validator": "MAX_DATE_LATEST", - "scope": { - "data_source": "stats" - }, "params": {} }, { "rule_id": "check_deleted_points_threshold", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 10 } @@ -101,7 +94,6 @@ Here is an example of a complete configuration file: "rule_id": "check_percent_max_value", "validator": "MAX_VALUE_CHECK", "scope": { - "data_source": "stats", "variables": { "contains_all": ["Percent"] } @@ -150,9 +142,9 @@ The following validations are currently supported: | `MAX_DATE_CONSISTENT` | Checks that the latest date is the same for all StatVars. | `stats` | None | | `MISSING_REFS_COUNT` | Checks that the total number of missing references is within a threshold. | `lint` | `threshold` (integer, defaults to 0) | | `LINT_ERROR_COUNT` | Checks that the total number of lint errors is within a threshold. | `lint` | `threshold` (integer, defaults to 0) | -| `DELETED_COUNT` | Checks that the total number of deleted points is within a threshold. | `differ` | `threshold` (integer, defaults to 0) | -| `MODIFIED_COUNT` | Checks that the number of modified points is the same for all StatVars. | `differ` | None | -| `ADDED_COUNT` | Checks that the number of added points is the same for all StatVars. | `differ` | None | +| `DELETED_RECORDS_COUNT` | Checks that the total number of deleted points is within a threshold. | `differ` | `threshold` (integer, defaults to 0) | +| `MODIFIED_RECORDS_COUNT` | Checks that the number of modified points is the same for all StatVars. | `differ` | None | +| `ADDED_RECORDS_COUNT` | Checks that the number of added points is the same for all StatVars. | `differ` | None | | `NUM_PLACES_CONSISTENT` | Checks that the number of places is the same for all StatVars. | `stats` | None | | `NUM_PLACES_COUNT` | Checks that the number of places is within a defined range. | `stats` | `minimum`, `maximum`, or `value` (integer) | | `NUM_OBSERVATIONS_CHECK` | Checks that the number of observations is within a defined range. | `stats` | `minimum`, `maximum`, or `value` (integer) | diff --git a/tools/import_validation/import_validation_test.py b/tools/import_validation/import_validation_test.py index 5cc028848c..7d45827648 100644 --- a/tools/import_validation/import_validation_test.py +++ b/tools/import_validation/import_validation_test.py @@ -58,9 +58,7 @@ def test_successful_run(self): "rules": [{ "rule_id": "num_places_consistent", "validator": "NUM_PLACES_CONSISTENT", - "scope": { - "data_source": "stats" - }, + "scope": {}, "params": {} }] }, f) @@ -97,9 +95,7 @@ def test_failed_run(self): "rules": [{ "rule_id": "num_places_consistent", "validator": "NUM_PLACES_CONSISTENT", - "scope": { - "data_source": "stats" - }, + "scope": {}, "params": {} }] }, f) @@ -136,9 +132,7 @@ def test_missing_required_file_fails(self): "rules": [{ "rule_id": "num_places_consistent", "validator": "NUM_PLACES_CONSISTENT", - "scope": { - "data_source": "stats" - } + "scope": {} }] }, f) @@ -169,7 +163,6 @@ def test_variables_filtering(self): "rule_id": "num_places_consistent_filtered", "validator": "NUM_PLACES_CONSISTENT", "scope": { - "data_source": "stats", "variables": { "dcids": ["sv1", "sv2"] } @@ -254,11 +247,9 @@ def test_empty_differ_file_runs_validation(self): json.dump( { "rules": [{ - "rule_id": "check_deleted_count", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", + "scope": {}, "params": { "threshold": 0 } # Fail if deleted count is > 0 @@ -299,11 +290,9 @@ def test_missing_differ_file_does_not_throw_exception(self): json.dump( { "rules": [{ - "rule_id": "check_deleted_count", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", + "scope": {}, "params": { "threshold": 10 } diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index a3bbbde132..9857085064 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -57,14 +57,16 @@ def __init__(self, validation_config_path: str, differ_output: str, (self.validator.validate_max_date_latest, 'stats'), 'MAX_DATE_CONSISTENT': (self.validator.validate_max_date_consistent, 'stats'), - 'DELETED_COUNT': (self.validator.validate_deleted_count, 'differ'), + 'DELETED_RECORDS_COUNT': + (self.validator.validate_deleted_records_count, 'differ'), 'MISSING_REFS_COUNT': (self.validator.validate_missing_refs_count, 'lint'), 'LINT_ERROR_COUNT': (self.validator.validate_lint_error_count, 'lint'), - 'MODIFIED_COUNT': - (self.validator.validate_modified_count, 'differ'), - 'ADDED_COUNT': (self.validator.validate_added_count, 'differ'), + 'MODIFIED_RECORDS_COUNT': + (self.validator.validate_modified_records_count, 'differ'), + 'ADDED_RECORDS_COUNT': + (self.validator.validate_added_records_count, 'differ'), 'NUM_PLACES_CONSISTENT': (self.validator.validate_num_places_consistent, 'stats'), 'NUM_PLACES_COUNT': @@ -141,8 +143,10 @@ def _determine_required_sources(self) -> set[str]: for rule in self.config.rules: if not rule.get('enabled', True): continue - if 'scope' in rule and 'data_source' in rule['scope']: - req_sources.add(rule['scope']['data_source']) + + validator_name = rule.get('validator') + if validator_name in self.validation_dispatch: + req_sources.add(self.validation_dispatch[validator_name][1]) return req_sources def run_validations(self) -> tuple[bool, list[ValidationResult]]: diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index 207c3d5081..28f805498d 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -53,9 +53,7 @@ def test_runner_calls_correct_validator_function(self, MockValidator): 'rules': [{ 'rule_id': 'test_max_date', 'validator': 'MAX_DATE_LATEST', - 'scope': { - 'data_source': 'stats' - }, + 'scope': {}, 'params': {} }] }, f) @@ -76,7 +74,8 @@ def test_runner_calls_correct_validator_function(self, MockValidator): # 4. Assert that the correct method was called on the mock mock_validator_instance.validate_max_date_latest.assert_called_once() # Ensure other methods were NOT called - mock_validator_instance.validate_deleted_count.assert_not_called() + mock_validator_instance.validate_deleted_records_count.assert_not_called( + ) @patch('tools.import_validation.runner.filter_dataframe') @patch('tools.import_validation.runner.Validator') @@ -96,7 +95,6 @@ def test_runner_applies_filters_correctly(self, MockValidator, 'rule_id': 'test_places_consistent', 'validator': 'NUM_PLACES_CONSISTENT', 'scope': { - 'data_source': 'stats', 'variables': { 'dcids': ['Count_Person_Male'] } @@ -141,9 +139,7 @@ def test_runner_handles_failed_validation(self, MockValidator): 'rules': [{ 'rule_id': 'test_max_date', 'validator': 'MAX_DATE_LATEST', - 'scope': { - 'data_source': 'stats' - }, + 'scope': {}, 'params': {} }] }, f) @@ -169,26 +165,24 @@ def test_runner_writes_correct_output(self, MockValidator): mock_validator_instance = MockValidator.return_value expected_result = ValidationResult( ValidationStatus.FAILED, - 'DELETED_COUNT', + 'DELETED_RECORDS_COUNT', message='Too many deletions, found 100', details={ - 'deleted_count': 100, + 'deleted_records_count': 100, 'rows_processed': 1, 'rows_succeeded': 0, 'rows_failed': 1 }) - mock_validator_instance.validate_deleted_count.return_value = expected_result + mock_validator_instance.validate_deleted_records_count.return_value = expected_result # 2. Create test files with open(self.config_path, 'w') as f: json.dump( { 'rules': [{ - 'rule_id': 'check_deleted_count', - 'validator': 'DELETED_COUNT', - 'scope': { - 'data_source': 'differ' - }, + 'rule_id': 'check_deleted_records_count', + 'validator': 'DELETED_RECORDS_COUNT', + 'scope': {}, 'params': { 'threshold': 10 } @@ -208,12 +202,12 @@ def test_runner_writes_correct_output(self, MockValidator): output_df = pd.read_csv(self.output_path) self.assertEqual(len(output_df), 1) self.assertEqual(output_df.iloc[0]['ValidationName'], - 'check_deleted_count') + 'check_deleted_records_count') self.assertEqual(output_df.iloc[0]['Status'], 'FAILED') self.assertEqual(output_df.iloc[0]['Message'], 'Too many deletions, found 100') details = json.loads(output_df.iloc[0]['Details']) - self.assertEqual(details['deleted_count'], 100) + self.assertEqual(details['deleted_records_count'], 100) self.assertEqual(details['rows_processed'], 1) self.assertEqual(details['rows_succeeded'], 0) self.assertEqual(details['rows_failed'], 1) @@ -232,9 +226,7 @@ def test_runner_uses_custom_name(self, MockValidator): 'rules': [{ 'rule_id': 'My_Custom_Test_Name', 'validator': 'MAX_DATE_LATEST', - 'scope': { - 'data_source': 'stats' - }, + 'scope': {}, 'params': {} }] }, f) @@ -265,9 +257,7 @@ def test_runner_handles_unknown_validation(self, MockValidator, 'rules': [{ 'rule_id': 'test_fake', 'validator': 'FAKE_VALIDATION', - 'scope': { - 'data_source': 'stats' - }, + 'scope': {}, 'params': {} }] }, f) @@ -292,9 +282,7 @@ def test_init_raises_error_if_required_file_is_missing(self): 'rules': [{ 'rule_id': 'test_rule', 'validator': 'MAX_DATE_LATEST', - 'scope': { - 'data_source': 'stats' - } + 'scope': {} }] }, f) diff --git a/tools/import_validation/sample_data/sample_config.json b/tools/import_validation/sample_data/sample_config.json index 06b85e36a8..45dec57b6f 100644 --- a/tools/import_validation/sample_data/sample_config.json +++ b/tools/import_validation/sample_data/sample_config.json @@ -5,27 +5,18 @@ "rule_id": "check_max_date_latest", "description": "Checks that the latest date in the stats summary is from the current year.", "validator": "MAX_DATE_LATEST", - "scope": { - "data_source": "stats" - }, "params": {} }, { "rule_id": "check_max_date_consistent", "description": "Checks if the MaxDate is the same for all StatVars.", "validator": "MAX_DATE_CONSISTENT", - "scope": { - "data_source": "stats" - }, "params": {} }, { - "rule_id": "check_deleted_count", + "rule_id": "check_deleted_records_count", "description": "Checks that the number of deleted points is within the threshold.", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 5 } @@ -35,7 +26,6 @@ "description": "Checks if the number of places is the same for all StatVars.", "validator": "NUM_PLACES_CONSISTENT", "scope": { - "data_source": "stats", "variables": { "regex": [ "Count_Person_.*" @@ -48,9 +38,6 @@ "rule_id": "check_min_value", "description": "Checks that the minimum value for each StatVar is not below 0.", "validator": "MIN_VALUE_CHECK", - "scope": { - "data_source": "stats" - }, "params": { "minimum": 0 } @@ -60,7 +47,6 @@ "description": "Checks that the maximum value for percentage StatVars is not above 100.", "validator": "MAX_VALUE_CHECK", "scope": { - "data_source": "stats", "variables": { "regex": [ ".*Percent.*" @@ -84,9 +70,6 @@ "rule_id": "check_num_observations", "description": "Checks that there is at least one observation for each StatVar.", "validator": "NUM_OBSERVATIONS_CHECK", - "scope": { - "data_source": "stats" - }, "params": { "minimum": 1 } @@ -95,9 +78,6 @@ "rule_id": "check_unit_consistency", "description": "Check that all units are consistent", "validator": "UNIT_CONSISTENCY_CHECK", - "scope": { - "data_source": "stats" - }, "params": {} } ] diff --git a/tools/import_validation/sample_data/validation_output.csv b/tools/import_validation/sample_data/validation_output.csv index 7580f156bc..5d079467c8 100644 --- a/tools/import_validation/sample_data/validation_output.csv +++ b/tools/import_validation/sample_data/validation_output.csv @@ -1,7 +1,7 @@ ValidationName,Status,Message,Details,ValidationParams check_max_date_latest,FAILED,"Latest date found was 2023, expected 2025.","{""latest_date_found"": 2023, ""expected_latest_date"": 2025, ""rows_processed"": 4, ""rows_succeeded"": 0, ""rows_failed"": 4}", check_max_date_consistent,PASSED,,"{""rows_processed"": 4, ""rows_succeeded"": 4, ""rows_failed"": 0}", -check_deleted_count,PASSED,,"{""rows_processed"": 3, ""rows_succeeded"": 3, ""rows_failed"": 0}","{""threshold"": 5}" +check_deleted_records_count,PASSED,,"{""rows_processed"": 3, ""rows_succeeded"": 3, ""rows_failed"": 0}","{""threshold"": 5}" check_num_places_consistent,PASSED,,"{""rows_processed"": 2, ""rows_succeeded"": 2, ""rows_failed"": 0}", check_min_value,PASSED,,"{""rows_processed"": 4, ""rows_succeeded"": 4, ""rows_failed"": 0}","{""minimum"": 0}" check_max_value_for_percent,FAILED,1 out of 1 StatVars failed the maximum value check.,"{""failed_rows"": [{""stat_var"": ""UnemploymentRate_Person_Percent"", ""actual_max_value"": 105, ""maximum"": 100}], ""rows_processed"": 1, ""rows_succeeded"": 0, ""rows_failed"": 1}","{""maximum"": 100}" diff --git a/tools/import_validation/sample_data/validation_output.json b/tools/import_validation/sample_data/validation_output.json index 0b248b13f4..cd070dfce0 100644 --- a/tools/import_validation/sample_data/validation_output.json +++ b/tools/import_validation/sample_data/validation_output.json @@ -24,7 +24,7 @@ "validation_params": {} }, { - "validation_name": "check_deleted_count", + "validation_name": "check_deleted_records_count", "status": "PASSED", "message": "", "details": { diff --git a/tools/import_validation/validation_config.json b/tools/import_validation/validation_config.json index b92d23c5bd..97ad6ac736 100644 --- a/tools/import_validation/validation_config.json +++ b/tools/import_validation/validation_config.json @@ -2,12 +2,9 @@ "schema_version": "1.0", "rules": [ { - "rule_id": "check_deleted_count", + "rule_id": "check_deleted_records_count", "description": "Checks that the number of deleted points is within the threshold.", - "validator": "DELETED_COUNT", - "scope": { - "data_source": "differ" - }, + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 0 } @@ -15,9 +12,6 @@ { "rule_id": "check_missing_refs_count", "validator": "MISSING_REFS_COUNT", - "scope": { - "data_source": "lint" - }, "params": { "threshold": 0 } @@ -25,12 +19,9 @@ { "rule_id": "check_lint_error_count", "validator": "LINT_ERROR_COUNT", - "scope": { - "data_source": "lint" - }, "params": { "threshold": 0 } } ] -} \ No newline at end of file +} diff --git a/tools/import_validation/validation_config_test.py b/tools/import_validation/validation_config_test.py index eb38ba035e..9beef55f96 100644 --- a/tools/import_validation/validation_config_test.py +++ b/tools/import_validation/validation_config_test.py @@ -36,8 +36,8 @@ def test_rules_are_deep_merged_by_rule_id(self): json.dump( { "rules": [{ - "rule_id": "check_deleted_count", - "validator": "DELETED_COUNT", + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 0, "warn_only": True, @@ -55,8 +55,8 @@ def test_rules_are_deep_merged_by_rule_id(self): json.dump( { "rules": [{ - "rule_id": "check_deleted_count", - "validator": "DELETED_COUNT", + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 5, "new_param": "keep-me", @@ -77,8 +77,8 @@ def test_rules_are_deep_merged_by_rule_id(self): { "rules": [ { - "rule_id": "check_deleted_count", - "validator": "DELETED_COUNT", + "rule_id": "check_deleted_records_count", + "validator": "DELETED_RECORDS_COUNT", "params": { "threshold": 5, "warn_only": true, @@ -119,7 +119,6 @@ def test_definitions_are_deep_merged(self): "definitions": { "scopes": { "foo": { - "data_source": "stats", "filters": { "dcids": ["a"] } @@ -137,9 +136,7 @@ def test_definitions_are_deep_merged(self): "contains_all": ["b"] } }, - "bar": { - "data_source": "differ" - } + "bar": {} } } }, f) @@ -150,14 +147,12 @@ def test_definitions_are_deep_merged(self): expected_scopes_json = """ { "foo": { - "data_source": "stats", "filters": { "dcids": ["a"], "contains_all": ["b"] } }, "bar": { - "data_source": "differ" } } """ diff --git a/tools/import_validation/validator.py b/tools/import_validation/validator.py index b8eec807ce..7abf48c730 100644 --- a/tools/import_validation/validator.py +++ b/tools/import_validation/validator.py @@ -134,8 +134,8 @@ def validate_max_date_latest(self, stats_df: pd.DataFrame, 'rows_failed': 0 }) - def validate_deleted_count(self, differ_df: pd.DataFrame, - params: dict) -> ValidationResult: + def validate_deleted_records_count(self, differ_df: pd.DataFrame, + params: dict) -> ValidationResult: """Checks if the total number of deleted points is within a threshold. Args: @@ -148,23 +148,23 @@ def validate_deleted_count(self, differ_df: pd.DataFrame, A ValidationResult object. """ if differ_df.empty: - deleted_count = 0 + deleted_records_count = 0 threshold = params.get('threshold', 0) - if deleted_count > threshold: + if deleted_records_count > threshold: return ValidationResult( ValidationStatus.FAILED, - 'DELETED_COUNT', + 'DELETED_RECORDS_COUNT', message= - f"Found {deleted_count} deleted points, which is over the threshold of {threshold}.", + f"Found {deleted_records_count} deleted points, which is over the threshold of {threshold}.", details={ - 'deleted_count': int(deleted_count), + 'deleted_records_count': int(deleted_records_count), 'threshold': threshold, 'rows_processed': 0, 'rows_succeeded': 0, 'rows_failed': 0 }) return ValidationResult(ValidationStatus.PASSED, - 'DELETED_COUNT', + 'DELETED_RECORDS_COUNT', details={ 'rows_processed': 0, 'rows_succeeded': 0, @@ -174,28 +174,28 @@ def validate_deleted_count(self, differ_df: pd.DataFrame, if 'DELETED' not in differ_df.columns: return ValidationResult( ValidationStatus.DATA_ERROR, - 'DELETED_COUNT', + 'DELETED_RECORDS_COUNT', message="Input data is missing required column: 'DELETED'.") rows_processed = len(differ_df) threshold = params.get('threshold', 0) - deleted_count = differ_df['DELETED'].sum() + deleted_records_count = differ_df['DELETED'].sum() - if deleted_count > threshold: + if deleted_records_count > threshold: return ValidationResult( ValidationStatus.FAILED, - 'DELETED_COUNT', + 'DELETED_RECORDS_COUNT', message= - f"Found {deleted_count} deleted points, which is over the threshold of {threshold}.", + f"Found {deleted_records_count} deleted points, which is over the threshold of {threshold}.", details={ - 'deleted_count': int(deleted_count), + 'deleted_records_count': int(deleted_records_count), 'threshold': threshold, 'rows_processed': rows_processed, 'rows_succeeded': 0, 'rows_failed': rows_processed }) return ValidationResult(ValidationStatus.PASSED, - 'DELETED_COUNT', + 'DELETED_RECORDS_COUNT', details={ 'rows_processed': rows_processed, 'rows_succeeded': rows_processed, @@ -219,8 +219,10 @@ def validate_missing_refs_count(self, report: dict, counters = report.get('levelSummary', {}).get('LEVEL_WARNING', {}).get('counters', {}) - missing_refs_count = int( - counters.get('Existence_MissingReference', '0')) + missing_refs_count = sum( + int(value) + for key, value in counters.items() + if key.startswith('Existence_MissingReference')) threshold = params.get('threshold', 0) if missing_refs_count > threshold: return ValidationResult( @@ -264,8 +266,8 @@ def validate_lint_error_count(self, report: dict, 'LINT_ERROR_COUNT', details={'lint_error_count': lint_error_count}) - def validate_modified_count(self, differ_df: pd.DataFrame, - params: dict) -> ValidationResult: + def validate_modified_records_count(self, differ_df: pd.DataFrame, + params: dict) -> ValidationResult: """Checks if the number of modified points is the same for all StatVars. Args: @@ -278,7 +280,7 @@ def validate_modified_count(self, differ_df: pd.DataFrame, """ if differ_df.empty: return ValidationResult(ValidationStatus.PASSED, - 'MODIFIED_COUNT', + 'MODIFIED_RECORDS_COUNT', details={ 'rows_processed': 0, 'rows_succeeded': 0, @@ -288,7 +290,7 @@ def validate_modified_count(self, differ_df: pd.DataFrame, if 'MODIFIED' not in differ_df.columns: return ValidationResult( ValidationStatus.DATA_ERROR, - 'MODIFIED_COUNT', + 'MODIFIED_RECORDS_COUNT', message="Input data is missing required column: 'MODIFIED'.") rows_processed = len(differ_df) @@ -297,26 +299,26 @@ def validate_modified_count(self, differ_df: pd.DataFrame, if unique_counts > 1: return ValidationResult( ValidationStatus.FAILED, - 'MODIFIED_COUNT', + 'MODIFIED_RECORDS_COUNT', message= "The number of modified data points is not consistent across all StatVars", details={ 'distinct_statvar_count': differ_df['StatVar'].nunique(), - 'distinct_modified_counts': unique_counts, + 'distinct_modified_records_count': unique_counts, 'rows_processed': rows_processed, 'rows_succeeded': 0, 'rows_failed': rows_processed }) return ValidationResult(ValidationStatus.PASSED, - 'MODIFIED_COUNT', + 'MODIFIED_RECORDS_COUNT', details={ 'rows_processed': rows_processed, 'rows_succeeded': rows_processed, 'rows_failed': 0 }) - def validate_added_count(self, differ_df: pd.DataFrame, - params: dict) -> ValidationResult: + def validate_added_records_count(self, differ_df: pd.DataFrame, + params: dict) -> ValidationResult: """Checks if the number of added points is the same for all StatVars. Args: @@ -329,7 +331,7 @@ def validate_added_count(self, differ_df: pd.DataFrame, """ if differ_df.empty: return ValidationResult(ValidationStatus.PASSED, - 'ADDED_COUNT', + 'ADDED_RECORDS_COUNT', details={ 'rows_processed': 0, 'rows_succeeded': 0, @@ -339,7 +341,7 @@ def validate_added_count(self, differ_df: pd.DataFrame, if 'ADDED' not in differ_df.columns: return ValidationResult( ValidationStatus.DATA_ERROR, - 'ADDED_COUNT', + 'ADDED_RECORDS_COUNT', message="Input data is missing required column: 'ADDED'.") rows_processed = len(differ_df) @@ -348,18 +350,18 @@ def validate_added_count(self, differ_df: pd.DataFrame, if unique_counts > 1: return ValidationResult( ValidationStatus.FAILED, - 'ADDED_COUNT', + 'ADDED_RECORDS_COUNT', message= "The number of added data points is not consistent across all StatVars.", details={ 'distinct_statvar_count': differ_df['StatVar'].nunique(), - 'distinct_added_counts': unique_counts, + 'distinct_added_records_count': unique_counts, 'rows_processed': rows_processed, 'rows_succeeded': 0, 'rows_failed': rows_processed }) return ValidationResult(ValidationStatus.PASSED, - 'ADDED_COUNT', + 'ADDED_RECORDS_COUNT', details={ 'rows_processed': rows_processed, 'rows_succeeded': rows_processed, @@ -404,7 +406,7 @@ def validate_num_places_consistent(self, stats_df: pd.DataFrame, "The number of places is not consistent across all StatVars.", details={ 'distinct_statvar_count': stats_df['StatVar'].nunique(), - 'distinct_place_counts': unique_counts, + 'distinct_place_count': unique_counts, 'rows_processed': rows_processed, 'rows_succeeded': 0, 'rows_failed': rows_processed @@ -619,7 +621,7 @@ def validate_max_date_consistent(self, stats_df: pd.DataFrame, message="The MaxDate is not consistent across all StatVars.", details={ 'distinct_statvar_count': stats_df['StatVar'].nunique(), - 'distinct_max_date_counts': unique_dates, + 'distinct_max_date_count': unique_dates, 'rows_processed': rows_processed, 'rows_succeeded': 0, 'rows_failed': rows_processed @@ -763,7 +765,7 @@ def validate_unit_consistency(self, stats_df: pd.DataFrame, message="The unit is not consistent across all StatVars.", details={ 'distinct_statvar_count': stats_df['StatVar'].nunique(), - 'distinct_unit_counts': unique_units, + 'distinct_unit_count': unique_units, 'rows_processed': rows_processed, 'rows_succeeded': 0, 'rows_failed': rows_processed diff --git a/tools/import_validation/validator_test.py b/tools/import_validation/validator_test.py index d0e30c3fd1..5d7a77863b 100644 --- a/tools/import_validation/validator_test.py +++ b/tools/import_validation/validator_test.py @@ -67,45 +67,45 @@ def test_max_date_latest_fails_on_missing_column(self): self.assertIn('missing required column', result.message) -class TestDeletedCountValidation(unittest.TestCase): - '''Test Class for the DELETED_COUNT validation rule.''' +class TestDeletedRecordsCountValidation(unittest.TestCase): + '''Test Class for the DELETED_RECORDS_COUNT validation rule.''' def setUp(self): self.validator = Validator() - def test_deleted_count_fails_when_over_threshold(self): + def test_deleted_records_count_fails_when_over_threshold(self): test_df = pd.DataFrame({'DELETED': [1, 1]}) # Total deleted = 2 params = {'threshold': 1} - result = self.validator.validate_deleted_count(test_df, params) + result = self.validator.validate_deleted_records_count(test_df, params) self.assertEqual(result.status, ValidationStatus.FAILED) - self.assertEqual(result.details['deleted_count'], 2) + self.assertEqual(result.details['deleted_records_count'], 2) self.assertEqual(result.details['threshold'], 1) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 2) - def test_deleted_count_passes_when_at_threshold(self): + def test_deleted_records_count_passes_when_at_threshold(self): test_df = pd.DataFrame({'DELETED': [1, 1]}) # Total deleted = 2 params = {'threshold': 2} - result = self.validator.validate_deleted_count(test_df, params) + result = self.validator.validate_deleted_records_count(test_df, params) self.assertEqual(result.status, ValidationStatus.PASSED) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 2) self.assertEqual(result.details['rows_failed'], 0) - def test_deleted_count_passes_on_empty_dataframe(self): + def test_deleted_records_count_passes_on_empty_dataframe(self): test_df = pd.DataFrame({'DELETED': []}) params = {'threshold': 0} - result = self.validator.validate_deleted_count(test_df, params) + result = self.validator.validate_deleted_records_count(test_df, params) self.assertEqual(result.status, ValidationStatus.PASSED) self.assertEqual(result.details['rows_processed'], 0) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 0) - def test_deleted_count_fails_on_missing_column(self): + def test_deleted_records_count_fails_on_missing_column(self): test_df = pd.DataFrame({'StatVar': ['sv1']}) # Missing 'DELETED' params = {'threshold': 1} - result = self.validator.validate_deleted_count(test_df, params) + result = self.validator.validate_deleted_records_count(test_df, params) self.assertEqual(result.status, ValidationStatus.DATA_ERROR) self.assertIn('missing required column', result.message) @@ -155,86 +155,86 @@ def test_lint_error_count_fails_when_over_threshold(self): self.assertEqual(result.details['lint_error_count'], 5) -class TestModifiedCountValidation(unittest.TestCase): - '''Test Class for the MODIFIED_COUNT validation rule.''' +class TestModifiedRecordsCountValidation(unittest.TestCase): + '''Test Class for the MODIFIED_RECORDS_COUNT validation rule.''' def setUp(self): self.validator = Validator() - def test_modified_count_fails_on_inconsistent_counts(self): + def test_modified_records_count_fails_on_inconsistent_counts(self): test_df = pd.DataFrame({ 'StatVar': ['sv1', 'sv2'], 'MODIFIED': [1, 2] }) # Inconsistent - result = self.validator.validate_modified_count(test_df, {}) + result = self.validator.validate_modified_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.FAILED) self.assertEqual(result.details['distinct_statvar_count'], 2) - self.assertEqual(result.details['distinct_modified_counts'], 2) + self.assertEqual(result.details['distinct_modified_records_count'], 2) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 2) - def test_modified_count_passes_on_consistent_counts(self): + def test_modified_records_count_passes_on_consistent_counts(self): test_df = pd.DataFrame({'MODIFIED': [2, 2]}) # Consistent - result = self.validator.validate_modified_count(test_df, {}) + result = self.validator.validate_modified_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.PASSED) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 2) self.assertEqual(result.details['rows_failed'], 0) - def test_modified_count_passes_on_empty_dataframe(self): + def test_modified_records_count_passes_on_empty_dataframe(self): test_df = pd.DataFrame({'MODIFIED': []}) - result = self.validator.validate_modified_count(test_df, {}) + result = self.validator.validate_modified_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.PASSED) self.assertEqual(result.details['rows_processed'], 0) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 0) - def test_modified_count_fails_on_missing_column(self): + def test_modified_records_count_fails_on_missing_column(self): test_df = pd.DataFrame({'StatVar': ['sv1']}) # Missing 'MODIFIED' - result = self.validator.validate_modified_count(test_df, {}) + result = self.validator.validate_modified_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.DATA_ERROR) self.assertIn('missing required column', result.message) -class TestAddedCountValidation(unittest.TestCase): - '''Test Class for the ADDED_COUNT validation rule.''' +class TestAddedRecordsCountValidation(unittest.TestCase): + '''Test Class for the ADDED_RECORDS_COUNT validation rule.''' def setUp(self): self.validator = Validator() - def test_added_count_fails_on_inconsistent_counts(self): + def test_added_records_count_fails_on_inconsistent_counts(self): test_df = pd.DataFrame({ 'StatVar': ['sv1', 'sv2'], 'ADDED': [1, 2] }) # Inconsistent - result = self.validator.validate_added_count(test_df, {}) + result = self.validator.validate_added_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.FAILED) self.assertEqual(result.details['distinct_statvar_count'], 2) - self.assertEqual(result.details['distinct_added_counts'], 2) + self.assertEqual(result.details['distinct_added_records_count'], 2) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 2) - def test_added_count_passes_on_consistent_counts(self): + def test_added_records_count_passes_on_consistent_counts(self): test_df = pd.DataFrame({'ADDED': [1, 1]}) # Consistent - result = self.validator.validate_added_count(test_df, {}) + result = self.validator.validate_added_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.PASSED) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 2) self.assertEqual(result.details['rows_failed'], 0) - def test_added_count_passes_on_empty_dataframe(self): + def test_added_records_count_passes_on_empty_dataframe(self): test_df = pd.DataFrame({'ADDED': []}) - result = self.validator.validate_added_count(test_df, {}) + result = self.validator.validate_added_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.PASSED) self.assertEqual(result.details['rows_processed'], 0) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 0) - def test_added_count_fails_on_missing_column(self): + def test_added_records_count_fails_on_missing_column(self): test_df = pd.DataFrame({'StatVar': ['sv1']}) # Missing 'ADDED' - result = self.validator.validate_added_count(test_df, {}) + result = self.validator.validate_added_records_count(test_df, {}) self.assertEqual(result.status, ValidationStatus.DATA_ERROR) self.assertIn('missing required column', result.message) @@ -256,7 +256,7 @@ def test_num_places_consistent_fails_on_inconsistent_counts(self): result.message, "The number of places is not consistent across all StatVars.") self.assertEqual(result.details['distinct_statvar_count'], 2) - self.assertEqual(result.details['distinct_place_counts'], 2) + self.assertEqual(result.details['distinct_place_count'], 2) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 2) @@ -421,7 +421,7 @@ def test_max_date_consistent_fails_on_inconsistent_dates(self): self.assertEqual(result.message, "The MaxDate is not consistent across all StatVars.") self.assertEqual(result.details['distinct_statvar_count'], 2) - self.assertEqual(result.details['distinct_max_date_counts'], 2) + self.assertEqual(result.details['distinct_max_date_count'], 2) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 2) @@ -538,7 +538,7 @@ def test_unit_consistency_fails_on_inconsistent_units(self): self.assertEqual(result.message, "The unit is not consistent across all StatVars.") self.assertEqual(result.details['distinct_statvar_count'], 2) - self.assertEqual(result.details['distinct_unit_counts'], 2) + self.assertEqual(result.details['distinct_unit_count'], 2) self.assertEqual(result.details['rows_processed'], 2) self.assertEqual(result.details['rows_succeeded'], 0) self.assertEqual(result.details['rows_failed'], 2)