From 4630a5781cf01fc7b2156fa7f91f2782171d600d Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Wed, 8 Oct 2025 12:47:45 +0200 Subject: [PATCH 01/32] add correctness metric --- metis/metric/correctness/correctness.py | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 metis/metric/correctness/correctness.py diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness.py new file mode 100644 index 0000000..842faf6 --- /dev/null +++ b/metis/metric/correctness/correctness.py @@ -0,0 +1,82 @@ +from typing import List, Union + +import pandas as pd + +from metis.metric.metric import Metric +from metis.utils.result import DQResult + + +class Correctness(Metric): + def assess( + self, + data: pd.DataFrame, + reference: Union[pd.DataFrame, None] = None, + metric_config: Union[str, None] = None, + ) -> List[DQResult]: + """ + Assess the correctness of the data by calculating the deviation from the reference. + + :param data: DataFrame to assess. + :param metric_config: Optional configuration for the metric. + :return: List of DQResult objects containing correctness results. + """ + if reference is None: + raise ValueError( + "Reference DataFrame is required for correctness assessment." + ) + + results = [] + total_rows = len(data) + + for col_name in data.columns: + for row_index in range(total_rows): + measurement = self.measure_correctness( + data.at[row_index, col_name], + reference_value=reference.at[row_index, col_name], + dtype=data[col_name].dtype, + ) + + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=measurement, + DQdimension="Correctness", + DQmetric="Correctness", + columnNames=[col_name], + rowIndex=row_index, + ) + results.append(result) + + return results + + def measure_correctness(self, value, *, reference_value, dtype): + if value == reference_value: + return 1 + if pd.isna(value) or pd.isna(reference_value): + return 0 + if dtype == "int64" or dtype == "float64": + return abs(value - reference_value) / max(abs(reference_value), abs(value)) + if dtype == "object": + return self.levenshteinDistance(str(value), str(reference_value)) / max( + len(str(value)), len(str(reference_value)) + ) + raise ValueError( + f"Unsupported dtype for correctness measurement: {dtype} (value: {value}, reference_value: {reference_value})" + ) + + # https://stackoverflow.com/a/32558749 + def levenshteinDistance(self, s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append( + 1 + min((distances[i1], distances[i1 + 1], distances_[-1])) + ) + distances = distances_ + return distances[-1] From 7c131bf1a11eb95f959456523949be20ed4cc5b0 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Wed, 8 Oct 2025 12:47:54 +0200 Subject: [PATCH 02/32] add rule-based consistency metric --- metis/metric/__init__.py | 8 +-- metis/metric/config.py | 20 ++++++++ metis/metric/consistency/config.py | 15 ++++++ metis/metric/consistency/consistency.py | 10 ++-- metis/metric/consistency/rule_consistency.py | 52 ++++++++++++++++++++ metis/metric/metric.py | 28 ++++++++--- metis/utils/result.py | 2 +- 7 files changed, 120 insertions(+), 15 deletions(-) create mode 100644 metis/metric/config.py create mode 100644 metis/metric/consistency/config.py create mode 100644 metis/metric/consistency/rule_consistency.py diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index 899af06..775e613 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,5 +1,7 @@ -from .metric import Metric from .completeness.completeness import Completeness from .consistency.consistency import Consistency -from .minimality.column_minimality_duplicateCount import column_minimality_duplicateCount -from .validity.out_of_vocabulary import OutOfVocabulary \ No newline at end of file +from .metric import Metric +from .minimality.column_minimality_duplicateCount import ( + column_minimality_duplicateCount, +) +from .validity.out_of_vocabulary import OutOfVocabulary diff --git a/metis/metric/config.py b/metis/metric/config.py new file mode 100644 index 0000000..3fec237 --- /dev/null +++ b/metis/metric/config.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass + + +@dataclass +class MetricConfig: + """ + Base class for metric configuration. + All metric configuration classes should inherit from this class. + """ + + @classmethod + def from_dict(cls, config_dict: dict): + """ + Create an instance of the configuration class from a dictionary. + + :param config_dict: A dictionary containing the configuration parameters. + + :return: An instance of the configuration class. + """ + return cls(**config_dict) diff --git a/metis/metric/consistency/config.py b/metis/metric/consistency/config.py new file mode 100644 index 0000000..4ddbec8 --- /dev/null +++ b/metis/metric/consistency/config.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass +from typing import Any, Callable, Dict, List + +from metis.metric.config import MetricConfig + + +@dataclass +class RuleConsistencyConfig(MetricConfig): + """ + Configuration class for the RuleConsistency metric. + """ + + rules: Dict[ + str, List[Callable[[Any], float]] + ] # Dictionary of functions that define consistency rules for each column given by the key diff --git a/metis/metric/consistency/consistency.py b/metis/metric/consistency/consistency.py index 50c026d..35c7234 100644 --- a/metis/metric/consistency/consistency.py +++ b/metis/metric/consistency/consistency.py @@ -9,7 +9,7 @@ class Consistency(Metric): def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: """ Assess the consistency of a dataset by checking the compliance of a functional dependency specified in the metric_config. - + :param data: DataFrame to assess. :param metric_config: JSON that specifies FDs to check. :return: List of DQResult objects containing accuracy results. @@ -25,7 +25,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None for determinant, dependents in metric_conf.items(): if determinant not in data.columns: continue - + for dependent in dependents: if dependent not in data.columns: continue @@ -36,7 +36,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None # find groups where there's more than one dependent value # for the same determinant (FD violation) violations = grouped[grouped > 1].index.tolist() - + consistency = 1 - (len(violations) / len(data[determinant])) result = DQResult( @@ -48,5 +48,5 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None DQannotations={f"{determinant}:{dependent}":violations} # FD ) results.append(result) - - return results \ No newline at end of file + + return results diff --git a/metis/metric/consistency/rule_consistency.py b/metis/metric/consistency/rule_consistency.py new file mode 100644 index 0000000..d51ac5f --- /dev/null +++ b/metis/metric/consistency/rule_consistency.py @@ -0,0 +1,52 @@ +from typing import List, Union + +import pandas as pd + +from metis.metric.consistency.config import RuleConsistencyConfig +from metis.metric.metric import Metric +from metis.utils.result import DQResult + + +class RuleConsistency(Metric): + def assess( + self, + data: pd.DataFrame, + reference: Union[pd.DataFrame, None] = None, + metric_config: str | None | RuleConsistencyConfig = None, + ) -> List[DQResult]: + """ + Assess the consistency of the data by checking each value for the given rules. + + :param data: DataFrame to assess. + :param metric_config: Optional configuration for the metric. + :return: List of DQResult objects containing consistency results. + """ + if metric_config is None: + raise ValueError("Metric configuration is required for rule-based consistency assessment.") + if isinstance(metric_config, str): + raise ValueError("Metric configuration must be a RuleConsistencyConfig instance. JSON loading is not supported.") + + rules = metric_config.rules + + results = [] + total_rows = len(data) + + for col_name in data.columns: + column_rules = rules.get(col_name, []) + for row_index in range(total_rows): + degree_of_violation = sum( + rule(data.at[row_index, col_name]) for rule in column_rules + ) + measurement = 1 / (1 + degree_of_violation) + + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=measurement, + DQdimension="Consistency", + DQmetric="RuleConsistency", + columnNames=[col_name], + rowIndex=row_index, + ) + results.append(result) + + return results diff --git a/metis/metric/metric.py b/metis/metric/metric.py index e23dc62..4bdadd5 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -1,9 +1,12 @@ +import json from abc import ABC, abstractmethod -import pandas as pd from typing import List, Union +import pandas as pd + from metis.utils.result import DQResult + class Metric(ABC): """ Abstract base class for metrics. @@ -16,9 +19,9 @@ def __init_subclass__(cls): Metric.registry[cls.__name__] = cls @abstractmethod - def assess(self, - data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, + def assess(self, + data: pd.DataFrame, + reference: Union[pd.DataFrame, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: """Assess data using this metric and return the results. @@ -37,7 +40,7 @@ def assess(self, - metric_config: Optional[str] Optional path or JSON string containing metric-specific configuration. Use this to keep the method signature compact; - all metric-specific parameters (thresholds, aggregation options, + all metric-specific parameters (thresholds, aggregation options, etc.) can be stored here. Returns @@ -64,4 +67,17 @@ def assess(self, `reference` and returns one `DQResult` per cell in the input table containing the agreement score. """ - raise NotImplementedError() \ No newline at end of file + raise NotImplementedError() + + def load_config(self, config: str) -> dict: + """ + Load metric-specific configuration from a JSON file. + + :param config: Path to the JSON configuration file or a JSON string. + :return: Dictionary containing the configuration parameters. + """ + if config.endswith(".json"): + with open(config, 'r') as f: + return json.load(f) + + return json.loads(config) diff --git a/metis/utils/result.py b/metis/utils/result.py index cc4457c..2df5f41 100644 --- a/metis/utils/result.py +++ b/metis/utils/result.py @@ -137,4 +137,4 @@ def as_json(self): "columnNames": self._columnNames, "rowIndex": self._rowIndex, "DQannotations": self._DQannotations - } \ No newline at end of file + } From c540fdc8c1bc88e2338decdfd9fd2286e392152e Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sat, 11 Oct 2025 21:28:02 +0200 Subject: [PATCH 03/32] add currency metric --- metis/metric/__init__.py | 3 ++ metis/metric/currency/config.py | 15 +++++++ metis/metric/currency/currency.py | 67 +++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 metis/metric/currency/config.py create mode 100644 metis/metric/currency/currency.py diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index 775e613..7c9ebde 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -5,3 +5,6 @@ column_minimality_duplicateCount, ) from .validity.out_of_vocabulary import OutOfVocabulary +from .consistency.rule_consistency import RuleConsistency +from .correctness.correctness import Correctness +from .currency.currency import Currency diff --git a/metis/metric/currency/config.py b/metis/metric/currency/config.py new file mode 100644 index 0000000..62eb0ce --- /dev/null +++ b/metis/metric/currency/config.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass +from typing import Dict + +from metis.metric.config import MetricConfig + + +@dataclass +class CurrencyConfig(MetricConfig): + """ + Configuration class for the Currency metric. + """ + + decline_rate_per_column: Dict[str, float] + ingestion_date_column: str + simulated_assessment_date: str | None = None diff --git a/metis/metric/currency/currency.py b/metis/metric/currency/currency.py new file mode 100644 index 0000000..d94708d --- /dev/null +++ b/metis/metric/currency/currency.py @@ -0,0 +1,67 @@ +from math import exp +from typing import List + +import pandas as pd + +from metis.metric.config import MetricConfig +from metis.metric.currency.config import CurrencyConfig +from metis.metric.metric import Metric +from metis.utils.result import DQResult + + +class Currency(Metric): + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: + """ + Assess the currency of the data by calculating the deviation from the reference. + + :param data: DataFrame to assess. + :param metric_config: Optional configuration for the metric. + :return: List of DQResult objects containing currency results. + """ + if not metric_config: + raise ValueError( + "Metric configuration is required for currency assessment." + ) + + config = self.load_config(metric_config, CurrencyConfig) + + ingestion_date_column = config.ingestion_date_column + assessment_date = pd.to_datetime( + config.simulated_assessment_date or pd.Timestamp.now() + ) + + results = [] + total_rows = len(data) + + for col_name in data.columns: + decline_rate = config.decline_rate_per_column.get(col_name) + if decline_rate is None: + print( + f"Decline rate for column '{col_name}' is not specified in the configuration. Skipping." + ) + continue + + for row_index in range(total_rows): + ingestion_date = pd.to_datetime( + data.at[row_index, ingestion_date_column], dayfirst=True + ) + delta: pd.Timedelta = (assessment_date - ingestion_date) + age = delta.days // 365 + measurement = exp(-decline_rate * age) if pd.notna(age) else 0 + + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=measurement, + DQdimension="Currency", + DQmetric="Currency", + columnNames=[col_name], + rowIndex=row_index, + ) + results.append(result) + + return results From 89da95b0803708a26ef0f0a2d0124665aef31bc3 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 12 Oct 2025 13:48:05 +0200 Subject: [PATCH 04/32] add MetricConfig support in addition to json configuration --- metis/dq_orchestrator.py | 3 +- metis/metric/completeness/completeness.py | 27 ++-- metis/metric/consistency/config.py | 10 ++ metis/metric/consistency/consistency.py | 31 ++++- metis/metric/consistency/rule_consistency.py | 24 +++- metis/metric/correctness/correctness.py | 12 +- metis/metric/currency/currency.py | 9 +- metis/metric/metric.py | 126 ++++++++++--------- metis/utils/dq_dimension.py | 8 ++ metis/utils/result.py | 4 +- metis/writer/console_writer.py | 4 +- 11 files changed, 169 insertions(+), 89 deletions(-) create mode 100644 metis/utils/dq_dimension.py diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index 0d699a3..8a652e6 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -5,6 +5,7 @@ from metis.loader.csv_loader import CSVLoader from metis.metric import Metric +from metis.metric.config import MetricConfig from metis.utils.data_config import DataConfig from metis.utils.result import DQResult from metis.writer.console_writer import ConsoleWriter @@ -54,7 +55,7 @@ def load(self, data_loader_configs: List[str]) -> None: f"Unsupported loader type: {config_data.get('loader', None)}" ) - def assess(self, metrics: List[str], metric_configs: List[str | None]) -> None: + def assess(self, metrics: List[str], metric_configs: List[str | MetricConfig | None]) -> None: results = [] for metric, metric_config in zip(metrics, metric_configs): diff --git a/metis/metric/completeness/completeness.py b/metis/metric/completeness/completeness.py index 885d48f..f08a3cd 100644 --- a/metis/metric/completeness/completeness.py +++ b/metis/metric/completeness/completeness.py @@ -1,32 +1,41 @@ +from typing import List + import pandas as pd -from typing import List, Union -from metis.utils.result import DQResult +from metis.metric.config import MetricConfig from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension +from metis.utils.result import DQResult + class Completeness(Metric): - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: """ Assess the completeness of the data by checking for missing values. - + :param data: DataFrame to assess. :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing completeness results. """ results = [] total_rows = len(data) - + for column in data.columns: missing_count = data[column].isnull().sum() completeness = (total_rows - int(missing_count)) / total_rows - + result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=completeness, - DQdimension="Completeness", + DQdimension=DQDimension.COMPLETENESS, DQmetric="Completeness", columnNames=[column], ) results.append(result) - - return results \ No newline at end of file + + return results diff --git a/metis/metric/consistency/config.py b/metis/metric/consistency/config.py index 4ddbec8..2492df1 100644 --- a/metis/metric/consistency/config.py +++ b/metis/metric/consistency/config.py @@ -4,6 +4,16 @@ from metis.metric.config import MetricConfig +@dataclass +class ConsistencyConfig(MetricConfig): + """ + Configuration class for the Consistency metric. + """ + + rules: Dict[ + str, List[Callable[[Any], float]] + ] # Dictionary of functions that define consistency rules for each column given by the key + @dataclass class RuleConsistencyConfig(MetricConfig): """ diff --git a/metis/metric/consistency/consistency.py b/metis/metric/consistency/consistency.py index 35c7234..d5eb4c6 100644 --- a/metis/metric/consistency/consistency.py +++ b/metis/metric/consistency/consistency.py @@ -1,12 +1,21 @@ -import pandas as pd -from typing import List, Union import json +from typing import List -from metis.utils.result import DQResult +import pandas as pd + +from metis.metric.config import MetricConfig from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension +from metis.utils.result import DQResult + class Consistency(Metric): - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: """ Assess the consistency of a dataset by checking the compliance of a functional dependency specified in the metric_config. @@ -20,8 +29,18 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None if total_rows == 0: return results + if metric_config is None: + raise ValueError( + "Metric configuration is required for consistency assessment." + ) + if not isinstance(metric_config, str): + raise ValueError( + "Metric configuration must be a file path to a JSON configuration or a JSON string." + ) + with open(metric_config, "r") as f: metric_conf = json.load(f) + for determinant, dependents in metric_conf.items(): if determinant not in data.columns: continue @@ -42,10 +61,10 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=consistency, - DQdimension="Consistency", + DQdimension=DQDimension.CONSISTENCY, DQmetric="Consistency", columnNames=[determinant], - DQannotations={f"{determinant}:{dependent}":violations} # FD + DQannotations={f"{determinant}:{dependent}": violations}, # FD ) results.append(result) diff --git a/metis/metric/consistency/rule_consistency.py b/metis/metric/consistency/rule_consistency.py index d51ac5f..3cb4adc 100644 --- a/metis/metric/consistency/rule_consistency.py +++ b/metis/metric/consistency/rule_consistency.py @@ -2,8 +2,10 @@ import pandas as pd +from metis.metric.config import MetricConfig from metis.metric.consistency.config import RuleConsistencyConfig from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult @@ -12,7 +14,7 @@ def assess( self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, - metric_config: str | None | RuleConsistencyConfig = None, + metric_config: str | None | MetricConfig = None, ) -> List[DQResult]: """ Assess the consistency of the data by checking each value for the given rules. @@ -22,9 +24,17 @@ def assess( :return: List of DQResult objects containing consistency results. """ if metric_config is None: - raise ValueError("Metric configuration is required for rule-based consistency assessment.") + raise ValueError( + "Metric configuration is required for rule-based consistency assessment." + ) if isinstance(metric_config, str): - raise ValueError("Metric configuration must be a RuleConsistencyConfig instance. JSON loading is not supported.") + raise ValueError( + "Metric configuration must be a RuleConsistencyConfig instance. JSON loading is not supported." + ) + if not isinstance(metric_config, RuleConsistencyConfig): + raise ValueError( + "Metric configuration must be a RuleConsistencyConfig instance." + ) rules = metric_config.rules @@ -33,6 +43,12 @@ def assess( for col_name in data.columns: column_rules = rules.get(col_name, []) + if len(column_rules) == 0: + print( + f"No consistency rules defined for column '{col_name}'. Skipping." + ) + continue + for row_index in range(total_rows): degree_of_violation = sum( rule(data.at[row_index, col_name]) for rule in column_rules @@ -42,7 +58,7 @@ def assess( result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=measurement, - DQdimension="Consistency", + DQdimension=DQDimension.CONSISTENCY, DQmetric="RuleConsistency", columnNames=[col_name], rowIndex=row_index, diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness.py index 842faf6..0b58c44 100644 --- a/metis/metric/correctness/correctness.py +++ b/metis/metric/correctness/correctness.py @@ -1,8 +1,10 @@ -from typing import List, Union +from typing import List import pandas as pd +from metis.metric.config import MetricConfig from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult @@ -10,8 +12,8 @@ class Correctness(Metric): def assess( self, data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, - metric_config: Union[str, None] = None, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ Assess the correctness of the data by calculating the deviation from the reference. @@ -38,8 +40,8 @@ def assess( result = DQResult( mesTime=pd.Timestamp.now(), - DQvalue=measurement, - DQdimension="Correctness", + DQvalue=float(measurement), + DQdimension=DQDimension.CORRECTNESS, DQmetric="Correctness", columnNames=[col_name], rowIndex=row_index, diff --git a/metis/metric/currency/currency.py b/metis/metric/currency/currency.py index d94708d..4ed5149 100644 --- a/metis/metric/currency/currency.py +++ b/metis/metric/currency/currency.py @@ -6,6 +6,7 @@ from metis.metric.config import MetricConfig from metis.metric.currency.config import CurrencyConfig from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult @@ -48,16 +49,16 @@ def assess( for row_index in range(total_rows): ingestion_date = pd.to_datetime( - data.at[row_index, ingestion_date_column], dayfirst=True + str(data.at[row_index, ingestion_date_column]), dayfirst=True ) - delta: pd.Timedelta = (assessment_date - ingestion_date) - age = delta.days // 365 + delta = (assessment_date - ingestion_date) + age = delta.days / 365 measurement = exp(-decline_rate * age) if pd.notna(age) else 0 result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=measurement, - DQdimension="Currency", + DQdimension=DQDimension.CURRENCY, DQmetric="Currency", columnNames=[col_name], rowIndex=row_index, diff --git a/metis/metric/metric.py b/metis/metric/metric.py index 4bdadd5..0ba5a5a 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -1,17 +1,21 @@ import json from abc import ABC, abstractmethod -from typing import List, Union +from typing import Any, List, TypeVar, cast, overload import pandas as pd +from metis.metric.config import MetricConfig from metis.utils.result import DQResult +C = TypeVar("C", bound=MetricConfig) + class Metric(ABC): """ Abstract base class for metrics. All metric classes should inherit from this class and implement the `compute` method. """ + registry = {} def __init_subclass__(cls): @@ -19,65 +23,73 @@ def __init_subclass__(cls): Metric.registry[cls.__name__] = cls @abstractmethod - def assess(self, - data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, - metric_config: Union[str, None] = None) -> List[DQResult]: - """Assess data using this metric and return the results. - - Parameters - - data: pd.DataFrame - The DataFrame that should be assessed by this metric. This is - the primary dataset under inspection. - - - reference: Optional[pd.DataFrame] - An optional, cleaned reference DataFrame that can act as a - clean version of the dataset. Metrics that need a canonical or - expected version of the data (for example correctness against a - known-good source) should accept and use this DataFrame. If not - needed by a metric, `None` is allowed. - - - metric_config: Optional[str] - Optional path or JSON string containing metric-specific - configuration. Use this to keep the method signature compact; - all metric-specific parameters (thresholds, aggregation options, - etc.) can be stored here. - - Returns - - List[DQResult] - A list of `DQResult` objects. Each `DQResult` instance captures - one assessed value produced by the metric. For metrics that - operate at the column level, there should be one `DQResult` per - column; for table-level metrics typically a single `DQResult` - is returned. Implementations are free to return multiple - results for any logical decomposition the metric provides - (e.g., per-column, per-partition, per-check). - - Notes - - Implementations must avoid mutating the - input `data` and `reference` DataFrames in-place. - - `metric_config` should be parsed by the implementation and any - invalid config should raise a clear exception describing the - expected format. - - Examples - - Column-level completeness metric: returns one `DQResult` per - column with the fraction of non-null values. - - Correctness metric against a reference: compares `data` to - `reference` and returns one `DQResult` per cell in the input table containing the - agreement score. - """ - raise NotImplementedError() - - def load_config(self, config: str) -> dict: + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: + """Assess data using this metric and return the results. + + Parameters + - data: pd.DataFrame + The DataFrame that should be assessed by this metric. This is + the primary dataset under inspection. + + - reference: Optional[pd.DataFrame] + An optional, cleaned reference DataFrame that can act as a + clean version of the dataset. Metrics that need a canonical or + expected version of the data (for example correctness against a + known-good source) should accept and use this DataFrame. If not + needed by a metric, `None` is allowed. + + - metric_config: Optional[str] + Optional path or JSON string containing metric-specific + configuration. Use this to keep the method signature compact; + all metric-specific parameters (thresholds, aggregation options, + etc.) can be stored here. + + Returns + - List[DQResult] + A list of `DQResult` objects. Each `DQResult` instance captures + one assessed value produced by the metric. For metrics that + operate at the column level, there should be one `DQResult` per + column; for table-level metrics typically a single `DQResult` + is returned. Implementations are free to return multiple + results for any logical decomposition the metric provides + (e.g., per-column, per-partition, per-check). + + Notes + - Implementations must avoid mutating the + input `data` and `reference` DataFrames in-place. + - `metric_config` should be parsed by the implementation and any + invalid config should raise a clear exception describing the + expected format. + + Examples + - Column-level completeness metric: returns one `DQResult` per + column with the fraction of non-null values. + - Correctness metric against a reference: compares `data` to + `reference` and returns one `DQResult` per cell in the input table containing the + agreement score. + """ + raise NotImplementedError() + + def load_config(self, config: Any, model: type[C]) -> C: """ Load metric-specific configuration from a JSON file. :param config: Path to the JSON configuration file or a JSON string. - :return: Dictionary containing the configuration parameters. + :return: An instance of the metric-specific configuration class. """ - if config.endswith(".json"): - with open(config, 'r') as f: - return json.load(f) + if isinstance(config, model): + return config + + if isinstance(config, str) and config.endswith(".json"): + with open(config, "r") as f: + return model(**json.load(f)) + + if isinstance(config, str): + return model(**json.loads(config)) - return json.loads(config) + raise TypeError(f"Invalid config type: {type(config)}. Expected str or {model}.") diff --git a/metis/utils/dq_dimension.py b/metis/utils/dq_dimension.py new file mode 100644 index 0000000..4b423e4 --- /dev/null +++ b/metis/utils/dq_dimension.py @@ -0,0 +1,8 @@ +from enum import StrEnum + + +class DQDimension(StrEnum): + CONSISTENCY = "Consistency" + CORRECTNESS = "Correctness" + COMPLETENESS = "Completeness" + CURRENCY = "Currency" diff --git a/metis/utils/result.py b/metis/utils/result.py index 2df5f41..62a32a2 100644 --- a/metis/utils/result.py +++ b/metis/utils/result.py @@ -1,12 +1,14 @@ from typing import List, Union import pandas as pd +from metis.utils.dq_dimension import DQDimension + class DQResult: def __init__( self, mesTime: pd.Timestamp, DQvalue: float, - DQdimension: str, + DQdimension: DQDimension, DQmetric: str, columnNames: Union[List[str], None] = None, rowIndex: Union[int, None] = None, diff --git a/metis/writer/console_writer.py b/metis/writer/console_writer.py index c5eb6f2..4cbc88e 100644 --- a/metis/writer/console_writer.py +++ b/metis/writer/console_writer.py @@ -3,9 +3,9 @@ from metis.utils.result import DQResult class ConsoleWriter: - def __init__(self, writer_config: Dict = None) -> None: + def __init__(self, writer_config: Dict | None = None) -> None: pass def write(self, results: List[DQResult]) -> None: for result in results: - print(result.as_json()) \ No newline at end of file + print(result.as_json()) From 379641fc7c28736b2c0e249adb6009f9da19d058 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 19 Oct 2025 20:42:53 +0200 Subject: [PATCH 05/32] prototypical assessment of certainty of the rule-consistency metric --- metis/metric/consistency/rule_consistency.py | 23 +++++++++++++++++--- metis/metric/correctness/correctness.py | 4 ++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/metis/metric/consistency/rule_consistency.py b/metis/metric/consistency/rule_consistency.py index 3cb4adc..e5dd4bf 100644 --- a/metis/metric/consistency/rule_consistency.py +++ b/metis/metric/consistency/rule_consistency.py @@ -1,3 +1,4 @@ +from math import sqrt from typing import List, Union import pandas as pd @@ -38,7 +39,7 @@ def assess( rules = metric_config.rules - results = [] + results: List[DQResult] = [] total_rows = len(data) for col_name in data.columns: @@ -49,20 +50,36 @@ def assess( ) continue + max_violation = 0.0 + column_results: List[DQResult] = [] + for row_index in range(total_rows): degree_of_violation = sum( rule(data.at[row_index, col_name]) for rule in column_rules ) measurement = 1 / (1 + degree_of_violation) + max_violation = max(max_violation, degree_of_violation) result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=measurement, DQdimension=DQDimension.CONSISTENCY, - DQmetric="RuleConsistency", + DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, ) - results.append(result) + column_results.append(result) + + maximum_rules_coverage = 1 / (1 + max_violation) + for result in column_results: + certainty = sqrt( + (1 - result.DQvalue + maximum_rules_coverage) + * maximum_rules_coverage + ) + result.DQannotations = { + "certainty": certainty, + } + + results.extend(column_results) return results diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness.py index 0b58c44..b1b42f4 100644 --- a/metis/metric/correctness/correctness.py +++ b/metis/metric/correctness/correctness.py @@ -58,7 +58,7 @@ def measure_correctness(self, value, *, reference_value, dtype): if dtype == "int64" or dtype == "float64": return abs(value - reference_value) / max(abs(reference_value), abs(value)) if dtype == "object": - return self.levenshteinDistance(str(value), str(reference_value)) / max( + return self.levenshtein_distance(str(value), str(reference_value)) / max( len(str(value)), len(str(reference_value)) ) raise ValueError( @@ -66,7 +66,7 @@ def measure_correctness(self, value, *, reference_value, dtype): ) # https://stackoverflow.com/a/32558749 - def levenshteinDistance(self, s1, s2): + def levenshtein_distance(self, s1, s2): if len(s1) > len(s2): s1, s2 = s2, s1 From c823e937225c42a2c35a14c40d40d02a14648b74 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 19 Oct 2025 21:36:47 +0200 Subject: [PATCH 06/32] fix correctness metric and adjust docstring --- metis/metric/consistency/consistency.py | 2 +- metis/metric/correctness/correctness.py | 4 ++-- metis/utils/result.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/metis/metric/consistency/consistency.py b/metis/metric/consistency/consistency.py index d5eb4c6..7b7b70c 100644 --- a/metis/metric/consistency/consistency.py +++ b/metis/metric/consistency/consistency.py @@ -35,7 +35,7 @@ def assess( ) if not isinstance(metric_config, str): raise ValueError( - "Metric configuration must be a file path to a JSON configuration or a JSON string." + "Metric configuration must be a file path to a JSON configuration." ) with open(metric_config, "r") as f: diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness.py index b1b42f4..272ea4a 100644 --- a/metis/metric/correctness/correctness.py +++ b/metis/metric/correctness/correctness.py @@ -56,9 +56,9 @@ def measure_correctness(self, value, *, reference_value, dtype): if pd.isna(value) or pd.isna(reference_value): return 0 if dtype == "int64" or dtype == "float64": - return abs(value - reference_value) / max(abs(reference_value), abs(value)) + return 1 - abs(value - reference_value) / max(abs(reference_value), abs(value)) if dtype == "object": - return self.levenshtein_distance(str(value), str(reference_value)) / max( + return 1 - self.levenshtein_distance(str(value), str(reference_value)) / max( len(str(value)), len(str(reference_value)) ) raise ValueError( diff --git a/metis/utils/result.py b/metis/utils/result.py index 62a32a2..be114f2 100644 --- a/metis/utils/result.py +++ b/metis/utils/result.py @@ -21,7 +21,7 @@ def __init__( Required arguments - `mesTime: pd.Timestamp`: The time at which the result was assessed. - `DQvalue: float`: Numeric outcome of the assessment (quantitative only). - - `DQdimension: str`: Data quality dimension assessed (e.g. 'completeness', 'accuracy'). + - `DQdimension: DQDimension`: Data quality dimension assessed (e.g. DQDimension.COMPLETENESS, DQDimension.ACCURACY). - `DQmetric: str`: Name of the specific metric within the dimension. Optional arguments From 8947834ef2ea4873b64bebcea2e08afda18fe7db Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 1 Dec 2025 22:59:13 +0100 Subject: [PATCH 07/32] add prototypical certainty calculation to correctness and currency --- metis/metric/correctness/correctness.py | 34 +++++++++++++++++-------- metis/metric/currency/currency.py | 9 +++++-- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness.py index 272ea4a..db85246 100644 --- a/metis/metric/correctness/correctness.py +++ b/metis/metric/correctness/correctness.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple import pandas as pd @@ -32,7 +32,7 @@ def assess( for col_name in data.columns: for row_index in range(total_rows): - measurement = self.measure_correctness( + measurement, certainty = self.measure_correctness( data.at[row_index, col_name], reference_value=reference.at[row_index, col_name], dtype=data[col_name].dtype, @@ -42,31 +42,43 @@ def assess( mesTime=pd.Timestamp.now(), DQvalue=float(measurement), DQdimension=DQDimension.CORRECTNESS, - DQmetric="Correctness", + DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, + DQannotations={ + "certainty": certainty, + }, ) results.append(result) return results - def measure_correctness(self, value, *, reference_value, dtype): + def measure_correctness( + self, value, *, reference_value, dtype + ) -> Tuple[float, float]: if value == reference_value: - return 1 + return 1, 1 if pd.isna(value) or pd.isna(reference_value): - return 0 + return 0, 1 if dtype == "int64" or dtype == "float64": - return 1 - abs(value - reference_value) / max(abs(reference_value), abs(value)) + return ( + 1 + - abs(value - reference_value) / max(abs(reference_value), abs(value)), + 1, + ) if dtype == "object": - return 1 - self.levenshtein_distance(str(value), str(reference_value)) / max( - len(str(value)), len(str(reference_value)) + max_len = max(len(str(value)), len(str(reference_value))) + correctness = ( + 1 + - self.levenshtein_distance(str(value), str(reference_value)) / max_len ) + return correctness, 1 / (1 + max_len / 10) # certainty decreases with length raise ValueError( f"Unsupported dtype for correctness measurement: {dtype} (value: {value}, reference_value: {reference_value})" ) # https://stackoverflow.com/a/32558749 - def levenshtein_distance(self, s1, s2): + def levenshtein_distance(self, s1: str, s2: str) -> int: if len(s1) > len(s2): s1, s2 = s2, s1 @@ -78,7 +90,7 @@ def levenshtein_distance(self, s1, s2): distances_.append(distances[i1]) else: distances_.append( - 1 + min((distances[i1], distances[i1 + 1], distances_[-1])) + 1 + min(distances[i1], distances[i1 + 1], distances_[-1]) ) distances = distances_ return distances[-1] diff --git a/metis/metric/currency/currency.py b/metis/metric/currency/currency.py index 4ed5149..a3f4925 100644 --- a/metis/metric/currency/currency.py +++ b/metis/metric/currency/currency.py @@ -38,6 +38,7 @@ def assess( results = [] total_rows = len(data) + decline_rate_variance = 0.1 for col_name in data.columns: decline_rate = config.decline_rate_per_column.get(col_name) @@ -51,17 +52,21 @@ def assess( ingestion_date = pd.to_datetime( str(data.at[row_index, ingestion_date_column]), dayfirst=True ) - delta = (assessment_date - ingestion_date) + delta = assessment_date - ingestion_date age = delta.days / 365 measurement = exp(-decline_rate * age) if pd.notna(age) else 0 + certainty = 1 / (1 + decline_rate * decline_rate_variance * age) result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=measurement, DQdimension=DQDimension.CURRENCY, - DQmetric="Currency", + DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, + DQannotations={ + "certainty": certainty, + }, ) results.append(result) From cf3305caa8028d1fd038c3aba9a344b1e8b671cf Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 1 Dec 2025 22:59:32 +0100 Subject: [PATCH 08/32] add postgres docker setup --- docker_compose.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 docker_compose.yaml diff --git a/docker_compose.yaml b/docker_compose.yaml new file mode 100644 index 0000000..c36b44b --- /dev/null +++ b/docker_compose.yaml @@ -0,0 +1,14 @@ +services: + db: + image: postgres:18 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: metis_db + ports: + - "5432:5432" + volumes: + - pgdata:/var/lib/postgresql + +volumes: + pgdata: From 33ad9bbfeab5f61290f85ae68ae506b961645cc4 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 7 Dec 2025 16:35:01 +0100 Subject: [PATCH 09/32] add basic logging --- metis/database_models.py | 12 +++++++----- metis/dq_orchestrator.py | 6 +++++- metis/utils/logger.py | 4 ++++ metis/writer/database_writer.py | 17 +++++++++++++---- requirements.txt | 3 ++- 5 files changed, 31 insertions(+), 11 deletions(-) create mode 100644 metis/utils/logger.py diff --git a/metis/database_models.py b/metis/database_models.py index 5e8d94d..6b9c390 100644 --- a/metis/database_models.py +++ b/metis/database_models.py @@ -5,16 +5,18 @@ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column -class Base(DeclarativeBase): - pass - def register_models(results_table_name: str): + class Base(DeclarativeBase): + pass + class DQResultModel(Base): __tablename__ = results_table_name __table_args__ = {"extend_existing": True} id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - mes_time: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now()) + mes_time: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) dq_value: Mapped[float] = mapped_column(Double) dq_dimension: Mapped[str] dq_metric: Mapped[str] @@ -24,4 +26,4 @@ class DQResultModel(Base): dataset: Mapped[str | None] table_name: Mapped[str | None] - return DQResultModel + return Base, DQResultModel diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index 8a652e6..3c57227 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -7,6 +7,7 @@ from metis.metric import Metric from metis.metric.config import MetricConfig from metis.utils.data_config import DataConfig +from metis.utils.logger import logger from metis.utils.result import DQResult from metis.writer.console_writer import ConsoleWriter from metis.writer.postgres_writer import PostgresWriter @@ -55,7 +56,9 @@ def load(self, data_loader_configs: List[str]) -> None: f"Unsupported loader type: {config_data.get('loader', None)}" ) - def assess(self, metrics: List[str], metric_configs: List[str | MetricConfig | None]) -> None: + def assess( + self, metrics: List[str], metric_configs: List[str | MetricConfig | None] + ) -> None: results = [] for metric, metric_config in zip(metrics, metric_configs): @@ -64,6 +67,7 @@ def assess(self, metrics: List[str], metric_configs: List[str | MetricConfig | N raise ValueError(f"Metric {metric} is not registered.") metric_instance: Metric = metric_class() for df_name, df in self.dataframes.items(): + logger.info(f"Assessing metric '{metric}'") incomplete_metric_results = metric_instance.assess( data=df, reference=self.reference_dataframes.get(df_name), diff --git a/metis/utils/logger.py b/metis/utils/logger.py new file mode 100644 index 0000000..4f1f38d --- /dev/null +++ b/metis/utils/logger.py @@ -0,0 +1,4 @@ +import logging + +logger = logging.getLogger("metis") +logging.basicConfig(level=logging.INFO) diff --git a/metis/writer/database_writer.py b/metis/writer/database_writer.py index 7a07c10..d639c12 100644 --- a/metis/writer/database_writer.py +++ b/metis/writer/database_writer.py @@ -2,8 +2,10 @@ from sqlalchemy import Engine from sqlalchemy.orm import Session +from tqdm import tqdm -from metis.database_models import Base, register_models +from metis.database_models import register_models +from metis.utils.numbers import format_count from metis.utils.result import DQResult from metis.writer.writer import DQResultWriter @@ -12,7 +14,9 @@ class DatabaseWriter(DQResultWriter): def __init__(self, writer_config: Dict) -> None: self.engine = self.create_engine(writer_config) - self.DQResultModel = register_models(writer_config.get("table_name", "dq_results")) + Base, self.DQResultModel = register_models( + writer_config.get("table_name", "dq_results") + ) Base.metadata.create_all(self.engine) def create_engine(self, writer_config: Dict) -> Engine: @@ -34,5 +38,10 @@ def write(self, results: List[DQResult]) -> None: ) for result in results ] - session.add_all(db_entities) - session.commit() + for batch in tqdm( + range(0, len(db_entities), 1000), + desc=f"Writing {format_count(len(db_entities))} DQ results to database", + unit="k results", + ): + session.add_all(db_entities[batch : batch + 1000]) + session.commit() diff --git a/requirements.txt b/requirements.txt index 8d1a079..ef9171c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ pandas psycopg2-binary sqlite3 ; sys_platform == "win32" # sqlite3 is included with Python, but this line is for completeness sqlalchemy==2.0.44 -nltk +nltk==3.9.2 +tqdm From a77a8904e0ecf92910c5fd683be1046bdd9b038e Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 7 Dec 2025 16:35:40 +0100 Subject: [PATCH 10/32] move common utilities into utils folder --- metis/metric/correctness/correctness.py | 36 +++++++++---------------- metis/utils/numbers.py | 12 +++++++++ metis/utils/strings.py | 17 ++++++++++++ 3 files changed, 42 insertions(+), 23 deletions(-) create mode 100644 metis/utils/numbers.py create mode 100644 metis/utils/strings.py diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness.py index db85246..bd04e65 100644 --- a/metis/metric/correctness/correctness.py +++ b/metis/metric/correctness/correctness.py @@ -5,7 +5,9 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.numbers import clamp from metis.utils.result import DQResult +from metis.utils.strings import levenshtein_distance class Correctness(Metric): @@ -62,35 +64,23 @@ def measure_correctness( return 0, 1 if dtype == "int64" or dtype == "float64": return ( - 1 - - abs(value - reference_value) / max(abs(reference_value), abs(value)), + clamp( + 1 + - abs(value - reference_value) + / max(abs(reference_value), abs(value)), + 0, + 1, + ), 1, ) if dtype == "object": max_len = max(len(str(value)), len(str(reference_value))) correctness = ( - 1 - - self.levenshtein_distance(str(value), str(reference_value)) / max_len + 1 - levenshtein_distance(str(value), str(reference_value)) / max_len ) - return correctness, 1 / (1 + max_len / 10) # certainty decreases with length + return correctness, 1 / ( + 1 + max_len / 10 + ) # certainty decreases with length raise ValueError( f"Unsupported dtype for correctness measurement: {dtype} (value: {value}, reference_value: {reference_value})" ) - - # https://stackoverflow.com/a/32558749 - def levenshtein_distance(self, s1: str, s2: str) -> int: - if len(s1) > len(s2): - s1, s2 = s2, s1 - - distances = range(len(s1) + 1) - for i2, c2 in enumerate(s2): - distances_ = [i2 + 1] - for i1, c1 in enumerate(s1): - if c1 == c2: - distances_.append(distances[i1]) - else: - distances_.append( - 1 + min(distances[i1], distances[i1 + 1], distances_[-1]) - ) - distances = distances_ - return distances[-1] diff --git a/metis/utils/numbers.py b/metis/utils/numbers.py new file mode 100644 index 0000000..50a1e6e --- /dev/null +++ b/metis/utils/numbers.py @@ -0,0 +1,12 @@ +def clamp( + value: int | float, min_value: int | float, max_value: int | float +) -> int | float: + return max(min(value, max_value), min_value) + + +def format_count(value: int | float) -> str: + """Formats a large number with appropriate suffixes (K, M, B, T) for thousands, millions, billions, and trillions.""" + suffixes = ["", "K", "M", "B", "T"] + string_value = str(int(value)) + suffix = min((len(string_value) - 1) // 3, len(suffixes) - 1) + return f"{string_value[: -3 * suffix] if suffix > 0 else string_value}{suffixes[suffix]}" diff --git a/metis/utils/strings.py b/metis/utils/strings.py new file mode 100644 index 0000000..7be4035 --- /dev/null +++ b/metis/utils/strings.py @@ -0,0 +1,17 @@ +# https://stackoverflow.com/a/32558749 +def levenshtein_distance(s1: str, s2: str) -> int: + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append( + 1 + min(distances[i1], distances[i1 + 1], distances_[-1]) + ) + distances = distances_ + return distances[-1] From bf8e8c90e63bed2f17e8466f57375434f16a2fe2 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 8 Dec 2025 12:18:49 +0100 Subject: [PATCH 11/32] add fallback writing to csv in case the writer errors --- metis/dq_orchestrator.py | 20 +++++++++++++++++++- metis/metric/consistency/rule_consistency.py | 2 +- metis/writer/database_writer.py | 3 ++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index 3c57227..edba47f 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -1,4 +1,6 @@ import json +import traceback +from pathlib import Path from typing import Dict, List, Type import pandas as pd @@ -78,7 +80,23 @@ def assess( result.dataset = self.data_paths[df_name] results.append(result) - self.writer.write(results) + try: + logger.info( + f"Writing {len(results)} results using {self.writer.__class__.__name__}" + ) + self.writer.write(results) + except Exception as e: + traceback.print_exc() + logger.error(f"Error writing results: {e}") + try: + logger.warning(f"Trying to save results to csv as fallback...") + fallback_df = pd.DataFrame([result.as_json() for result in results]) + fallback_file = Path("dq_results_fallback.csv") + fallback_df.to_csv(fallback_file, index=False) + logger.warning(f"Results saved to {fallback_file.absolute()}") + except Exception as e: + logger.error(f"Failed to save results to csv: {e}") + raise e def get_dq_result(self, query: str) -> List[DQResult]: return [] diff --git a/metis/metric/consistency/rule_consistency.py b/metis/metric/consistency/rule_consistency.py index e5dd4bf..f6e44c3 100644 --- a/metis/metric/consistency/rule_consistency.py +++ b/metis/metric/consistency/rule_consistency.py @@ -62,7 +62,7 @@ def assess( result = DQResult( mesTime=pd.Timestamp.now(), - DQvalue=measurement, + DQvalue=float(measurement), DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, columnNames=[col_name], diff --git a/metis/writer/database_writer.py b/metis/writer/database_writer.py index d639c12..c9e649c 100644 --- a/metis/writer/database_writer.py +++ b/metis/writer/database_writer.py @@ -44,4 +44,5 @@ def write(self, results: List[DQResult]) -> None: unit="k results", ): session.add_all(db_entities[batch : batch + 1000]) - session.commit() + session.flush() + session.commit() From cd9a0a7515498dad8d80f3d6a80b86bbd862e80f Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 8 Dec 2025 13:22:55 +0100 Subject: [PATCH 12/32] rename metrics according to naming scheme --- README.md | 19 +++++++++++-------- metis/dq_orchestrator.py | 2 +- metis/metric/__init__.py | 6 +++--- metis/metric/consistency/config.py | 5 +++-- ...cy.py => consistency_RuleBasedHinrichs.py} | 10 +++++----- ...correctness.py => correctness_Heinrich.py} | 2 +- metis/metric/metric.py | 6 ++++-- .../metric/{currency => timeliness}/config.py | 4 ++-- .../timeliness_Heinrich.py} | 15 ++++++++------- metis/utils/{logger.py => logging.py} | 0 10 files changed, 38 insertions(+), 31 deletions(-) rename metis/metric/consistency/{rule_consistency.py => consistency_RuleBasedHinrichs.py} (86%) rename metis/metric/correctness/{correctness.py => correctness_Heinrich.py} (98%) rename metis/metric/{currency => timeliness}/config.py (72%) rename metis/metric/{currency/currency.py => timeliness/timeliness_Heinrich.py} (82%) rename metis/utils/{logger.py => logging.py} (100%) diff --git a/README.md b/README.md index ce82852..4cef871 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,11 @@ python -m demo.getting_started ## How to implement new metrics -To extend the Metis framework and add new data quality metrics, please check our interface for easy integration. +To extend the Metis framework and add new data quality metrics, please check our interface for easy integration. ````python -def assess(self, - data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, +def assess(self, + data: pd.DataFrame, + reference: Union[pd.DataFrame, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: ```` Each metric should be a subclass of ```metis.metric.metric.Metric``` and implement the assess method. This method takes three arguments: @@ -29,14 +29,17 @@ The metric should return a list of ```metis.utils.result.DQResult```. This can b ### Metric naming convention -Metrics are organized by dimension (e.g., `completeness`, `minimality`), where one folder exists for each. -New metrics should follow the naming format: `{Granularity}_{DimensionName}_{Technique}` +Metrics are organized by dimension (e.g., `completeness`, `minimality`), where one folder exists for each. +New metrics inside those folders should follow the naming format: `{DimensionName}_{Technique}` -- **Granularity**: The level of analysis (e.g., `cell`, `row`, `column`, `table`) - **DimensionName**: The quality dimension being measured (e.g., `Completeness`, `Minimality`) - **Technique**: The calculation or method used (e.g., `MissingRatio`, `HierarchicalClustering`) -Examples: `column_completeness_MissingRatio`, `row_minimality_DuplicateRatio` +Examples: `completeness_MissingRatio`, `minimality_DuplicateRatio` + +The granularities at which a metric can operate are handled inside the metric itself and are configurable through the metric config. This enables each metric to handle reusing results for coarser granularities individually. + +- **Granularity**: The level of analysis (e.g., `cell`, `row`, `column`, `table`) ## Output: creating a DQResult diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index edba47f..d926873 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -9,7 +9,7 @@ from metis.metric import Metric from metis.metric.config import MetricConfig from metis.utils.data_config import DataConfig -from metis.utils.logger import logger +from metis.utils.logging import logger from metis.utils.result import DQResult from metis.writer.console_writer import ConsoleWriter from metis.writer.postgres_writer import PostgresWriter diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index 7c9ebde..deb907e 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,10 +1,10 @@ from .completeness.completeness import Completeness from .consistency.consistency import Consistency +from .consistency.consistency_RuleBasedHinrichs import ConsistencyRuleBasedHinrichs +from .correctness.correctness_Heinrich import CorrectnessHeinrich from .metric import Metric from .minimality.column_minimality_duplicateCount import ( column_minimality_duplicateCount, ) +from .timeliness.timeliness_Heinrich import TimelinessHeinrich from .validity.out_of_vocabulary import OutOfVocabulary -from .consistency.rule_consistency import RuleConsistency -from .correctness.correctness import Correctness -from .currency.currency import Currency diff --git a/metis/metric/consistency/config.py b/metis/metric/consistency/config.py index 2492df1..28ea43c 100644 --- a/metis/metric/consistency/config.py +++ b/metis/metric/consistency/config.py @@ -14,10 +14,11 @@ class ConsistencyConfig(MetricConfig): str, List[Callable[[Any], float]] ] # Dictionary of functions that define consistency rules for each column given by the key + @dataclass -class RuleConsistencyConfig(MetricConfig): +class ConsistencyRuleBasedHinrichsConfig(MetricConfig): """ - Configuration class for the RuleConsistency metric. + Configuration class for the RuleBasedHinrichs metric. """ rules: Dict[ diff --git a/metis/metric/consistency/rule_consistency.py b/metis/metric/consistency/consistency_RuleBasedHinrichs.py similarity index 86% rename from metis/metric/consistency/rule_consistency.py rename to metis/metric/consistency/consistency_RuleBasedHinrichs.py index f6e44c3..fcb5cc4 100644 --- a/metis/metric/consistency/rule_consistency.py +++ b/metis/metric/consistency/consistency_RuleBasedHinrichs.py @@ -4,13 +4,13 @@ import pandas as pd from metis.metric.config import MetricConfig -from metis.metric.consistency.config import RuleConsistencyConfig +from metis.metric.consistency.config import ConsistencyRuleBasedHinrichsConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult -class RuleConsistency(Metric): +class ConsistencyRuleBasedHinrichs(Metric): def assess( self, data: pd.DataFrame, @@ -30,11 +30,11 @@ def assess( ) if isinstance(metric_config, str): raise ValueError( - "Metric configuration must be a RuleConsistencyConfig instance. JSON loading is not supported." + "Metric configuration must be a ConsistencyRuleBasedHinrichsConfig instance. JSON loading is not supported." ) - if not isinstance(metric_config, RuleConsistencyConfig): + if not isinstance(metric_config, ConsistencyRuleBasedHinrichsConfig): raise ValueError( - "Metric configuration must be a RuleConsistencyConfig instance." + "Metric configuration must be a ConsistencyRuleBasedHinrichsConfig instance." ) rules = metric_config.rules diff --git a/metis/metric/correctness/correctness.py b/metis/metric/correctness/correctness_Heinrich.py similarity index 98% rename from metis/metric/correctness/correctness.py rename to metis/metric/correctness/correctness_Heinrich.py index bd04e65..231c569 100644 --- a/metis/metric/correctness/correctness.py +++ b/metis/metric/correctness/correctness_Heinrich.py @@ -10,7 +10,7 @@ from metis.utils.strings import levenshtein_distance -class Correctness(Metric): +class CorrectnessHeinrich(Metric): def assess( self, data: pd.DataFrame, diff --git a/metis/metric/metric.py b/metis/metric/metric.py index 0ba5a5a..8350baf 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -1,6 +1,6 @@ import json from abc import ABC, abstractmethod -from typing import Any, List, TypeVar, cast, overload +from typing import Any, List, TypeVar import pandas as pd @@ -92,4 +92,6 @@ def load_config(self, config: Any, model: type[C]) -> C: if isinstance(config, str): return model(**json.loads(config)) - raise TypeError(f"Invalid config type: {type(config)}. Expected str or {model}.") + raise TypeError( + f"Invalid config type: {type(config)}. Expected str or {model}." + ) diff --git a/metis/metric/currency/config.py b/metis/metric/timeliness/config.py similarity index 72% rename from metis/metric/currency/config.py rename to metis/metric/timeliness/config.py index 62eb0ce..fe951bb 100644 --- a/metis/metric/currency/config.py +++ b/metis/metric/timeliness/config.py @@ -5,9 +5,9 @@ @dataclass -class CurrencyConfig(MetricConfig): +class TimelinessConfig(MetricConfig): """ - Configuration class for the Currency metric. + Configuration class for the TimelinessHeinrich metric. """ decline_rate_per_column: Dict[str, float] diff --git a/metis/metric/currency/currency.py b/metis/metric/timeliness/timeliness_Heinrich.py similarity index 82% rename from metis/metric/currency/currency.py rename to metis/metric/timeliness/timeliness_Heinrich.py index a3f4925..92006d1 100644 --- a/metis/metric/currency/currency.py +++ b/metis/metric/timeliness/timeliness_Heinrich.py @@ -4,13 +4,14 @@ import pandas as pd from metis.metric.config import MetricConfig -from metis.metric.currency.config import CurrencyConfig from metis.metric.metric import Metric +from metis.metric.timeliness.config import TimelinessConfig from metis.utils.dq_dimension import DQDimension +from metis.utils.logging import logger from metis.utils.result import DQResult -class Currency(Metric): +class TimelinessHeinrich(Metric): def assess( self, data: pd.DataFrame, @@ -18,18 +19,18 @@ def assess( metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ - Assess the currency of the data by calculating the deviation from the reference. + Assess the timeliness of the data by calculating the deviation from the reference. :param data: DataFrame to assess. :param metric_config: Optional configuration for the metric. - :return: List of DQResult objects containing currency results. + :return: List of DQResult objects containing timeliness results. """ if not metric_config: raise ValueError( - "Metric configuration is required for currency assessment." + "Metric configuration is required for timeliness assessment." ) - config = self.load_config(metric_config, CurrencyConfig) + config = self.load_config(metric_config, TimelinessConfig) ingestion_date_column = config.ingestion_date_column assessment_date = pd.to_datetime( @@ -43,7 +44,7 @@ def assess( for col_name in data.columns: decline_rate = config.decline_rate_per_column.get(col_name) if decline_rate is None: - print( + logger.info( f"Decline rate for column '{col_name}' is not specified in the configuration. Skipping." ) continue diff --git a/metis/utils/logger.py b/metis/utils/logging.py similarity index 100% rename from metis/utils/logger.py rename to metis/utils/logging.py From 164ae7fc1cf680a6069720b8e299f4cfcc907835 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 8 Dec 2025 14:33:35 +0100 Subject: [PATCH 13/32] add csv writer --- metis/dq_orchestrator.py | 15 +++++++------- metis/metric/consistency/config.py | 10 +++++++++ metis/metric/timeliness/config.py | 8 ++++++++ metis/writer/csv_writer.py | 33 ++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 metis/writer/csv_writer.py diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index d926873..3393183 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -1,6 +1,5 @@ import json import traceback -from pathlib import Path from typing import Dict, List, Type import pandas as pd @@ -12,9 +11,12 @@ from metis.utils.logging import logger from metis.utils.result import DQResult from metis.writer.console_writer import ConsoleWriter +from metis.writer.csv_writer import CSVWriter from metis.writer.postgres_writer import PostgresWriter from metis.writer.sqlite_writer import SQLiteWriter +FALLBACK_RESULTS_FILE = "dq_results_fallback.csv" + class DQOrchestrator: def __init__(self, writer_config_path: str | None = None) -> None: @@ -29,12 +31,14 @@ def __init__(self, writer_config_path: str | None = None) -> None: if writer_config_path: with open(writer_config_path, "r") as f: writer_config = json.load(f) - if not "writer_name" in writer_config: + if "writer_name" not in writer_config: raise ValueError("Writer config must include 'writer_name' field.") if writer_config["writer_name"] == "sqlite": self.writer = SQLiteWriter(writer_config) elif writer_config["writer_name"] == "postgres": self.writer = PostgresWriter(writer_config) + elif writer_config["writer_name"] == "csv": + self.writer = CSVWriter(writer_config) def load(self, data_loader_configs: List[str]) -> None: for config_path in data_loader_configs: @@ -89,11 +93,8 @@ def assess( traceback.print_exc() logger.error(f"Error writing results: {e}") try: - logger.warning(f"Trying to save results to csv as fallback...") - fallback_df = pd.DataFrame([result.as_json() for result in results]) - fallback_file = Path("dq_results_fallback.csv") - fallback_df.to_csv(fallback_file, index=False) - logger.warning(f"Results saved to {fallback_file.absolute()}") + logger.warning("Trying to save results to csv as fallback...") + CSVWriter({"path": FALLBACK_RESULTS_FILE}).write(results) except Exception as e: logger.error(f"Failed to save results to csv: {e}") raise e diff --git a/metis/metric/consistency/config.py b/metis/metric/consistency/config.py index 28ea43c..1b1ae53 100644 --- a/metis/metric/consistency/config.py +++ b/metis/metric/consistency/config.py @@ -1,3 +1,4 @@ +import inspect from dataclasses import dataclass from typing import Any, Callable, Dict, List @@ -24,3 +25,12 @@ class ConsistencyRuleBasedHinrichsConfig(MetricConfig): rules: Dict[ str, List[Callable[[Any], float]] ] # Dictionary of functions that define consistency rules for each column given by the key + + def to_json(self): + return { + "name": self.__class__.__name__, + "rules": { + column: [inspect.getsource(rule).strip() for rule in rules] + for column, rules in self.rules.items() + }, + } diff --git a/metis/metric/timeliness/config.py b/metis/metric/timeliness/config.py index fe951bb..c3eb358 100644 --- a/metis/metric/timeliness/config.py +++ b/metis/metric/timeliness/config.py @@ -13,3 +13,11 @@ class TimelinessConfig(MetricConfig): decline_rate_per_column: Dict[str, float] ingestion_date_column: str simulated_assessment_date: str | None = None + + def to_json(self): + return { + "name": self.__class__.__name__, + "decline_rate_per_column": self.decline_rate_per_column, + "ingestion_date_column": self.ingestion_date_column, + "simulated_assessment_date": self.simulated_assessment_date, + } diff --git a/metis/writer/csv_writer.py b/metis/writer/csv_writer.py new file mode 100644 index 0000000..cceb7b9 --- /dev/null +++ b/metis/writer/csv_writer.py @@ -0,0 +1,33 @@ +from pathlib import Path +from typing import Dict, List + +import pandas as pd + +from metis.utils.logging import logger +from metis.utils.result import DQResult + + +class CSVWriter: + def __init__(self, writer_config: Dict) -> None: + if "path" not in writer_config: + raise ValueError( + f"{self.__class__.__name__} requires a 'path' in the configuration." + ) + + self.path = Path(writer_config["path"]) + if not self.path.suffix == ".csv": + raise ValueError( + f"{self.__class__.__name__} path must end with .csv extension." + ) + + if self.path.exists(): + logger.warning( + f"{self.__class__.__name__} path {self.path} already exists and will be overwritten." + ) + + def write(self, results: List[DQResult]) -> None: + self.path.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame([result.as_json() for result in results]).to_csv( + self.path, index=False + ) + logger.info(f"Results saved to {self.path.absolute()}") From 1c3d0a5adc80f6d6713ebc6fe7671f6efe8fc035 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Fri, 12 Dec 2025 15:47:12 +0100 Subject: [PATCH 14/32] rename metric files --- metis/metric/__init__.py | 6 +++--- ...uleBasedHinrichs.py => consistency_ruleBasedHinrichs.py} | 0 .../{correctness_Heinrich.py => correctness_heinrich.py} | 0 .../{timeliness_Heinrich.py => timeliness_heinrich.py} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename metis/metric/consistency/{consistency_RuleBasedHinrichs.py => consistency_ruleBasedHinrichs.py} (100%) rename metis/metric/correctness/{correctness_Heinrich.py => correctness_heinrich.py} (100%) rename metis/metric/timeliness/{timeliness_Heinrich.py => timeliness_heinrich.py} (100%) diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index deb907e..616189c 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,10 +1,10 @@ from .completeness.completeness import Completeness from .consistency.consistency import Consistency -from .consistency.consistency_RuleBasedHinrichs import ConsistencyRuleBasedHinrichs -from .correctness.correctness_Heinrich import CorrectnessHeinrich +from .consistency.consistency_ruleBasedHinrichs import ConsistencyRuleBasedHinrichs +from .correctness.correctness_heinrich import CorrectnessHeinrich from .metric import Metric from .minimality.column_minimality_duplicateCount import ( column_minimality_duplicateCount, ) -from .timeliness.timeliness_Heinrich import TimelinessHeinrich +from .timeliness.timeliness_heinrich import TimelinessHeinrich from .validity.out_of_vocabulary import OutOfVocabulary diff --git a/metis/metric/consistency/consistency_RuleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py similarity index 100% rename from metis/metric/consistency/consistency_RuleBasedHinrichs.py rename to metis/metric/consistency/consistency_ruleBasedHinrichs.py diff --git a/metis/metric/correctness/correctness_Heinrich.py b/metis/metric/correctness/correctness_heinrich.py similarity index 100% rename from metis/metric/correctness/correctness_Heinrich.py rename to metis/metric/correctness/correctness_heinrich.py diff --git a/metis/metric/timeliness/timeliness_Heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py similarity index 100% rename from metis/metric/timeliness/timeliness_Heinrich.py rename to metis/metric/timeliness/timeliness_heinrich.py From e1362eabaf2a7cc5df789e8746cc0ff81fcf4957 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 15 Dec 2025 15:41:46 +0100 Subject: [PATCH 15/32] add tuple rules to metric ConsistencyRuleBasedHinrichs --- metis/metric/consistency/config.py | 44 +++++---- .../consistency_ruleBasedHinrichs.py | 98 ++++++++++++------- .../metric/timeliness/timeliness_heinrich.py | 8 +- 3 files changed, 93 insertions(+), 57 deletions(-) diff --git a/metis/metric/consistency/config.py b/metis/metric/consistency/config.py index 1b1ae53..6f5f423 100644 --- a/metis/metric/consistency/config.py +++ b/metis/metric/consistency/config.py @@ -2,35 +2,39 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, List -from metis.metric.config import MetricConfig - - -@dataclass -class ConsistencyConfig(MetricConfig): - """ - Configuration class for the Consistency metric. - """ +import pandas as pd - rules: Dict[ - str, List[Callable[[Any], float]] - ] # Dictionary of functions that define consistency rules for each column given by the key +from metis.metric.config import MetricConfig -@dataclass +@dataclass(kw_only=True) class ConsistencyRuleBasedHinrichsConfig(MetricConfig): """ - Configuration class for the RuleBasedHinrichs metric. + Configuration class for the ConsistencyRuleBasedHinrichs metric. + + Accepts a dictionary mapping attribute names to lists of functions that define consistency rules. + :param attribute_rules: Dictionary of functions that define consistency rules for each column given by the key + :param tuple_rules: List of functions that define consistency rules for entire tuples """ - rules: Dict[ - str, List[Callable[[Any], float]] - ] # Dictionary of functions that define consistency rules for each column given by the key + attribute_rules: Dict[str, List[Callable[[Any], float]]] | None = None + + tuple_rules: List[Callable[[pd.Series], float]] | None = None def to_json(self): return { "name": self.__class__.__name__, - "rules": { - column: [inspect.getsource(rule).strip() for rule in rules] - for column, rules in self.rules.items() - }, + "attribute_rules": ( + { + column: [inspect.getsource(rule).strip() for rule in rules] + for column, rules in self.attribute_rules.items() + } + if self.attribute_rules + else {} + ), + "tuple_rules": ( + [inspect.getsource(rule).strip() for rule in self.tuple_rules] + if self.tuple_rules + else [] + ), } diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index fcb5cc4..31053a8 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -1,5 +1,5 @@ from math import sqrt -from typing import List, Union +from typing import Any, Callable, List, Union import pandas as pd @@ -7,10 +7,15 @@ from metis.metric.consistency.config import ConsistencyRuleBasedHinrichsConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.logging import logger as main_logger from metis.utils.result import DQResult class ConsistencyRuleBasedHinrichs(Metric): + def __init__(self) -> None: + super().__init__() + self.logger = main_logger.getChild(self.__class__.__name__) + def assess( self, data: pd.DataFrame, @@ -26,60 +31,83 @@ def assess( """ if metric_config is None: raise ValueError( - "Metric configuration is required for rule-based consistency assessment." + f"Metric configuration is required for metric {ConsistencyRuleBasedHinrichs.__name__} but None was provided." ) if isinstance(metric_config, str): raise ValueError( - "Metric configuration must be a ConsistencyRuleBasedHinrichsConfig instance. JSON loading is not supported." + f"Metric configuration must be an instance of {ConsistencyRuleBasedHinrichsConfig.__name__}. JSON loading is not supported." ) if not isinstance(metric_config, ConsistencyRuleBasedHinrichsConfig): raise ValueError( - "Metric configuration must be a ConsistencyRuleBasedHinrichsConfig instance." + f"Metric configuration must be an instance of {ConsistencyRuleBasedHinrichsConfig.__name__} but was of type {type(metric_config)}." ) - rules = metric_config.rules + attribute_rules = metric_config.attribute_rules or {} + tuple_rules = metric_config.tuple_rules or [] results: List[DQResult] = [] - total_rows = len(data) + + if tuple_rules: + degree_of_violation: pd.Series[float] = data.apply( + lambda x: self.sum_rules(tuple_rules, x), axis="columns" + ) + + dq_measurements = 1 / (1 + degree_of_violation) + min_quality = dq_measurements.min() + for row_index, dq_value in dq_measurements.items(): + certainty = sqrt( + (1 - dq_value + min_quality) * min_quality + ) + + results.append( + DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=dq_value, + DQdimension=DQDimension.CONSISTENCY, + DQmetric=self.__class__.__name__, + columnNames=[], + rowIndex=int(str(row_index)), + DQannotations={ + "certainty": certainty, + }, + ) + ) for col_name in data.columns: - column_rules = rules.get(col_name, []) - if len(column_rules) == 0: - print( + column_rules = attribute_rules.get(col_name, []) + if not column_rules: + self.logger.info( f"No consistency rules defined for column '{col_name}'. Skipping." ) continue - max_violation = 0.0 - column_results: List[DQResult] = [] + degree_of_violation: pd.Series[float] = data[col_name].apply( + lambda x: self.sum_rules(column_rules, x) + ) - for row_index in range(total_rows): - degree_of_violation = sum( - rule(data.at[row_index, col_name]) for rule in column_rules - ) - measurement = 1 / (1 + degree_of_violation) - max_violation = max(max_violation, degree_of_violation) - - result = DQResult( - mesTime=pd.Timestamp.now(), - DQvalue=float(measurement), - DQdimension=DQDimension.CONSISTENCY, - DQmetric=self.__class__.__name__, - columnNames=[col_name], - rowIndex=row_index, - ) - column_results.append(result) + dq_measurements = 1 / (1 + degree_of_violation) + min_quality = dq_measurements.min() - maximum_rules_coverage = 1 / (1 + max_violation) - for result in column_results: + for row_index, dq_value in dq_measurements.items(): certainty = sqrt( - (1 - result.DQvalue + maximum_rules_coverage) - * maximum_rules_coverage + (1 - dq_value + min_quality) * min_quality ) - result.DQannotations = { - "certainty": certainty, - } - results.extend(column_results) + results.append( + DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=dq_value, + DQdimension=DQDimension.CONSISTENCY, + DQmetric=self.__class__.__name__, + columnNames=[col_name], + rowIndex=int(str(row_index)), + DQannotations={ + "certainty": certainty, + }, + ) + ) return results + + def sum_rules(self, rules: List[Callable], value: Any) -> float: + return float(sum(rule(value) for rule in rules)) diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 92006d1..4194d02 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -7,11 +7,15 @@ from metis.metric.metric import Metric from metis.metric.timeliness.config import TimelinessConfig from metis.utils.dq_dimension import DQDimension -from metis.utils.logging import logger +from metis.utils.logging import logger as main_logger from metis.utils.result import DQResult class TimelinessHeinrich(Metric): + def __init__(self) -> None: + super().__init__() + self.logger = main_logger.getChild(self.__class__.__name__) + def assess( self, data: pd.DataFrame, @@ -44,7 +48,7 @@ def assess( for col_name in data.columns: decline_rate = config.decline_rate_per_column.get(col_name) if decline_rate is None: - logger.info( + self.logger.info( f"Decline rate for column '{col_name}' is not specified in the configuration. Skipping." ) continue From 3f22d9d94c94b0210af7ec41b4c6f99f98877718 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 15 Dec 2025 20:23:59 +0100 Subject: [PATCH 16/32] remove certainty from correctness and timeliness for now again --- .../correctness/correctness_heinrich.py | 33 +++++++------------ .../metric/timeliness/timeliness_heinrich.py | 5 --- 2 files changed, 11 insertions(+), 27 deletions(-) diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index 231c569..0a3ddad 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List import pandas as pd @@ -34,7 +34,7 @@ def assess( for col_name in data.columns: for row_index in range(total_rows): - measurement, certainty = self.measure_correctness( + measurement = self.measure_correctness( data.at[row_index, col_name], reference_value=reference.at[row_index, col_name], dtype=data[col_name].dtype, @@ -42,35 +42,26 @@ def assess( result = DQResult( mesTime=pd.Timestamp.now(), - DQvalue=float(measurement), + DQvalue=measurement, DQdimension=DQDimension.CORRECTNESS, DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, - DQannotations={ - "certainty": certainty, - }, ) results.append(result) return results - def measure_correctness( - self, value, *, reference_value, dtype - ) -> Tuple[float, float]: + def measure_correctness(self, value, *, reference_value, dtype) -> float: if value == reference_value: - return 1, 1 + return 1 if pd.isna(value) or pd.isna(reference_value): - return 0, 1 + return 0 if dtype == "int64" or dtype == "float64": - return ( - clamp( - 1 - - abs(value - reference_value) - / max(abs(reference_value), abs(value)), - 0, - 1, - ), + return clamp( + 1 + - abs(value - reference_value) / max(abs(reference_value), abs(value)), + 0, 1, ) if dtype == "object": @@ -78,9 +69,7 @@ def measure_correctness( correctness = ( 1 - levenshtein_distance(str(value), str(reference_value)) / max_len ) - return correctness, 1 / ( - 1 + max_len / 10 - ) # certainty decreases with length + return correctness raise ValueError( f"Unsupported dtype for correctness measurement: {dtype} (value: {value}, reference_value: {reference_value})" ) diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 4194d02..f0f4d17 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -43,7 +43,6 @@ def assess( results = [] total_rows = len(data) - decline_rate_variance = 0.1 for col_name in data.columns: decline_rate = config.decline_rate_per_column.get(col_name) @@ -60,7 +59,6 @@ def assess( delta = assessment_date - ingestion_date age = delta.days / 365 measurement = exp(-decline_rate * age) if pd.notna(age) else 0 - certainty = 1 / (1 + decline_rate * decline_rate_variance * age) result = DQResult( mesTime=pd.Timestamp.now(), @@ -69,9 +67,6 @@ def assess( DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, - DQannotations={ - "certainty": certainty, - }, ) results.append(result) From 5bd62cb2e1f916d367b7e75c1b3f3c16c6a59759 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Wed, 17 Dec 2025 12:03:44 +0100 Subject: [PATCH 17/32] rename metrics and configs and add some documentation --- README.md | 4 ++- metis/metric/__init__.py | 6 ++--- .../consistency_ruleBasedHinrichs.py | 26 +++++++++---------- ...> consistency_ruleBasedHinrichs_config.py} | 4 +-- .../correctness/correctness_heinrich.py | 4 +-- .../metric/timeliness/timeliness_heinrich.py | 20 +++++++++----- ...onfig.py => timeliness_heinrich_config.py} | 8 ++++-- metis/utils/dq_dimension.py | 4 ++- .../levenshtein_distance.py} | 1 + 9 files changed, 47 insertions(+), 30 deletions(-) rename metis/metric/consistency/{config.py => consistency_ruleBasedHinrichs_config.py} (90%) rename metis/metric/timeliness/{config.py => timeliness_heinrich_config.py} (57%) rename metis/utils/{strings.py => similarity_measures/levenshtein_distance.py} (89%) diff --git a/README.md b/README.md index 4cef871..ff8f351 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,9 @@ New metrics inside those folders should follow the naming format: `{DimensionNam - **DimensionName**: The quality dimension being measured (e.g., `Completeness`, `Minimality`) - **Technique**: The calculation or method used (e.g., `MissingRatio`, `HierarchicalClustering`) -Examples: `completeness_MissingRatio`, `minimality_DuplicateRatio` +Examples: `completeness_missingRatio`, `minimality_duplicateRatio` + +The file name and class name of each metric should be equal. If a metric has a specific config class, the name of the config class should be `{MetricName}_config` (e.g., `completeness_missingRatio_config`). The granularities at which a metric can operate are handled inside the metric itself and are configurable through the metric config. This enables each metric to handle reusing results for coarser granularities individually. diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index 616189c..1b93adc 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,10 +1,10 @@ from .completeness.completeness import Completeness from .consistency.consistency import Consistency -from .consistency.consistency_ruleBasedHinrichs import ConsistencyRuleBasedHinrichs -from .correctness.correctness_heinrich import CorrectnessHeinrich +from .consistency.consistency_ruleBasedHinrichs import consistency_ruleBasedHinrichs +from .correctness.correctness_heinrich import correctness_heinrich from .metric import Metric from .minimality.column_minimality_duplicateCount import ( column_minimality_duplicateCount, ) -from .timeliness.timeliness_heinrich import TimelinessHeinrich +from .timeliness.timeliness_heinrich import timeliness_heinrich from .validity.out_of_vocabulary import OutOfVocabulary diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index 31053a8..e4fafcd 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -4,14 +4,16 @@ import pandas as pd from metis.metric.config import MetricConfig -from metis.metric.consistency.config import ConsistencyRuleBasedHinrichsConfig +from metis.metric.consistency.consistency_ruleBasedHinrichs_config import ( + consistency_ruleBasedHinrichs_config, +) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension from metis.utils.logging import logger as main_logger from metis.utils.result import DQResult -class ConsistencyRuleBasedHinrichs(Metric): +class consistency_ruleBasedHinrichs(Metric): def __init__(self) -> None: super().__init__() self.logger = main_logger.getChild(self.__class__.__name__) @@ -23,23 +25,25 @@ def assess( metric_config: str | None | MetricConfig = None, ) -> List[DQResult]: """ - Assess the consistency of the data by checking each value for the given rules. + Assess the consistency of the data by checking the given rules for each value. The rules are defined in the metric configuration. There are attribute rules that apply to individual columns and tuple rules that apply to entire rows. The quality measurement is calculated as 1 / (1 + degree_of_violation), where degree_of_violation is the sum of the result of all applicable rules for a given value/row. + Additionally, this metric assesses the certainty of the measurement based on the minimum quality in the assessed data. The certainty is calculated as sqrt((1 - dq_value) * (1 - min_quality)), where dq_value is the quality measurement for the specific value/row and min_quality is the lowest quality measurement observed in the dataset. :param data: DataFrame to assess. + :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing consistency results. """ if metric_config is None: raise ValueError( - f"Metric configuration is required for metric {ConsistencyRuleBasedHinrichs.__name__} but None was provided." + f"Metric configuration is required for metric {consistency_ruleBasedHinrichs.__name__} but None was provided." ) if isinstance(metric_config, str): raise ValueError( - f"Metric configuration must be an instance of {ConsistencyRuleBasedHinrichsConfig.__name__}. JSON loading is not supported." + f"Metric configuration must be an instance of {consistency_ruleBasedHinrichs_config.__name__}. JSON loading is not supported." ) - if not isinstance(metric_config, ConsistencyRuleBasedHinrichsConfig): + if not isinstance(metric_config, consistency_ruleBasedHinrichs_config): raise ValueError( - f"Metric configuration must be an instance of {ConsistencyRuleBasedHinrichsConfig.__name__} but was of type {type(metric_config)}." + f"Metric configuration must be an instance of {consistency_ruleBasedHinrichs_config.__name__} but was of type {type(metric_config)}." ) attribute_rules = metric_config.attribute_rules or {} @@ -55,9 +59,7 @@ def assess( dq_measurements = 1 / (1 + degree_of_violation) min_quality = dq_measurements.min() for row_index, dq_value in dq_measurements.items(): - certainty = sqrt( - (1 - dq_value + min_quality) * min_quality - ) + certainty = sqrt((1 - dq_value) * (1 - min_quality)) results.append( DQResult( @@ -89,9 +91,7 @@ def assess( min_quality = dq_measurements.min() for row_index, dq_value in dq_measurements.items(): - certainty = sqrt( - (1 - dq_value + min_quality) * min_quality - ) + certainty = sqrt((1 - dq_value + min_quality) * min_quality) results.append( DQResult( diff --git a/metis/metric/consistency/config.py b/metis/metric/consistency/consistency_ruleBasedHinrichs_config.py similarity index 90% rename from metis/metric/consistency/config.py rename to metis/metric/consistency/consistency_ruleBasedHinrichs_config.py index 6f5f423..6eef08a 100644 --- a/metis/metric/consistency/config.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs_config.py @@ -8,9 +8,9 @@ @dataclass(kw_only=True) -class ConsistencyRuleBasedHinrichsConfig(MetricConfig): +class consistency_ruleBasedHinrichs_config(MetricConfig): """ - Configuration class for the ConsistencyRuleBasedHinrichs metric. + Configuration class for the consistency_ruleBasedHinrichs metric. Accepts a dictionary mapping attribute names to lists of functions that define consistency rules. :param attribute_rules: Dictionary of functions that define consistency rules for each column given by the key diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index 0a3ddad..2c98de2 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -7,10 +7,10 @@ from metis.utils.dq_dimension import DQDimension from metis.utils.numbers import clamp from metis.utils.result import DQResult -from metis.utils.strings import levenshtein_distance +from metis.utils.similarity_measures.levenshtein_distance import levenshtein_distance -class CorrectnessHeinrich(Metric): +class correctness_heinrich(Metric): def assess( self, data: pd.DataFrame, diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index f0f4d17..3f3c709 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -5,13 +5,15 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric -from metis.metric.timeliness.config import TimelinessConfig +from metis.metric.timeliness.timeliness_heinrich_config import ( + timeliness_heinrich_config, +) from metis.utils.dq_dimension import DQDimension from metis.utils.logging import logger as main_logger from metis.utils.result import DQResult -class TimelinessHeinrich(Metric): +class timeliness_heinrich(Metric): def __init__(self) -> None: super().__init__() self.logger = main_logger.getChild(self.__class__.__name__) @@ -23,10 +25,12 @@ def assess( metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ - Assess the timeliness of the data by calculating the deviation from the reference. + Assess the timeliness of the data by calculating how likely each cell is to be out of date based on a reference date and a decline rate. The reference date is either provided in the configuration or defaults to the current date. + The formula used is: timeliness = exp(-decline_rate * age), where age and decline_rate are measured in years. The age is calculated as the difference between the reference date and the ingestion date of the tuple (defined by the ingestion_date_column in the configuration). :param data: DataFrame to assess. - :param metric_config: Optional configuration for the metric. + : param reference: Optional reference DataFrame (not used in this metric). + :param metric_config: Configuration for the metric (required). :return: List of DQResult objects containing timeliness results. """ if not metric_config: @@ -34,7 +38,7 @@ def assess( "Metric configuration is required for timeliness assessment." ) - config = self.load_config(metric_config, TimelinessConfig) + config = self.load_config(metric_config, timeliness_heinrich_config) ingestion_date_column = config.ingestion_date_column assessment_date = pd.to_datetime( @@ -63,7 +67,7 @@ def assess( result = DQResult( mesTime=pd.Timestamp.now(), DQvalue=measurement, - DQdimension=DQDimension.CURRENCY, + DQdimension=DQDimension.TIMELINESS, DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, @@ -71,3 +75,7 @@ def assess( results.append(result) return results + + return results + + return results diff --git a/metis/metric/timeliness/config.py b/metis/metric/timeliness/timeliness_heinrich_config.py similarity index 57% rename from metis/metric/timeliness/config.py rename to metis/metric/timeliness/timeliness_heinrich_config.py index c3eb358..5797751 100644 --- a/metis/metric/timeliness/config.py +++ b/metis/metric/timeliness/timeliness_heinrich_config.py @@ -5,9 +5,13 @@ @dataclass -class TimelinessConfig(MetricConfig): +class timeliness_heinrich_config(MetricConfig): """ - Configuration class for the TimelinessHeinrich metric. + Configuration class for the timeliness_heinrich metric. + + :param decline_rate_per_column: Decline rate specific to each column + :param ingestion_date_column: Name of the column containing the ingestion date of each tuple + :param simulated_assessment_date: Optional simulated assessment date in string format. If not provided, the current date will be used. """ decline_rate_per_column: Dict[str, float] diff --git a/metis/utils/dq_dimension.py b/metis/utils/dq_dimension.py index 4b423e4..0566d22 100644 --- a/metis/utils/dq_dimension.py +++ b/metis/utils/dq_dimension.py @@ -2,7 +2,9 @@ class DQDimension(StrEnum): + """Data Quality Dimensions Enum. Primarily used for labeling DQResults inside each metric implementation with the DQ Dimension they assess.""" + CONSISTENCY = "Consistency" CORRECTNESS = "Correctness" COMPLETENESS = "Completeness" - CURRENCY = "Currency" + TIMELINESS = "Timeliness" diff --git a/metis/utils/strings.py b/metis/utils/similarity_measures/levenshtein_distance.py similarity index 89% rename from metis/utils/strings.py rename to metis/utils/similarity_measures/levenshtein_distance.py index 7be4035..ef2a011 100644 --- a/metis/utils/strings.py +++ b/metis/utils/similarity_measures/levenshtein_distance.py @@ -1,5 +1,6 @@ # https://stackoverflow.com/a/32558749 def levenshtein_distance(s1: str, s2: str) -> int: + """Calculate the Levenshtein distance between s1 and s2.""" if len(s1) > len(s2): s1, s2 = s2, s1 From 9e0c2ff1f3d7fd61b96b8d3da0a939f541b74e1c Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 22 Dec 2025 12:36:39 +0100 Subject: [PATCH 18/32] add consistency_ruleBasedPipino metric --- metis/metric/__init__.py | 1 + .../consistency_ruleBasedHinrichs.py | 31 +++-- .../consistency_ruleBasedPipino.py | 121 ++++++++++++++++++ .../consistency_ruleBasedPipino_config.py | 40 ++++++ metis/metric/metric.py | 6 +- .../metric/timeliness/timeliness_heinrich.py | 4 - 6 files changed, 184 insertions(+), 19 deletions(-) create mode 100644 metis/metric/consistency/consistency_ruleBasedPipino.py create mode 100644 metis/metric/consistency/consistency_ruleBasedPipino_config.py diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index 1b93adc..45e1731 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,6 +1,7 @@ from .completeness.completeness import Completeness from .consistency.consistency import Consistency from .consistency.consistency_ruleBasedHinrichs import consistency_ruleBasedHinrichs +from .consistency.consistency_ruleBasedPipino import consistency_ruleBasedPipino from .correctness.correctness_heinrich import correctness_heinrich from .metric import Metric from .minimality.column_minimality_duplicateCount import ( diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index e4fafcd..603573e 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -9,15 +9,10 @@ ) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension -from metis.utils.logging import logger as main_logger from metis.utils.result import DQResult class consistency_ruleBasedHinrichs(Metric): - def __init__(self) -> None: - super().__init__() - self.logger = main_logger.getChild(self.__class__.__name__) - def assess( self, data: pd.DataFrame, @@ -59,8 +54,6 @@ def assess( dq_measurements = 1 / (1 + degree_of_violation) min_quality = dq_measurements.min() for row_index, dq_value in dq_measurements.items(): - certainty = sqrt((1 - dq_value) * (1 - min_quality)) - results.append( DQResult( mesTime=pd.Timestamp.now(), @@ -70,17 +63,26 @@ def assess( columnNames=[], rowIndex=int(str(row_index)), DQannotations={ - "certainty": certainty, + "certainty": self.certainty(dq_value, min_quality) }, ) ) + extraneous_rules = set(attribute_rules.keys()) - set(data.columns) + if extraneous_rules: + self.logger.warning( + f"The following columns have consistency rules defined but are not present in the data: {extraneous_rules}. These rules will be ignored." + ) + + extraneous_columns = set(data.columns) - set(attribute_rules.keys()) + if extraneous_columns: + self.logger.info( + f"The following columns are present in the data but have no consistency rules defined: {extraneous_columns}. These columns will be skipped." + ) + for col_name in data.columns: column_rules = attribute_rules.get(col_name, []) if not column_rules: - self.logger.info( - f"No consistency rules defined for column '{col_name}'. Skipping." - ) continue degree_of_violation: pd.Series[float] = data[col_name].apply( @@ -91,8 +93,6 @@ def assess( min_quality = dq_measurements.min() for row_index, dq_value in dq_measurements.items(): - certainty = sqrt((1 - dq_value + min_quality) * min_quality) - results.append( DQResult( mesTime=pd.Timestamp.now(), @@ -102,7 +102,7 @@ def assess( columnNames=[col_name], rowIndex=int(str(row_index)), DQannotations={ - "certainty": certainty, + "certainty": self.certainty(dq_value, min_quality) }, ) ) @@ -111,3 +111,6 @@ def assess( def sum_rules(self, rules: List[Callable], value: Any) -> float: return float(sum(rule(value) for rule in rules)) + + def certainty(self, dq_value: float, min_quality: float) -> float: + return sqrt((1 - dq_value) * (1 - min_quality)) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py new file mode 100644 index 0000000..291ecf2 --- /dev/null +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -0,0 +1,121 @@ +from math import sqrt +from typing import Any, Callable, List, Union + +import pandas as pd + +from metis.metric.config import MetricConfig +from metis.metric.consistency.consistency_ruleBasedPipino_config import ( + consistency_ruleBasedPipino_config, +) +from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension +from metis.utils.result import DQResult + + +class consistency_ruleBasedPipino(Metric): + def assess( + self, + data: pd.DataFrame, + reference: Union[pd.DataFrame, None] = None, + metric_config: str | None | MetricConfig = None, + ) -> List[DQResult]: + """ + Assess the consistency of the data by checking the given rules for each value. The rules are defined in the metric configuration. There are attribute rules that apply to individual columns and tuple rules that apply to entire rows. The quality measurement is calculated as 1 - degree_of_violation / N, where degree_of_violation is the sum of the result of all applicable rules for a given value/row and N is the total number of rules. + Additionally, this metric assesses the certainty of the measurement based on the minimum quality in the assessed data. The certainty is calculated as sqrt((1 - dq_value) * (1 - min_quality)), where dq_value is the quality measurement for the specific value/row and min_quality is the lowest quality measurement observed in the dataset. + + :param data: DataFrame to assess. + :param reference: Optional reference DataFrame (not used in this metric). + :param metric_config: Mandatory configuration for the metric. + :return: List of DQResult objects containing consistency results. + """ + if metric_config is None: + raise ValueError( + f"Metric configuration is required for metric {consistency_ruleBasedPipino.__name__} but None was provided." + ) + if isinstance(metric_config, str): + raise ValueError( + f"Metric configuration must be an instance of {consistency_ruleBasedPipino_config.__name__}. JSON loading is not supported." + ) + if not isinstance(metric_config, consistency_ruleBasedPipino_config): + raise ValueError( + f"Metric configuration must be an instance of {consistency_ruleBasedPipino_config.__name__} but was of type {type(metric_config)}." + ) + + attribute_rules = metric_config.attribute_rules or {} + tuple_rules = metric_config.tuple_rules or [] + + results: List[DQResult] = [] + + if tuple_rules: + degree_of_violation: pd.Series[float] = data.apply( + lambda x: self.sum_rules(tuple_rules, x), axis="columns" + ) + + dq_measurements = 1 - degree_of_violation / len(tuple_rules) + min_quality = dq_measurements.min() + for row_index, dq_value in dq_measurements.items(): + results.append( + self.create_result( + dq_value, + None, + int(str(row_index)), + self.certainty(dq_value, min_quality), + ) + ) + + extraneous_rules = set(attribute_rules.keys()) - set(data.columns) + if extraneous_rules: + self.logger.warning( + f"The following columns have consistency rules defined but are not present in the data: {extraneous_rules}. These rules will be ignored." + ) + + extraneous_columns = set(data.columns) - set(attribute_rules.keys()) + if extraneous_columns: + self.logger.info( + f"The following columns are present in the data but have no consistency rules defined: {extraneous_columns}. These columns will be skipped." + ) + + for col_name in data.columns: + column_rules = attribute_rules.get(col_name, []) + if not column_rules: + continue + + degree_of_violation: pd.Series[float] = data[col_name].apply( + lambda x: self.sum_rules(column_rules, x) + ) + + dq_measurements = 1 - degree_of_violation / len(column_rules) + min_quality = dq_measurements.min() + + for row_index, dq_value in dq_measurements.items(): + results.append( + self.create_result( + dq_value, + col_name, + int(str(row_index)), + self.certainty(dq_value, min_quality), + ) + ) + + return results + + def sum_rules(self, rules: List[Callable], value: Any) -> float: + return float(sum(rule(value) for rule in rules)) + + def certainty(self, dq_value: float, min_quality: float) -> float: + return sqrt((1 - dq_value) * (1 - min_quality)) + + def create_result( + self, dq_value: float, col_name: str | None, row_index: int, certainty: float + ) -> DQResult: + return DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=dq_value, + DQdimension=DQDimension.CONSISTENCY, + DQmetric=self.__class__.__name__, + columnNames=[col_name] if col_name else [], + rowIndex=row_index, + DQannotations={ + "certainty": certainty, + }, + ) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino_config.py b/metis/metric/consistency/consistency_ruleBasedPipino_config.py new file mode 100644 index 0000000..984db67 --- /dev/null +++ b/metis/metric/consistency/consistency_ruleBasedPipino_config.py @@ -0,0 +1,40 @@ +import inspect +from dataclasses import dataclass +from typing import Any, Callable, Dict, List + +import pandas as pd + +from metis.metric.config import MetricConfig + + +@dataclass(kw_only=True) +class consistency_ruleBasedPipino_config(MetricConfig): + """ + Configuration class for the consistency_ruleBasedPipino metric. + + Accepts a dictionary mapping attribute names to lists of functions that define consistency rules. + :param attribute_rules: Dictionary of functions that define consistency rules for each column given by the key + :param tuple_rules: List of functions that define consistency rules for entire tuples + """ + + attribute_rules: Dict[str, List[Callable[[Any], float]]] | None = None + + tuple_rules: List[Callable[[pd.Series], float]] | None = None + + def to_json(self): + return { + "name": self.__class__.__name__, + "attribute_rules": ( + { + column: [inspect.getsource(rule).strip() for rule in rules] + for column, rules in self.attribute_rules.items() + } + if self.attribute_rules + else {} + ), + "tuple_rules": ( + [inspect.getsource(rule).strip() for rule in self.tuple_rules] + if self.tuple_rules + else [] + ), + } diff --git a/metis/metric/metric.py b/metis/metric/metric.py index 8350baf..bf355bf 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -5,6 +5,7 @@ import pandas as pd from metis.metric.config import MetricConfig +from metis.utils.logging import logger as main_logger from metis.utils.result import DQResult C = TypeVar("C", bound=MetricConfig) @@ -13,7 +14,7 @@ class Metric(ABC): """ Abstract base class for metrics. - All metric classes should inherit from this class and implement the `compute` method. + All metric classes should inherit from this class and implement the `assess` method. """ registry = {} @@ -22,6 +23,9 @@ def __init_subclass__(cls): super().__init_subclass__() Metric.registry[cls.__name__] = cls + def __init__(self) -> None: + self.logger = main_logger.getChild(self.__class__.__name__) + @abstractmethod def assess( self, diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 3f3c709..74bbc21 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -75,7 +75,3 @@ def assess( results.append(result) return results - - return results - - return results From 4131dad66fc632000ad274746500861ee6ea4c47 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Tue, 20 Jan 2026 21:13:43 +0100 Subject: [PATCH 19/32] Add completeness metrics based on null values and dmvs detected by FAHES --- metis/metric/completeness/completeness.py | 3 +- .../completeness_nullAndDMVRate.py | 126 ++++++++++++++++++ .../completeness_nullAndDMVRate_config.py | 34 +++++ .../completeness/completeness_nullRate.py | 83 ++++++++++++ .../completeness_nullRate_config.py | 34 +++++ metis/utils/__init__.py | 0 .../disguised_missing_values/__init__.py | 0 .../fahes/__init__.py | 0 .../disguised_missing_values/fahes/fahes.py | 83 ++++++++++++ .../fahes/lib/makefile | 14 ++ 10 files changed, 376 insertions(+), 1 deletion(-) create mode 100644 metis/metric/completeness/completeness_nullAndDMVRate.py create mode 100644 metis/metric/completeness/completeness_nullAndDMVRate_config.py create mode 100644 metis/metric/completeness/completeness_nullRate.py create mode 100644 metis/metric/completeness/completeness_nullRate_config.py create mode 100644 metis/utils/__init__.py create mode 100644 metis/utils/disguised_missing_values/__init__.py create mode 100644 metis/utils/disguised_missing_values/fahes/__init__.py create mode 100644 metis/utils/disguised_missing_values/fahes/fahes.py create mode 100755 metis/utils/disguised_missing_values/fahes/lib/makefile diff --git a/metis/metric/completeness/completeness.py b/metis/metric/completeness/completeness.py index f08a3cd..bfe17f9 100644 --- a/metis/metric/completeness/completeness.py +++ b/metis/metric/completeness/completeness.py @@ -19,6 +19,7 @@ def assess( Assess the completeness of the data by checking for missing values. :param data: DataFrame to assess. + :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing completeness results. """ @@ -26,7 +27,7 @@ def assess( total_rows = len(data) for column in data.columns: - missing_count = data[column].isnull().sum() + missing_count = data[column].isna().sum() completeness = (total_rows - int(missing_count)) / total_rows result = DQResult( diff --git a/metis/metric/completeness/completeness_nullAndDMVRate.py b/metis/metric/completeness/completeness_nullAndDMVRate.py new file mode 100644 index 0000000..febb71d --- /dev/null +++ b/metis/metric/completeness/completeness_nullAndDMVRate.py @@ -0,0 +1,126 @@ +from typing import List + +import pandas as pd + +from metis.metric.completeness.completeness_nullAndDMVRate_config import ( + completeness_nullAndDMVRate_config, +) +from metis.metric.config import MetricConfig +from metis.metric.metric import Metric +from metis.utils.disguised_missing_values.fahes.fahes import ( + FAHES_PRECISION, + FAHES_RECALL, + run_fahes, +) +from metis.utils.dq_dimension import DQDimension +from metis.utils.result import DQResult + +IS_VALID_MARKER = 0 +IS_NULL_MARKER = 1 +IS_DMV_MARKER = 2 + + +class completeness_nullAndDMVRate(Metric): + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: + """ + Assess the completeness of the data by checking for null values and disguised missing values. + + :param data: DataFrame to assess. + :param reference: Optional reference DataFrame (not used in this metric). + :param metric_config: Optional configuration for the metric. + :return: List of DQResult objects containing completeness results. + """ + + config = self.load_config(metric_config, completeness_nullAndDMVRate_config) + + results = [] + + dmvs = run_fahes(data) + self.logger.info(f"Detected DMVs:\n{dmvs}") + + marked_cells = pd.DataFrame( + IS_VALID_MARKER, index=data.index, columns=data.columns + ) + marked_cells[data.isna()] = IS_NULL_MARKER + for _, dmv_row in dmvs.iterrows(): + col = dmv_row["Attribute Name"] + val = dmv_row["DMV"] + marked_cells.loc[data[col] == val, col] = IS_DMV_MARKER + + def counts(marks: pd.Series): + return ( + (marks == IS_NULL_MARKER).sum(), + (marks == IS_DMV_MARKER).sum(), + len(marks), + ) + + def completeness(marks: pd.Series): + null_count, dmv_count, total_count = counts(marks) + return (total_count - null_count - dmv_count) / total_count + + def certainty(marks: pd.Series): + null_count, dmv_count, total_count = counts(marks) + return self.certainty(null_count, dmv_count, total_count) + + aggregated_marks = marked_cells.agg( + [completeness, certainty], + axis=config.aggregation_axis, + ) + + if config.aggregation_axis == "index": + aggregated_marks = aggregated_marks.T + + if config.aggregate_all: + table_completeness = aggregated_marks["completeness"].mean() + table_certainty = aggregated_marks["certainty"].mean() + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=table_completeness, + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=data.columns.tolist(), + DQannotations={"certainty": float(table_certainty)}, + ) + results.append(result) + return results + + for index, row in aggregated_marks.iterrows(): + row_index = ( + int(str(index)) if config.aggregation_axis == "columns" else None + ) + col_names = ( + data.columns.tolist() + if config.aggregation_axis == "columns" + else [str(index)] + ) + + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=row["completeness"], + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=col_names, + rowIndex=row_index, + DQannotations={"certainty": float(row["certainty"])}, + ) + results.append(result) + + return results + + # TODO: assess the confidence of FAHES using publicly available datasets with DMVs and checking the precision and recall of the detected DMVs (or some other metric, because given the detected dmvs, we actually only look at true positives and false positives); confidence per datatype probably useful + def certainty(self, null_count: int, dmv_count: int, total_count: int): + # certainty = P(missing rate is correct) = P(nulls are correct) * P(DMVs are correct) = P(detected nulls are correct) * P(detected DMVs are correct) * P(all DMVs found) = (1 - NULLABLE_COLUMN_RATE)^(null_count) * (1)^(null_count) * (FAHES_PRECISION)^(dmv_count) * (FAHES_RECALL)^(unflagged_count) + minimum = min(FAHES_PRECISION, FAHES_RECALL) ** total_count + # maximum = max(FAHES_PRECISION, FAHES_RECALL) ** total_count + certainty = float( + ( + FAHES_PRECISION**dmv_count + * FAHES_RECALL ** (total_count - null_count - dmv_count) + ) + ) + return (certainty - minimum) / (1 - minimum) diff --git a/metis/metric/completeness/completeness_nullAndDMVRate_config.py b/metis/metric/completeness/completeness_nullAndDMVRate_config.py new file mode 100644 index 0000000..c063343 --- /dev/null +++ b/metis/metric/completeness/completeness_nullAndDMVRate_config.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from typing import Literal + +from metis.metric.config import MetricConfig + + +@dataclass +class completeness_nullAndDMVRate_config(MetricConfig): + """ + Configuration class for the completeness_nullAndDMVRate metric. + + :param aggregation_axis: Axis along which to aggregate completeness ('index': aggregate each column; 'columns': aggregate each row). + :param aggregate_all: Whether to aggregate all completeness results into a single value for the whole input data. + """ + + aggregation_axis: Literal["index", "columns"] = "columns" + aggregate_all: bool = False + + def to_json(self): + return { + "name": self.__class__.__name__, + "aggregation_axis": self.aggregation_axis, + "aggregate_all": self.aggregate_all, + } + + def validate(self): + if self.aggregation_axis not in ["index", "columns"]: + raise ValueError( + f"aggregation_axis must be either 'index' or 'columns' but was {self.aggregation_axis}" + ) + if not isinstance(self.aggregate_all, bool): + raise ValueError( + f"aggregate_all must be a boolean value but was {type(self.aggregate_all)}" + ) diff --git a/metis/metric/completeness/completeness_nullRate.py b/metis/metric/completeness/completeness_nullRate.py new file mode 100644 index 0000000..e8b781f --- /dev/null +++ b/metis/metric/completeness/completeness_nullRate.py @@ -0,0 +1,83 @@ +from typing import List + +import pandas as pd + +from metis.metric.completeness.completeness_nullRate_config import ( + completeness_nullRate_config, +) +from metis.metric.config import MetricConfig +from metis.metric.metric import Metric +from metis.utils.dq_dimension import DQDimension +from metis.utils.result import DQResult + + +class completeness_nullRate(Metric): + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: + """ + Assess the completeness of the data by checking for null values. + + :param data: DataFrame to assess. + :param reference: Optional reference DataFrame (not used in this metric). + :param metric_config: Optional configuration for the metric. + :return: List of DQResult objects containing completeness results. + """ + + config = self.load_config(metric_config, completeness_nullRate_config) + + results = [] + + na_mask = data.isna() + + def counts(marks: pd.Series): + return marks.sum(), len(marks) + + def completeness(marks: pd.Series): + null_count, total_count = counts(marks) + return (total_count - null_count) / total_count + + aggregated_marks = na_mask.agg( + [completeness], + axis=config.aggregation_axis, + ) + + if config.aggregation_axis == "index": + aggregated_marks = aggregated_marks.T + + if config.aggregate_all: + table_completeness = aggregated_marks["completeness"].mean() + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=table_completeness, + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=data.columns.tolist(), + ) + results.append(result) + return results + + for index, row in aggregated_marks.iterrows(): + row_index = ( + int(str(index)) if config.aggregation_axis == "columns" else None + ) + col_names = ( + data.columns.tolist() + if config.aggregation_axis == "columns" + else [str(index)] + ) + + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=row["completeness"], + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=col_names, + rowIndex=row_index, + ) + results.append(result) + + return results diff --git a/metis/metric/completeness/completeness_nullRate_config.py b/metis/metric/completeness/completeness_nullRate_config.py new file mode 100644 index 0000000..25edee0 --- /dev/null +++ b/metis/metric/completeness/completeness_nullRate_config.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from typing import Literal + +from metis.metric.config import MetricConfig + + +@dataclass +class completeness_nullRate_config(MetricConfig): + """ + Configuration class for the completeness_nullRate metric. + + :param aggregation_axis: Axis along which to aggregate completeness ('index': aggregate each column; 'columns': aggregate each row). + :param aggregate_all: Whether to aggregate all completeness results into a single value for the whole input data. + """ + + aggregation_axis: Literal["index", "columns"] = "columns" + aggregate_all: bool = False + + def to_json(self): + return { + "name": self.__class__.__name__, + "aggregation_axis": self.aggregation_axis, + "aggregate_all": self.aggregate_all, + } + + def validate(self): + if self.aggregation_axis not in ["index", "columns"]: + raise ValueError( + f"aggregation_axis must be either 'index' or 'columns' but was {self.aggregation_axis}" + ) + if not isinstance(self.aggregate_all, bool): + raise ValueError( + f"aggregate_all must be a boolean value but was {type(self.aggregate_all)}" + ) diff --git a/metis/utils/__init__.py b/metis/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/utils/disguised_missing_values/__init__.py b/metis/utils/disguised_missing_values/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/utils/disguised_missing_values/fahes/__init__.py b/metis/utils/disguised_missing_values/fahes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/utils/disguised_missing_values/fahes/fahes.py b/metis/utils/disguised_missing_values/fahes/fahes.py new file mode 100644 index 0000000..a89b9a9 --- /dev/null +++ b/metis/utils/disguised_missing_values/fahes/fahes.py @@ -0,0 +1,83 @@ +import ctypes +import os +import tempfile +from pathlib import Path +from statistics import mean + +import pandas as pd + +FAHES_PRECISION = mean([0.384, 0.484, 0.385, 0.371, 0.522]) +FAHES_RECALL = mean([0.952, 0.978, 0.87, 0.929, 0.725]) +FAHES_F1 = 2 * FAHES_PRECISION * FAHES_RECALL / (FAHES_PRECISION + FAHES_RECALL) + + +def call_fahes(tab_full_name, output_dir): + path = Path(__file__).parent.resolve() / "lib" / "FAHES_Code" / "libFahes.so" + if not path.exists(): + raise FileNotFoundError( + f"Fahes shared library not found at: {path}. Please clone https://github.com/qcri/FAHES_Code.git into {path.parent} and compile it using the provided makefile at {path.parent.parent / 'makefile'}." + ) + + LP_c_char = ctypes.POINTER(ctypes.c_char) + LP_LP_c_char = ctypes.POINTER(LP_c_char) + try: + Fahes = ctypes.CDLL(str(path), use_errno=True) + except OSError as e: + raise ImportError(f"Failed to load Fahes shared library: {path}") from e + + try: + Fahes.main.argtypes = (ctypes.c_int, LP_LP_c_char) + except AttributeError as e: + raise AttributeError( + "Fahes library missing 'main' or has unexpected signature" + ) from e + + ctypes.set_errno(0) + args = [str(path), tab_full_name, output_dir, "4"] + argc = len(args) + argv = (LP_c_char * (argc + 1))() + for i, arg in enumerate(args): + enc_arg = arg.encode("utf-8") + argv[i] = ctypes.create_string_buffer(enc_arg) + + rc = Fahes.main(argc, argv) + if rc != 0: + err = ctypes.get_errno() + err_msg = os.strerror(err) if err else "Unknown C error" + raise RuntimeError(f"Fahes.main failed (rc={rc}, errno={err}: {err_msg})") + + +# Based on https://github.com/qcri/Fahes_Demo.git +def run_fahes(data: Path | str | pd.DataFrame) -> pd.DataFrame: + """ + Run FAHES on the given data file and return the resulting DataFrame. The resulting DataFrame contains the disguised missing values identified by FAHES. + Example resulting DataFrame structure: + + | Table Name | Attribute Name | DMV | Frequency | Detecting Tool | + |------------|----------------|-------------------|-----------|----------------| + | adult.csv | workclass | ? | 183 | Rand | + + :param data: Path to the input CSV data file or DataFrame containing the data. Warning: if a DataFrame is provided, it will be saved to a temporary CSV file before processing. + :return: DataFrame with disguised missing values identified by FAHES. + """ + tmp_file = None + + try: + if isinstance(data, pd.DataFrame): + tmp_file = tempfile.NamedTemporaryFile(suffix=".csv") + data.to_csv(tmp_file.name, index=False) + data_file_path = Path(tmp_file.name) + else: + data_file_path = Path(data) + + if not data_file_path.exists(): + raise FileNotFoundError(f"Data file not found: {data_file_path}") + + with tempfile.TemporaryDirectory() as results_dir: + call_fahes(str(data_file_path.absolute()), results_dir) + result_file = Path(results_dir) / ("DMV_" + data_file_path.name) + + return pd.read_csv(result_file) + finally: + if tmp_file is not None: + tmp_file.close() diff --git a/metis/utils/disguised_missing_values/fahes/lib/makefile b/metis/utils/disguised_missing_values/fahes/lib/makefile new file mode 100755 index 0000000..aafd392 --- /dev/null +++ b/metis/utils/disguised_missing_values/fahes/lib/makefile @@ -0,0 +1,14 @@ +LDFLAGS = -shared +TARGET_LIB = ../libFahes.so +CHILD_MAKEFILE_DIR = FAHES_Code/src +ARGS = -C $(CHILD_MAKEFILE_DIR) TARGET=$(TARGET_LIB) LFLAGS+=$(LDFLAGS) + +.PHONY: all +all: + $(MAKE) $(ARGS) $@ + +.PHONY: clean +clean: + $(MAKE) $(ARGS) clean $@ +rmo: + $(MAKE) $(ARGS) rmo $@ From 557e400ee226e03b554c8a5062cf57827132698c Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Tue, 20 Jan 2026 21:15:56 +0100 Subject: [PATCH 20/32] Add certainty to timeliness metric --- metis/__init__.py | 0 .../completeness_nullAndDMVRate.py | 2 - metis/metric/config.py | 7 ++ metis/metric/metric.py | 25 +++-- .../metric/timeliness/timeliness_heinrich.py | 91 ++++++++++++++++--- metis/utils/datetime/datetime_precision.py | 31 +++++++ metis/utils/logging.py | 19 ++++ 7 files changed, 151 insertions(+), 24 deletions(-) create mode 100644 metis/__init__.py create mode 100644 metis/utils/datetime/datetime_precision.py diff --git a/metis/__init__.py b/metis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/metric/completeness/completeness_nullAndDMVRate.py b/metis/metric/completeness/completeness_nullAndDMVRate.py index febb71d..09a6f6a 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRate.py +++ b/metis/metric/completeness/completeness_nullAndDMVRate.py @@ -112,11 +112,9 @@ def certainty(marks: pd.Series): return results - # TODO: assess the confidence of FAHES using publicly available datasets with DMVs and checking the precision and recall of the detected DMVs (or some other metric, because given the detected dmvs, we actually only look at true positives and false positives); confidence per datatype probably useful def certainty(self, null_count: int, dmv_count: int, total_count: int): # certainty = P(missing rate is correct) = P(nulls are correct) * P(DMVs are correct) = P(detected nulls are correct) * P(detected DMVs are correct) * P(all DMVs found) = (1 - NULLABLE_COLUMN_RATE)^(null_count) * (1)^(null_count) * (FAHES_PRECISION)^(dmv_count) * (FAHES_RECALL)^(unflagged_count) minimum = min(FAHES_PRECISION, FAHES_RECALL) ** total_count - # maximum = max(FAHES_PRECISION, FAHES_RECALL) ** total_count certainty = float( ( FAHES_PRECISION**dmv_count diff --git a/metis/metric/config.py b/metis/metric/config.py index 3fec237..9cdbb39 100644 --- a/metis/metric/config.py +++ b/metis/metric/config.py @@ -18,3 +18,10 @@ def from_dict(cls, config_dict: dict): :return: An instance of the configuration class. """ return cls(**config_dict) + + def validate(self): + """ + Validate the configuration parameters. + This method should be overridden by subclasses to implement specific validation logic. + """ + pass diff --git a/metis/metric/metric.py b/metis/metric/metric.py index bf355bf..f858bbb 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -81,20 +81,31 @@ def assess( def load_config(self, config: Any, model: type[C]) -> C: """ - Load metric-specific configuration from a JSON file. + Load metric-specific configuration from a JSON file path, JSON string or the correct config model instance. Also validates the configuration using its validate method. - :param config: Path to the JSON configuration file or a JSON string. + :param config: Path to the JSON configuration file, a JSON string or an instance of the config model. :return: An instance of the metric-specific configuration class. """ if isinstance(config, model): + config.validate() return config - if isinstance(config, str) and config.endswith(".json"): - with open(config, "r") as f: - return model(**json.load(f)) - if isinstance(config, str): - return model(**json.loads(config)) + try: + if config.endswith(".json"): + with open(config, "r") as f: + config_dict = json.load(f) + else: + config_dict = json.loads(config) + + parsed_config = model(**config_dict) + except Exception as e: + raise ValueError( + f"Failed to load metric configuration from {config}: {e}" + ) from e + + parsed_config.validate() + return parsed_config raise TypeError( f"Invalid config type: {type(config)}. Expected str or {model}." diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 74bbc21..8fd5acb 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -1,6 +1,7 @@ -from math import exp +from math import exp, floor from typing import List +import numpy as np import pandas as pd from metis.metric.config import MetricConfig @@ -8,8 +9,10 @@ from metis.metric.timeliness.timeliness_heinrich_config import ( timeliness_heinrich_config, ) +from metis.utils.datetime.datetime_precision import determine_datetime_precision from metis.utils.dq_dimension import DQDimension from metis.utils.logging import logger as main_logger +from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult @@ -46,32 +49,90 @@ def assess( ) results = [] - total_rows = len(data) + + warn_unconfigured_columns( + self.logger, + set(data.columns), + set(config.decline_rate_per_column.keys()), + "decline rates", + ) + + ingestion_dates = pd.to_datetime(data[ingestion_date_column]) + ages_in_days = ( + (assessment_date - ingestion_dates).dt.total_seconds() / 60 / 60 / 24 + ) + precision_of_dates = data[ingestion_date_column].apply( + determine_datetime_precision + ) + age_and_precision = pd.DataFrame( + {"age": ages_in_days, "precision": precision_of_dates} + ) for col_name in data.columns: decline_rate = config.decline_rate_per_column.get(col_name) if decline_rate is None: - self.logger.info( - f"Decline rate for column '{col_name}' is not specified in the configuration. Skipping." - ) continue - for row_index in range(total_rows): - ingestion_date = pd.to_datetime( - str(data.at[row_index, ingestion_date_column]), dayfirst=True - ) - delta = assessment_date - ingestion_date - age = delta.days / 365 - measurement = exp(-decline_rate * age) if pd.notna(age) else 0 - + timeliness = pd.Series(np.exp(-decline_rate * ages_in_days)) + certainty = age_and_precision.apply( + lambda row: self.certainty( + row["age"], + decline_rate or 0, + row["precision"], + ), + axis=1, + ) + for (index, timeliness_value), (_, certainty_value) in zip( + timeliness.items(), certainty.items() + ): result = DQResult( mesTime=pd.Timestamp.now(), - DQvalue=measurement, + DQvalue=timeliness_value, DQdimension=DQDimension.TIMELINESS, DQmetric=self.__class__.__name__, columnNames=[col_name], - rowIndex=row_index, + rowIndex=int(str(index)), + DQannotations={ + "certainty": certainty_value, + }, ) results.append(result) return results + + def certainty(self, age: float, decline_rate: float, precision: str) -> float: + """ + Calculate the certainty of the timeliness measurement based on age, decline rate, and datetime precision. + + :param age: The age of the data in days. + :param decline_rate: The decline rate per day. + :param precision: The precision of the datetime ('year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond'). + :return: The certainty of the measurement. + """ + lower_age_bound, upper_age_bound = self.age_precision_bounds(age, precision) + # max_quality_difference = abs(exp(-decline_rate) - 1) + unscaled_difference = abs( + exp(-decline_rate * upper_age_bound) - exp(-decline_rate * lower_age_bound) + ) + return 1 - unscaled_difference + + def age_precision_bounds(self, age: float, precision: str): + """ + Get the precision factor based on the datetime precision. + + :param precision: The precision of the datetime ('year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond'). + :return: The corresponding precision factor. + """ + precision_factors = { + "year": 365.25, + "month": 30, + "day": 1, + "hour": 1.0 / 24, + "minute": 1.0 / (24 * 60), + "second": 1.0 / (24 * 60 * 60), + "microsecond": 1.0 / (24 * 60 * 60 * 1_000_000), + } + factor = precision_factors.get(precision, 1) + lower_bound = floor(age / factor) * factor + upper_bound = (floor(age / factor) + 1) * factor + return lower_bound, upper_bound diff --git a/metis/utils/datetime/datetime_precision.py b/metis/utils/datetime/datetime_precision.py new file mode 100644 index 0000000..dfd0639 --- /dev/null +++ b/metis/utils/datetime/datetime_precision.py @@ -0,0 +1,31 @@ +import datetime + +from dateutil import parser + + +class datetimespy(datetime.datetime): + def replace(self, *args, **kwargs): + self._replaced_args = args + self._replaced_kwargs = kwargs + return super().replace(*args, **kwargs) + + +def determine_datetime_precision(dt_str): + default = datetimespy.now() + parser.parse(dt_str, default=default) + + replaced_fields = getattr(default, "_replaced_kwargs", {}) + + if "microsecond" in replaced_fields: + return "microsecond" + if "second" in replaced_fields: + return "second" + if "minute" in replaced_fields: + return "minute" + if "hour" in replaced_fields: + return "hour" + if "day" in replaced_fields: + return "day" + if "month" in replaced_fields: + return "month" + return "year" diff --git a/metis/utils/logging.py b/metis/utils/logging.py index 4f1f38d..7a3ab7a 100644 --- a/metis/utils/logging.py +++ b/metis/utils/logging.py @@ -2,3 +2,22 @@ logger = logging.getLogger("metis") logging.basicConfig(level=logging.INFO) + + +def warn_unconfigured_columns( + logger: logging.Logger, + data_columns: set[str] | list[str], + configured_columns: set[str] | list[str], + config_type: str, +): + extraneous_rules = set(configured_columns) - set(data_columns) + if extraneous_rules: + logger.warning( + f"The following columns have {config_type} defined but are not present in the data: {extraneous_rules}. These {config_type} will be ignored." + ) + + extraneous_columns = set(data_columns) - set(configured_columns) + if extraneous_columns: + logger.warning( + f"The following columns are present in the data but have no {config_type} defined: {extraneous_columns}. These columns will be skipped." + ) From 77f18cdba9101cded193fd6ab30680d6c8da3884 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Tue, 20 Jan 2026 22:30:00 +0100 Subject: [PATCH 21/32] Add consistency rules certainty --- .../consistency_ruleBasedHinrichs.py | 18 +++---- .../consistency_ruleBasedPipino.py | 54 ++++++++++--------- .../consistency_ruleBasedPipino_config.py | 4 +- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index 603573e..f90f86f 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -9,6 +9,7 @@ ) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult @@ -68,17 +69,12 @@ def assess( ) ) - extraneous_rules = set(attribute_rules.keys()) - set(data.columns) - if extraneous_rules: - self.logger.warning( - f"The following columns have consistency rules defined but are not present in the data: {extraneous_rules}. These rules will be ignored." - ) - - extraneous_columns = set(data.columns) - set(attribute_rules.keys()) - if extraneous_columns: - self.logger.info( - f"The following columns are present in the data but have no consistency rules defined: {extraneous_columns}. These columns will be skipped." - ) + warn_unconfigured_columns( + self.logger, + set(data.columns), + set(attribute_rules.keys()), + "consistency rules", + ) for col_name in data.columns: column_rules = attribute_rules.get(col_name, []) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index 291ecf2..564b6ee 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -1,5 +1,4 @@ -from math import sqrt -from typing import Any, Callable, List, Union +from typing import List, Union import pandas as pd @@ -9,6 +8,7 @@ ) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult @@ -63,47 +63,53 @@ def assess( ) ) - extraneous_rules = set(attribute_rules.keys()) - set(data.columns) - if extraneous_rules: - self.logger.warning( - f"The following columns have consistency rules defined but are not present in the data: {extraneous_rules}. These rules will be ignored." - ) - - extraneous_columns = set(data.columns) - set(attribute_rules.keys()) - if extraneous_columns: - self.logger.info( - f"The following columns are present in the data but have no consistency rules defined: {extraneous_columns}. These columns will be skipped." - ) + warn_unconfigured_columns( + self.logger, + set(data.columns), + set(attribute_rules.keys()), + "consistency rules", + ) for col_name in data.columns: column_rules = attribute_rules.get(col_name, []) if not column_rules: continue - degree_of_violation: pd.Series[float] = data[col_name].apply( - lambda x: self.sum_rules(column_rules, x) + fulfilled_rules_mask = pd.DataFrame( + { + f"rule_{i}": data[col_name].dropna().apply(rule) + for i, rule in enumerate(column_rules) + } ) - dq_measurements = 1 - degree_of_violation / len(column_rules) - min_quality = dq_measurements.min() + dq_measurements = fulfilled_rules_mask.sum(axis=1) / len(column_rules) + certainties = self.certainties(fulfilled_rules_mask) - for row_index, dq_value in dq_measurements.items(): + for (row_index, dq_value), certainty in zip( + dq_measurements.items(), certainties.values + ): results.append( self.create_result( dq_value, col_name, int(str(row_index)), - self.certainty(dq_value, min_quality), + float(certainty), ) ) return results - def sum_rules(self, rules: List[Callable], value: Any) -> float: - return float(sum(rule(value) for rule in rules)) - - def certainty(self, dq_value: float, min_quality: float) -> float: - return sqrt((1 - dq_value) * (1 - min_quality)) + def certainties(self, fulfilled_rules_mask: pd.DataFrame): + rule_fulfillment_percentage = fulfilled_rules_mask.mean(axis=0) + rule_distance_to_mean_percentage = ( + rule_fulfillment_percentage - rule_fulfillment_percentage.mean() + ) + rule_certainties = ~fulfilled_rules_mask * rule_distance_to_mean_percentage.where( + rule_distance_to_mean_percentage >= 0, 0 + ) + fulfilled_rules_mask * rule_distance_to_mean_percentage.where( + rule_distance_to_mean_percentage < 0, 0 + ) + return 1 - rule_certainties.abs().sum(axis=1) def create_result( self, dq_value: float, col_name: str | None, row_index: int, certainty: float diff --git a/metis/metric/consistency/consistency_ruleBasedPipino_config.py b/metis/metric/consistency/consistency_ruleBasedPipino_config.py index 984db67..b930b19 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino_config.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino_config.py @@ -17,9 +17,9 @@ class consistency_ruleBasedPipino_config(MetricConfig): :param tuple_rules: List of functions that define consistency rules for entire tuples """ - attribute_rules: Dict[str, List[Callable[[Any], float]]] | None = None + attribute_rules: Dict[str, List[Callable[[Any], bool]]] | None = None - tuple_rules: List[Callable[[pd.Series], float]] | None = None + tuple_rules: List[Callable[[pd.Series], bool]] | None = None def to_json(self): return { From aa348918f62f7f535001b39e17bdbf47e0b438cd Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Thu, 22 Jan 2026 22:54:07 +0100 Subject: [PATCH 22/32] Update consistency certainty calculation --- .../consistency_ruleBasedPipino.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index 564b6ee..35f4982 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -47,19 +47,24 @@ def assess( results: List[DQResult] = [] if tuple_rules: - degree_of_violation: pd.Series[float] = data.apply( - lambda x: self.sum_rules(tuple_rules, x), axis="columns" + fulfilled_rules_mask = pd.DataFrame( + { + f"rule_{i}": data.apply(rule, axis="columns") + for i, rule in enumerate(tuple_rules) + } ) - dq_measurements = 1 - degree_of_violation / len(tuple_rules) - min_quality = dq_measurements.min() - for row_index, dq_value in dq_measurements.items(): + dq_measurements = fulfilled_rules_mask.sum(axis=1) / len(tuple_rules) + certainties = self.certainties(fulfilled_rules_mask) + for (row_index, dq_value), certainty in zip( + dq_measurements.items(), certainties.values + ): results.append( self.create_result( dq_value, None, int(str(row_index)), - self.certainty(dq_value, min_quality), + float(certainty), ) ) @@ -101,15 +106,9 @@ def assess( def certainties(self, fulfilled_rules_mask: pd.DataFrame): rule_fulfillment_percentage = fulfilled_rules_mask.mean(axis=0) - rule_distance_to_mean_percentage = ( - rule_fulfillment_percentage - rule_fulfillment_percentage.mean() - ) - rule_certainties = ~fulfilled_rules_mask * rule_distance_to_mean_percentage.where( - rule_distance_to_mean_percentage >= 0, 0 - ) + fulfilled_rules_mask * rule_distance_to_mean_percentage.where( - rule_distance_to_mean_percentage < 0, 0 + return ( + (1 - fulfilled_rules_mask - rule_fulfillment_percentage).abs().mean(axis=1) ) - return 1 - rule_certainties.abs().sum(axis=1) def create_result( self, dq_value: float, col_name: str | None, row_index: int, certainty: float From 40701f9e22cb9465082b4adf28e6fe931d7f9fd5 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 2 Feb 2026 12:02:21 +0100 Subject: [PATCH 23/32] Add simulated precision to timeliness metric --- metis/metric/timeliness/timeliness_heinrich.py | 14 ++++++++++++-- .../timeliness/timeliness_heinrich_config.py | 4 ++++ metis/utils/datetime/datetime_precision.py | 5 ++++- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 8fd5acb..72a8e2a 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -57,12 +57,22 @@ def assess( "decline rates", ) + if not ingestion_date_column or ingestion_date_column not in data.columns: + self.logger.warning( + f"Ingestion date column '{ingestion_date_column}' is not present in the data." + ) + return results + ingestion_dates = pd.to_datetime(data[ingestion_date_column]) ages_in_days = ( (assessment_date - ingestion_dates).dt.total_seconds() / 60 / 60 / 24 ) - precision_of_dates = data[ingestion_date_column].apply( - determine_datetime_precision + precision_of_dates = ( + pd.Series( + [config.simulated_timestamp_precision] * len(data), index=data.index + ) + if config.simulated_timestamp_precision + else data[ingestion_date_column].apply(determine_datetime_precision) ) age_and_precision = pd.DataFrame( {"age": ages_in_days, "precision": precision_of_dates} diff --git a/metis/metric/timeliness/timeliness_heinrich_config.py b/metis/metric/timeliness/timeliness_heinrich_config.py index 5797751..a92fc81 100644 --- a/metis/metric/timeliness/timeliness_heinrich_config.py +++ b/metis/metric/timeliness/timeliness_heinrich_config.py @@ -2,6 +2,7 @@ from typing import Dict from metis.metric.config import MetricConfig +from metis.utils.datetime.datetime_precision import DTPrecision @dataclass @@ -12,11 +13,13 @@ class timeliness_heinrich_config(MetricConfig): :param decline_rate_per_column: Decline rate specific to each column :param ingestion_date_column: Name of the column containing the ingestion date of each tuple :param simulated_assessment_date: Optional simulated assessment date in string format. If not provided, the current date will be used. + :param simulated_timestamp_precision: Optional simulated precision of each the timestamps in ingestion_date_column. If not provided, the precision is detected automatically. """ decline_rate_per_column: Dict[str, float] ingestion_date_column: str simulated_assessment_date: str | None = None + simulated_timestamp_precision: DTPrecision | None = None def to_json(self): return { @@ -24,4 +27,5 @@ def to_json(self): "decline_rate_per_column": self.decline_rate_per_column, "ingestion_date_column": self.ingestion_date_column, "simulated_assessment_date": self.simulated_assessment_date, + "simulated_timestamp_precision": self.simulated_timestamp_precision, } diff --git a/metis/utils/datetime/datetime_precision.py b/metis/utils/datetime/datetime_precision.py index dfd0639..e6556d6 100644 --- a/metis/utils/datetime/datetime_precision.py +++ b/metis/utils/datetime/datetime_precision.py @@ -1,7 +1,10 @@ import datetime +from typing import Literal from dateutil import parser +DTPrecision = Literal["year", "month", "day", "hour", "minute", "second", "microsecond"] + class datetimespy(datetime.datetime): def replace(self, *args, **kwargs): @@ -10,7 +13,7 @@ def replace(self, *args, **kwargs): return super().replace(*args, **kwargs) -def determine_datetime_precision(dt_str): +def determine_datetime_precision(dt_str: str) -> DTPrecision: default = datetimespy.now() parser.parse(dt_str, default=default) From ad591fcb0ae88f63c00acbd2af2599540ddb70cd Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 2 Feb 2026 16:36:43 +0100 Subject: [PATCH 24/32] Update DQResult usages to newer interface --- README.md | 37 ++++++++++++------- metis/dq_orchestrator.py | 5 ++- metis/metric/__init__.py | 1 + .../completeness_nullAndDMVRate.py | 9 +++-- .../completeness/completeness_nullRate.py | 4 ++ .../consistency_ruleBasedHinrichs.py | 6 ++- .../consistency_ruleBasedPipino.py | 3 +- .../correctness/correctness_heinrich.py | 1 + .../minimality/minimality_duplicateCount.py | 18 +++++---- .../metric/timeliness/timeliness_heinrich.py | 3 +- .../validity/validity_outOfVocabulary.py | 6 ++- metis/utils/dq_dimension.py | 2 + metis/utils/result.py | 2 +- 13 files changed, 64 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 30dd861..5d62177 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,10 @@ New metrics should follow the naming format: `{DimensionName}_{Technique}` - **DimensionName**: The quality dimension being measured (e.g., `Completeness`, `Minimality`) - **Technique**: The calculation or method used (e.g., `NullRatio`, `DuplicateCount`) -- **Granularity**: The level of analysis (e.g., `cell`, `row`, `column`, `table`) should be passed as a parameter through the metric config file if the metric can be applied at different granularity levels. -Examples: `completeness_NullRatio`, `minimality_DuplicateCount` +Examples: `completeness_nullRatio`, `minimality_duplicateCount` + +The file name and class name of each metric should be equal. If a metric has a specific config class, the name of the config class should be `{MetricName}_config` (e.g., `completeness_missingRatio_config`). - **Granularity**: The level of analysis (e.g., `cell`, `row`, `column`, `table`) should be passed as a parameter through the metric config file if the metric can be applied at different granularity levels. @@ -47,27 +48,35 @@ class DQResult: def __init__( self, mesTime: pd.Timestamp, - DQvalue: float, - DQdimension: str, + DQdimension: DQDimension, DQmetric: str, + DQgranularity: str, + DQvalue: float, + DQexplanation: Union[dict, None] = None, + runtime: Union[float, None] = None, + tableName: Union[str, None] = None, columnNames: Union[List[str], None] = None, rowIndex: Union[int, None] = None, - DQannotations: Union[dict, None] = None, + experimentTag: Union[str, None] = None, dataset: Union[str, None] = None, - tableName: Union[str, None] = None, + configJson: Union[dict, None] = None, ): ```` To create a new instance of DQResult, one needs to provide at least the following arguments: - **mesTime: pd.Timestamp**: The time at which a result was assessed. -- **DQvalue: float**: The result of the assessment. This currently only supports quantitative assessments. -- **DQdimension: str**: The name of the data quality dimension that was assessed e.g. completeness, accuracy, etc. -- **DQmetric: str**: The name of the specific metric inside the given dimension that was assessed. - -Furthermore, there are more optional arguments that might need to be set depending on the nature of different metrics. ```dataset``` and ```tableName``` are automatically set by the ```metis.dq_orchestrator.DQOrchestrator``` class which controles the data quality assessment and takes care of calling the individual metrics and storing the results. -- **columnNames: Optional[List[str]]**: List of column names associated with the assessed result. For example for column level completeness, this would be a list with a single column name, for table level completeness this would be empty since the result is valid for the whole table. -- **rowIndex: Optional[int]**: Index of the row this result is associated with. This can either be used together with columnNames to assess data quality on a cell level or for row based metrics. -- **DQannotations: Optional[dict]**: To allow metrics to save additional information or annotations, this dictionary can store all additional information that might need to be saved. This currently does not need for follow a predefined structure. +- **DQdimension: DQDimension**: Data quality dimension assessed (e.g. `DQDimension.COMPLETENESS`, `DQDimension.ACCURACY`). +- **DQmetric: str**: Name of the specific metric within the dimension. +- **DQgranularity: str**: Granularity of the metric (e.g. 'column', 'table', 'cell', 'row'). +- **DQvalue: float**: Numeric outcome of the assessment. This currently only supports quantitative assessments. + +Furthermore, there are more optional arguments that might need to be set depending on the nature of different metrics. ```dataset``` and ```tableName``` are automatically set by the ```metis.dq_orchestrator.DQOrchestrator``` class which controls the data quality assessment and takes care of calling the individual metrics and storing the results. +- **DQexplanation: Optional[dict]**: Arbitrary additional information produced by the metric (no fixed schema required). +- **runtime: Optional[float]**: Time taken to compute the metric, in seconds. +- **columnNames: Optional[List[str]]**: Columns that this result pertains to. For a column-level metric this is typically a single-item list; for a table-level metric this may be `None` or an empty list. +- **rowIndex: Optional[int]**: Row index associated with the result. Use together with `columnNames` to represent a cell-level result, or for row-based metrics. +- **experimentTag: Optional[str]**: Tag to identify a specific run. +- **configJson: Optional[dict]**: Configuration used for the metric as a JSON object. diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index 6a5ed0d..12219f2 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -114,10 +114,13 @@ def assess( def get_dq_result(self, query: str) -> List[DQResult]: return [] - def _should_measure_runtime(self, metric_config: str | None) -> bool: + def _should_measure_runtime(self, metric_config: MetricConfig | str | None) -> bool: if metric_config is None: return False + if isinstance(metric_config, MetricConfig): + return getattr(metric_config, "measure_runtime", False) + try: parsed = json.loads(metric_config) except Exception: diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index ef3d5d2..0e3b841 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,4 +1,5 @@ from .completeness.completeness_nullRatio import completeness_nullRatio +from .completeness.completeness_nullAndDMVRate import completeness_nullAndDMVRate from .consistency.consistency_countFDViolations import consistency_countFDViolations from .consistency.consistency_ruleBasedHinrichs import consistency_ruleBasedHinrichs from .consistency.consistency_ruleBasedPipino import consistency_ruleBasedPipino diff --git a/metis/metric/completeness/completeness_nullAndDMVRate.py b/metis/metric/completeness/completeness_nullAndDMVRate.py index 09a6f6a..a1c62ca 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRate.py +++ b/metis/metric/completeness/completeness_nullAndDMVRate.py @@ -84,7 +84,8 @@ def certainty(marks: pd.Series): DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=data.columns.tolist(), - DQannotations={"certainty": float(table_certainty)}, + DQexplanation={"certainty": float(table_certainty)}, + DQgranularity="table", ) results.append(result) return results @@ -106,14 +107,16 @@ def certainty(marks: pd.Series): DQmetric=self.__class__.__name__, columnNames=col_names, rowIndex=row_index, - DQannotations={"certainty": float(row["certainty"])}, + DQexplanation={"certainty": float(row["certainty"])}, + DQgranularity=( + "row" if config.aggregation_axis == "columns" else "column" + ), ) results.append(result) return results def certainty(self, null_count: int, dmv_count: int, total_count: int): - # certainty = P(missing rate is correct) = P(nulls are correct) * P(DMVs are correct) = P(detected nulls are correct) * P(detected DMVs are correct) * P(all DMVs found) = (1 - NULLABLE_COLUMN_RATE)^(null_count) * (1)^(null_count) * (FAHES_PRECISION)^(dmv_count) * (FAHES_RECALL)^(unflagged_count) minimum = min(FAHES_PRECISION, FAHES_RECALL) ** total_count certainty = float( ( diff --git a/metis/metric/completeness/completeness_nullRate.py b/metis/metric/completeness/completeness_nullRate.py index e8b781f..512c32d 100644 --- a/metis/metric/completeness/completeness_nullRate.py +++ b/metis/metric/completeness/completeness_nullRate.py @@ -56,6 +56,7 @@ def completeness(marks: pd.Series): DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=data.columns.tolist(), + DQgranularity="table", ) results.append(result) return results @@ -77,6 +78,9 @@ def completeness(marks: pd.Series): DQmetric=self.__class__.__name__, columnNames=col_names, rowIndex=row_index, + DQgranularity=( + "row" if config.aggregation_axis == "columns" else "column" + ), ) results.append(result) diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index f90f86f..5cc9fc0 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -63,9 +63,10 @@ def assess( DQmetric=self.__class__.__name__, columnNames=[], rowIndex=int(str(row_index)), - DQannotations={ + DQexplanation={ "certainty": self.certainty(dq_value, min_quality) }, + DQgranularity="row", ) ) @@ -97,9 +98,10 @@ def assess( DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=int(str(row_index)), - DQannotations={ + DQexplanation={ "certainty": self.certainty(dq_value, min_quality) }, + DQgranularity="cell", ) ) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index 35f4982..c5f691a 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -120,7 +120,8 @@ def create_result( DQmetric=self.__class__.__name__, columnNames=[col_name] if col_name else [], rowIndex=row_index, - DQannotations={ + DQexplanation={ "certainty": certainty, }, + DQgranularity="cell" if col_name else "row", ) diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index 2c98de2..5e361a8 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -47,6 +47,7 @@ def assess( DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, + DQgranularity="cell", ) results.append(result) diff --git a/metis/metric/minimality/minimality_duplicateCount.py b/metis/metric/minimality/minimality_duplicateCount.py index 5dd59d7..879ebb4 100644 --- a/metis/metric/minimality/minimality_duplicateCount.py +++ b/metis/metric/minimality/minimality_duplicateCount.py @@ -1,26 +1,28 @@ import pandas as pd from typing import List, Union +from metis.metric.config import MetricConfig +from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult from metis.metric.metric import Metric class minimality_duplicateCount(Metric): - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: + def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[MetricConfig, str, None] = None) -> List[DQResult]: """ - Assess the minimality for each attribute of a dataset by checking for unique values. - + Assess the minimality for each attribute of a dataset by checking for unique values. + :param data: DataFrame to assess. :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing completeness results. """ results = [] total_rows = len(data) - + for column in data.columns: # Count values that appear exactly once (not duplicated) unique_count = (~data[column].duplicated(keep=False)).sum() minimality = unique_count / total_rows if total_rows > 0 else 0 - + # Attributes with 100% unique values are candidate keys annotations = {} if minimality == 1.0: @@ -28,7 +30,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None result = DQResult( mesTime=pd.Timestamp.now(), - DQdimension="Minimality", + DQdimension=DQDimension.MINIMALITY, DQmetric="DuplicateCount", DQgranularity="column", DQvalue=minimality, @@ -36,5 +38,5 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None columnNames=[column], ) results.append(result) - - return results \ No newline at end of file + + return results diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 72a8e2a..16630f0 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -102,9 +102,10 @@ def assess( DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=int(str(index)), - DQannotations={ + DQexplanation={ "certainty": certainty_value, }, + DQgranularity="cell", ) results.append(result) diff --git a/metis/metric/validity/validity_outOfVocabulary.py b/metis/metric/validity/validity_outOfVocabulary.py index b918d25..791c63f 100644 --- a/metis/metric/validity/validity_outOfVocabulary.py +++ b/metis/metric/validity/validity_outOfVocabulary.py @@ -1,6 +1,8 @@ import re import pandas as pd from typing import List, Union +from metis.metric.config import MetricConfig +from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult from metis.metric.metric import Metric @@ -9,7 +11,7 @@ nltk.download("words", quiet=True) class validity_outOfVocabulary(Metric): - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, set, None] = None, metric_config: Union[str, None] = None) -> List[DQResult]: + def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, set, None] = None, metric_config: Union[MetricConfig, str, None] = None) -> List[DQResult]: """ General vocabulary check at token level. Any alphabetic token not in the standard vocab is OOV. @@ -63,7 +65,7 @@ def is_valid(text: str) -> bool: result = DQResult( mesTime=pd.Timestamp.now(), - DQdimension="Validity", + DQdimension=DQDimension.VALIDITY, DQmetric="OutOfVocabulary", DQgranularity="column", DQvalue=dq_value, diff --git a/metis/utils/dq_dimension.py b/metis/utils/dq_dimension.py index 0566d22..05e3985 100644 --- a/metis/utils/dq_dimension.py +++ b/metis/utils/dq_dimension.py @@ -8,3 +8,5 @@ class DQDimension(StrEnum): CORRECTNESS = "Correctness" COMPLETENESS = "Completeness" TIMELINESS = "Timeliness" + MINIMALITY = "Minimality" + VALIDITY = "Validity" diff --git a/metis/utils/result.py b/metis/utils/result.py index 6fe89af..696e449 100644 --- a/metis/utils/result.py +++ b/metis/utils/result.py @@ -26,7 +26,7 @@ def __init__( - `mesTime: pd.Timestamp`: The time at which the result was assessed. - `DQdimension: DQDimension`: Data quality dimension assessed (e.g. DQDimension.COMPLETENESS, DQDimension.ACCURACY). - `DQmetric: str`: Name of the specific metric within the dimension. - - `DQgranularity: str`: Granularity of the metric (e.g. 'column', 'table', 'cell'). + - `DQgranularity: str`: Granularity of the metric (e.g. 'column', 'table', 'cell', 'row'). - `DQvalue: float`: Numeric outcome of the assessment (quantitative only). Optional arguments From 742f8fa52e6912bda15c6a6dd6cd44addd440066 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Tue, 10 Feb 2026 22:56:09 +0100 Subject: [PATCH 25/32] Rename and merge completeness metrics --- metis/metric/__init__.py | 2 +- ...ate.py => completeness_nullAndDMVRatio.py} | 32 +++---- ...=> completeness_nullAndDMVRatio_config.py} | 4 +- .../completeness/completeness_nullRate.py | 87 ------------------- .../completeness/completeness_nullRatio.py | 81 ++++++++++++++--- ...ig.py => completeness_nullRatio_config.py} | 4 +- .../metric/timeliness/timeliness_heinrich.py | 16 ++-- .../timeliness/timeliness_heinrich_config.py | 3 + .../disguised_missing_values/fahes/fahes.py | 8 +- 9 files changed, 109 insertions(+), 128 deletions(-) rename metis/metric/completeness/{completeness_nullAndDMVRate.py => completeness_nullAndDMVRatio.py} (76%) rename metis/metric/completeness/{completeness_nullAndDMVRate_config.py => completeness_nullAndDMVRatio_config.py} (89%) delete mode 100644 metis/metric/completeness/completeness_nullRate.py rename metis/metric/completeness/{completeness_nullRate_config.py => completeness_nullRatio_config.py} (90%) diff --git a/metis/metric/__init__.py b/metis/metric/__init__.py index 0e3b841..a6bf785 100644 --- a/metis/metric/__init__.py +++ b/metis/metric/__init__.py @@ -1,5 +1,5 @@ +from .completeness.completeness_nullAndDMVRatio import completeness_nullAndDMVRatio from .completeness.completeness_nullRatio import completeness_nullRatio -from .completeness.completeness_nullAndDMVRate import completeness_nullAndDMVRate from .consistency.consistency_countFDViolations import consistency_countFDViolations from .consistency.consistency_ruleBasedHinrichs import consistency_ruleBasedHinrichs from .consistency.consistency_ruleBasedPipino import consistency_ruleBasedPipino diff --git a/metis/metric/completeness/completeness_nullAndDMVRate.py b/metis/metric/completeness/completeness_nullAndDMVRatio.py similarity index 76% rename from metis/metric/completeness/completeness_nullAndDMVRate.py rename to metis/metric/completeness/completeness_nullAndDMVRatio.py index a1c62ca..c3a04f9 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRate.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio.py @@ -2,8 +2,8 @@ import pandas as pd -from metis.metric.completeness.completeness_nullAndDMVRate_config import ( - completeness_nullAndDMVRate_config, +from metis.metric.completeness.completeness_nullAndDMVRatio_config import ( + completeness_nullAndDMVRatio_config, ) from metis.metric.config import MetricConfig from metis.metric.metric import Metric @@ -20,7 +20,7 @@ IS_DMV_MARKER = 2 -class completeness_nullAndDMVRate(Metric): +class completeness_nullAndDMVRatio(Metric): def assess( self, data: pd.DataFrame, @@ -28,7 +28,7 @@ def assess( metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ - Assess the completeness of the data by checking for null values and disguised missing values. + Assess the completeness of the data by checking for null values and disguised missing values. To detect disguised missing values, the FAHES algorithm by Qahtan et al. is applied to the data (paper: https://doi.org/10.1145/3219819.3220109). The completeness quality measurement is calculated as the ratio of valid values (non-null and non-disguised missing) to the total number of values. The metric can be configured using `completeness_nullAndDMVRatio_config` to calculate the completeness on column, row level, or table-level granularity. :param data: DataFrame to assess. :param reference: Optional reference DataFrame (not used in this metric). @@ -36,7 +36,7 @@ def assess( :return: List of DQResult objects containing completeness results. """ - config = self.load_config(metric_config, completeness_nullAndDMVRate_config) + config = self.load_config(metric_config, completeness_nullAndDMVRatio_config) results = [] @@ -47,10 +47,11 @@ def assess( IS_VALID_MARKER, index=data.index, columns=data.columns ) marked_cells[data.isna()] = IS_NULL_MARKER - for _, dmv_row in dmvs.iterrows(): - col = dmv_row["Attribute Name"] - val = dmv_row["DMV"] - marked_cells.loc[data[col] == val, col] = IS_DMV_MARKER + if dmvs is not None: + for _, dmv_row in dmvs.iterrows(): + col = dmv_row["Attribute Name"] + val = dmv_row["DMV"] + marked_cells.loc[data[col] == val, col] = IS_DMV_MARKER def counts(marks: pd.Series): return ( @@ -117,11 +118,12 @@ def certainty(marks: pd.Series): return results def certainty(self, null_count: int, dmv_count: int, total_count: int): - minimum = min(FAHES_PRECISION, FAHES_RECALL) ** total_count - certainty = float( - ( - FAHES_PRECISION**dmv_count - * FAHES_RECALL ** (total_count - null_count - dmv_count) + minimum = (1 - FAHES_PRECISION) + (1 - FAHES_RECALL) + return ( + 1 + - ( + (1 - FAHES_PRECISION) * (dmv_count / total_count) + + (1 - FAHES_RECALL) * (null_count / total_count) ) + / minimum ) - return (certainty - minimum) / (1 - minimum) diff --git a/metis/metric/completeness/completeness_nullAndDMVRate_config.py b/metis/metric/completeness/completeness_nullAndDMVRatio_config.py similarity index 89% rename from metis/metric/completeness/completeness_nullAndDMVRate_config.py rename to metis/metric/completeness/completeness_nullAndDMVRatio_config.py index c063343..dfaa540 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRate_config.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio_config.py @@ -5,9 +5,9 @@ @dataclass -class completeness_nullAndDMVRate_config(MetricConfig): +class completeness_nullAndDMVRatio_config(MetricConfig): """ - Configuration class for the completeness_nullAndDMVRate metric. + Configuration class for the completeness_nullAndDMVRatio metric. :param aggregation_axis: Axis along which to aggregate completeness ('index': aggregate each column; 'columns': aggregate each row). :param aggregate_all: Whether to aggregate all completeness results into a single value for the whole input data. diff --git a/metis/metric/completeness/completeness_nullRate.py b/metis/metric/completeness/completeness_nullRate.py deleted file mode 100644 index 512c32d..0000000 --- a/metis/metric/completeness/completeness_nullRate.py +++ /dev/null @@ -1,87 +0,0 @@ -from typing import List - -import pandas as pd - -from metis.metric.completeness.completeness_nullRate_config import ( - completeness_nullRate_config, -) -from metis.metric.config import MetricConfig -from metis.metric.metric import Metric -from metis.utils.dq_dimension import DQDimension -from metis.utils.result import DQResult - - -class completeness_nullRate(Metric): - def assess( - self, - data: pd.DataFrame, - reference: pd.DataFrame | None = None, - metric_config: str | MetricConfig | None = None, - ) -> List[DQResult]: - """ - Assess the completeness of the data by checking for null values. - - :param data: DataFrame to assess. - :param reference: Optional reference DataFrame (not used in this metric). - :param metric_config: Optional configuration for the metric. - :return: List of DQResult objects containing completeness results. - """ - - config = self.load_config(metric_config, completeness_nullRate_config) - - results = [] - - na_mask = data.isna() - - def counts(marks: pd.Series): - return marks.sum(), len(marks) - - def completeness(marks: pd.Series): - null_count, total_count = counts(marks) - return (total_count - null_count) / total_count - - aggregated_marks = na_mask.agg( - [completeness], - axis=config.aggregation_axis, - ) - - if config.aggregation_axis == "index": - aggregated_marks = aggregated_marks.T - - if config.aggregate_all: - table_completeness = aggregated_marks["completeness"].mean() - result = DQResult( - mesTime=pd.Timestamp.now(), - DQvalue=table_completeness, - DQdimension=DQDimension.COMPLETENESS, - DQmetric=self.__class__.__name__, - columnNames=data.columns.tolist(), - DQgranularity="table", - ) - results.append(result) - return results - - for index, row in aggregated_marks.iterrows(): - row_index = ( - int(str(index)) if config.aggregation_axis == "columns" else None - ) - col_names = ( - data.columns.tolist() - if config.aggregation_axis == "columns" - else [str(index)] - ) - - result = DQResult( - mesTime=pd.Timestamp.now(), - DQvalue=row["completeness"], - DQdimension=DQDimension.COMPLETENESS, - DQmetric=self.__class__.__name__, - columnNames=col_names, - rowIndex=row_index, - DQgranularity=( - "row" if config.aggregation_axis == "columns" else "column" - ), - ) - results.append(result) - - return results diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 7a87c57..10b74f6 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -1,7 +1,10 @@ -from typing import List, Union +from typing import List import pandas as pd +from metis.metric.completeness.completeness_nullRatio_config import ( + completeness_nullRatio_config, +) from metis.metric.config import MetricConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension @@ -9,29 +12,85 @@ class completeness_nullRatio(Metric): - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[MetricConfig, str, None] = None) -> List[DQResult]: + def assess( + self, + data: pd.DataFrame, + reference: pd.DataFrame | None = None, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: """ - Assess the completeness of the data by checking for null values. + Assess the completeness of the data by calculating the ratio and count of null values on different granularities. The ratio of non-null values is stored as the completeness quality measurement, while the count of null values is stored in the explanation for better interpretability. The metric can be configured using `completeness_nullRatio_config` to calculate the completeness on column, row level, or table-level granularity. :param data: DataFrame to assess. :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing completeness results. """ + + config = self.load_config(metric_config, completeness_nullRatio_config) + results = [] - total_rows = len(data) - for column in data.columns: - null_count = data[column].isnull().sum() - completeness = (total_rows - int(null_count)) / total_rows + na_mask = data.isna() + + def counts(null_mask: pd.Series): + return null_mask.sum(), len(null_mask) + + def not_null_ratio(null_mask: pd.Series): + null_count, total_count = counts(null_mask) + return (total_count - null_count) / total_count + + def null_count(null_mask: pd.Series): + null_count, _ = counts(null_mask) + return null_count + + not_null_ratios = na_mask.agg( + [not_null_ratio, null_count], + axis=config.aggregation_axis, + ) + + if config.aggregation_axis == "index": + not_null_ratios = not_null_ratios.T + + if config.aggregate_all: + table_completeness = not_null_ratios["not_null_ratio"].mean() + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=table_completeness, + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=data.columns.tolist(), + DQgranularity="table", + DQexplanation={ + "null_count": not_null_ratios["null_count"].sum(), + } + ) + results.append(result) + return results + + for index, row in not_null_ratios.iterrows(): + row_index = ( + int(str(index)) if config.aggregation_axis == "columns" else None + ) + col_names = ( + data.columns.tolist() + if config.aggregation_axis == "columns" + else [str(index)] + ) result = DQResult( mesTime=pd.Timestamp.now(), + DQvalue=row["not_null_ratio"], DQdimension=DQDimension.COMPLETENESS, - DQmetric="NullRatio", - DQgranularity="column", - DQvalue=completeness, - columnNames=[column], + DQmetric=self.__class__.__name__, + columnNames=col_names, + rowIndex=row_index, + DQgranularity=( + "row" if config.aggregation_axis == "columns" else "column" + ), + DQexplanation={ + "null_count": row["null_count"], + } ) results.append(result) diff --git a/metis/metric/completeness/completeness_nullRate_config.py b/metis/metric/completeness/completeness_nullRatio_config.py similarity index 90% rename from metis/metric/completeness/completeness_nullRate_config.py rename to metis/metric/completeness/completeness_nullRatio_config.py index 25edee0..a255e03 100644 --- a/metis/metric/completeness/completeness_nullRate_config.py +++ b/metis/metric/completeness/completeness_nullRatio_config.py @@ -5,9 +5,9 @@ @dataclass -class completeness_nullRate_config(MetricConfig): +class completeness_nullRatio_config(MetricConfig): """ - Configuration class for the completeness_nullRate metric. + Configuration class for the completeness_nullRatio metric. :param aggregation_axis: Axis along which to aggregate completeness ('index': aggregate each column; 'columns': aggregate each row). :param aggregate_all: Whether to aggregate all completeness results into a single value for the whole input data. diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 16630f0..4c0826a 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -50,6 +50,12 @@ def assess( results = [] + if not ingestion_date_column or ingestion_date_column not in data.columns: + self.logger.warning( + f"Ingestion date column '{ingestion_date_column}' is not present in the data." + ) + return results + warn_unconfigured_columns( self.logger, set(data.columns), @@ -57,13 +63,9 @@ def assess( "decline rates", ) - if not ingestion_date_column or ingestion_date_column not in data.columns: - self.logger.warning( - f"Ingestion date column '{ingestion_date_column}' is not present in the data." - ) - return results - - ingestion_dates = pd.to_datetime(data[ingestion_date_column]) + ingestion_dates = pd.to_datetime( + data[ingestion_date_column], **(config.to_datetime_kwargs or {}) + ) ages_in_days = ( (assessment_date - ingestion_dates).dt.total_seconds() / 60 / 60 / 24 ) diff --git a/metis/metric/timeliness/timeliness_heinrich_config.py b/metis/metric/timeliness/timeliness_heinrich_config.py index a92fc81..b5b47a1 100644 --- a/metis/metric/timeliness/timeliness_heinrich_config.py +++ b/metis/metric/timeliness/timeliness_heinrich_config.py @@ -12,12 +12,14 @@ class timeliness_heinrich_config(MetricConfig): :param decline_rate_per_column: Decline rate specific to each column :param ingestion_date_column: Name of the column containing the ingestion date of each tuple + :param to_datetime_kwargs: Optional keyword arguments for pandas.to_datetime when parsing the ingestion date column. :param simulated_assessment_date: Optional simulated assessment date in string format. If not provided, the current date will be used. :param simulated_timestamp_precision: Optional simulated precision of each the timestamps in ingestion_date_column. If not provided, the precision is detected automatically. """ decline_rate_per_column: Dict[str, float] ingestion_date_column: str + to_datetime_kwargs: Dict | None = None simulated_assessment_date: str | None = None simulated_timestamp_precision: DTPrecision | None = None @@ -28,4 +30,5 @@ def to_json(self): "ingestion_date_column": self.ingestion_date_column, "simulated_assessment_date": self.simulated_assessment_date, "simulated_timestamp_precision": self.simulated_timestamp_precision, + "to_datetime_kwargs": self.to_datetime_kwargs, } diff --git a/metis/utils/disguised_missing_values/fahes/fahes.py b/metis/utils/disguised_missing_values/fahes/fahes.py index a89b9a9..67b9381 100644 --- a/metis/utils/disguised_missing_values/fahes/fahes.py +++ b/metis/utils/disguised_missing_values/fahes/fahes.py @@ -6,6 +6,8 @@ import pandas as pd +"""FAHES paper: https://raulcastrofernandez.com/papers/kdd18-fahes.pdf, Code: https://github.com/qcri/FAHES_Code.git, and Demo: https://github.com/qcri/Fahes_Demo.git""" + FAHES_PRECISION = mean([0.384, 0.484, 0.385, 0.371, 0.522]) FAHES_RECALL = mean([0.952, 0.978, 0.87, 0.929, 0.725]) FAHES_F1 = 2 * FAHES_PRECISION * FAHES_RECALL / (FAHES_PRECISION + FAHES_RECALL) @@ -48,7 +50,7 @@ def call_fahes(tab_full_name, output_dir): # Based on https://github.com/qcri/Fahes_Demo.git -def run_fahes(data: Path | str | pd.DataFrame) -> pd.DataFrame: +def run_fahes(data: Path | str | pd.DataFrame) -> pd.DataFrame | None: """ Run FAHES on the given data file and return the resulting DataFrame. The resulting DataFrame contains the disguised missing values identified by FAHES. Example resulting DataFrame structure: @@ -76,8 +78,8 @@ def run_fahes(data: Path | str | pd.DataFrame) -> pd.DataFrame: with tempfile.TemporaryDirectory() as results_dir: call_fahes(str(data_file_path.absolute()), results_dir) result_file = Path(results_dir) / ("DMV_" + data_file_path.name) - - return pd.read_csv(result_file) + if result_file.stat().st_size > 0: + return pd.read_csv(result_file) finally: if tmp_file is not None: tmp_file.close() From 7205bd106105a472935dd9a4fad0555e1e7bec2e Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 22 Feb 2026 18:32:56 +0100 Subject: [PATCH 26/32] Store values as floats instead of numpy floats --- metis/metric/completeness/completeness_nullRatio.py | 4 ++-- metis/metric/consistency/consistency_ruleBasedPipino.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 10b74f6..421add1 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -62,7 +62,7 @@ def null_count(null_mask: pd.Series): columnNames=data.columns.tolist(), DQgranularity="table", DQexplanation={ - "null_count": not_null_ratios["null_count"].sum(), + "null_count": float(not_null_ratios["null_count"].sum()), } ) results.append(result) @@ -89,7 +89,7 @@ def null_count(null_mask: pd.Series): "row" if config.aggregation_axis == "columns" else "column" ), DQexplanation={ - "null_count": row["null_count"], + "null_count": float(row["null_count"]), } ) results.append(result) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index c5f691a..f8660b1 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -82,12 +82,12 @@ def assess( fulfilled_rules_mask = pd.DataFrame( { - f"rule_{i}": data[col_name].dropna().apply(rule) + f"rule_{i}": data[col_name].apply(rule) for i, rule in enumerate(column_rules) } ) - dq_measurements = fulfilled_rules_mask.sum(axis=1) / len(column_rules) + dq_measurements = fulfilled_rules_mask.mean(axis=1) certainties = self.certainties(fulfilled_rules_mask) for (row_index, dq_value), certainty in zip( From 68e96848abd7e056b7fa094c8be1960459cbce44 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 1 Mar 2026 15:39:46 +0100 Subject: [PATCH 27/32] Allow completeness assessment on cell granularity --- .../completeness_nullAndDMVRatio.py | 131 ++++++++++-------- .../completeness_nullAndDMVRatio_config.py | 10 +- .../completeness/completeness_nullRatio.py | 120 +++++++++------- .../completeness_nullRatio_config.py | 7 +- .../consistency_ruleBasedPipino.py | 2 +- 5 files changed, 152 insertions(+), 118 deletions(-) diff --git a/metis/metric/completeness/completeness_nullAndDMVRatio.py b/metis/metric/completeness/completeness_nullAndDMVRatio.py index c3a04f9..9132c27 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRatio.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Literal import pandas as pd @@ -38,8 +38,6 @@ def assess( config = self.load_config(metric_config, completeness_nullAndDMVRatio_config) - results = [] - dmvs = run_fahes(data) self.logger.info(f"Detected DMVs:\n{dmvs}") @@ -53,77 +51,90 @@ def assess( val = dmv_row["DMV"] marked_cells.loc[data[col] == val, col] = IS_DMV_MARKER - def counts(marks: pd.Series): - return ( - (marks == IS_NULL_MARKER).sum(), - (marks == IS_DMV_MARKER).sum(), - len(marks), + completeness = (marked_cells == IS_VALID_MARKER).astype(int) + certainty = self.certainty(marked_cells) + + if config.aggregation_axis is not None: + mean_completeness = completeness.mean(axis=config.aggregation_axis) + mean_certainty = certainty.mean(axis=config.aggregation_axis) + + if config.aggregate_all: + table_completeness = mean_completeness.mean() + table_certainty = mean_certainty.mean() + return [ + DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=table_completeness, + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=data.columns.tolist(), + DQexplanation={"certainty": float(table_certainty)}, + DQgranularity="table", + ) + ] + + return self.create_aggregated_results( + mean_completeness, + mean_certainty, + config.aggregation_axis, + data.columns.tolist(), ) - def completeness(marks: pd.Series): - null_count, dmv_count, total_count = counts(marks) - return (total_count - null_count - dmv_count) / total_count - - def certainty(marks: pd.Series): - null_count, dmv_count, total_count = counts(marks) - return self.certainty(null_count, dmv_count, total_count) + return self.create_flat_results(completeness, certainty) - aggregated_marks = marked_cells.agg( - [completeness, certainty], - axis=config.aggregation_axis, + def certainty(self, marks: pd.DataFrame): + # .replace with a dict sometimes throws an IndexError during pandas memory cleanup. Reason not yet identified, but using chained .replace calls seems to mitigate the issue. + return ( + marks.replace(IS_VALID_MARKER, FAHES_RECALL) + .replace(IS_NULL_MARKER, 1) + .replace(IS_DMV_MARKER, FAHES_PRECISION) ) - if config.aggregation_axis == "index": - aggregated_marks = aggregated_marks.T - - if config.aggregate_all: - table_completeness = aggregated_marks["completeness"].mean() - table_certainty = aggregated_marks["certainty"].mean() - result = DQResult( - mesTime=pd.Timestamp.now(), - DQvalue=table_completeness, - DQdimension=DQDimension.COMPLETENESS, - DQmetric=self.__class__.__name__, - columnNames=data.columns.tolist(), - DQexplanation={"certainty": float(table_certainty)}, - DQgranularity="table", - ) - results.append(result) - return results - - for index, row in aggregated_marks.iterrows(): - row_index = ( - int(str(index)) if config.aggregation_axis == "columns" else None - ) - col_names = ( - data.columns.tolist() - if config.aggregation_axis == "columns" - else [str(index)] - ) + def create_aggregated_results( + self, + mean_completeness: pd.Series, + mean_certainty: pd.Series, + aggregation_axis: Literal["index", "columns"], + columns: List[str], + ) -> List[DQResult]: + results = [] + for (index, completeness), certainty in zip( + mean_completeness.items(), mean_certainty.values + ): + row_index = int(str(index)) if aggregation_axis == "columns" else None + col_names = columns if aggregation_axis == "columns" else [str(index)] result = DQResult( mesTime=pd.Timestamp.now(), - DQvalue=row["completeness"], + DQvalue=completeness, DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=col_names, rowIndex=row_index, - DQexplanation={"certainty": float(row["certainty"])}, - DQgranularity=( - "row" if config.aggregation_axis == "columns" else "column" - ), + DQexplanation={"certainty": float(certainty)}, + DQgranularity=("row" if aggregation_axis == "columns" else "column"), ) results.append(result) return results - def certainty(self, null_count: int, dmv_count: int, total_count: int): - minimum = (1 - FAHES_PRECISION) + (1 - FAHES_RECALL) - return ( - 1 - - ( - (1 - FAHES_PRECISION) * (dmv_count / total_count) - + (1 - FAHES_RECALL) * (null_count / total_count) - ) - / minimum - ) + def create_flat_results( + self, completeness: pd.DataFrame, certainty: pd.DataFrame + ) -> List[DQResult]: + results = [] + for col in completeness.columns: + for (row_index, completeness_value), certainty_value in zip( + completeness[col].items(), certainty[col].values + ): + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=float(completeness_value), + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=[col], + rowIndex=int(str(row_index)), + DQexplanation={"certainty": float(certainty_value)}, + DQgranularity="cell", + ) + results.append(result) + return results diff --git a/metis/metric/completeness/completeness_nullAndDMVRatio_config.py b/metis/metric/completeness/completeness_nullAndDMVRatio_config.py index dfaa540..1624307 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRatio_config.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio_config.py @@ -3,17 +3,19 @@ from metis.metric.config import MetricConfig +VALID_AGGREGATION_AXES = ["index", "columns", None] + @dataclass class completeness_nullAndDMVRatio_config(MetricConfig): """ Configuration class for the completeness_nullAndDMVRatio metric. - :param aggregation_axis: Axis along which to aggregate completeness ('index': aggregate each column; 'columns': aggregate each row). + :param aggregation_axis: Axis along which to aggregate completeness ('index': aggregate each column; 'columns': aggregate each row, None (default): no aggregation). :param aggregate_all: Whether to aggregate all completeness results into a single value for the whole input data. """ - aggregation_axis: Literal["index", "columns"] = "columns" + aggregation_axis: Literal["index", "columns", None] = None aggregate_all: bool = False def to_json(self): @@ -24,9 +26,9 @@ def to_json(self): } def validate(self): - if self.aggregation_axis not in ["index", "columns"]: + if self.aggregation_axis not in VALID_AGGREGATION_AXES: raise ValueError( - f"aggregation_axis must be either 'index' or 'columns' but was {self.aggregation_axis}" + f"aggregation_axis must be one of {VALID_AGGREGATION_AXES} but was {self.aggregation_axis}" ) if not isinstance(self.aggregate_all, bool): raise ValueError( diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 421add1..cdac396 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Literal import pandas as pd @@ -33,65 +33,85 @@ def assess( na_mask = data.isna() - def counts(null_mask: pd.Series): - return null_mask.sum(), len(null_mask) - - def not_null_ratio(null_mask: pd.Series): - null_count, total_count = counts(null_mask) - return (total_count - null_count) / total_count - - def null_count(null_mask: pd.Series): - null_count, _ = counts(null_mask) - return null_count - - not_null_ratios = na_mask.agg( - [not_null_ratio, null_count], - axis=config.aggregation_axis, - ) - - if config.aggregation_axis == "index": - not_null_ratios = not_null_ratios.T - - if config.aggregate_all: - table_completeness = not_null_ratios["not_null_ratio"].mean() - result = DQResult( - mesTime=pd.Timestamp.now(), - DQvalue=table_completeness, - DQdimension=DQDimension.COMPLETENESS, - DQmetric=self.__class__.__name__, - columnNames=data.columns.tolist(), - DQgranularity="table", - DQexplanation={ - "null_count": float(not_null_ratios["null_count"].sum()), - } + completeness = (~na_mask).astype(int) + null_count = na_mask.astype(int) + + if config.aggregation_axis is not None: + mean_completeness = completeness.mean(axis=config.aggregation_axis) + mean_null_count = null_count.sum(axis=config.aggregation_axis) + + if config.aggregate_all: + table_completeness = mean_completeness.mean() + table_null_count = mean_null_count.sum() + + return [ + DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=table_completeness, + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=data.columns.tolist(), + DQgranularity="table", + DQexplanation={ + "null_count": float(table_null_count), + }, + ) + ] + + return self.create_aggregated_results( + mean_completeness, + mean_null_count, + config.aggregation_axis, + data.columns.tolist(), ) - results.append(result) - return results - for index, row in not_null_ratios.iterrows(): - row_index = ( - int(str(index)) if config.aggregation_axis == "columns" else None - ) - col_names = ( - data.columns.tolist() - if config.aggregation_axis == "columns" - else [str(index)] - ) + return self.create_flat_results(completeness, null_count) + + def create_aggregated_results( + self, + mean_completeness: pd.Series, + mean_null_count: pd.Series, + aggregation_axis: Literal["index", "columns"], + columns: List[str], + ) -> List[DQResult]: + results = [] + for (index, completeness), null_count in zip( + mean_completeness.items(), mean_null_count.values + ): + row_index = int(str(index)) if aggregation_axis == "columns" else None + col_names = columns if aggregation_axis == "columns" else [str(index)] result = DQResult( mesTime=pd.Timestamp.now(), - DQvalue=row["not_null_ratio"], + DQvalue=completeness, DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=col_names, rowIndex=row_index, - DQgranularity=( - "row" if config.aggregation_axis == "columns" else "column" - ), - DQexplanation={ - "null_count": float(row["null_count"]), - } + DQexplanation={"null_count": float(null_count)}, + DQgranularity=("row" if aggregation_axis == "columns" else "column"), ) results.append(result) return results + + def create_flat_results( + self, completeness: pd.DataFrame, null_count: pd.DataFrame + ) -> List[DQResult]: + results = [] + for col in completeness.columns: + for (row_index, completeness_value), null_count_value in zip( + completeness[col].items(), null_count[col].values + ): + result = DQResult( + mesTime=pd.Timestamp.now(), + DQvalue=float(completeness_value), + DQdimension=DQDimension.COMPLETENESS, + DQmetric=self.__class__.__name__, + columnNames=[col], + rowIndex=int(str(row_index)), + DQexplanation={"null_count": float(null_count_value)}, + DQgranularity="cell", + ) + results.append(result) + return results diff --git a/metis/metric/completeness/completeness_nullRatio_config.py b/metis/metric/completeness/completeness_nullRatio_config.py index a255e03..20cc60d 100644 --- a/metis/metric/completeness/completeness_nullRatio_config.py +++ b/metis/metric/completeness/completeness_nullRatio_config.py @@ -3,6 +3,7 @@ from metis.metric.config import MetricConfig +VALID_AGGREGATION_AXES = ["index", "columns", None] @dataclass class completeness_nullRatio_config(MetricConfig): @@ -13,7 +14,7 @@ class completeness_nullRatio_config(MetricConfig): :param aggregate_all: Whether to aggregate all completeness results into a single value for the whole input data. """ - aggregation_axis: Literal["index", "columns"] = "columns" + aggregation_axis: Literal["index", "columns", None] = None aggregate_all: bool = False def to_json(self): @@ -24,9 +25,9 @@ def to_json(self): } def validate(self): - if self.aggregation_axis not in ["index", "columns"]: + if self.aggregation_axis not in VALID_AGGREGATION_AXES: raise ValueError( - f"aggregation_axis must be either 'index' or 'columns' but was {self.aggregation_axis}" + f"aggregation_axis must be one of {VALID_AGGREGATION_AXES} but was {self.aggregation_axis}" ) if not isinstance(self.aggregate_all, bool): raise ValueError( diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index f8660b1..0b86686 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -54,7 +54,7 @@ def assess( } ) - dq_measurements = fulfilled_rules_mask.sum(axis=1) / len(tuple_rules) + dq_measurements = fulfilled_rules_mask.mean(axis=1) certainties = self.certainties(fulfilled_rules_mask) for (row_index, dq_value), certainty in zip( dq_measurements.items(), certainties.values From 4030b2078891e7ae165e67505d11d0beeddde385 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Fri, 6 Mar 2026 16:43:09 +0100 Subject: [PATCH 28/32] Create central Database class to use in different modules --- metis/database.py | 71 ++++++++++++++ metis/database_models.py | 1 + .../completeness_nullAndDMVRatio.py | 6 +- .../consistency_countFDViolations.py | 24 ++--- .../consistency_ruleBasedHinrichs.py | 4 +- .../consistency_ruleBasedPipino.py | 2 +- .../correctness/correctness_heinrich.py | 2 +- .../minimality/minimality_duplicateCount.py | 2 +- .../metric/timeliness/timeliness_heinrich.py | 7 +- .../validity/validity_outOfVocabulary.py | 17 ++-- metis/profiling/data_profile_manager.py | 95 ++++++++++--------- metis/writer/database_writer.py | 18 +--- metis/writer/postgres_writer.py | 17 ---- metis/writer/sqlite_writer.py | 14 --- 14 files changed, 160 insertions(+), 120 deletions(-) create mode 100644 metis/database.py delete mode 100644 metis/writer/postgres_writer.py delete mode 100644 metis/writer/sqlite_writer.py diff --git a/metis/database.py b/metis/database.py new file mode 100644 index 0000000..0f7b9ad --- /dev/null +++ b/metis/database.py @@ -0,0 +1,71 @@ +from typing import Dict, Literal + +from sqlalchemy import create_engine + +from metis.database_models import register_models + + +class Database: + """Provides a singleton reference for the database connection and models. Can be used by different modules to access the database without risking conflicts caused by multiple bases or engines.""" + + _instance: Database | None = None + + def __init__(self, db_type: Literal["sqlite", "postgres"], db_config: Dict): + if Database._instance is not None: + raise RuntimeError( + "Database has already been initialized. Use Database.get_instance() to access the singleton." + ) + + self.engine = self.create_engine(db_type, db_config) + + Base, self.DQResultModel, self.DataProfile = register_models( + db_config.get("table_name", "dq_results") + ) + Base.metadata.create_all(self.engine) + + Database._instance = self + + @classmethod + def get_instance(cls) -> Database: + """Return the current singleton. Raises if not initialized.""" + if cls._instance is None: + raise RuntimeError( + "Database has not been initialized. " + "Call Database.initialize(engine) first." + ) + return cls._instance + + @classmethod + def is_initialized(cls) -> bool: + return cls._instance is not None + + def create_engine(self, db_type: Literal["sqlite", "postgres"], db_config: Dict): + if db_type == "sqlite": + return self.create_sqlite_engine(db_config) + elif db_type == "postgres": + return self.create_postgres_engine(db_config) + raise ValueError(f"Unsupported database type: {db_type}") + + def create_sqlite_engine(self, db_config: Dict): + required_keys = "db_name" + if not all(k in db_config for k in required_keys): + raise ValueError( + f"SQLite database config must include the following fields: {required_keys}." + ) + + return create_engine( + f"sqlite:///{db_config['db_name']}", + echo=db_config.get("echo", False), + ) + + def create_postgres_engine(self, db_config: Dict): + required_keys = ("db_user", "db_pass", "db_name", "db_host", "db_port") + if not all(k in db_config for k in required_keys): + raise ValueError( + f"Postgres database config must include the following fields: {required_keys}." + ) + + return create_engine( + f"postgresql://{db_config['db_user']}:{db_config['db_pass']}@{db_config['db_host']}:{db_config['db_port']}/{db_config['db_name']}", + echo=db_config.get("echo", False), + ) diff --git a/metis/database_models.py b/metis/database_models.py index b33f8d8..04a5495 100644 --- a/metis/database_models.py +++ b/metis/database_models.py @@ -7,6 +7,7 @@ def register_models(results_table_name: str): """Register the SQLAlchemy models for the database tables based on initial configuration. Every call creates a new SQLAlchemy base, which is not bound to any engine yet. Use the Database singleton for stable references to the models and engine.""" + class Base(DeclarativeBase): pass diff --git a/metis/metric/completeness/completeness_nullAndDMVRatio.py b/metis/metric/completeness/completeness_nullAndDMVRatio.py index 9132c27..814f475 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRatio.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio.py @@ -63,7 +63,7 @@ def assess( table_certainty = mean_certainty.mean() return [ DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=table_completeness, DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, @@ -105,7 +105,7 @@ def create_aggregated_results( col_names = columns if aggregation_axis == "columns" else [str(index)] result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=completeness, DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, @@ -127,7 +127,7 @@ def create_flat_results( completeness[col].items(), certainty[col].values ): result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=float(completeness_value), DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, diff --git a/metis/metric/consistency/consistency_countFDViolations.py b/metis/metric/consistency/consistency_countFDViolations.py index eb64658..ea600db 100644 --- a/metis/metric/consistency/consistency_countFDViolations.py +++ b/metis/metric/consistency/consistency_countFDViolations.py @@ -56,18 +56,18 @@ def assess( # for the same determinant (FD violation) violations = grouped[grouped > 1].index.tolist() - consistency = 1 - (len(violations) / len(data[determinant])) + consistency = 1 - (len(violations) / len(data[determinant])) - result = DQResult( - timestamp=pd.Timestamp.now(), - DQdimension=DQDimension.CONSISTENCY, - DQmetric="CountFDViolations", - DQgranularity="table", - DQvalue=consistency, - DQexplanation={f"{determinant}:{dependent}": violations}, # FD - columnNames=[determinant], - configJson=metric_conf, - ) - results.append(result) + result = DQResult( + timestamp=pd.Timestamp.now(), + DQdimension=DQDimension.CONSISTENCY, + DQmetric=self.__class__.__name__, + DQgranularity="table", + DQvalue=consistency, + DQexplanation={f"{determinant}:{dependent}": violations}, # FD + columnNames=[determinant], + configJson=metric_conf, + ) + results.append(result) return results diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index 5cc9fc0..ab43931 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -57,7 +57,7 @@ def assess( for row_index, dq_value in dq_measurements.items(): results.append( DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=dq_value, DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, @@ -92,7 +92,7 @@ def assess( for row_index, dq_value in dq_measurements.items(): results.append( DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=dq_value, DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index 0b86686..0f15da0 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -114,7 +114,7 @@ def create_result( self, dq_value: float, col_name: str | None, row_index: int, certainty: float ) -> DQResult: return DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=dq_value, DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index 5e361a8..0ca4b9c 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -41,7 +41,7 @@ def assess( ) result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=measurement, DQdimension=DQDimension.CORRECTNESS, DQmetric=self.__class__.__name__, diff --git a/metis/metric/minimality/minimality_duplicateCount.py b/metis/metric/minimality/minimality_duplicateCount.py index 6ecb8a8..2800e6e 100644 --- a/metis/metric/minimality/minimality_duplicateCount.py +++ b/metis/metric/minimality/minimality_duplicateCount.py @@ -31,7 +31,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None result = DQResult( timestamp=pd.Timestamp.now(), DQdimension=DQDimension.MINIMALITY, - DQmetric="DuplicateCount", + DQmetric=self.__class__.__name__, DQgranularity="column", DQvalue=minimality, DQexplanation=annotations, diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 4c0826a..753a02f 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -11,16 +11,11 @@ ) from metis.utils.datetime.datetime_precision import determine_datetime_precision from metis.utils.dq_dimension import DQDimension -from metis.utils.logging import logger as main_logger from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult class timeliness_heinrich(Metric): - def __init__(self) -> None: - super().__init__() - self.logger = main_logger.getChild(self.__class__.__name__) - def assess( self, data: pd.DataFrame, @@ -98,7 +93,7 @@ def assess( timeliness.items(), certainty.items() ): result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQvalue=timeliness_value, DQdimension=DQDimension.TIMELINESS, DQmetric=self.__class__.__name__, diff --git a/metis/metric/validity/validity_outOfVocabulary.py b/metis/metric/validity/validity_outOfVocabulary.py index cf10769..7bf9a99 100644 --- a/metis/metric/validity/validity_outOfVocabulary.py +++ b/metis/metric/validity/validity_outOfVocabulary.py @@ -1,16 +1,21 @@ import re -import pandas as pd from typing import List, Union + +import nltk +import pandas as pd +from nltk.corpus import words as nltk_words + from metis.metric.config import MetricConfig +from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension from metis.utils.result import DQResult -from metis.metric.metric import Metric -import nltk -from nltk.corpus import words as nltk_words -nltk.download("words", quiet=True) class validity_outOfVocabulary(Metric): + def __init__(self) -> None: + super().__init__() + nltk.download("words", quiet=True) + def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, set, None] = None, metric_config: Union[MetricConfig, str, None] = None) -> List[DQResult]: """ General vocabulary check at token level. @@ -66,7 +71,7 @@ def is_valid(text: str) -> bool: result = DQResult( timestamp=pd.Timestamp.now(), DQdimension=DQDimension.VALIDITY, - DQmetric="OutOfVocabulary", + DQmetric=self.__class__.__name__, DQgranularity="column", DQvalue=dq_value, DQexplanation=annotations, diff --git a/metis/profiling/data_profile_manager.py b/metis/profiling/data_profile_manager.py index 946bc4a..3699c0b 100644 --- a/metis/profiling/data_profile_manager.py +++ b/metis/profiling/data_profile_manager.py @@ -4,10 +4,10 @@ import threading from typing import Any, Dict, List, Optional -from sqlalchemy import Engine, create_engine as sa_create_engine, delete, select +from sqlalchemy import delete, select from sqlalchemy.orm import Session -from metis.database_models import Base, DataProfile +from metis.database import Database class DataProfileManager: @@ -31,7 +31,7 @@ class DataProfileManager: @classmethod def initialize( cls, - engine_or_url: Engine | str, + database: Database, ignore_cache: bool = False, overwrite_cache: bool = False, clear_cache: bool = False, @@ -44,21 +44,20 @@ def initialize( clear_cache: Delete all stored profiles at startup, then cache normally. """ with cls._lock: - if isinstance(engine_or_url, str): - engine = sa_create_engine(engine_or_url) - else: - engine = engine_or_url - Base.metadata.create_all(engine) if clear_cache: - with Session(engine) as session: - session.execute(delete(DataProfile)) + with Session(database.engine) as session: + session.execute(delete(database.DataProfile)) session.commit() - cls._instance = cls(engine, ignore_cache=ignore_cache, overwrite_cache=overwrite_cache) + cls._instance = cls( + database=database, + ignore_cache=ignore_cache, + overwrite_cache=overwrite_cache, + ) return cls._instance @classmethod def get_instance(cls) -> DataProfileManager: - """Return the current singleton. Raises if not initialized.""" + """Return the current singleton. Raises if not initialized.""" if cls._instance is None: raise RuntimeError( "DataProfileManager has not been initialized. " @@ -75,16 +74,16 @@ def shutdown(cls) -> None: """Shutdown the singleton and dispose the engine.""" with cls._lock: if cls._instance is not None: - cls._instance._engine.dispose() + cls._instance._database.engine.dispose() cls._instance = None def __init__( self, - engine: Engine, + database: Database, ignore_cache: bool = False, overwrite_cache: bool = False, ) -> None: - self._engine = engine + self._database = database self._dataset: Optional[str] = None self._table: Optional[str] = None self._mem_cache: Dict[str, Any] = {} @@ -141,12 +140,12 @@ def lookup( return self._mem_cache[key] # slow path: DB - with Session(self._engine) as session: + with Session(self._database.engine) as session: stmt = ( - select(DataProfile) - .where(DataProfile.dataset == self._dataset) - .where(DataProfile.table_name == self._table) - .where(DataProfile.dp_task_name == dp_task_name) + select(self._database.DataProfile) + .where(self._database.DataProfile.dataset == self._dataset) + .where(self._database.DataProfile.table_name == self._table) + .where(self._database.DataProfile.dp_task_name == dp_task_name) ) for row in session.execute(stmt).scalars(): if sorted(row.column_names) == sorted(column_names): @@ -181,13 +180,13 @@ def store( serialized, result_type = self._serialize(value) - with Session(self._engine) as session: + with Session(self._database.engine) as session: # Find existing row with same logical key stmt = ( - select(DataProfile) - .where(DataProfile.dataset == ds) - .where(DataProfile.table_name == tbl) - .where(DataProfile.dp_task_name == dp_task_name) + select(self._database.DataProfile) + .where(self._database.DataProfile.dataset == ds) + .where(self._database.DataProfile.table_name == tbl) + .where(self._database.DataProfile.dp_task_name == dp_task_name) ) existing = None for row in session.execute(stmt).scalars(): @@ -203,17 +202,19 @@ def store( existing.profile_type = profile_type existing.source = source else: - session.add(DataProfile( - dataset=ds, - table_name=tbl, - column_names=column_names, - dp_task_name=dp_task_name, - task_config=task_config, - profile_type=profile_type, - dp_result_value=serialized, - result_type=result_type, - source=source, - )) + session.add( + self._database.DataProfile( + dataset=ds, + table_name=tbl, + column_names=column_names, + dp_task_name=dp_task_name, + task_config=task_config, + profile_type=profile_type, + dp_result_value=serialized, + result_type=result_type, + source=source, + ) + ) session.commit() # update in-memory cache @@ -316,12 +317,12 @@ def _query_by_task( tbl = table or self._table if ds is None or tbl is None: return [] - with Session(self._engine) as session: + with Session(self._database.engine) as session: stmt = ( - select(DataProfile) - .where(DataProfile.dataset == ds) - .where(DataProfile.table_name == tbl) - .where(DataProfile.dp_task_name == dp_task_name) + select(self._database.DataProfile) + .where(self._database.DataProfile.dataset == ds) + .where(self._database.DataProfile.table_name == tbl) + .where(self._database.DataProfile.dp_task_name == dp_task_name) ) return [ self._deserialize(row.dp_result_value, row.result_type) @@ -363,7 +364,11 @@ def to_json_safe(v: Any) -> Any: } }, "minhash" - if isinstance(value, dict) and value and isinstance(next(iter(value.values())), _MinHash): + if ( + isinstance(value, dict) + and value + and isinstance(next(iter(value.values())), _MinHash) + ): return { "v": { k: { @@ -392,16 +397,18 @@ def _deserialize(payload: Optional[dict], result_type: str) -> Any: if result_type == "series": return pd.Series(raw) if result_type == "minhash": - from datasketch import MinHash as _MinHash import numpy as np + from datasketch import MinHash as _MinHash + return _MinHash( num_perm=raw["num_perm"], seed=raw["seed"], hashvalues=np.array(raw["hashvalues"], dtype=np.uint64), ) if result_type == "minhash_dict": - from datasketch import MinHash as _MinHash import numpy as np + from datasketch import MinHash as _MinHash + return { k: _MinHash( num_perm=v["num_perm"], diff --git a/metis/writer/database_writer.py b/metis/writer/database_writer.py index 35ad81c..026345f 100644 --- a/metis/writer/database_writer.py +++ b/metis/writer/database_writer.py @@ -1,26 +1,18 @@ -from typing import Dict, List +from typing import List -from sqlalchemy import Engine from sqlalchemy.orm import Session from tqdm import tqdm -from metis.database_models import register_models +from metis.database import Database from metis.utils.numbers import format_count from metis.utils.result import DQResult from metis.writer.writer import DQResultWriter class DatabaseWriter(DQResultWriter): - def __init__(self, writer_config: Dict) -> None: - self.engine = self.create_engine(writer_config) - - Base, self.DQResultModel = register_models( - writer_config.get("table_name", "dq_results") - ) - Base.metadata.create_all(self.engine) - - def create_engine(self, writer_config: Dict) -> Engine: - raise NotImplementedError("Subclasses must implement the create_engine method.") + def __init__(self, db: Database) -> None: + self.engine = db.engine + self.DQResultModel = db.DQResultModel def write(self, results: List[DQResult]) -> None: with Session(self.engine) as session: diff --git a/metis/writer/postgres_writer.py b/metis/writer/postgres_writer.py deleted file mode 100644 index 3f0b65b..0000000 --- a/metis/writer/postgres_writer.py +++ /dev/null @@ -1,17 +0,0 @@ -from sqlalchemy import Engine, create_engine - -from metis.writer.database_writer import DatabaseWriter - - -class PostgresWriter(DatabaseWriter): - def create_engine(self, writer_config) -> Engine: - required_keys = ("db_user", "db_pass", "db_name", "db_host", "db_port") - if not all(k in writer_config for k in required_keys): - raise ValueError( - "Postgres writer config must include 'db_user', 'db_pass', 'db_name', 'db_host', and 'db_port' fields." - ) - - return create_engine( - f"postgresql://{writer_config['db_user']}:{writer_config['db_pass']}@{writer_config['db_host']}:{writer_config['db_port']}/{writer_config['db_name']}", - echo=writer_config.get("echo", False), - ) diff --git a/metis/writer/sqlite_writer.py b/metis/writer/sqlite_writer.py deleted file mode 100644 index aa8da27..0000000 --- a/metis/writer/sqlite_writer.py +++ /dev/null @@ -1,14 +0,0 @@ -from sqlalchemy import Engine, create_engine - -from metis.writer.database_writer import DatabaseWriter - - -class SQLiteWriter(DatabaseWriter): - def create_engine(self, writer_config) -> Engine: - if "db_name" not in writer_config: - raise ValueError("SQLite writer config must include 'db_name' field.") - - return create_engine( - f"sqlite:///{writer_config['db_name']}", - echo=writer_config.get("echo", False), - ) From 6da26ebe4f91f52f67f4ce41c9eb26c159d4f225 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Fri, 6 Mar 2026 17:00:51 +0100 Subject: [PATCH 29/32] Add enum for data quality assessment granularity --- metis/database.py | 2 +- .../completeness/completeness_nullAndDMVRatio.py | 11 ++++++++--- .../completeness/completeness_nullRatio.py | 11 ++++++++--- .../consistency/consistency_countFDViolations.py | 3 ++- .../consistency/consistency_ruleBasedHinrichs.py | 5 +++-- .../consistency/consistency_ruleBasedPipino.py | 3 ++- metis/metric/correctness/correctness_heinrich.py | 3 ++- .../minimality/minimality_duplicateCount.py | 16 ++++++++++++---- metis/metric/timeliness/timeliness_heinrich.py | 3 ++- .../metric/validity/validity_outOfVocabulary.py | 3 ++- metis/utils/dq_granularity.py | 10 ++++++++++ metis/utils/result.py | 5 +++-- 12 files changed, 55 insertions(+), 20 deletions(-) create mode 100644 metis/utils/dq_granularity.py diff --git a/metis/database.py b/metis/database.py index 0f7b9ad..d78a0e6 100644 --- a/metis/database.py +++ b/metis/database.py @@ -47,7 +47,7 @@ def create_engine(self, db_type: Literal["sqlite", "postgres"], db_config: Dict) raise ValueError(f"Unsupported database type: {db_type}") def create_sqlite_engine(self, db_config: Dict): - required_keys = "db_name" + required_keys = ("db_name",) if not all(k in db_config for k in required_keys): raise ValueError( f"SQLite database config must include the following fields: {required_keys}." diff --git a/metis/metric/completeness/completeness_nullAndDMVRatio.py b/metis/metric/completeness/completeness_nullAndDMVRatio.py index 814f475..d7426fd 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRatio.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio.py @@ -13,6 +13,7 @@ run_fahes, ) from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.result import DQResult IS_VALID_MARKER = 0 @@ -69,7 +70,7 @@ def assess( DQmetric=self.__class__.__name__, columnNames=data.columns.tolist(), DQexplanation={"certainty": float(table_certainty)}, - DQgranularity="table", + DQgranularity=DQGranularity.TABLE, ) ] @@ -112,7 +113,11 @@ def create_aggregated_results( columnNames=col_names, rowIndex=row_index, DQexplanation={"certainty": float(certainty)}, - DQgranularity=("row" if aggregation_axis == "columns" else "column"), + DQgranularity=( + DQGranularity.ROW + if aggregation_axis == "columns" + else DQGranularity.COLUMN + ), ) results.append(result) @@ -134,7 +139,7 @@ def create_flat_results( columnNames=[col], rowIndex=int(str(row_index)), DQexplanation={"certainty": float(certainty_value)}, - DQgranularity="cell", + DQgranularity=DQGranularity.CELL, ) results.append(result) return results diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 925e0a7..9a380fb 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -8,6 +8,7 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.result import DQResult @@ -51,7 +52,7 @@ def assess( DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=data.columns.tolist(), - DQgranularity="table", + DQgranularity=DQGranularity.TABLE, DQexplanation={ "null_count": float(table_null_count), }, @@ -89,7 +90,11 @@ def create_aggregated_results( columnNames=col_names, rowIndex=row_index, DQexplanation={"null_count": float(null_count)}, - DQgranularity=("row" if aggregation_axis == "columns" else "column"), + DQgranularity=( + DQGranularity.ROW + if aggregation_axis == "columns" + else DQGranularity.COLUMN + ), ) results.append(result) @@ -111,7 +116,7 @@ def create_flat_results( columnNames=[col], rowIndex=int(str(row_index)), DQexplanation={"null_count": float(null_count_value)}, - DQgranularity="cell", + DQgranularity=DQGranularity.CELL, ) results.append(result) return results diff --git a/metis/metric/consistency/consistency_countFDViolations.py b/metis/metric/consistency/consistency_countFDViolations.py index ea600db..5bae10f 100644 --- a/metis/metric/consistency/consistency_countFDViolations.py +++ b/metis/metric/consistency/consistency_countFDViolations.py @@ -6,6 +6,7 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.result import DQResult @@ -62,7 +63,7 @@ def assess( timestamp=pd.Timestamp.now(), DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, - DQgranularity="table", + DQgranularity=DQGranularity.TABLE, DQvalue=consistency, DQexplanation={f"{determinant}:{dependent}": violations}, # FD columnNames=[determinant], diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index ab43931..ee4ad75 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -9,6 +9,7 @@ ) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult @@ -66,7 +67,7 @@ def assess( DQexplanation={ "certainty": self.certainty(dq_value, min_quality) }, - DQgranularity="row", + DQgranularity=DQGranularity.ROW, ) ) @@ -101,7 +102,7 @@ def assess( DQexplanation={ "certainty": self.certainty(dq_value, min_quality) }, - DQgranularity="cell", + DQgranularity=DQGranularity.CELL, ) ) diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index 0f15da0..45dcc86 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -8,6 +8,7 @@ ) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult @@ -123,5 +124,5 @@ def create_result( DQexplanation={ "certainty": certainty, }, - DQgranularity="cell" if col_name else "row", + DQgranularity=DQGranularity.CELL if col_name else DQGranularity.ROW, ) diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index 0ca4b9c..c3e33a1 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -5,6 +5,7 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.numbers import clamp from metis.utils.result import DQResult from metis.utils.similarity_measures.levenshtein_distance import levenshtein_distance @@ -47,7 +48,7 @@ def assess( DQmetric=self.__class__.__name__, columnNames=[col_name], rowIndex=row_index, - DQgranularity="cell", + DQgranularity=DQGranularity.CELL, ) results.append(result) diff --git a/metis/metric/minimality/minimality_duplicateCount.py b/metis/metric/minimality/minimality_duplicateCount.py index 2800e6e..1320b65 100644 --- a/metis/metric/minimality/minimality_duplicateCount.py +++ b/metis/metric/minimality/minimality_duplicateCount.py @@ -1,13 +1,21 @@ -import pandas as pd from typing import List, Union +import pandas as pd + from metis.metric.config import MetricConfig +from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.result import DQResult -from metis.metric.metric import Metric + class minimality_duplicateCount(Metric): - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None, metric_config: Union[MetricConfig, str, None] = None) -> List[DQResult]: + def assess( + self, + data: pd.DataFrame, + reference: Union[pd.DataFrame, None] = None, + metric_config: Union[MetricConfig, str, None] = None, + ) -> List[DQResult]: """ Assess the minimality for each attribute of a dataset by checking for unique values. @@ -32,7 +40,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None timestamp=pd.Timestamp.now(), DQdimension=DQDimension.MINIMALITY, DQmetric=self.__class__.__name__, - DQgranularity="column", + DQgranularity=DQGranularity.COLUMN, DQvalue=minimality, DQexplanation=annotations, columnNames=[column], diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index 753a02f..e89aa86 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -11,6 +11,7 @@ ) from metis.utils.datetime.datetime_precision import determine_datetime_precision from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.logging import warn_unconfigured_columns from metis.utils.result import DQResult @@ -102,7 +103,7 @@ def assess( DQexplanation={ "certainty": certainty_value, }, - DQgranularity="cell", + DQgranularity=DQGranularity.CELL, ) results.append(result) diff --git a/metis/metric/validity/validity_outOfVocabulary.py b/metis/metric/validity/validity_outOfVocabulary.py index 7bf9a99..2075124 100644 --- a/metis/metric/validity/validity_outOfVocabulary.py +++ b/metis/metric/validity/validity_outOfVocabulary.py @@ -8,6 +8,7 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity from metis.utils.result import DQResult @@ -72,7 +73,7 @@ def is_valid(text: str) -> bool: timestamp=pd.Timestamp.now(), DQdimension=DQDimension.VALIDITY, DQmetric=self.__class__.__name__, - DQgranularity="column", + DQgranularity=DQGranularity.COLUMN, DQvalue=dq_value, DQexplanation=annotations, columnNames=[column], diff --git a/metis/utils/dq_granularity.py b/metis/utils/dq_granularity.py new file mode 100644 index 0000000..b8c87c2 --- /dev/null +++ b/metis/utils/dq_granularity.py @@ -0,0 +1,10 @@ +from enum import StrEnum + + +class DQGranularity(StrEnum): + """Data Quality Granularity Enum. Primarily used for labeling DQResults inside each metric implementation with the DQ Granularity they assess.""" + + CELL = "cell" + ROW = "row" + COLUMN = "column" + TABLE = "table" diff --git a/metis/utils/result.py b/metis/utils/result.py index f21fce9..bba02a6 100644 --- a/metis/utils/result.py +++ b/metis/utils/result.py @@ -2,6 +2,7 @@ import pandas as pd from metis.utils.dq_dimension import DQDimension +from metis.utils.dq_granularity import DQGranularity class DQResult: def __init__( @@ -9,7 +10,7 @@ def __init__( timestamp: pd.Timestamp, DQdimension: DQDimension, DQmetric: str, - DQgranularity: str, + DQgranularity: DQGranularity, DQvalue: float, DQexplanation: Union[dict, None] = None, runtime: Union[float, None] = None, @@ -26,7 +27,7 @@ def __init__( - `timestamp: pd.Timestamp`: The time at which the result was assessed. - `DQdimension: DQDimension`: Data quality dimension assessed (e.g. DQDimension.COMPLETENESS, DQDimension.ACCURACY). - `DQmetric: str`: Name of the specific metric within the dimension. - - `DQgranularity: str`: Granularity of the metric (e.g. 'column', 'table', 'cell', 'row'). + - `DQgranularity: DQGranularity`: Granularity of the metric (e.g. DQGranularity.COLUMN, DQGranularity.TABLE, DQGranularity.CELL, DQGranularity.ROW). - `DQvalue: float`: Numeric outcome of the assessment (quantitative only). Optional arguments From 15cbb0b53bdaa2ec570b64f81df4ee71391a04f0 Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Fri, 6 Mar 2026 17:27:13 +0100 Subject: [PATCH 30/32] Remove unused config parameter from demo --- demo/getting_started.py | 7 +++++-- metis/metric/metric.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/demo/getting_started.py b/demo/getting_started.py index 34b106f..244c74b 100644 --- a/demo/getting_started.py +++ b/demo/getting_started.py @@ -5,6 +5,9 @@ orchestrator.load(data_loader_configs=["data/adult.json"]) -orchestrator.assess(metrics=["completeness_nullRatio"], metric_configs=['{"measure_runtime": true}']) +orchestrator.assess(metrics=["completeness_nullRatio"], metric_configs=[""]) orchestrator.assess(metrics=["minimality_duplicateCount"], metric_configs=[None]) -orchestrator.assess(metrics=["validity_outOfVocabulary"], metric_configs=['{"use_nltk": true, "lowercase": true}']) \ No newline at end of file +orchestrator.assess( + metrics=["validity_outOfVocabulary"], + metric_configs=['{"use_nltk": true, "lowercase": true}'], +) diff --git a/metis/metric/metric.py b/metis/metric/metric.py index f858bbb..62a4da1 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -96,7 +96,7 @@ def load_config(self, config: Any, model: type[C]) -> C: with open(config, "r") as f: config_dict = json.load(f) else: - config_dict = json.loads(config) + config_dict = json.loads(config) if len(config) > 0 else {} parsed_config = model(**config_dict) except Exception as e: From ebb932b1145e12b5bdcf32acb979532d3adbc1cf Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Mon, 9 Mar 2026 13:41:29 +0100 Subject: [PATCH 31/32] Allow column-specific timeliness configuration --- .../metric/timeliness/timeliness_heinrich.py | 64 +++++++++---------- .../timeliness/timeliness_heinrich_config.py | 37 +++++++---- 2 files changed, 54 insertions(+), 47 deletions(-) diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index e89aa86..dd1c73d 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -38,49 +38,45 @@ def assess( ) config = self.load_config(metric_config, timeliness_heinrich_config) - - ingestion_date_column = config.ingestion_date_column - assessment_date = pd.to_datetime( - config.simulated_assessment_date or pd.Timestamp.now() - ) - results = [] - - if not ingestion_date_column or ingestion_date_column not in data.columns: - self.logger.warning( - f"Ingestion date column '{ingestion_date_column}' is not present in the data." - ) - return results - warn_unconfigured_columns( self.logger, set(data.columns), - set(config.decline_rate_per_column.keys()), - "decline rates", + set(config.timeliness_config_per_column.keys()), + "timeliness configuration", ) - ingestion_dates = pd.to_datetime( - data[ingestion_date_column], **(config.to_datetime_kwargs or {}) - ) - ages_in_days = ( - (assessment_date - ingestion_dates).dt.total_seconds() / 60 / 60 / 24 - ) - precision_of_dates = ( - pd.Series( - [config.simulated_timestamp_precision] * len(data), index=data.index + for col_name, col_config in config.timeliness_config_per_column.items(): + ingestion_date_column = col_config.ingestion_date_column + assessment_date = pd.to_datetime( + col_config.simulated_assessment_date or pd.Timestamp.now() ) - if config.simulated_timestamp_precision - else data[ingestion_date_column].apply(determine_datetime_precision) - ) - age_and_precision = pd.DataFrame( - {"age": ages_in_days, "precision": precision_of_dates} - ) - for col_name in data.columns: - decline_rate = config.decline_rate_per_column.get(col_name) - if decline_rate is None: - continue + if not ingestion_date_column or ingestion_date_column not in data.columns: + self.logger.warning( + f"Ingestion date column '{ingestion_date_column}' is not present in the data. Skipping assessment for column '{col_name}'." + ) + return results + + ingestion_dates = pd.to_datetime( + data[ingestion_date_column], **(col_config.to_datetime_kwargs or {}) + ) + ages_in_days = ( + (assessment_date - ingestion_dates).dt.total_seconds() / 60 / 60 / 24 + ) + precision_of_dates = ( + pd.Series( + [col_config.simulated_timestamp_precision] * len(data), + index=data.index, + ) + if col_config.simulated_timestamp_precision + else data[ingestion_date_column].apply(determine_datetime_precision) + ) + age_and_precision = pd.DataFrame( + {"age": ages_in_days, "precision": precision_of_dates} + ) + decline_rate = col_config.decline_rate timeliness = pd.Series(np.exp(-decline_rate * ages_in_days)) certainty = age_and_precision.apply( lambda row: self.certainty( diff --git a/metis/metric/timeliness/timeliness_heinrich_config.py b/metis/metric/timeliness/timeliness_heinrich_config.py index b5b47a1..ee8a45b 100644 --- a/metis/metric/timeliness/timeliness_heinrich_config.py +++ b/metis/metric/timeliness/timeliness_heinrich_config.py @@ -1,3 +1,4 @@ +import dataclasses from dataclasses import dataclass from typing import Dict @@ -6,29 +7,39 @@ @dataclass -class timeliness_heinrich_config(MetricConfig): +class timeliness_heinrich_column_config: """ - Configuration class for the timeliness_heinrich metric. + Configuration class for a single column in the timeliness_heinrich metric (used as part of timeliness_heinrich_config). - :param decline_rate_per_column: Decline rate specific to each column - :param ingestion_date_column: Name of the column containing the ingestion date of each tuple - :param to_datetime_kwargs: Optional keyword arguments for pandas.to_datetime when parsing the ingestion date column. - :param simulated_assessment_date: Optional simulated assessment date in string format. If not provided, the current date will be used. - :param simulated_timestamp_precision: Optional simulated precision of each the timestamps in ingestion_date_column. If not provided, the precision is detected automatically. + :param decline_rate: Decline rate for the column + :param ingestion_date_column: Name of the column containing the ingestion date that should be used to calculate the age of the data for this column + :param to_datetime_kwargs: Optional keyword arguments for pandas.to_datetime when parsing the date in ingestion_date_column. + :param simulated_assessment_date: Optional simulated assessment date in string format. If not provided, the current date will be used. This can be used to simulate the assessment of data at a specific point in time, which is especially useful for testing and evaluation purposes. + :param simulated_timestamp_precision: Optional simulated precision of each the timestamps in ingestion_date_column. If not provided, the precision is detected automatically. The precision is used to assess the certainty of the timeliness measurements. """ - decline_rate_per_column: Dict[str, float] + decline_rate: float ingestion_date_column: str to_datetime_kwargs: Dict | None = None simulated_assessment_date: str | None = None simulated_timestamp_precision: DTPrecision | None = None + +@dataclass +class timeliness_heinrich_config(MetricConfig): + """ + Configuration class for the timeliness_heinrich metric. + + :param timeliness_config_per_column: Configuration for each column in the timeliness_heinrich metric. Each column can have a different decline rate and ingestion date column, which allows for a more fine-grained and accurate assessment of timeliness based on the specific characteristics of each column. + """ + + timeliness_config_per_column: Dict[str, timeliness_heinrich_column_config] + def to_json(self): return { "name": self.__class__.__name__, - "decline_rate_per_column": self.decline_rate_per_column, - "ingestion_date_column": self.ingestion_date_column, - "simulated_assessment_date": self.simulated_assessment_date, - "simulated_timestamp_precision": self.simulated_timestamp_precision, - "to_datetime_kwargs": self.to_datetime_kwargs, + "timeliness_config_per_column": { + col: dataclasses.asdict(config) + for col, config in self.timeliness_config_per_column.items() + }, } From d3798d850cf9f848af6784c4e2f8264814aa449d Mon Sep 17 00:00:00 2001 From: jb3rndt Date: Sun, 22 Mar 2026 23:59:43 +0100 Subject: [PATCH 32/32] Fix result creation index handling --- .../completeness_nullAndDMVRatio.py | 6 ++-- .../completeness/completeness_nullRatio.py | 6 ++-- .../consistency_ruleBasedHinrichs.py | 10 +++---- .../consistency_ruleBasedPipino.py | 28 +++++++++++-------- .../correctness/correctness_heinrich.py | 13 ++++++--- .../metric/timeliness/timeliness_heinrich.py | 8 +++--- 6 files changed, 41 insertions(+), 30 deletions(-) diff --git a/metis/metric/completeness/completeness_nullAndDMVRatio.py b/metis/metric/completeness/completeness_nullAndDMVRatio.py index d7426fd..c3e39cc 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRatio.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio.py @@ -128,8 +128,8 @@ def create_flat_results( ) -> List[DQResult]: results = [] for col in completeness.columns: - for (row_index, completeness_value), certainty_value in zip( - completeness[col].items(), certainty[col].values + for row_index, (completeness_value, certainty_value) in enumerate( + zip(completeness[col].values, certainty[col].values) ): result = DQResult( timestamp=pd.Timestamp.now(), @@ -137,7 +137,7 @@ def create_flat_results( DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=[col], - rowIndex=int(str(row_index)), + rowIndex=row_index, DQexplanation={"certainty": float(certainty_value)}, DQgranularity=DQGranularity.CELL, ) diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 9a380fb..1dd64b3 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -105,8 +105,8 @@ def create_flat_results( ) -> List[DQResult]: results = [] for col in completeness.columns: - for (row_index, completeness_value), null_count_value in zip( - completeness[col].items(), null_count[col].values + for row_index, (completeness_value, null_count_value) in enumerate( + zip(completeness[col].values, null_count[col].values) ): result = DQResult( timestamp=pd.Timestamp.now(), @@ -114,7 +114,7 @@ def create_flat_results( DQdimension=DQDimension.COMPLETENESS, DQmetric=self.__class__.__name__, columnNames=[col], - rowIndex=int(str(row_index)), + rowIndex=row_index, DQexplanation={"null_count": float(null_count_value)}, DQgranularity=DQGranularity.CELL, ) diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index ee4ad75..2c98bf2 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -55,15 +55,15 @@ def assess( dq_measurements = 1 / (1 + degree_of_violation) min_quality = dq_measurements.min() - for row_index, dq_value in dq_measurements.items(): + for row_index, dq_value in enumerate(dq_measurements.values): results.append( DQResult( timestamp=pd.Timestamp.now(), DQvalue=dq_value, DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, - columnNames=[], - rowIndex=int(str(row_index)), + columnNames=data.columns.tolist(), + rowIndex=row_index, DQexplanation={ "certainty": self.certainty(dq_value, min_quality) }, @@ -90,7 +90,7 @@ def assess( dq_measurements = 1 / (1 + degree_of_violation) min_quality = dq_measurements.min() - for row_index, dq_value in dq_measurements.items(): + for row_index, dq_value in enumerate(dq_measurements.values): results.append( DQResult( timestamp=pd.Timestamp.now(), @@ -98,7 +98,7 @@ def assess( DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, columnNames=[col_name], - rowIndex=int(str(row_index)), + rowIndex=row_index, DQexplanation={ "certainty": self.certainty(dq_value, min_quality) }, diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index 45dcc86..b557ea2 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -57,14 +57,14 @@ def assess( dq_measurements = fulfilled_rules_mask.mean(axis=1) certainties = self.certainties(fulfilled_rules_mask) - for (row_index, dq_value), certainty in zip( - dq_measurements.items(), certainties.values + for row_index, (dq_value, certainty) in enumerate( + zip(dq_measurements.values, certainties.values) ): results.append( self.create_result( dq_value, - None, - int(str(row_index)), + data.columns.tolist(), + row_index, float(certainty), ) ) @@ -91,14 +91,14 @@ def assess( dq_measurements = fulfilled_rules_mask.mean(axis=1) certainties = self.certainties(fulfilled_rules_mask) - for (row_index, dq_value), certainty in zip( - dq_measurements.items(), certainties.values + for row_index, (dq_value, certainty) in enumerate( + zip(dq_measurements.values, certainties.values) ): results.append( self.create_result( dq_value, - col_name, - int(str(row_index)), + [col_name], + row_index, float(certainty), ) ) @@ -112,17 +112,23 @@ def certainties(self, fulfilled_rules_mask: pd.DataFrame): ) def create_result( - self, dq_value: float, col_name: str | None, row_index: int, certainty: float + self, + dq_value: float, + col_names: List[str], + row_index: int, + certainty: float, ) -> DQResult: return DQResult( timestamp=pd.Timestamp.now(), DQvalue=dq_value, DQdimension=DQDimension.CONSISTENCY, DQmetric=self.__class__.__name__, - columnNames=[col_name] if col_name else [], + columnNames=col_names, rowIndex=row_index, DQexplanation={ "certainty": certainty, }, - DQgranularity=DQGranularity.CELL if col_name else DQGranularity.ROW, + DQgranularity=( + DQGranularity.CELL if len(col_names) == 1 else DQGranularity.ROW + ), ) diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index c3e33a1..927ffa8 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -30,14 +30,19 @@ def assess( "Reference DataFrame is required for correctness assessment." ) + if data.shape != reference.shape: + raise ValueError( + f"Data and reference must have the same shape for correctness assessment. Got data shape {data.shape} and reference shape {reference.shape}." + ) + results = [] total_rows = len(data) for col_name in data.columns: for row_index in range(total_rows): measurement = self.measure_correctness( - data.at[row_index, col_name], - reference_value=reference.at[row_index, col_name], + data[col_name].iat[row_index], + reference_value=reference[col_name].iat[row_index], dtype=data[col_name].dtype, ) @@ -59,14 +64,14 @@ def measure_correctness(self, value, *, reference_value, dtype) -> float: return 1 if pd.isna(value) or pd.isna(reference_value): return 0 - if dtype == "int64" or dtype == "float64": + if pd.api.types.is_numeric_dtype(dtype): return clamp( 1 - abs(value - reference_value) / max(abs(reference_value), abs(value)), 0, 1, ) - if dtype == "object": + if pd.api.types.is_string_dtype(dtype): max_len = max(len(str(value)), len(str(reference_value))) correctness = ( 1 - levenshtein_distance(str(value), str(reference_value)) / max_len diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index dd1c73d..cf46e30 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -28,7 +28,7 @@ def assess( The formula used is: timeliness = exp(-decline_rate * age), where age and decline_rate are measured in years. The age is calculated as the difference between the reference date and the ingestion date of the tuple (defined by the ingestion_date_column in the configuration). :param data: DataFrame to assess. - : param reference: Optional reference DataFrame (not used in this metric). + :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Configuration for the metric (required). :return: List of DQResult objects containing timeliness results. """ @@ -86,8 +86,8 @@ def assess( ), axis=1, ) - for (index, timeliness_value), (_, certainty_value) in zip( - timeliness.items(), certainty.items() + for row_index, (timeliness_value, certainty_value) in enumerate( + zip(timeliness.values, certainty.values) ): result = DQResult( timestamp=pd.Timestamp.now(), @@ -95,7 +95,7 @@ def assess( DQdimension=DQDimension.TIMELINESS, DQmetric=self.__class__.__name__, columnNames=[col_name], - rowIndex=int(str(index)), + rowIndex=row_index, DQexplanation={ "certainty": certainty_value, },