diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index ba51794..e4debab 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -24,7 +24,6 @@ class DQOrchestrator: def __init__(self, writer_config_path: str | None = None) -> None: self.dataframes: Dict[str, pd.DataFrame] = {} - self.reference_dataframes: Dict[str, pd.DataFrame] = {} self.data_paths: Dict[str, str] = {} self.results: Dict[str, DQResult] = ( {} @@ -61,12 +60,6 @@ def load(self, data_loader_configs: List[str]) -> None: self.dataframes[config.name] = dataframe self.data_paths[config.name] = config_path - if config.reference_file_name: - reference_config = DataConfig(config_data) - reference_config.file_name = config.reference_file_name - reference_dataframe = loader.load(reference_config) - self.reference_dataframes[config.name] = reference_dataframe - # Import pre-computed data profiles if config.data_profiles: self._import_data_profiles( @@ -98,7 +91,6 @@ def assess( start = time.perf_counter() incomplete_metric_results = metric_instance.assess( data=df, - reference=self.reference_dataframes.get(df_name), metric_config=metric_config, ) elapsed = time.perf_counter() - start @@ -107,7 +99,6 @@ def assess( else: incomplete_metric_results = metric_instance.assess( data=df, - reference=self.reference_dataframes.get(df_name), metric_config=metric_config, ) for result in incomplete_metric_results: diff --git a/metis/metric/completeness/completeness_nullAndDMVRatio.py b/metis/metric/completeness/completeness_nullAndDMVRatio.py index c3e39cc..7f6f6e4 100644 --- a/metis/metric/completeness/completeness_nullAndDMVRatio.py +++ b/metis/metric/completeness/completeness_nullAndDMVRatio.py @@ -25,14 +25,12 @@ class completeness_nullAndDMVRatio(Metric): def assess( self, data: pd.DataFrame, - reference: pd.DataFrame | None = None, metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ Assess the completeness of the data by checking for null values and disguised missing values. To detect disguised missing values, the FAHES algorithm by Qahtan et al. is applied to the data (paper: https://doi.org/10.1145/3219819.3220109). The completeness quality measurement is calculated as the ratio of valid values (non-null and non-disguised missing) to the total number of values. The metric can be configured using `completeness_nullAndDMVRatio_config` to calculate the completeness on column, row level, or table-level granularity. :param data: DataFrame to assess. - :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing completeness results. """ diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 1dd64b3..0abd599 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -16,21 +16,20 @@ class completeness_nullRatio(Metric): def assess( self, data: pd.DataFrame, - reference: pd.DataFrame | None = None, metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ Assess the completeness of the data by calculating the ratio and count of null values on different granularities. The ratio of non-null values is stored as the completeness quality measurement, while the count of null values is stored in the explanation for better interpretability. The metric can be configured using `completeness_nullRatio_config` to calculate the completeness on column, row level, or table-level granularity. :param data: DataFrame to assess. - :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing completeness results. """ - config = self.load_config(metric_config, completeness_nullRatio_config) - - results = [] + if metric_config is None: + config = completeness_nullRatio_config() + else: + config = self.load_config(metric_config, completeness_nullRatio_config) na_mask = data.isna() diff --git a/metis/metric/consistency/consistency_countFDViolations.py b/metis/metric/consistency/consistency_countFDViolations.py index 5bae10f..1e8213a 100644 --- a/metis/metric/consistency/consistency_countFDViolations.py +++ b/metis/metric/consistency/consistency_countFDViolations.py @@ -14,7 +14,6 @@ class consistency_countFDViolations(Metric): def assess( self, data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, metric_config: Union[MetricConfig, str, None] = None, ) -> List[DQResult]: """ diff --git a/metis/metric/consistency/consistency_ruleBasedHinrichs.py b/metis/metric/consistency/consistency_ruleBasedHinrichs.py index 2c98bf2..b328c85 100644 --- a/metis/metric/consistency/consistency_ruleBasedHinrichs.py +++ b/metis/metric/consistency/consistency_ruleBasedHinrichs.py @@ -18,7 +18,6 @@ class consistency_ruleBasedHinrichs(Metric): def assess( self, data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, metric_config: str | None | MetricConfig = None, ) -> List[DQResult]: """ @@ -26,7 +25,6 @@ def assess( Additionally, this metric assesses the certainty of the measurement based on the minimum quality in the assessed data. The certainty is calculated as sqrt((1 - dq_value) * (1 - min_quality)), where dq_value is the quality measurement for the specific value/row and min_quality is the lowest quality measurement observed in the dataset. :param data: DataFrame to assess. - :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing consistency results. """ diff --git a/metis/metric/consistency/consistency_ruleBasedPipino.py b/metis/metric/consistency/consistency_ruleBasedPipino.py index b557ea2..9ba83b2 100644 --- a/metis/metric/consistency/consistency_ruleBasedPipino.py +++ b/metis/metric/consistency/consistency_ruleBasedPipino.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List import pandas as pd @@ -17,7 +17,6 @@ class consistency_ruleBasedPipino(Metric): def assess( self, data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, metric_config: str | None | MetricConfig = None, ) -> List[DQResult]: """ @@ -25,7 +24,6 @@ def assess( Additionally, this metric assesses the certainty of the measurement based on the minimum quality in the assessed data. The certainty is calculated as sqrt((1 - dq_value) * (1 - min_quality)), where dq_value is the quality measurement for the specific value/row and min_quality is the lowest quality measurement observed in the dataset. :param data: DataFrame to assess. - :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Mandatory configuration for the metric. :return: List of DQResult objects containing consistency results. """ diff --git a/metis/metric/correctness/correctness_heinrich.py b/metis/metric/correctness/correctness_heinrich.py index 927ffa8..87a0082 100644 --- a/metis/metric/correctness/correctness_heinrich.py +++ b/metis/metric/correctness/correctness_heinrich.py @@ -3,6 +3,9 @@ import pandas as pd from metis.metric.config import MetricConfig +from metis.metric.correctness.correctness_heinrich_config import ( + correctness_heinrich_config, +) from metis.metric.metric import Metric from metis.utils.dq_dimension import DQDimension from metis.utils.dq_granularity import DQGranularity @@ -15,7 +18,6 @@ class correctness_heinrich(Metric): def assess( self, data: pd.DataFrame, - reference: pd.DataFrame | None = None, metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ @@ -25,14 +27,12 @@ def assess( :param metric_config: Optional configuration for the metric. :return: List of DQResult objects containing correctness results. """ - if reference is None: - raise ValueError( - "Reference DataFrame is required for correctness assessment." - ) + config = self.load_config(metric_config, correctness_heinrich_config) + reference_data = pd.read_csv(config.reference_file_path) - if data.shape != reference.shape: + if data.shape != reference_data.shape: raise ValueError( - f"Data and reference must have the same shape for correctness assessment. Got data shape {data.shape} and reference shape {reference.shape}." + f"Data and reference must have the same shape for correctness assessment. Got data shape {data.shape} and reference shape {reference_data.shape}." ) results = [] @@ -42,7 +42,7 @@ def assess( for row_index in range(total_rows): measurement = self.measure_correctness( data[col_name].iat[row_index], - reference_value=reference[col_name].iat[row_index], + reference_value=reference_data[col_name].iat[row_index], dtype=data[col_name].dtype, ) diff --git a/metis/metric/correctness/correctness_heinrich_config.py b/metis/metric/correctness/correctness_heinrich_config.py new file mode 100644 index 0000000..0937876 --- /dev/null +++ b/metis/metric/correctness/correctness_heinrich_config.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass +from pathlib import Path + +from metis.metric.config import MetricConfig + + +@dataclass +class correctness_heinrich_config(MetricConfig): + """ + Configuration class for the correctness_heinrich metric. + + :param reference_file_path: Path to the reference file that contains the correct values for the data. This file is used to compare against the assessed data in order to determine the correctness of the data. Must be of the same shape as the assessed data. + :param superset_file_path: Optional path to a superset file that contains the full dataset beyond the reference file. The reference data will, in many cases, be a manually cleaned subset representing the real data. This allows the correctness measurements to be extrapolated on the full dataset. The superset data is used to calculate how well the reference data covers the superset data, to assess the certainty of correctness measurements. + """ + + reference_file_path: str | Path + superset_file_path: str | Path | None = None diff --git a/metis/metric/metric.py b/metis/metric/metric.py index 62a4da1..6b67a54 100644 --- a/metis/metric/metric.py +++ b/metis/metric/metric.py @@ -30,7 +30,6 @@ def __init__(self) -> None: def assess( self, data: pd.DataFrame, - reference: pd.DataFrame | None = None, metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """Assess data using this metric and return the results. @@ -40,13 +39,6 @@ def assess( The DataFrame that should be assessed by this metric. This is the primary dataset under inspection. - - reference: Optional[pd.DataFrame] - An optional, cleaned reference DataFrame that can act as a - clean version of the dataset. Metrics that need a canonical or - expected version of the data (for example correctness against a - known-good source) should accept and use this DataFrame. If not - needed by a metric, `None` is allowed. - - metric_config: Optional[str] Optional path or JSON string containing metric-specific configuration. Use this to keep the method signature compact; diff --git a/metis/metric/minimality/minimality_duplicateCount.py b/metis/metric/minimality/minimality_duplicateCount.py index 1320b65..ee97ad8 100644 --- a/metis/metric/minimality/minimality_duplicateCount.py +++ b/metis/metric/minimality/minimality_duplicateCount.py @@ -13,7 +13,6 @@ class minimality_duplicateCount(Metric): def assess( self, data: pd.DataFrame, - reference: Union[pd.DataFrame, None] = None, metric_config: Union[MetricConfig, str, None] = None, ) -> List[DQResult]: """ diff --git a/metis/metric/timeliness/timeliness_heinrich.py b/metis/metric/timeliness/timeliness_heinrich.py index cf46e30..1bb70f9 100644 --- a/metis/metric/timeliness/timeliness_heinrich.py +++ b/metis/metric/timeliness/timeliness_heinrich.py @@ -20,7 +20,6 @@ class timeliness_heinrich(Metric): def assess( self, data: pd.DataFrame, - reference: pd.DataFrame | None = None, metric_config: str | MetricConfig | None = None, ) -> List[DQResult]: """ @@ -28,7 +27,6 @@ def assess( The formula used is: timeliness = exp(-decline_rate * age), where age and decline_rate are measured in years. The age is calculated as the difference between the reference date and the ingestion date of the tuple (defined by the ingestion_date_column in the configuration). :param data: DataFrame to assess. - :param reference: Optional reference DataFrame (not used in this metric). :param metric_config: Configuration for the metric (required). :return: List of DQResult objects containing timeliness results. """ @@ -47,6 +45,9 @@ def assess( ) for col_name, col_config in config.timeliness_config_per_column.items(): + if col_name not in data.columns: + continue + ingestion_date_column = col_config.ingestion_date_column assessment_date = pd.to_datetime( col_config.simulated_assessment_date or pd.Timestamp.now() @@ -97,7 +98,7 @@ def assess( columnNames=[col_name], rowIndex=row_index, DQexplanation={ - "certainty": certainty_value, + "certainty": float(certainty_value), }, DQgranularity=DQGranularity.CELL, ) diff --git a/metis/metric/validity/validity_outOfVocabulary.py b/metis/metric/validity/validity_outOfVocabulary.py index 2075124..aa74692 100644 --- a/metis/metric/validity/validity_outOfVocabulary.py +++ b/metis/metric/validity/validity_outOfVocabulary.py @@ -1,5 +1,5 @@ import re -from typing import List, Union +from typing import List import nltk import pandas as pd @@ -7,6 +7,9 @@ from metis.metric.config import MetricConfig from metis.metric.metric import Metric +from metis.metric.validity.validity_outOfVocabulary_config import ( + validity_outOfVocabulary_config, +) from metis.utils.dq_dimension import DQDimension from metis.utils.dq_granularity import DQGranularity from metis.utils.result import DQResult @@ -17,27 +20,41 @@ def __init__(self) -> None: super().__init__() nltk.download("words", quiet=True) - def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, set, None] = None, metric_config: Union[MetricConfig, str, None] = None) -> List[DQResult]: + def assess( + self, + data: pd.DataFrame, + metric_config: str | MetricConfig | None = None, + ) -> List[DQResult]: """ General vocabulary check at token level. Any alphabetic token not in the standard vocab is OOV. """ results: List[DQResult] = [] + if metric_config is None: + config = validity_outOfVocabulary_config() + else: + config = self.load_config(metric_config, validity_outOfVocabulary_config) + # Build vocabulary (lowercase) - if reference is None: + if config.reference is None: vocab_set = {w.lower() for w in nltk_words.words()} ref_src = "NLTK English words" - elif isinstance(reference, pd.DataFrame): - if reference.shape[1] != 1: + elif isinstance(config.reference, pd.DataFrame): + if config.reference.shape[1] != 1: raise ValueError("Reference DataFrame must have exactly one column.") - vocab_set = {str(x).strip().lower() for x in reference.iloc[:, 0].dropna().unique()} + vocab_set = { + str(x).strip().lower() + for x in config.reference.iloc[:, 0].dropna().unique() + } ref_src = "Custom vocabulary" - elif isinstance(reference, set): - vocab_set = {str(x).strip().lower() for x in reference} + elif isinstance(config.reference, set): + vocab_set = {str(x).strip().lower() for x in config.reference} ref_src = "Custom vocabulary" else: - raise ValueError("Reference must be a one column DataFrame, a set, or None.") + raise ValueError( + "Reference must be a one column DataFrame, a set, or None." + ) def tokenize(text: str): return re.findall(r"[A-Za-z]+", text.lower()) @@ -50,10 +67,12 @@ def tokenize(text: str): dq_value = 0.0 in_vocab_count = 0 else: + def is_valid(text: str) -> bool: tokens = tokenize(text) if not tokens: - return True # empty or numeric-like strings are treated as valid + # empty or numeric-like strings are treated as valid + return True # valid if *all* tokens are in vocabulary return all(token in vocab_set for token in tokens) @@ -66,7 +85,7 @@ def is_valid(text: str) -> bool: annotations = { "TotalNotNullValues": total_not_null_values, "InVocabValues": in_vocab_count, - "ReferenceSource": ref_src + "ReferenceSource": ref_src, } result = DQResult( diff --git a/metis/metric/validity/validity_outOfVocabulary_config.py b/metis/metric/validity/validity_outOfVocabulary_config.py new file mode 100644 index 0000000..6f015da --- /dev/null +++ b/metis/metric/validity/validity_outOfVocabulary_config.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from typing import Set + +import pandas as pd + +from metis.metric.config import MetricConfig + +@dataclass +class validity_outOfVocabulary_config(MetricConfig): + """ + Configuration class for the validity_outOfVocabulary metric. + + :param reference: Reference vocabulary to use for the out-of-vocabulary check. This can be provided as a DataFrame with a single column, a set of strings, or None to use the default NLTK English word list. + """ + + reference: pd.DataFrame | Set | None = None diff --git a/metis/profiling/data_profile_manager.py b/metis/profiling/data_profile_manager.py index 3699c0b..708cf6f 100644 --- a/metis/profiling/data_profile_manager.py +++ b/metis/profiling/data_profile_manager.py @@ -4,6 +4,9 @@ import threading from typing import Any, Dict, List, Optional +import numpy as np +import pandas as pd +from datasketch import MinHash as _MinHash from sqlalchemy import delete, select from sqlalchemy.orm import Session @@ -332,8 +335,6 @@ def _query_by_task( @staticmethod def _serialize(value: Any) -> tuple[dict, str]: """Wrap *value* into a JSON-safe dict and return (payload, type_tag).""" - import numpy as np - import pandas as pd def to_json_safe(v: Any) -> Any: """Convert numpy types to native Python types.""" @@ -353,8 +354,6 @@ def to_json_safe(v: Any) -> Any: return {"v": to_json_safe(value.to_dict())}, "series" # MinHash support (for minhash_signature results) - from datasketch import MinHash as _MinHash - if isinstance(value, _MinHash): return { "v": { @@ -389,26 +388,19 @@ def to_json_safe(v: Any) -> Any: @staticmethod def _deserialize(payload: Optional[dict], result_type: str) -> Any: - if payload is None: + raw = (payload or {}).get("v") + if raw is None: return None - import pandas as pd - raw = payload.get("v") if result_type == "series": return pd.Series(raw) if result_type == "minhash": - import numpy as np - from datasketch import MinHash as _MinHash - return _MinHash( num_perm=raw["num_perm"], seed=raw["seed"], hashvalues=np.array(raw["hashvalues"], dtype=np.uint64), ) if result_type == "minhash_dict": - import numpy as np - from datasketch import MinHash as _MinHash - return { k: _MinHash( num_perm=v["num_perm"], diff --git a/metis/profiling/importers/jaccard_importer.py b/metis/profiling/importers/jaccard_importer.py index 8240103..34c9fbb 100644 --- a/metis/profiling/importers/jaccard_importer.py +++ b/metis/profiling/importers/jaccard_importer.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List -from .base import BaseImporter, auto_detect_type +from .base import BaseImporter class JaccardImporter(BaseImporter): @@ -25,7 +25,7 @@ def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: col2 = row["column2"] value = float(row["value"]) - profile = { + profile: Dict[str, List[str] | float | Dict[str, int]] = { "column_names": sorted([col1, col2]), "value": value, } diff --git a/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py b/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py index df74e11..f0b6b5c 100644 --- a/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py +++ b/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py @@ -1,13 +1,14 @@ from typing import Union + import pandas as pd def distinct_count(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: """ Count the number of distinct (unique) values, excluding nulls. - + :param data: Input Series (single column) or DataFrame (multiple columns). :return: Number of distinct values as int if Series input, Series of ints if DataFrame input. """ result = data.nunique() - return int(result) if isinstance(data, pd.Series) else result \ No newline at end of file + return result if isinstance(result, pd.Series) else int(result) diff --git a/metis/utils/data_profiling/single_column/cardinalities/null_values.py b/metis/utils/data_profiling/single_column/cardinalities/null_values.py index 99c6a06..04af46c 100644 --- a/metis/utils/data_profiling/single_column/cardinalities/null_values.py +++ b/metis/utils/data_profiling/single_column/cardinalities/null_values.py @@ -1,28 +1,28 @@ - from typing import Union + import pandas as pd def null_count(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: """ Count the number of null/missing values. - + :param data: Input Series (single column) or DataFrame (multiple columns). :return: Number of null values as int if Series input, Series of ints if DataFrame input. """ result = data.isna().sum() - return int(result) if isinstance(data, pd.Series) else result + return result if isinstance(result, pd.Series) else int(result) def null_percentage(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: """ Calculate the percentage of null/missing values. - + :param data: Input Series (single column) or DataFrame (multiple columns). :return: Percentage of null values (0-100) as float if Series input, Series of floats if DataFrame input. """ if len(data) == 0: - return 0.0 if isinstance(data, pd.Series) else pd.Series(dtype=float) - - result = (data.isna().sum() / len(data) * 100) - return float(result) if isinstance(data, pd.Series) else result \ No newline at end of file + result = data.isna().sum() + else: + result = data.isna().mean() * 100 + return result if isinstance(result, pd.Series) else float(result) diff --git a/metis/utils/data_profiling/single_column/domain_classification/domain.py b/metis/utils/data_profiling/single_column/domain_classification/domain.py index dd5f0b3..b5c954b 100644 --- a/metis/utils/data_profiling/single_column/domain_classification/domain.py +++ b/metis/utils/data_profiling/single_column/domain_classification/domain.py @@ -1,6 +1,6 @@ -from typing import Union, Optional, Dict, List -import pandas as pd +from typing import Dict, List, Optional, Union +import pandas as pd DOMAIN_PATTERNS: Dict[str, str] = { "email": r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', @@ -8,7 +8,7 @@ "ssn": r'^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$', "date_iso": r'^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$', "time": r'^([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?$', - "ip_address": r'^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$', + "ip_address": r'^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$', "zip_code": r'^\d{5}(-\d{4})?$', "credit_card": r'^(?:\d[ -]*){13,19}$', "phone": r'^(\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}|\(\d{3}\)[-.\s]?\d{3}[-.\s]?\d{4}|\d{3}[-.\s]\d{3}[-.\s]\d{4})$', @@ -36,7 +36,7 @@ def _detect_domain_by_pattern(series: pd.Series, threshold: float = 0.8) -> Opti :return: Domain name if detected, None otherwise. """ clean_data = series.dropna() - + if len(clean_data) == 0: return None @@ -110,10 +110,12 @@ def domain(data: Union[pd.Series, pd.DataFrame]) -> Union[str, pd.Series]: DataFrame input. """ if isinstance(data, pd.Series): - column_name = data.name if hasattr(data, 'name') else None + column_name = ( + str(data.name) if getattr(data, "name", None) is not None else None + ) return _classify_domain(data, column_name) else: result = {} for col in data.columns: result[col] = _classify_domain(data[col], col) - return pd.Series(result) \ No newline at end of file + return pd.Series(result) diff --git a/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py b/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py index 64f9101..c166c4b 100644 --- a/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py +++ b/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py @@ -1,5 +1,6 @@ -from typing import Union from decimal import Decimal, InvalidOperation +from typing import Union + import pandas as pd @@ -7,16 +8,18 @@ def _to_decimal_string(val: float) -> str: """Convert float to full decimal string without scientific notation.""" try: d = Decimal(str(val)) - sign, digits, exponent = d.as_tuple() - + _, digits, exponent = d.as_tuple() + + if not isinstance(exponent, int): + exponent = 0 if exponent >= 0: - return ''.join(str(d) for d in digits) + '0' * exponent + return "".join(str(d) for d in digits) + "0" * exponent else: - digits_str = ''.join(str(d) for d in digits) + digits_str = "".join(str(d) for d in digits) if len(digits_str) <= -exponent: - digits_str = '0' * (-exponent - len(digits_str) + 1) + digits_str + digits_str = "0" * (-exponent - len(digits_str) + 1) + digits_str decimal_pos = len(digits_str) + exponent - return digits_str[:decimal_pos] + '.' + digits_str[decimal_pos:] + return digits_str[:decimal_pos] + "." + digits_str[decimal_pos:] except (InvalidOperation, ValueError): return str(val) @@ -35,7 +38,7 @@ def _calculate_size(series: pd.Series) -> int: if not pd.api.types.is_numeric_dtype(clean_data): try: - clean_data = pd.to_numeric(clean_data, errors='coerce').dropna() + clean_data = pd.to_numeric(clean_data, errors="coerce").dropna() if len(clean_data) == 0: return 0 except Exception: @@ -45,7 +48,7 @@ def _calculate_size(series: pd.Series) -> int: for val in clean_data: try: val_str = _to_decimal_string(abs(val)) - val_str = val_str.replace('.', '') + val_str = val_str.replace(".", "") max_digits = max(max_digits, len(val_str)) except Exception: continue @@ -67,7 +70,7 @@ def _calculate_decimals(series: pd.Series) -> int: if not pd.api.types.is_numeric_dtype(clean_data): try: - clean_data = pd.to_numeric(clean_data, errors='coerce').dropna() + clean_data = pd.to_numeric(clean_data, errors="coerce").dropna() if len(clean_data) == 0: return 0 except Exception: @@ -77,8 +80,8 @@ def _calculate_decimals(series: pd.Series) -> int: for val in clean_data: try: val_str = _to_decimal_string(val) - if '.' in val_str: - decimal_part = val_str.split('.')[1].rstrip('0') + if "." in val_str: + decimal_part = val_str.split(".")[1].rstrip("0") max_decimals = max(max_decimals, len(decimal_part)) except Exception: continue @@ -125,4 +128,4 @@ def decimals(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: result = {} for col in data.columns: result[col] = _calculate_decimals(data[col]) - return pd.Series(result) \ No newline at end of file + return pd.Series(result) diff --git a/metis/utils/data_profiling/single_column/value_distribution/histogram.py b/metis/utils/data_profiling/single_column/value_distribution/histogram.py index fb1bc09..c613548 100644 --- a/metis/utils/data_profiling/single_column/value_distribution/histogram.py +++ b/metis/utils/data_profiling/single_column/value_distribution/histogram.py @@ -74,7 +74,7 @@ def equi_depth_histogram( return {"bin_edges": [], "frequencies": []} quantiles = np.linspace(0, 1, bins + 1) - bin_edges = clean_data.quantile(quantiles).values + bin_edges = clean_data.quantile(quantiles).to_numpy() bin_edges = np.unique(bin_edges) if len(bin_edges) <= 1: @@ -97,7 +97,7 @@ def equi_depth_histogram( continue quantiles = np.linspace(0, 1, bins + 1) - bin_edges = clean_data.quantile(quantiles).values + bin_edges = clean_data.quantile(quantiles).to_numpy() bin_edges = np.unique(bin_edges) if len(bin_edges) <= 1: diff --git a/metis/utils/data_profiling/single_column/value_distribution/quartiles.py b/metis/utils/data_profiling/single_column/value_distribution/quartiles.py index 30b382b..385a61e 100644 --- a/metis/utils/data_profiling/single_column/value_distribution/quartiles.py +++ b/metis/utils/data_profiling/single_column/value_distribution/quartiles.py @@ -1,8 +1,11 @@ -from typing import Union, Dict +from typing import Dict, Union + import pandas as pd -def quartiles(data: Union[pd.Series, pd.DataFrame]) -> Union[Dict[str, float], Dict[str, Dict[str, float]]]: +def quartiles( + data: Union[pd.Series, pd.DataFrame], +) -> Union[Dict[str, float | None], Dict[str, Dict[str, float | None]]]: """ Calculate quartiles (Q1, Q2/median, Q3) that divide numeric values into four equal groups. @@ -45,7 +48,9 @@ def quartiles(data: Union[pd.Series, pd.DataFrame]) -> Union[Dict[str, float], D return result -def interquartile_range(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: +def interquartile_range( + data: Union[pd.Series, pd.DataFrame], +) -> Union[float, pd.Series, None]: """ Calculate the interquartile range (IQR = Q3 - Q1).