diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3010086 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +data/restaurants.csv filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 264cca2..0d66586 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,21 @@ Metis is a framework to automatically assess the quality of tabular data across python -m demo.getting_started ``` +## Full demo (all metrics) + +To run every registered metric against the full TripAdvisor European Restaurants dataset, use the extended demo. **Note: this will take some time.** + +``` +python -m demo.run_demo +``` + +The demo uses `data/restaurants.csv`, the full +[TripAdvisor European Restaurants dataset from Kaggle](https://www.kaggle.com/datasets/stefanoleone992/tripadvisor-european-restaurants) +(~1.08 M rows). The CSV includes 42 original columns (ratings, cuisines, +location, price level, …) plus two synthetic timestamp columns +(`first_review_date` and `last_review_date`) with ~10 % intentional nulls +to surface interesting completeness findings. + ## How to implement new metrics To extend the Metis framework and add new data quality metrics, please check our interface for easy integration. diff --git a/data/restaurants.csv b/data/restaurants.csv new file mode 100644 index 0000000..7bd0bd5 --- /dev/null +++ b/data/restaurants.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17bcb14f67996d50b77a8077539713424bc47b9e29b1150f79ac71a1fa74e498 +size 700264775 diff --git a/data/restaurants.json b/data/restaurants.json new file mode 100644 index 0000000..5154c1a --- /dev/null +++ b/data/restaurants.json @@ -0,0 +1,9 @@ +{ + "loader": "CSV", + "name": "Restaurants", + "file_name": "restaurants.csv", + "delimiter": ",", + "encoding": "utf-8", + "header": 0, + "nrows": null +} diff --git a/demo/configs/consistency.json b/demo/configs/consistency.json new file mode 100644 index 0000000..994d1ed --- /dev/null +++ b/demo/configs/consistency.json @@ -0,0 +1,4 @@ +{ + "province": ["region"], + "country": ["default_language"] +} diff --git a/demo/configs/sqlite.json b/demo/configs/sqlite.json new file mode 100644 index 0000000..3c76885 --- /dev/null +++ b/demo/configs/sqlite.json @@ -0,0 +1,5 @@ +{ + "writer_name": "sqlite", + "table_name": "dqresults", + "db_name": "dq_repository/demo.db" +} diff --git a/demo/run_demo.py b/demo/run_demo.py new file mode 100644 index 0000000..e2b6a81 --- /dev/null +++ b/demo/run_demo.py @@ -0,0 +1,18 @@ +from metis.dq_orchestrator import DQOrchestrator +from metis.metric import Metric + +_METRIC_CONFIGS = { + "consistency_countFDViolations": "demo/configs/consistency.json", +} + +orchestrator = DQOrchestrator(writer_config_path="demo/configs/sqlite.json") +orchestrator.load(data_loader_configs=["data/restaurants.json"]) + +for metric_name in Metric.registry: + try: + orchestrator.assess( + metrics=[metric_name], + metric_configs=[_METRIC_CONFIGS.get(metric_name)], + ) + except Exception as exc: + print(f"Metric {metric_name} failed: {exc}")