From cd288e50a687aea1b4f23597d607d803c1453476 Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Tue, 24 Mar 2026 20:54:31 +0100 Subject: [PATCH] feat: add full restaurants demo with dynamic metric discovery --- .gitattributes | 1 + README.md | 15 +++++++++++++++ data/restaurants.csv | 3 +++ data/restaurants.json | 9 +++++++++ demo/configs/consistency.json | 4 ++++ demo/configs/sqlite.json | 5 +++++ demo/run_demo.py | 18 ++++++++++++++++++ 7 files changed, 55 insertions(+) create mode 100644 .gitattributes create mode 100644 data/restaurants.csv create mode 100644 data/restaurants.json create mode 100644 demo/configs/consistency.json create mode 100644 demo/configs/sqlite.json create mode 100644 demo/run_demo.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3010086 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +data/restaurants.csv filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 264cca2..0d66586 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,21 @@ Metis is a framework to automatically assess the quality of tabular data across python -m demo.getting_started ``` +## Full demo (all metrics) + +To run every registered metric against the full TripAdvisor European Restaurants dataset, use the extended demo. **Note: this will take some time.** + +``` +python -m demo.run_demo +``` + +The demo uses `data/restaurants.csv`, the full +[TripAdvisor European Restaurants dataset from Kaggle](https://www.kaggle.com/datasets/stefanoleone992/tripadvisor-european-restaurants) +(~1.08 M rows). The CSV includes 42 original columns (ratings, cuisines, +location, price level, …) plus two synthetic timestamp columns +(`first_review_date` and `last_review_date`) with ~10 % intentional nulls +to surface interesting completeness findings. + ## How to implement new metrics To extend the Metis framework and add new data quality metrics, please check our interface for easy integration. diff --git a/data/restaurants.csv b/data/restaurants.csv new file mode 100644 index 0000000..7bd0bd5 --- /dev/null +++ b/data/restaurants.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17bcb14f67996d50b77a8077539713424bc47b9e29b1150f79ac71a1fa74e498 +size 700264775 diff --git a/data/restaurants.json b/data/restaurants.json new file mode 100644 index 0000000..5154c1a --- /dev/null +++ b/data/restaurants.json @@ -0,0 +1,9 @@ +{ + "loader": "CSV", + "name": "Restaurants", + "file_name": "restaurants.csv", + "delimiter": ",", + "encoding": "utf-8", + "header": 0, + "nrows": null +} diff --git a/demo/configs/consistency.json b/demo/configs/consistency.json new file mode 100644 index 0000000..994d1ed --- /dev/null +++ b/demo/configs/consistency.json @@ -0,0 +1,4 @@ +{ + "province": ["region"], + "country": ["default_language"] +} diff --git a/demo/configs/sqlite.json b/demo/configs/sqlite.json new file mode 100644 index 0000000..3c76885 --- /dev/null +++ b/demo/configs/sqlite.json @@ -0,0 +1,5 @@ +{ + "writer_name": "sqlite", + "table_name": "dqresults", + "db_name": "dq_repository/demo.db" +} diff --git a/demo/run_demo.py b/demo/run_demo.py new file mode 100644 index 0000000..e2b6a81 --- /dev/null +++ b/demo/run_demo.py @@ -0,0 +1,18 @@ +from metis.dq_orchestrator import DQOrchestrator +from metis.metric import Metric + +_METRIC_CONFIGS = { + "consistency_countFDViolations": "demo/configs/consistency.json", +} + +orchestrator = DQOrchestrator(writer_config_path="demo/configs/sqlite.json") +orchestrator.load(data_loader_configs=["data/restaurants.json"]) + +for metric_name in Metric.registry: + try: + orchestrator.assess( + metrics=[metric_name], + metric_configs=[_METRIC_CONFIGS.get(metric_name)], + ) + except Exception as exc: + print(f"Metric {metric_name} failed: {exc}")