From b3d3eafd1c0904fc05a4c8497ac0ce0be7e0b54c Mon Sep 17 00:00:00 2001 From: Korawich Anuttra Date: Sun, 16 Mar 2025 22:28:12 +0700 Subject: [PATCH 1/4] :page_facing_up: docs: update readme for usage of data-quality. --- README.md | 24 ++++++++++++++++++++++++ src/sqlplate/__init__.py | 1 + 2 files changed, 25 insertions(+) diff --git a/README.md b/README.md index efc6ef3..b305313 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ pip install -U sqlplate ## :fork_and_knife: Usage +### Generate SQL template + Start passing option parameters before generate the Delta ETL SQL statement that will use on the Azure Databricks service. @@ -108,6 +110,28 @@ WHEN NOT MATCHED THEN INSERT ; ``` +### Data Quality + +> [!IMPORTANT] +> This feature does not support yet!!! + +```python +from sqlplate import SQLity, Condition + +report: str = ( + SQLity.format('databricks') + .template('quality') + .option('catalog', 'catalog-name') + .option('schema', 'schema-name') + .option('table', 'table-name') + .check('unique', Condition(cols=['pk_col'], rule="unique")) + .check('not-null', Condition(cols=['col01', 'col02'], rule="not-null")) + .check('row-count', Contition(rule="count")) + .validate(output='html') +) +print(report.strip().strip('\n')) +``` + ## :chains: Support Systems | System | Progress Status | System Integration Test | Remark | diff --git a/src/sqlplate/__init__.py b/src/sqlplate/__init__.py index 73cf028..e208322 100644 --- a/src/sqlplate/__init__.py +++ b/src/sqlplate/__init__.py @@ -1 +1,2 @@ from .sqlplate import SQLPlate +from .sqlity import SQLity From 9e1111eff78d0a11635ccee258a7e5a6110c2026 Mon Sep 17 00:00:00 2001 From: Korawich Anuttra Date: Sun, 16 Mar 2025 22:40:47 +0700 Subject: [PATCH 2/4] :gear: fixed: remove usesage pattern of data-quality. --- README.md | 13 +++++++------ src/sqlplate/__init__.py | 1 - src/sqlplate/rules.py | 10 ++++++++++ src/sqlplate/sqlity.py | 10 ---------- src/sqlplate/sqlplate.py | 23 ++++++++++++++++++++++- 5 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 src/sqlplate/rules.py delete mode 100644 src/sqlplate/sqlity.py diff --git a/README.md b/README.md index b305313..82efa46 100644 --- a/README.md +++ b/README.md @@ -116,17 +116,18 @@ WHEN NOT MATCHED THEN INSERT > This feature does not support yet!!! ```python -from sqlplate import SQLity, Condition +from sqlplate import SQLPlate +from sqlplate.rules import Unique, NotNull, Count report: str = ( - SQLity.format('databricks') - .template('quality') + SQLPlate.format('databricks') + .quality(mode="pushdown") .option('catalog', 'catalog-name') .option('schema', 'schema-name') .option('table', 'table-name') - .check('unique', Condition(cols=['pk_col'], rule="unique")) - .check('not-null', Condition(cols=['col01', 'col02'], rule="not-null")) - .check('row-count', Contition(rule="count")) + .check('unique', Unique(cols=['pk_col'])) + .check('not-null', NotNull(cols=['col01', 'col02'])) + .check('row-count', Count()) .validate(output='html') ) print(report.strip().strip('\n')) diff --git a/src/sqlplate/__init__.py b/src/sqlplate/__init__.py index e208322..73cf028 100644 --- a/src/sqlplate/__init__.py +++ b/src/sqlplate/__init__.py @@ -1,2 +1 @@ from .sqlplate import SQLPlate -from .sqlity import SQLity diff --git a/src/sqlplate/rules.py b/src/sqlplate/rules.py new file mode 100644 index 0000000..6c2f025 --- /dev/null +++ b/src/sqlplate/rules.py @@ -0,0 +1,10 @@ +class BaseRule: ... + + +class Unique(BaseRule): ... + + +class NotNull(BaseRule): ... + + +class Count(BaseRule): ... diff --git a/src/sqlplate/sqlity.py b/src/sqlplate/sqlity.py deleted file mode 100644 index 76d6ad7..0000000 --- a/src/sqlplate/sqlity.py +++ /dev/null @@ -1,10 +0,0 @@ -# ------------------------------------------------------------------------------ -# Copyright (c) 2022 Korawich Anuttra. All rights reserved. -# Licensed under the MIT License. See LICENSE in the project root for -# license information. -# ------------------------------------------------------------------------------ -from __future__ import annotations - - -class SQLity: - """A SQLity object for render data quality report by Jinja template.""" diff --git a/src/sqlplate/sqlplate.py b/src/sqlplate/sqlplate.py index 9ab5f33..1be5ed7 100644 --- a/src/sqlplate/sqlplate.py +++ b/src/sqlplate/sqlplate.py @@ -6,7 +6,7 @@ from __future__ import annotations from pathlib import Path -from typing import Any, Iterator, Optional, Callable +from typing import Any, Iterator, Optional, Callable, Literal from jinja2 import Template @@ -83,6 +83,9 @@ def template(self, name: str) -> 'SQLPlate': ) return self + def quality(self, mode: Literal["pushdown", "memory"]) -> 'SQLPlate': + return self + def option(self, key: str, value: Any) -> 'SQLPlate': """Pass an option key-value pair before generate template.""" self._option[key] = value @@ -144,3 +147,21 @@ def stream( ) if trim(s) != '' ) + + def check( + self, + name: str, + rule: Any, + ) -> 'SQLPlate': + return self + + def validate( + self, + output: Literal["json", "html"], + ): + return self + + def filter( + self, + ): + return self From a4b4a590bf9c1cde2a6de54ef4a3490f1d4ec899 Mon Sep 17 00:00:00 2001 From: Korawich Anuttra Date: Wed, 19 Mar 2025 21:58:29 +0700 Subject: [PATCH 3/4] :gear: fixed: change way to get dq. --- README.md | 24 +++++++++++++++--------- src/sqlplate/rules.py | 10 ---------- src/sqlplate/sqlplate.py | 18 ------------------ templates/databricks/quality.check.sql | 19 +++++++++++++++++++ tests/test_databricks.py | 16 ++++++++++++++++ 5 files changed, 50 insertions(+), 37 deletions(-) delete mode 100644 src/sqlplate/rules.py create mode 100644 templates/databricks/quality.check.sql diff --git a/README.md b/README.md index 82efa46..3064444 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ print(statement.strip().strip('\n')) The result SQL statement: -```text +```sql MERGE INTO catalog-name.schema-name.table-name AS target USING ( WITH change_query AS ( @@ -117,20 +117,26 @@ WHEN NOT MATCHED THEN INSERT ```python from sqlplate import SQLPlate -from sqlplate.rules import Unique, NotNull, Count -report: str = ( +statement: str = ( SQLPlate.format('databricks') - .quality(mode="pushdown") + .template('quality.check') .option('catalog', 'catalog-name') .option('schema', 'schema-name') .option('table', 'table-name') - .check('unique', Unique(cols=['pk_col'])) - .check('not-null', NotNull(cols=['col01', 'col02'])) - .check('row-count', Count()) - .validate(output='html') + .option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')") + .option('unique', ['pk_col']) + .option('notnull', ['col01', 'col02']) + .option('row_count', True) + .load() ) -print(report.strip().strip('\n')) +print(statement.strip().strip('\n')) +``` + +The result SQL statement: + +```sql + ``` ## :chains: Support Systems diff --git a/src/sqlplate/rules.py b/src/sqlplate/rules.py deleted file mode 100644 index 6c2f025..0000000 --- a/src/sqlplate/rules.py +++ /dev/null @@ -1,10 +0,0 @@ -class BaseRule: ... - - -class Unique(BaseRule): ... - - -class NotNull(BaseRule): ... - - -class Count(BaseRule): ... diff --git a/src/sqlplate/sqlplate.py b/src/sqlplate/sqlplate.py index 1be5ed7..b2f3565 100644 --- a/src/sqlplate/sqlplate.py +++ b/src/sqlplate/sqlplate.py @@ -147,21 +147,3 @@ def stream( ) if trim(s) != '' ) - - def check( - self, - name: str, - rule: Any, - ) -> 'SQLPlate': - return self - - def validate( - self, - output: Literal["json", "html"], - ): - return self - - def filter( - self, - ): - return self diff --git a/templates/databricks/quality.check.sql b/templates/databricks/quality.check.sql new file mode 100644 index 0000000..1473f24 --- /dev/null +++ b/templates/databricks/quality.check.sql @@ -0,0 +1,19 @@ +{% extends "base.jinja" %} + +{% block statement %} +WITH source AS ( + SELECT + * + FROM {{ catalog }}.{{ schema }}.{{ table }} + {%+ if filter %}WHERE {{ filter }}{% endif +%} +) +SELECT + * + {%+ if row_count %}, (SELECT COUNT(1) FROM source) AS table_records{% endif +%} + {%+ if unique -%} + {%- for col in unique -%} + , (SELECT COUNT {{ col }} FROM (SELECT DISTINCT {{ col}} FROM source)) AS unique_{{ col }} + {%- endfor -%} + {%- endif +%} +FROM source +{% endblock statement %} diff --git a/tests/test_databricks.py b/tests/test_databricks.py index d1c1ee1..3593504 100644 --- a/tests/test_databricks.py +++ b/tests/test_databricks.py @@ -277,3 +277,19 @@ def test_sql_full_dump(template_path): FROM ( SELECT * FROM catalog-name.schema-name.source-name ) AS sub_query ; """).strip('\n') + + +def test_quality_check(template_path): + statement: SQLPlate = ( + SQLPlate.format('databricks', path=template_path) + .template('quality.check') + .option('catalog', 'catalog-name') + .option('schema', 'schema-name') + .option('table', 'table-name') + .option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')") + .option('unique', ['pk_col']) + .option('notnull', ['col01', 'col02']) + .option("row_count", True) + .load() + ) + print(statement) From 10dc388e0cfafd068fba31a2b8ed995f35f356a9 Mon Sep 17 00:00:00 2001 From: Korawich Anuttra Date: Wed, 19 Mar 2025 23:47:33 +0700 Subject: [PATCH 4/4] :dart: feat: add tempate for quality check. --- README.md | 24 +++++++++++++++++++-- templates/databricks/quality.check.sql | 28 ++++++++++++++++++++----- templates/databricks/quality.metrix.sql | 11 ++++++++++ tests/test_databricks.py | 23 +++++++++++++++++++- 4 files changed, 78 insertions(+), 8 deletions(-) create mode 100644 templates/databricks/quality.metrix.sql diff --git a/README.md b/README.md index 3064444..fe6d0a4 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,9 @@ WHEN NOT MATCHED THEN INSERT ### Data Quality +This package handle generate SQL statement only. For a data quality part, you can +use the quality template. + > [!IMPORTANT] > This feature does not support yet!!! @@ -127,7 +130,8 @@ statement: str = ( .option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')") .option('unique', ['pk_col']) .option('notnull', ['col01', 'col02']) - .option('row_count', True) + .option("contain", [("col01", ["A", "B", "C"])]) + .option("validate", [("col03", "> 10000")]) .load() ) print(statement.strip().strip('\n')) @@ -136,7 +140,23 @@ print(statement.strip().strip('\n')) The result SQL statement: ```sql - +WITH source AS ( + SELECT + * + FROM + catalog-name.schema-name.table-name + WHERE load_date >= to_timestamp('20250201', 'yyyyMMdd') +) +, records AS ( + SELECT COUNT(1) AS table_records + FROM source +) +SELECT + (SELECT table_records FROM records) AS table_records + , ((SELECT COUNT( DISTINCT pk_col ) FROM source) = (SELECT table_records FROM records)) AS unique_pk_col + , (SELECT COUNT(1) FROM source WHERE pk_col IS NULL) = 0 AS notnull_pk_col + , (SELECT COUNT(1) FROM source WHERE col01 NOT IN ['A', 'B', 'C']) = 0 AS contain_col01 + , ((SELECT COUNT(1) FROM source WHERE col03 > 10000) = (SELECT table_records FROM records)) AS validate_col03 ``` ## :chains: Support Systems diff --git a/templates/databricks/quality.check.sql b/templates/databricks/quality.check.sql index 1473f24..37de899 100644 --- a/templates/databricks/quality.check.sql +++ b/templates/databricks/quality.check.sql @@ -4,16 +4,34 @@ WITH source AS ( SELECT * - FROM {{ catalog }}.{{ schema }}.{{ table }} + FROM + {{ catalog }}.{{ schema }}.{{ table }} {%+ if filter %}WHERE {{ filter }}{% endif +%} ) +, records AS ( + SELECT COUNT(1) AS table_records + FROM source +) SELECT - * - {%+ if row_count %}, (SELECT COUNT(1) FROM source) AS table_records{% endif +%} + (SELECT table_records FROM records) AS table_records {%+ if unique -%} {%- for col in unique -%} - , (SELECT COUNT {{ col }} FROM (SELECT DISTINCT {{ col}} FROM source)) AS unique_{{ col }} + , ((SELECT COUNT( DISTINCT {{ col }} ) FROM source) = (SELECT table_records FROM records)) AS unique_{{ col }} + {%- endfor -%} + {%- endif +%} + {%+ if notnull -%} + {%- for col in unique -%} + , (SELECT COUNT(1) FROM source WHERE {{ col }} IS NULL) = 0 AS notnull_{{ col }} + {%- endfor -%} + {%- endif +%} + {%+ if contain -%} + {%- for col in contain -%} + , (SELECT COUNT(1) FROM source WHERE {{ col[0] }} NOT IN {{ col[1] }}) = 0 AS contain_{{ col[0] }} + {%- endfor -%} + {%- endif +%} + {%+ if contain -%} + {%- for col in validate -%} + , ((SELECT COUNT(1) FROM source WHERE {{ col[0] }} {{ col[1] }}) = (SELECT table_records FROM records)) AS validate_{{ col[0] }} {%- endfor -%} {%- endif +%} -FROM source {% endblock statement %} diff --git a/templates/databricks/quality.metrix.sql b/templates/databricks/quality.metrix.sql new file mode 100644 index 0000000..5a21b34 --- /dev/null +++ b/templates/databricks/quality.metrix.sql @@ -0,0 +1,11 @@ +{% extends "base.jinja" %} + +{% block statement %} +WITH source AS ( + SELECT + * + FROM {{ catalog }}.{{ schema }}.{{ table }} + {%+ if filter %}WHERE {{ filter }}{% endif +%} +) +SELECT +{% endblock statement %} diff --git a/tests/test_databricks.py b/tests/test_databricks.py index 3593504..41aec71 100644 --- a/tests/test_databricks.py +++ b/tests/test_databricks.py @@ -289,7 +289,28 @@ def test_quality_check(template_path): .option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')") .option('unique', ['pk_col']) .option('notnull', ['col01', 'col02']) - .option("row_count", True) + .option( + "contain", + [("col01", ["A", "B", "C"])], + ) + .option( + "validate", + [("col03", "> 10000")], + ) + .load() + ) + print(statement) + + +def test_quality_metrix(template_path): + statement: SQLPlate = ( + SQLPlate.format('databricks', path=template_path) + .template('quality.metrix') + .option('catalog', 'catalog-name') + .option('schema', 'schema-name') + .option('table', 'table-name') + .option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')") + .option("metrix", ["col1", "col2", "col3"]) .load() ) print(statement)