From 4e6455b3e370dc4952fb65be937d8dad8384bf10 Mon Sep 17 00:00:00 2001 From: venu-sambarapu-DS Date: Mon, 3 Jun 2024 19:14:12 +0530 Subject: [PATCH 1/5] Added code to read standard datasets only from google sheet --- app/api/api_v1/routers/dictionary.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/app/api/api_v1/routers/dictionary.py b/app/api/api_v1/routers/dictionary.py index c76030d..c59bcad 100644 --- a/app/api/api_v1/routers/dictionary.py +++ b/app/api/api_v1/routers/dictionary.py @@ -1,6 +1,8 @@ import pandas as pd from fastapi import APIRouter, HTTPException, status from fastapi.encoders import jsonable_encoder +import io +import requests from fastapi.responses import JSONResponse from app.core.config import CORE_FOLDER, Settings @@ -12,13 +14,26 @@ dictionary_router = router = APIRouter() +g_sheet_session = requests.Session() +g_sheet_response = g_sheet_session.get("https://docs.google.com/spreadsheets/d/1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E/gviz/tq?tqx=out:csv&sheet=master_dictionaries") +g_sheet_bytes_data = g_sheet_response.content +data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode('utf-8'))) +print("reading data from google sheet@@@@") +# data.rename( +# columns={ +# "country_standard_name": "country", +# "unique_standard_airline_name": "airline", +# "standard_disease_name": "disease", +# "psu_companies": "psu", +# } +# ) +# print(data.columns.tolist()) + + @router.get("/", summary="Get all Saved Entities csv file name") async def get_entity_names(): # List down all the csv files present in the config folder - return [ - csv_file.name.replace(".csv", "") - for csv_file in CORE_FOLDER.glob("**/*.csv") - ] + return data.columns.tolist() @router.get( @@ -27,9 +42,9 @@ async def get_entity_names(): response_class=JSONResponse, ) async def get_entity_data(entity: str): - entity_df = pd.read_csv(CORE_FOLDER / f"{entity}.csv") + entity_df = data[[entity]].dropna() # to avoid json conversion error - entity_df = entity_df.fillna("") + # entity_df = entity_df.fillna("") # convert to json json_compatible_item_data = jsonable_encoder( From e38430869a5ced9c600f5a2c14a03db64ece1466 Mon Sep 17 00:00:00 2001 From: venu-sambarapu-DS Date: Tue, 4 Jun 2024 14:33:50 +0530 Subject: [PATCH 2/5] Modified the way of reading standard dataset --- app/api/api_v1/routers/dictionary.py | 25 +++++++++++++++---------- app/utils/airline.py | 12 +++++------- app/utils/common.py | 16 ++++++++-------- app/utils/geography.py | 13 +++++-------- app/utils/insurance.py | 10 ++++------ app/utils/metadata.py | 23 ++++++++++------------- 6 files changed, 47 insertions(+), 52 deletions(-) diff --git a/app/api/api_v1/routers/dictionary.py b/app/api/api_v1/routers/dictionary.py index c59bcad..f962ee5 100644 --- a/app/api/api_v1/routers/dictionary.py +++ b/app/api/api_v1/routers/dictionary.py @@ -18,16 +18,20 @@ g_sheet_response = g_sheet_session.get("https://docs.google.com/spreadsheets/d/1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E/gviz/tq?tqx=out:csv&sheet=master_dictionaries") g_sheet_bytes_data = g_sheet_response.content data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode('utf-8'))) -print("reading data from google sheet@@@@") -# data.rename( -# columns={ -# "country_standard_name": "country", -# "unique_standard_airline_name": "airline", -# "standard_disease_name": "disease", -# "psu_companies": "psu", -# } -# ) -# print(data.columns.tolist()) + +standard_data_values = data.copy() +standard_data_values.rename( + columns={ + "country_standard_name": "country", + "unique_standard_airline_name": "airline", + "standard_disease_name": "diseases", + "psu_companies": "psu", + "standard_district_name": "district", + "standard_states": "state", + "insurance_standard_names": "insurance_companies" + }, + inplace=True, +) @router.get("/", summary="Get all Saved Entities csv file name") @@ -50,6 +54,7 @@ async def get_entity_data(entity: str): json_compatible_item_data = jsonable_encoder( entity_df.to_dict(orient="records") ) + print(json_compatible_item_data) return JSONResponse(content=json_compatible_item_data) diff --git a/app/utils/airline.py b/app/utils/airline.py index 6578e3f..be39bd3 100644 --- a/app/utils/airline.py +++ b/app/utils/airline.py @@ -1,9 +1,9 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder - -from app.core.config import APP_DIR, AirlineSettings, Settings +from app.api.api_v1.routers.dictionary import standard_data_values +from app.core.config import AirlineSettings, Settings from app.utils.column_mapping import find_airline_name_columns -from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset +from app.utils.common import modify_values_to_be_in_set settings = Settings() airline_settings = AirlineSettings() @@ -14,10 +14,8 @@ async def modify_airline_name_expectation_suite( ): default_expectation_suite = airline_settings.AIRLINE_NAME_EXPECTATION - airline_names_dataset = await read_pandas_dataset( - APP_DIR / "core" / "airline_names.csv" - ) - airline_names_list = airline_names_dataset["airline_names"].tolist() + airline_names_dataset = standard_data_values[["airline"]].dropna() + airline_names_list = airline_names_dataset["airline"].tolist() changed_config = { "expect_column_values_to_be_in_set": { diff --git a/app/utils/common.py b/app/utils/common.py index e6c702e..28b4225 100644 --- a/app/utils/common.py +++ b/app/utils/common.py @@ -2,13 +2,13 @@ import re from io import BytesIO from typing import Union - +# from app.api.api_v1.routers.dictionary import data as dictionary_data import great_expectations as ge import pandas as pd from charset_normalizer import from_bytes from fastapi.logger import logger -from app.core.config import APP_DIR, GeographySettings +from app.core.config import GeographySettings logging.basicConfig(level=logging.INFO) geographic_settings = GeographySettings() @@ -79,12 +79,12 @@ async def read_pandas_dataset(source: str, **kwargs): return dataset -async def load_values_to_be_in_set(domain: str): - # this function is used to load csv files, consisting values - # for states or country that are required to be in specific set - set_values_file = APP_DIR / "core" / f"{domain}.csv" - set_values = pd.read_csv(set_values_file)[f"{domain}"].unique() - return set_values +# async def load_values_to_be_in_set(domain: str): +# # this function is used to load csv files, consisting values +# # for states or country that are required to be in specific set +# set_values_file = APP_DIR / "core" / f"{domain}.csv" +# set_values = pd.read_csv(set_values_file)[f"{domain}"].unique() +# return set_values async def modify_column_names_to_expectation_suite( diff --git a/app/utils/geography.py b/app/utils/geography.py index cceeacf..e876320 100644 --- a/app/utils/geography.py +++ b/app/utils/geography.py @@ -1,15 +1,14 @@ import asyncio from collections import ChainMap - +from app.api.api_v1.routers.dictionary import standard_data_values import great_expectations as ge from fastapi.encoders import jsonable_encoder -from app.core.config import APP_DIR, GeographySettings, Settings +from app.core.config import GeographySettings, Settings from app.utils.column_mapping import find_geography_columns from app.utils.common import ( modify_values_to_be_in_set, read_dataset, - read_pandas_dataset, ) settings = Settings() @@ -19,7 +18,7 @@ async def modify_city_expectation_suite(column_name: str, result_format: str): default_expectation_suite = geograhy_setting.STATE_EXPECTATION - city_dataset = await read_pandas_dataset(APP_DIR / "core" / "district.csv") + city_dataset = standard_data_values[["district"]].dropna() city_list = city_dataset["districts"].tolist() changed_config = { @@ -65,7 +64,7 @@ async def city_expectation_suite(dataset, result_format): async def modify_state_expectation_suite(column_name: str, result_format: str): default_expectation_suite = geograhy_setting.STATE_EXPECTATION - state_dataset = await read_pandas_dataset(APP_DIR / "core" / "state.csv") + state_dataset = standard_data_values[["state"]].dropna() state_list = state_dataset["state"].tolist() changed_config = { @@ -112,9 +111,7 @@ async def modify_country_expectation_suite( ): default_expectation_suite = geograhy_setting.COUNTRY_EXPECTATION - country_dataset = await read_pandas_dataset( - APP_DIR / "core" / "country.csv" - ) + country_dataset = standard_data_values[["country"]].dropna() country_list = country_dataset["country"].tolist() changed_config = { diff --git a/app/utils/insurance.py b/app/utils/insurance.py index c78dad9..93d64d3 100644 --- a/app/utils/insurance.py +++ b/app/utils/insurance.py @@ -1,9 +1,9 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder - -from app.core.config import APP_DIR, InsuranceCompanySettings, Settings +from app.api.api_v1.routers.dictionary import standard_data_values +from app.core.config import InsuranceCompanySettings, Settings from app.utils.column_mapping import find_insurance_company_columns -from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset +from app.utils.common import modify_values_to_be_in_set settings = Settings() insurance_company_settings = InsuranceCompanySettings() @@ -16,9 +16,7 @@ async def modify_insurance_company_name_expectation_suite( insurance_company_settings.INSURANCE_COMPANY_NAME_EXPECTATION ) - insurance_company_names_dataset = await read_pandas_dataset( - APP_DIR / "core" / "insurance_companies.csv" - ) + insurance_company_names_dataset = standard_data_values[["insurance_companies"]] insurance_company_names_list = insurance_company_names_dataset[ "insurance_companies" ].tolist() diff --git a/app/utils/metadata.py b/app/utils/metadata.py index c9699be..1c0745c 100644 --- a/app/utils/metadata.py +++ b/app/utils/metadata.py @@ -4,20 +4,22 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder -from app.core.config import APP_DIR, MetadataSettings, Settings +from app.core.config import MetadataSettings, Settings from app.utils.column_mapping import find_metadata_columns from app.utils.common import ( modify_values_to_be_in_set, modify_values_to_match_regex_list, read_dataset, - read_pandas_dataset, ) +from app.api.api_v1.routers.dictionary import standard_data_values from app.utils.general import general_metadata_expectation_suite from app.utils.tags import tags_expectation_suite from app.utils.unit import unit_expectation_suite settings = Settings() meta_data_setting = MetadataSettings() +# todo: in future if we need short_form values from dictionary uncomment the following +# short_form_dataset = standard_data_values[["short_form"]].dropna() async def modify_sector_expectation_suite( @@ -26,7 +28,7 @@ async def modify_sector_expectation_suite( default_expectation_suite = meta_data_setting.SECTOR_EXPECTATION - sector_dataset = await read_pandas_dataset(APP_DIR / "core" / "sector.csv") + sector_dataset = standard_data_values[["sector"]].dropna() sector_list = sector_dataset["sector"].tolist() changed_config = { @@ -86,9 +88,7 @@ async def modify_organization_expectation_suite( ): default_expectation_suite = meta_data_setting.ORGANIZATION_EXPECTATION - organization_dataset = await read_pandas_dataset( - APP_DIR / "core" / "organization.csv" - ) + organization_dataset = standard_data_values[["organization"]].dropna() organization_list = organization_dataset["organization"].tolist() changed_config = { @@ -148,10 +148,9 @@ async def modify_short_form_expectation_suite( ): default_expectation_suite = meta_data_setting.SHORT_FORM_EXPECTATION - short_form_dataset = await read_pandas_dataset( - APP_DIR / "core" / "short_form.csv" - ) - short_form_list = short_form_dataset["short_form"].tolist() + # NOTE: Modify the short_form_expectation_suite to use short_form + short_form_dataset = {"short_form": ""} + short_form_list = short_form_dataset["short_form"] changed_config = { "expect_column_values_to_be_in_set": { @@ -210,9 +209,7 @@ async def modify_frequency_of_update_expectation_suite( meta_data_setting.FREQUENCY_OF_UPDATE_EXPECTATION ) - frequency_of_update_dataset = await read_pandas_dataset( - APP_DIR / "core" / "frequency_of_update.csv" - ) + frequency_of_update_dataset = standard_data_values[["frequency_of_update"]].dropna() frequency_of_update_list = frequency_of_update_dataset[ "frequency_of_update" ].tolist() From 4e81cd978f7fc87dccecce60db3181796ab40e0e Mon Sep 17 00:00:00 2001 From: venu-sambarapu-DS Date: Wed, 5 Jun 2024 16:39:24 +0530 Subject: [PATCH 3/5] Modified the way of reading the standard dataset --- app/api/api_v1/routers/dictionary.py | 15 ++++++++++----- app/utils/airline.py | 1 + app/utils/common.py | 1 + app/utils/geography.py | 8 +++----- app/utils/insurance.py | 5 ++++- app/utils/metadata.py | 6 ++++-- 6 files changed, 23 insertions(+), 13 deletions(-) diff --git a/app/api/api_v1/routers/dictionary.py b/app/api/api_v1/routers/dictionary.py index f962ee5..85e876d 100644 --- a/app/api/api_v1/routers/dictionary.py +++ b/app/api/api_v1/routers/dictionary.py @@ -1,8 +1,9 @@ +import io + import pandas as pd +import requests from fastapi import APIRouter, HTTPException, status from fastapi.encoders import jsonable_encoder -import io -import requests from fastapi.responses import JSONResponse from app.core.config import CORE_FOLDER, Settings @@ -15,9 +16,13 @@ g_sheet_session = requests.Session() -g_sheet_response = g_sheet_session.get("https://docs.google.com/spreadsheets/d/1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E/gviz/tq?tqx=out:csv&sheet=master_dictionaries") +common_g_sheet_link_format = "https://docs.google.com/spreadsheets/d/" +g_sheet_id = "1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E" +download_sheet_name = "/gviz/tq?tqx=out:csv&sheet=master_dictionaries" +url_name = common_g_sheet_link_format + g_sheet_id + download_sheet_name +g_sheet_response = g_sheet_session.get(url_name) g_sheet_bytes_data = g_sheet_response.content -data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode('utf-8'))) +data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode("utf-8"))) standard_data_values = data.copy() standard_data_values.rename( @@ -28,7 +33,7 @@ "psu_companies": "psu", "standard_district_name": "district", "standard_states": "state", - "insurance_standard_names": "insurance_companies" + "insurance_standard_names": "insurance_companies", }, inplace=True, ) diff --git a/app/utils/airline.py b/app/utils/airline.py index be39bd3..5627303 100644 --- a/app/utils/airline.py +++ b/app/utils/airline.py @@ -1,5 +1,6 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder + from app.api.api_v1.routers.dictionary import standard_data_values from app.core.config import AirlineSettings, Settings from app.utils.column_mapping import find_airline_name_columns diff --git a/app/utils/common.py b/app/utils/common.py index 28b4225..fd313c9 100644 --- a/app/utils/common.py +++ b/app/utils/common.py @@ -2,6 +2,7 @@ import re from io import BytesIO from typing import Union + # from app.api.api_v1.routers.dictionary import data as dictionary_data import great_expectations as ge import pandas as pd diff --git a/app/utils/geography.py b/app/utils/geography.py index e876320..3b97368 100644 --- a/app/utils/geography.py +++ b/app/utils/geography.py @@ -1,15 +1,13 @@ import asyncio from collections import ChainMap -from app.api.api_v1.routers.dictionary import standard_data_values + import great_expectations as ge from fastapi.encoders import jsonable_encoder +from app.api.api_v1.routers.dictionary import standard_data_values from app.core.config import GeographySettings, Settings from app.utils.column_mapping import find_geography_columns -from app.utils.common import ( - modify_values_to_be_in_set, - read_dataset, -) +from app.utils.common import modify_values_to_be_in_set, read_dataset settings = Settings() geograhy_setting = GeographySettings() diff --git a/app/utils/insurance.py b/app/utils/insurance.py index 93d64d3..c87fb03 100644 --- a/app/utils/insurance.py +++ b/app/utils/insurance.py @@ -1,5 +1,6 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder + from app.api.api_v1.routers.dictionary import standard_data_values from app.core.config import InsuranceCompanySettings, Settings from app.utils.column_mapping import find_insurance_company_columns @@ -16,7 +17,9 @@ async def modify_insurance_company_name_expectation_suite( insurance_company_settings.INSURANCE_COMPANY_NAME_EXPECTATION ) - insurance_company_names_dataset = standard_data_values[["insurance_companies"]] + insurance_company_names_dataset = standard_data_values[ + ["insurance_companies"] + ] insurance_company_names_list = insurance_company_names_dataset[ "insurance_companies" ].tolist() diff --git a/app/utils/metadata.py b/app/utils/metadata.py index 1c0745c..ed868cd 100644 --- a/app/utils/metadata.py +++ b/app/utils/metadata.py @@ -4,6 +4,7 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder +from app.api.api_v1.routers.dictionary import standard_data_values from app.core.config import MetadataSettings, Settings from app.utils.column_mapping import find_metadata_columns from app.utils.common import ( @@ -11,7 +12,6 @@ modify_values_to_match_regex_list, read_dataset, ) -from app.api.api_v1.routers.dictionary import standard_data_values from app.utils.general import general_metadata_expectation_suite from app.utils.tags import tags_expectation_suite from app.utils.unit import unit_expectation_suite @@ -209,7 +209,9 @@ async def modify_frequency_of_update_expectation_suite( meta_data_setting.FREQUENCY_OF_UPDATE_EXPECTATION ) - frequency_of_update_dataset = standard_data_values[["frequency_of_update"]].dropna() + frequency_of_update_dataset = standard_data_values[ + ["frequency_of_update"] + ].dropna() frequency_of_update_list = frequency_of_update_dataset[ "frequency_of_update" ].tolist() From b9ba107c4b977e065719ecb9aca60588e3366aef Mon Sep 17 00:00:00 2001 From: venu-sambarapu-DS Date: Thu, 6 Jun 2024 16:48:28 +0530 Subject: [PATCH 4/5] ran test cases to check the code to load standard datasets from google sheet --- app/core/config.py | 80 ++++++++--- app/utils/airline.py | 2 +- app/utils/column_mapping.py | 14 +- app/utils/common.py | 30 +++- app/utils/geography.py | 6 +- app/utils/metadata.py | 268 ++++++++++++++++++++++++------------ 6 files changed, 284 insertions(+), 116 deletions(-) diff --git a/app/core/config.py b/app/core/config.py index 35011e6..5a66f3f 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -60,6 +60,9 @@ class Settings(BaseSettings): SERVICE_ACCOUNT_CONF: Dict[str, str] = {"": ""} GSHEET_SCOPES: List[str] = ["https://www.googleapis.com/auth/spreadsheets"] + # Metadata File Parameters + METADATA_COLUMN_ORDER_STRING = "" + class Config: env_file = ".env" @@ -397,7 +400,7 @@ class MetadataSettings(BaseSettings): SECTOR_KEYWORD = "sector" ORGANIZATION_KEYWORD = "organization" - SHORT_FORM_KEYWORD = "short_form" + # SHORT_FORM_KEYWORD = "short_form" DESCRIPTION_KEYWORD = "description" DATASET_NAME_FOR_FACTLY_KEYWORD = "dataset_name_for_factly" @@ -412,66 +415,107 @@ class MetadataSettings(BaseSettings): VARIABLE_MEASURED_KEYWORD = "variable_measured" DATA_NEXT_UPDATE_KEYWORD = "data_next_update" SOURCE_KEYWORD = "source" - SECTOR_EXPECTATION = { + DATASET_NAME_FOR_FACTLY_EXPECTATION = { "data_asset_type": None, - "expectation_suite_name": "sector_expectation_suite", + "expectation_suite_name": "dataset_name_for_factly_expectation_suite", "expectations": [ { - "expectation_type": "expect_column_values_to_be_in_set", + "expectation_type": "expect_column_value_lengths_to_be_between", "kwargs": { - "column": "sector", - "value_set": [], + "column": "dataset_name_for_factly", + "min_value": 5, + "max_value": 200, "result_format": "SUMMARY", }, "meta": { - "expectation_name": "Sector Name in set of values", + "expectation_name": "Dataset Name For Factly Length", "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", - "expectation_error_message": "Sector Name should be from the Data Dictionary", + "expectation_error_message": "Dataset Name For Factly Length should be less than 200", }, } ], } - ORGANIZATION_EXPECTATION = { + DESCRIPTION_EXPECTATION = { "data_asset_type": None, - "expectation_suite_name": "organization_expectation_suite", + "expectation_suite_name": "description_expectation_suite", + "expectations": [ + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "description", + "min_value": 50, + "max_value": 5000, + "result_format": "SUMMARY", + }, + "meta": { + "expectation_name": "Description Length", + "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", + "expectation_error_message": "Description should be grater than 50", + }, + } + ], + } + SECTOR_EXPECTATION = { + "data_asset_type": None, + "expectation_suite_name": "sector_expectation_suite", "expectations": [ { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": { - "column": "organization", + "column": "sector", "value_set": [], "result_format": "SUMMARY", }, "meta": { - "expectation_name": "Organization Name in set of values", + "expectation_name": "Sector Name in set of values", "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", - "expectation_error_message": "Organization Name should be from the Data Dictionary", + "expectation_error_message": "Sector Name should be from the Data Dictionary", }, } ], } - SHORT_FORM_EXPECTATION = { + ORGANIZATION_EXPECTATION = { "data_asset_type": None, - "expectation_suite_name": "short_form_expectation_suite", + "expectation_suite_name": "organization_expectation_suite", "expectations": [ { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": { - "column": "short_form", + "column": "organization", "value_set": [], "result_format": "SUMMARY", }, "meta": { - "expectation_name": "Short Form in set of values", + "expectation_name": "Organization Name in set of values", "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", - "expectation_error_message": "Short Form should be from the Data Dictionary", + "expectation_error_message": "Organization Name should be from the Data Dictionary", }, } ], } + # SHORT_FORM_EXPECTATION = { + # "data_asset_type": None, + # "expectation_suite_name": "short_form_expectation_suite", + # "expectations": [ + # { + # "expectation_type": "expect_column_values_to_be_in_set", + # "kwargs": { + # "column": "short_form", + # "value_set": [], + # "result_format": "SUMMARY", + # }, + # "meta": { + # "expectation_name": "Short Form in set of values", + # "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", + # "expectation_error_message": "Short Form should be from the Data Dictionary", + # }, + # } + # ], + # } + FREQUENCY_OF_UPDATE_EXPECTATION = { "data_asset_type": None, "expectation_suite_name": "frequency_of_update_expectation_suite", diff --git a/app/utils/airline.py b/app/utils/airline.py index 5627303..a3c2c3c 100644 --- a/app/utils/airline.py +++ b/app/utils/airline.py @@ -15,7 +15,7 @@ async def modify_airline_name_expectation_suite( ): default_expectation_suite = airline_settings.AIRLINE_NAME_EXPECTATION - airline_names_dataset = standard_data_values[["airline"]].dropna() + airline_names_dataset = standard_data_values[["airline"]].dropna().copy() airline_names_list = airline_names_dataset["airline"].tolist() changed_config = { diff --git a/app/utils/column_mapping.py b/app/utils/column_mapping.py index 098e0bb..cedf571 100644 --- a/app/utils/column_mapping.py +++ b/app/utils/column_mapping.py @@ -170,9 +170,9 @@ async def find_metadata_columns(columns: set): organization_pattern = re.compile( r".*({}).*".format(metadata_settings.ORGANIZATION_KEYWORD) ) - short_form_pattern = re.compile( - r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD) - ) + # short_form_pattern = re.compile( + # r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD) + # ) description_pattern = re.compile( r".*({}).*".format(metadata_settings.DESCRIPTION_KEYWORD) ) @@ -217,9 +217,9 @@ async def find_metadata_columns(columns: set): organization_column, columns = extract_pattern_from_columns( columns, organization_pattern ) - short_form_column, columns = extract_pattern_from_columns( - columns, short_form_pattern - ) + # short_form_column, columns = extract_pattern_from_columns( + # columns, short_form_pattern + # ) description_column, columns = extract_pattern_from_columns( columns, description_pattern ) @@ -261,7 +261,7 @@ async def find_metadata_columns(columns: set): return { "sector": list(sector_column), "organization": list(organization_column), - "short_form": list(short_form_column), + # "short_form": list(short_form_column), "description": list(description_column), "tags": list(tags_column), "temporal_coverage": list(temporal_coverage_column), diff --git a/app/utils/common.py b/app/utils/common.py index fd313c9..90a5f25 100644 --- a/app/utils/common.py +++ b/app/utils/common.py @@ -3,7 +3,6 @@ from io import BytesIO from typing import Union -# from app.api.api_v1.routers.dictionary import data as dictionary_data import great_expectations as ge import pandas as pd from charset_normalizer import from_bytes @@ -113,6 +112,21 @@ async def modify_default_expectation_suite( return expectation_suite +async def modify_column_order_expectation_suite( + expectation_suite: dict, column_order: list +): + modified_expectations = [] + for expectation in expectation_suite["expectations"]: + if ( + expectation["expectation_type"] + == "expect_table_columns_to_match_ordered_list" + ): + expectation["kwargs"]["column_list"] = column_order + modified_expectations.append(expectation) + expectation_suite["expectations"] = modified_expectations + return expectation_suite + + async def modify_values_to_be_in_between( changed_config: dict, default_config: str ): @@ -127,6 +141,20 @@ async def modify_values_to_be_in_between( return default_config +async def modify_values_length_to_be_between( + changed_config: dict, default_config: str +): + for expectation in default_config["expectations"]: + if ( + expectation["expectation_type"] + == "expect_column_value_lengths_to_be_between" + ): + expectation["kwargs"].update( + changed_config["expect_column_value_lengths_to_be_between"] + ) + return default_config + + async def modify_values_to_be_in_set( changed_config: dict, default_config: str ): diff --git a/app/utils/geography.py b/app/utils/geography.py index 3b97368..8722d01 100644 --- a/app/utils/geography.py +++ b/app/utils/geography.py @@ -16,7 +16,7 @@ async def modify_city_expectation_suite(column_name: str, result_format: str): default_expectation_suite = geograhy_setting.STATE_EXPECTATION - city_dataset = standard_data_values[["district"]].dropna() + city_dataset = standard_data_values[["district"]].dropna().copy() city_list = city_dataset["districts"].tolist() changed_config = { @@ -62,7 +62,7 @@ async def city_expectation_suite(dataset, result_format): async def modify_state_expectation_suite(column_name: str, result_format: str): default_expectation_suite = geograhy_setting.STATE_EXPECTATION - state_dataset = standard_data_values[["state"]].dropna() + state_dataset = standard_data_values[["state"]].dropna().copy() state_list = state_dataset["state"].tolist() changed_config = { @@ -109,7 +109,7 @@ async def modify_country_expectation_suite( ): default_expectation_suite = geograhy_setting.COUNTRY_EXPECTATION - country_dataset = standard_data_values[["country"]].dropna() + country_dataset = standard_data_values[["country"]].dropna().copy() country_list = country_dataset["country"].tolist() changed_config = { diff --git a/app/utils/metadata.py b/app/utils/metadata.py index ed868cd..ad7a12b 100644 --- a/app/utils/metadata.py +++ b/app/utils/metadata.py @@ -7,7 +7,8 @@ from app.api.api_v1.routers.dictionary import standard_data_values from app.core.config import MetadataSettings, Settings from app.utils.column_mapping import find_metadata_columns -from app.utils.common import ( +from app.utils.common import ( # modify_column_order_expectation_suite, + modify_values_length_to_be_between, modify_values_to_be_in_set, modify_values_to_match_regex_list, read_dataset, @@ -18,18 +19,151 @@ settings = Settings() meta_data_setting = MetadataSettings() -# todo: in future if we need short_form values from dictionary uncomment the following -# short_form_dataset = standard_data_values[["short_form"]].dropna() + + +async def check_column_order(dataset): + results = {} + settings.METADATA_COLUMN_ORDER_STRING.split(",") + column_order_list = settings.METADATA_COLUMN_ORDER_STRING.split(",") + validation = dataset.expect_table_columns_to_match_ordered_list( + column_order_list + ) + results["Expect Table Columns To Match The Given List"] = validation + return jsonable_encoder(results) + + +async def modify_dataset_name_for_factly_expectation_suite( + column_name: str, result_format: str +): + default_expectation_suite = ( + meta_data_setting.DATASET_NAME_FOR_FACTLY_EXPECTATION + ) + changed_config = { + "expect_column_value_lengths_to_be_between": { + "min_value": 5, + "max_value": 200, + "column": column_name, + "result_format": result_format, + } + } + changed_expectation_suite = await modify_values_length_to_be_between( + changed_config, default_expectation_suite + ) + return changed_expectation_suite + + +async def dataset_name_for_factly_expectation_suite(dataset, result_format): + """Expectation to check description in specific range + + Expectation is on whether description lies in the range of 50 to 5000 characters + Flag if its outside the range. + + Args: + dataset (Data-frame): Read metadata csv using Pandas Data-frame + result_format (str): SUMMARY + + Returns: + Dict: Dictionary of Expectations + """ + results = {} + mapped_columns = await find_metadata_columns(set(dataset.columns)) + sector_column = mapped_columns["dataset_name_for_factly"][0] + + expectation_suite = await modify_dataset_name_for_factly_expectation_suite( + sector_column, result_format + ) + # convert pandas dataset to great_expectations dataset + ge_pandas_dataset = ge.from_pandas( + dataset, expectation_suite=expectation_suite + ) + validation = ge_pandas_dataset.validate() + validation_ui_name = ( + validation["results"][0]["expectation_config"]["meta"][ + "expectation_name" + ] + + " - " + + validation["results"][0]["expectation_config"]["_kwargs"]["column"] + ) + results[validation_ui_name] = validation + + return jsonable_encoder(results) + + +async def modify_description_expectation_suite( + column_name: str, result_format: str +): + default_expectation_suite = meta_data_setting.DESCRIPTION_EXPECTATION + changed_config = { + "expect_column_value_lengths_to_be_between": { + "min_value": 50, + "max_value": 5000, + "column": column_name, + "result_format": result_format, + } + } + changed_expectation_suite = await modify_values_length_to_be_between( + changed_config, default_expectation_suite + ) + return changed_expectation_suite + + +async def description_expectation_suite(dataset, result_format): + """Expectation to check description in specific range + + Expectation is on whether description lies in the range of 50 to 5000 characters + Flag if its outside the range. + + Args: + dataset (Data-frame): Read metadata csv using Pandas Data-frame + result_format (str): SUMMARY + + Returns: + Dict: Dictionary of Expectations + """ + results = {} + mapped_columns = await find_metadata_columns(set(dataset.columns)) + sector_column = mapped_columns["description"][0] + + expectation_suite = await modify_description_expectation_suite( + sector_column, result_format + ) + # convert pandas dataset to great_expectations dataset + ge_pandas_dataset = ge.from_pandas( + dataset, expectation_suite=expectation_suite + ) + + validation = ge_pandas_dataset.validate() + validation_ui_name = ( + validation["results"][0]["expectation_config"]["meta"][ + "expectation_name" + ] + + " - " + + validation["results"][0]["expectation_config"]["_kwargs"]["column"] + ) + results[validation_ui_name] = validation + + return jsonable_encoder(results) async def modify_sector_expectation_suite( column_name: str, result_format: str ): + """ + Summary: Modify the default sector expectation suite using + sector.csv file in app.core + + Args: + column_name (str): _description_ + result_format (str): _description_ + + Returns: + _type_: _description_ + """ default_expectation_suite = meta_data_setting.SECTOR_EXPECTATION - sector_dataset = standard_data_values[["sector"]].dropna() - sector_list = sector_dataset["sector"].tolist() + sector_dataset = standard_data_values[["sectors"]].dropna().copy() + sector_list = sector_dataset["sectors"].tolist() changed_config = { "expect_column_values_to_be_in_set": { @@ -88,8 +222,10 @@ async def modify_organization_expectation_suite( ): default_expectation_suite = meta_data_setting.ORGANIZATION_EXPECTATION - organization_dataset = standard_data_values[["organization"]].dropna() - organization_list = organization_dataset["organization"].tolist() + organization_dataset = ( + standard_data_values[["organisation"]].dropna().copy() + ) + organization_list = organization_dataset["organisation"].tolist() changed_config = { "expect_column_values_to_be_in_set": { @@ -139,7 +275,7 @@ async def organization_expectation_suite(dataset, result_format): + validation["results"][0]["expectation_config"]["_kwargs"]["column"] ) results[validation_ui_name] = validation - + # print(jsonable_encoder(results)) return jsonable_encoder(results) @@ -165,41 +301,41 @@ async def modify_short_form_expectation_suite( return changed_expectation_suite -async def short_form_expectation_suite(dataset, result_format): - """Expectation to check if Short Form values are in short_form.csv +# async def short_form_expectation_suite(dataset, result_format): +# """Expectation to check if Short Form values are in short_form.csv - Expectation is on whether every value present in short form column of metadata - csv is in short_form.csv file or not +# Expectation is on whether every value present in short form column of metadata +# csv is in short_form.csv file or not - Args: - dataset (Dataframe): Read metadata csv using Pandas Dataframe - result_format (str): SUMMARY +# Args: +# dataset (Dataframe): Read metadata csv using Pandas Dataframe +# result_format (str): SUMMARY - Returns: - Dict: Dictionary of Expectations - """ - results = {} - mapped_columns = await find_metadata_columns(set(dataset.columns)) - short_form_column = mapped_columns["short_form"][0] +# Returns: +# Dict: Dictionary of Expectations +# """ +# results = {} +# mapped_columns = await find_metadata_columns(set(dataset.columns)) +# short_form_column = mapped_columns["short_form"][0] - expectation_suite = await modify_short_form_expectation_suite( - short_form_column, result_format - ) - # convert pandas dataset to great_expectations dataset - ge_pandas_dataset = ge.from_pandas( - dataset, expectation_suite=expectation_suite - ) - validation = ge_pandas_dataset.validate() - validation_ui_name = ( - validation["results"][0]["expectation_config"]["meta"][ - "expectation_name" - ] - + " - " - + validation["results"][0]["expectation_config"]["_kwargs"]["column"] - ) - results[validation_ui_name] = validation +# expectation_suite = await modify_short_form_expectation_suite( +# short_form_column, result_format +# ) +# # convert pandas dataset to great_expectations dataset +# ge_pandas_dataset = ge.from_pandas( +# dataset, expectation_suite=expectation_suite +# ) +# validation = ge_pandas_dataset.validate() +# validation_ui_name = ( +# validation["results"][0]["expectation_config"]["meta"][ +# "expectation_name" +# ] +# + " - " +# + validation["results"][0]["expectation_config"]["_kwargs"]["column"] +# ) +# results[validation_ui_name] = validation - return jsonable_encoder(results) +# return jsonable_encoder(results) async def modify_frequency_of_update_expectation_suite( @@ -209,9 +345,9 @@ async def modify_frequency_of_update_expectation_suite( meta_data_setting.FREQUENCY_OF_UPDATE_EXPECTATION ) - frequency_of_update_dataset = standard_data_values[ - ["frequency_of_update"] - ].dropna() + frequency_of_update_dataset = ( + standard_data_values[["frequency_of_update"]].dropna().copy() + ) frequency_of_update_list = frequency_of_update_dataset[ "frequency_of_update" ].tolist() @@ -418,48 +554,6 @@ async def time_saved_in_hours_expectation_suite(dataset, result_format): return response -async def description_expectation_suite(dataset, result_format): - """Expectation to check description in specific range - - Expectation is on whether description lies in the range of 50 to 5000 characters - Flag if its outside the range. - - Args: - dataset (Dataframe): Read metadata csv using Pandas Dataframe - result_format (str): SUMMARY - - Returns: - Dict: Dictionary of Expectations - """ - mapped_columns = await find_metadata_columns(set(dataset.columns)) - description_column = mapped_columns["description"][0] - expectation_name = meta_data_setting.DESCRIPTION_KEYWORD.format( - column=description_column - ) - - ge_pandas_dataset = ge.from_pandas(dataset) - - expectation = ge_pandas_dataset.expect_column_values_to_be_between( - column=description_column, - min_value=50, - max_value=5000, - catch_exceptions=True, - result_format=result_format, - ) - - expectation_dict = expectation.to_json_dict() - expectation_dict["expectation_config"]["meta"] = { - "cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK, - "expectation_name": expectation_name, - } - response = { - expectation_dict["expectation_config"]["meta"][ - "expectation_name" - ]: expectation_dict - } - return response - - async def metadata_expectation_suite( dataset, result_format, dataset_name: str ): @@ -476,7 +570,7 @@ async def metadata_expectation_suite( """ if isinstance(dataset, str): dataset = await read_dataset(dataset) - + # print(dir(dataset)) # Dataset modification for sector expectation suite dataset_sector = dataset.copy() # explode the dataset based on sector column @@ -487,10 +581,12 @@ async def metadata_expectation_suite( dataset_sector["sectors"] = dataset_sector["sectors"].str.strip() expectations = await asyncio.gather( + check_column_order(dataset), sector_expectation_suite(dataset_sector, result_format), organization_expectation_suite(dataset, result_format), - short_form_expectation_suite(dataset, result_format), - # description_expectation_suite(dataset, result_format), + # short_form_expectation_suite(dataset, result_format), + description_expectation_suite(dataset, result_format), + dataset_name_for_factly_expectation_suite(dataset, result_format), unit_expectation_suite(dataset, result_format), tags_expectation_suite(dataset, result_format), frequency_of_update_expectation_suite(dataset, result_format), From eb3e9fd584ce116596f52b133887bd3cc9061674 Mon Sep 17 00:00:00 2001 From: venu-sambarapu-DS Date: Thu, 13 Jun 2024 15:24:51 +0530 Subject: [PATCH 5/5] Removed unnecessary comments and print statements, Also added code to pic gsheet params from env --- app/api/api_v1/routers/dataset.py | 7 --- app/api/api_v1/routers/dictionary.py | 10 ++- app/core/config.py | 45 ++----------- app/main.py | 1 - app/models/date_strftime_pattern.py | 63 ------------------- .../expect_column_values_to_be_in_set.py | 36 ----------- app/models/regex_list_pattern.py | 53 ---------------- app/models/regex_pattern.py | 52 --------------- app/utils/column_mapping.py | 11 +--- app/utils/common.py | 8 --- app/utils/general.py | 4 -- app/utils/metadata.py | 42 +------------ app/utils/minio_transfer.py | 5 -- app/utils/tags.py | 2 - 14 files changed, 16 insertions(+), 323 deletions(-) diff --git a/app/api/api_v1/routers/dataset.py b/app/api/api_v1/routers/dataset.py index 486eea3..70e5899 100644 --- a/app/api/api_v1/routers/dataset.py +++ b/app/api/api_v1/routers/dataset.py @@ -16,14 +16,7 @@ from fastapi.templating import Jinja2Templates from app.core.config import Settings - -# from app.models.date_strftime_pattern import DateStrftimePattern from app.models.enums import ExpectationResultFormat, ExpectationResultType - -# from app.models.expect_column_values_to_be_in_set import ColumnValuesToBeInSet -# from app.models.general import GeneralTableExpectation -# from app.models.regex_list_pattern import RegexMatchList -# from app.models.regex_pattern import RegexPatternExpectation from app.utils.dataset import ( datasets_expectation, datasets_expectation_from_url, diff --git a/app/api/api_v1/routers/dictionary.py b/app/api/api_v1/routers/dictionary.py index 85e876d..e1789b6 100644 --- a/app/api/api_v1/routers/dictionary.py +++ b/app/api/api_v1/routers/dictionary.py @@ -14,11 +14,16 @@ dictionary_router = router = APIRouter() +# reading sheet name from env +google_spread_sheet_sheet_name = settings.GOOGLE_SPREAD_SHEET_SHEET_NAME +google_sheet_id = settings.GOOGLE_SHEET_ID g_sheet_session = requests.Session() common_g_sheet_link_format = "https://docs.google.com/spreadsheets/d/" -g_sheet_id = "1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E" -download_sheet_name = "/gviz/tq?tqx=out:csv&sheet=master_dictionaries" +g_sheet_id = f"{google_sheet_id}" +download_sheet_name = ( + f"/gviz/tq?tqx=out:csv&sheet={google_spread_sheet_sheet_name}" +) url_name = common_g_sheet_link_format + g_sheet_id + download_sheet_name g_sheet_response = g_sheet_session.get(url_name) g_sheet_bytes_data = g_sheet_response.content @@ -59,7 +64,6 @@ async def get_entity_data(entity: str): json_compatible_item_data = jsonable_encoder( entity_df.to_dict(orient="records") ) - print(json_compatible_item_data) return JSONResponse(content=json_compatible_item_data) diff --git a/app/core/config.py b/app/core/config.py index 5a66f3f..c851b3e 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -63,6 +63,12 @@ class Settings(BaseSettings): # Metadata File Parameters METADATA_COLUMN_ORDER_STRING = "" + # Google spread-sheet sheet name + GOOGLE_SPREAD_SHEET_SHEET_NAME: str = "" + + # Google sheet id + GOOGLE_SHEET_ID: str = "" + class Config: env_file = ".env" @@ -496,26 +502,6 @@ class MetadataSettings(BaseSettings): ], } - # SHORT_FORM_EXPECTATION = { - # "data_asset_type": None, - # "expectation_suite_name": "short_form_expectation_suite", - # "expectations": [ - # { - # "expectation_type": "expect_column_values_to_be_in_set", - # "kwargs": { - # "column": "short_form", - # "value_set": [], - # "result_format": "SUMMARY", - # }, - # "meta": { - # "expectation_name": "Short Form in set of values", - # "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", - # "expectation_error_message": "Short Form should be from the Data Dictionary", - # }, - # } - # ], - # } - FREQUENCY_OF_UPDATE_EXPECTATION = { "data_asset_type": None, "expectation_suite_name": "frequency_of_update_expectation_suite", @@ -589,25 +575,6 @@ class MetadataSettings(BaseSettings): TIME_SAVED_IN_HOURS_MSG: str = ( "Null values should not present in these columns" ) - # TIME_SAVED_IN_HOURS_EXPECTATION = { - # "data_asset_type": None, - # "expectation_suite_name": "time_saved_in_hours_expectation_suite", - # "expectations": [ - # { - # "expectation_type": "expect_column_values_to_be_in_set", - # "kwargs": { - # "column": "time_saved_in_hours", - # "value_set": [], - # "result_format": "SUMMARY", - # }, - # "meta": { - # "expectation_name": "Time Saved In Hours", - # "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg", - # "expectation_error_message": "Time Saved in Hours should be from the range of 2 to 6 hours", - # }, - # } - # ], - # } class TagsSettings(BaseSettings): diff --git a/app/main.py b/app/main.py index 335e4a5..ff4ea73 100644 --- a/app/main.py +++ b/app/main.py @@ -28,7 +28,6 @@ @app.get(settings.API_V1_STR) async def home(request: Request): return templates.TemplateResponse("base.html", {"request": request}) - # return {"message": "Server is up"} app.include_router(dataset_router, prefix="", tags=["Compare Datasets"]) diff --git a/app/models/date_strftime_pattern.py b/app/models/date_strftime_pattern.py index b1d6932..4c2839c 100644 --- a/app/models/date_strftime_pattern.py +++ b/app/models/date_strftime_pattern.py @@ -4,69 +4,6 @@ from pydantic import BaseModel -# class _Kwargs(BaseModel): -# column: str -# strftime_format: str -# result_format: str - -# class Config: -# underscore_attrs_are_private = True - -# class ExpectationConfig(BaseModel): -# _expectation_type: str -# _kwargs: _Kwargs -# _raw_kwargs: Any -# meta: Dict[str, Any] -# success_on_last_run: Any -# _ge_cloud_id: Any -# _expectation_context: Any - -# class Config: -# underscore_attrs_are_private = True - -# class PartialUnexpectedCount(BaseModel): -# value: str -# count: int - -# class Config: -# underscore_attrs_are_private = True - - -# class Result(BaseModel): -# element_count: Optional[int] -# missing_count: Optional[int] -# missing_percent: Optional[int] -# unexpected_count: Optional[int] -# unexpected_percent: Optional[int] -# unexpected_percent_total: Optional[int] -# unexpected_percent_nonmissing: Optional[int] -# partial_unexpected_list: Optional[List[str]] -# partial_unexpected_index_list: Optional[List[int]] -# partial_unexpected_counts: Optional[List[PartialUnexpectedCount]] -# unexpected_list: Optional[List[str]] -# unexpected_index_list: Optional[List[int]] - - -# class ExceptionInfo(BaseModel): -# raised_exception: Optional[bool] -# exception_message: Optional[Any] -# exception_traceback: Optional[Any] - -# class Config: -# underscore_attrs_are_private = True - - -# class DateStrftimePattern(BaseModel): -# success: bool -# expectation_config: Optional[ExpectationConfig] -# result: Optional[Result] -# _meta: Optional[Dict[str, Any]] -# exception_info: Optional[ExceptionInfo] - -# class Config: -# underscore_attrs_are_private = True -# # response_model_exclude_unset = True - class _Kwargs(BaseModel): column: str diff --git a/app/models/expect_column_values_to_be_in_set.py b/app/models/expect_column_values_to_be_in_set.py index 183e251..c2801e4 100644 --- a/app/models/expect_column_values_to_be_in_set.py +++ b/app/models/expect_column_values_to_be_in_set.py @@ -4,42 +4,6 @@ from pydantic import BaseModel -# class PartialUnexpectedCount(BaseModel): -# value: str -# count: int - - -# class Result(BaseModel): -# element_count: Optional[int] -# missing_count: Optional[int] -# missing_percent: Optional[int] -# unexpected_count: Optional[int] -# unexpected_percent: Optional[int] -# unexpected_percent_total: Optional[int] -# unexpected_percent_nonmissing: Optional[int] -# partial_unexpected_list: Optional[List[str]] -# partial_unexpected_index_list: Optional[List[int]] -# partial_unexpected_counts: Optional[List[PartialUnexpectedCount]] -# unexpected_list: Optional[List[str]] -# unexpected_index_list: Optional[List[int]] - - -# class ExceptionInfo(BaseModel): -# raised_exception: bool -# exception_message: Optional[Any] -# exception_traceback: Optional[Any] - - -# class ColumnValuesToBeInSet(BaseModel): -# success: bool -# _expectation_config: Optional[Any] -# result: Optional[Result] -# meta: Optional[Dict[str, Any]] -# _exception_info: Optional[ExceptionInfo] - -# class Config: -# underscore_attrs_are_private = True - class _Kwargs(BaseModel): column: str diff --git a/app/models/regex_list_pattern.py b/app/models/regex_list_pattern.py index 950c20d..8941fee 100644 --- a/app/models/regex_list_pattern.py +++ b/app/models/regex_list_pattern.py @@ -4,59 +4,6 @@ from pydantic import BaseModel -# class _Kwargs(BaseModel): -# column: str -# regex_list: List[str] -# match_on: str -# result_format: str - - -# class ExpectationConfig(BaseModel): -# _expectation_type: str -# _kwargs: _Kwargs -# _raw_kwargs: Any -# meta: Dict[str, Any] -# success_on_last_run: Any -# _ge_cloud_id: Any -# _expectation_context: Any - - -# class PartialUnexpectedCount(BaseModel): -# value: str -# count: int - - -# class Result(BaseModel): -# element_count: Optional[int] -# missing_count: Optional[int] -# missing_percent: Optional[int] -# unexpected_count: Optional[int] -# unexpected_percent: Optional[int] -# unexpected_percent_total: Optional[int] -# unexpected_percent_nonmissing: Optional[int] -# partial_unexpected_list: Optional[List[str]] -# partial_unexpected_index_list: Optional[List[int]] -# partial_unexpected_counts: Optional[List[PartialUnexpectedCount]] -# unexpected_list: Optional[List[str]] -# unexpected_index_list: Optional[List[int]] - - -# class ExceptionInfo(BaseModel): -# raised_exception: Optional[bool] -# exception_message: Optional[Any] -# exception_traceback: Optional[Any] - - -# class RegexMatchList(BaseModel): -# success: Optional[bool] -# expectation_config: Optional[ExpectationConfig] -# result: Optional[Result] -# meta: Optional[Dict[str, Any]] -# exception_info: Optional[ExceptionInfo] - -# class Config: -# underscore_attrs_are_private = True - class _Kwargs(BaseModel): column: str diff --git a/app/models/regex_pattern.py b/app/models/regex_pattern.py index 949ae5c..466ec00 100644 --- a/app/models/regex_pattern.py +++ b/app/models/regex_pattern.py @@ -4,58 +4,6 @@ from pydantic import BaseModel -# class _Kwargs(BaseModel): -# column: str -# regex: str -# result_format: str - - -# class ExpectationConfig(BaseModel): -# _expectation_type: str -# _kwargs: _Kwargs -# _raw_kwargs: Any -# meta: Dict[str, Any] -# success_on_last_run: Any -# _ge_cloud_id: Any -# _expectation_context: Any - - -# class PartialUnexpectedCount(BaseModel): -# value: str -# count: int - - -# class Result(BaseModel): -# element_count: Optional[int] -# missing_count: Optional[int] -# missing_percent: Optional[int] -# unexpected_count: Optional[int] -# unexpected_percent: Optional[int] -# unexpected_percent_total: Optional[int] -# unexpected_percent_nonmissing: Optional[int] -# partial_unexpected_list: Optional[List[str]] -# partial_unexpected_index_list: Optional[List[int]] -# partial_unexpected_counts: Optional[List[PartialUnexpectedCount]] -# unexpected_list: Optional[List[str]] -# unexpected_index_list: Optional[List[int]] - - -# class ExceptionInfo(BaseModel): -# raised_exception: Optional[bool] -# exception_message: Optional[Any] -# exception_traceback: Optional[Any] - - -# class RegexPatternExpectation(BaseModel): -# success: bool -# expectation_config: Optional[ExpectationConfig] -# result: Optional[Result] -# meta: Optional[Dict[str, Any]] -# exception_info: Optional[ExceptionInfo] - -# class Config: -# underscore_attrs_are_private = True - class _Kwargs(BaseModel): column: str diff --git a/app/utils/column_mapping.py b/app/utils/column_mapping.py index cedf571..addc80e 100644 --- a/app/utils/column_mapping.py +++ b/app/utils/column_mapping.py @@ -170,9 +170,7 @@ async def find_metadata_columns(columns: set): organization_pattern = re.compile( r".*({}).*".format(metadata_settings.ORGANIZATION_KEYWORD) ) - # short_form_pattern = re.compile( - # r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD) - # ) + description_pattern = re.compile( r".*({}).*".format(metadata_settings.DESCRIPTION_KEYWORD) ) @@ -217,9 +215,7 @@ async def find_metadata_columns(columns: set): organization_column, columns = extract_pattern_from_columns( columns, organization_pattern ) - # short_form_column, columns = extract_pattern_from_columns( - # columns, short_form_pattern - # ) + description_column, columns = extract_pattern_from_columns( columns, description_pattern ) @@ -261,7 +257,6 @@ async def find_metadata_columns(columns: set): return { "sector": list(sector_column), "organization": list(organization_column), - # "short_form": list(short_form_column), "description": list(description_column), "tags": list(tags_column), "temporal_coverage": list(temporal_coverage_column), @@ -271,7 +266,6 @@ async def find_metadata_columns(columns: set): "file_path": list(file_path_column), "frequency_of_update": list(frequency_of_update_column), "source_link": list(source_link_column), - # "archive": list(archive_column), "spacial_coverage": list(spacial_coverage_column), "variable_measured": list(variable_measured_column), "data_next_update": list(data_next_update_column), @@ -305,5 +299,4 @@ async def find_mapped_columns(columns): list(chain.from_iterable(mapped_columns.values())) ) ) - print({**mapped_columns, "unmapped": not_mapped_columns}) return {**mapped_columns, "unmapped": not_mapped_columns} diff --git a/app/utils/common.py b/app/utils/common.py index 90a5f25..2c958ae 100644 --- a/app/utils/common.py +++ b/app/utils/common.py @@ -79,14 +79,6 @@ async def read_pandas_dataset(source: str, **kwargs): return dataset -# async def load_values_to_be_in_set(domain: str): -# # this function is used to load csv files, consisting values -# # for states or country that are required to be in specific set -# set_values_file = APP_DIR / "core" / f"{domain}.csv" -# set_values = pd.read_csv(set_values_file)[f"{domain}"].unique() -# return set_values - - async def modify_column_names_to_expectation_suite( expectation_suite: dict, expectation_config: dict ): diff --git a/app/utils/general.py b/app/utils/general.py index 9918c3d..8fb429e 100644 --- a/app/utils/general.py +++ b/app/utils/general.py @@ -157,9 +157,6 @@ async def null_not_in_columns(dataset, result_format, column, column_type): catch_exceptions=True, result_format=result_format, ) - # expectation = ge_pandas_dataset.expect_column_values_to_not_be_null( - # column=column, result_format=result_format, catch_exceptions=True - # ) expectation_dict = expectation.to_json_dict() expectation_dict["expectation_config"]["meta"] = { @@ -384,7 +381,6 @@ async def general_table_expectation_suite(dataset, result_format): multispaces_between_text_expectation_suite(dataset, result_format), bracket_values_expectation_suite(dataset, result_format), special_character_expectation_suite(dataset, result_format), - # null_not_in_columns(dataset, result_format, "price"), *[ null_not_in_columns(dataset, result_format, col, "numeric") for col in numeric_columns diff --git a/app/utils/metadata.py b/app/utils/metadata.py index ad7a12b..eeea191 100644 --- a/app/utils/metadata.py +++ b/app/utils/metadata.py @@ -7,7 +7,7 @@ from app.api.api_v1.routers.dictionary import standard_data_values from app.core.config import MetadataSettings, Settings from app.utils.column_mapping import find_metadata_columns -from app.utils.common import ( # modify_column_order_expectation_suite, +from app.utils.common import ( modify_values_length_to_be_between, modify_values_to_be_in_set, modify_values_to_match_regex_list, @@ -275,7 +275,6 @@ async def organization_expectation_suite(dataset, result_format): + validation["results"][0]["expectation_config"]["_kwargs"]["column"] ) results[validation_ui_name] = validation - # print(jsonable_encoder(results)) return jsonable_encoder(results) @@ -301,43 +300,6 @@ async def modify_short_form_expectation_suite( return changed_expectation_suite -# async def short_form_expectation_suite(dataset, result_format): -# """Expectation to check if Short Form values are in short_form.csv - -# Expectation is on whether every value present in short form column of metadata -# csv is in short_form.csv file or not - -# Args: -# dataset (Dataframe): Read metadata csv using Pandas Dataframe -# result_format (str): SUMMARY - -# Returns: -# Dict: Dictionary of Expectations -# """ -# results = {} -# mapped_columns = await find_metadata_columns(set(dataset.columns)) -# short_form_column = mapped_columns["short_form"][0] - -# expectation_suite = await modify_short_form_expectation_suite( -# short_form_column, result_format -# ) -# # convert pandas dataset to great_expectations dataset -# ge_pandas_dataset = ge.from_pandas( -# dataset, expectation_suite=expectation_suite -# ) -# validation = ge_pandas_dataset.validate() -# validation_ui_name = ( -# validation["results"][0]["expectation_config"]["meta"][ -# "expectation_name" -# ] -# + " - " -# + validation["results"][0]["expectation_config"]["_kwargs"]["column"] -# ) -# results[validation_ui_name] = validation - -# return jsonable_encoder(results) - - async def modify_frequency_of_update_expectation_suite( column_name: str, result_format: str ): @@ -570,7 +532,6 @@ async def metadata_expectation_suite( """ if isinstance(dataset, str): dataset = await read_dataset(dataset) - # print(dir(dataset)) # Dataset modification for sector expectation suite dataset_sector = dataset.copy() # explode the dataset based on sector column @@ -584,7 +545,6 @@ async def metadata_expectation_suite( check_column_order(dataset), sector_expectation_suite(dataset_sector, result_format), organization_expectation_suite(dataset, result_format), - # short_form_expectation_suite(dataset, result_format), description_expectation_suite(dataset, result_format), dataset_name_for_factly_expectation_suite(dataset, result_format), unit_expectation_suite(dataset, result_format), diff --git a/app/utils/minio_transfer.py b/app/utils/minio_transfer.py index 34849a4..d4a7633 100644 --- a/app/utils/minio_transfer.py +++ b/app/utils/minio_transfer.py @@ -103,8 +103,3 @@ async def get_files_inside_folder(folder_name: str): raise Exception(f"Could not get files inside folder: {e}") else: return file_keys - - -# async def save_expectation_to_minio_folder(expectation, s3_folder: str): - -# pass diff --git a/app/utils/tags.py b/app/utils/tags.py index 62cf155..47e39f8 100644 --- a/app/utils/tags.py +++ b/app/utils/tags.py @@ -4,8 +4,6 @@ from fastapi.encoders import jsonable_encoder from app.core.config import TagsSettings - -# from app.utils.column_mapping import find_tags_columns from app.utils.column_mapping import find_metadata_columns from app.utils.common import modify_values_to_match_regex_list