From a01eda7931379e63f85a505fa2d41d932f93227c Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 11:37:07 +0100 Subject: [PATCH 01/13] conformity to python style guide --- sweep.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sweep.py b/sweep.py index 9bd461e..45dd388 100644 --- a/sweep.py +++ b/sweep.py @@ -2,11 +2,9 @@ from pathlib import Path import argparse import wandb -import warnings -warnings.filterwarnings("ignore") -def train(): +def train() -> None: """ This function initializes a run with Weights & Biases (wandb), updates the configuration with the model configuration, constructs a run name based on the sweep parameters, retrains the model with transformed datasets, evaluates the model, @@ -28,7 +26,6 @@ def train(): for para in model_paras: run_name += f'_{para}_{run.config[para]}' wandb.run.name = run_name - # print(run_name) wandb.config.update(common_config, allow_val_change=True) wandb.config.update(model_config, allow_val_change=True) @@ -39,10 +36,15 @@ def train(): if __name__ == '__main__': + # this is the main function that will be called when running the script from the command line with arguments + parser = argparse.ArgumentParser(description='Method for sweeping on W&B') - parser.add_argument('-l', metavar='level', type=str, required=True, choices=['cm', 'pgm']) - parser.add_argument('-c', metavar='config', type=str, required=True, help='Path to the configuration directory') - parser.add_argument('-m', metavar='modelname', help='Name of the model to implement') + parser.add_argument('-l', metavar='level', type=str, + required=True, choices=['cm', 'pgm']) + parser.add_argument('-c', metavar='config', type=str, + required=True, help='Path to the configuration directory') + parser.add_argument('-m', metavar='modelname', + help='Name of the model to implement') args = parser.parse_args() level = args.l @@ -53,9 +55,11 @@ def train(): Datasets_transformed = {} para_transformed = {} qslist, Datasets = i_fetch_data(level) - Datasets_transformed, para_transformed = transform_data(Datasets, transforms, level=level, by_group=True) + Datasets_transformed, para_transformed = transform_data( + Datasets, transforms, level=level, by_group=True) - common_config_path, wandb_config_path, model_config_path, sweep_config_path = get_config_path(config_path) + common_config_path, wandb_config_path, model_config_path, sweep_config_path = get_config_path( + config_path) common_config = get_config_from_path(common_config_path, 'common') wandb_config = get_config_from_path(wandb_config_path, 'wandb') @@ -68,14 +72,16 @@ def train(): model_file = model_config_path / sweep_file.name if not model_file.is_file(): - raise FileNotFoundError(f'The corresponding model configuration file {model_file} does not exist.') + raise FileNotFoundError( + f'The corresponding model configuration file {model_file} does not exist.') sweep_config = get_config_from_path(sweep_file, 'sweep') model_config = get_config_from_path(model_file, 'model') - + model = sweep_file.stem.split('_')[-1] - sweep_id = wandb.sweep(sweep_config, project=wandb_config['project'], entity=wandb_config['entity']) + sweep_id = wandb.sweep( + sweep_config, project=wandb_config['project'], entity=wandb_config['entity']) wandb.agent(sweep_id, function=train) print(f'Finish sweeping over model {sweep_file.stem}') - print('**************************************************************') \ No newline at end of file + print('**************************************************************') From 92a1d1ffbfee052c1216eb7bf3119f05dbf901ae Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:22:00 +0100 Subject: [PATCH 02/13] more docstrings, removed commented code and added return types --- ViewsEstimators.py | 84 +++++++++++++++++++++++----------------------- sweep.py | 2 +- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/ViewsEstimators.py b/ViewsEstimators.py index 6cfc66d..e285b76 100644 --- a/ViewsEstimators.py +++ b/ViewsEstimators.py @@ -16,8 +16,6 @@ from xgboost import XGBRFRegressor, XGBRFClassifier from lightgbm import LGBMClassifier, LGBMRegressor -#from lightgbm import LGBMClassifier, LGBMRegressor - class HurdleRegression(BaseEstimator): """ Regression model which handles excessive zeros by fitting a two-part model and combining predictions: @@ -35,7 +33,7 @@ def __init__(self, clf_name: str = 'logistic', reg_name: str = 'linear', clf_params: Optional[dict] = None, - reg_params: Optional[dict] = None): + reg_params: Optional[dict] = None) -> None: self.clf_name = clf_name self.reg_name = reg_name @@ -44,31 +42,29 @@ def __init__(self, self.clf_fi = [] self.reg_fi = [] - @staticmethod - def _resolve_estimator(func_name: str): - """ Lookup table for supported estimators. - This is necessary because sklearn estimator default arguments - must pass equality test, and instantiated sub-estimators are not equal. """ + def fit(self, + X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series]) -> BaseEstimator: + """ + Fit the model using the provided features and target. - funcs = {'linear': LinearRegression(), - 'logistic': LogisticRegression(solver='liblinear'), - 'LGBMRegressor': LGBMRegressor(n_estimators=250), - 'LGBMClassifier': LGBMClassifier(n_estimators=250), - 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2), - 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2), - 'GBMRegressor': GradientBoostingRegressor(n_estimators=200), - 'GBMClassifier': GradientBoostingClassifier(n_estimators=200), - 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200), - 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200), - } + Parameters: + - X (Union[np.ndarray, pd.DataFrame]): Features for training the model. + - y (Union[np.ndarray, pd.Series]): Target variable. - return funcs[func_name] + Raises: + - ValueError: If the number of features in X is less than 2. - def fit(self, - X: Union[np.ndarray, pd.DataFrame], - y: Union[np.ndarray, pd.Series]): + Returns: + - self: The fitted model. + + The `fit` method trains both a classification (`clf_`) and regression (`reg_`) model. + The classification model is used to predict whether y is greater than 0, while the regression + model is trained on instances where y is greater than 0. Feature importances for both models + are stored in `clf_fi` and `reg_fi`, respectively. + + Note: The provided features and target are checked for validity, and the fitted status is updated. + """ X, y = check_X_y(X, y, dtype=None, accept_sparse=False, accept_large_sparse=False, @@ -92,32 +88,36 @@ def fit(self, self.is_fitted_ = True return self - -# def predict(self, X: Union[np.ndarray, pd.DataFrame]): - def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]): + def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: """ Predict combined response using binary classification outcome """ X = check_array(X, accept_sparse=False, accept_large_sparse=False) check_is_fitted(self, 'is_fitted_') return self.clf_.predict(X) * self.reg_.predict(X) - def predict(self, X: Union[np.ndarray, pd.DataFrame]): -# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]): + def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: """ Predict combined response using probabilistic classification outcome """ X = check_array(X, accept_sparse=False, accept_large_sparse=False) check_is_fitted(self, 'is_fitted_') return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X) -def manual_test(): - """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """ - check_estimator(HurdleRegression) - from sklearn.datasets import make_regression - X, y = make_regression() - reg = HurdleRegression() - reg.fit(X, y) - reg.predict(X) - - + @staticmethod + def _resolve_estimator(func_name: str) -> BaseEstimator: + """ Lookup table for supported estimators. + This is necessary because sklearn estimator default arguments + must pass equality test, and instantiated sub-estimators are not equal. """ + funcs = {'linear': LinearRegression(), + 'logistic': LogisticRegression(solver='liblinear'), + 'LGBMRegressor': LGBMRegressor(n_estimators=250), + 'LGBMClassifier': LGBMClassifier(n_estimators=250), + 'RFRegressor': XGBRFRegressor(n_estimators=250, n_jobs=-2), + 'RFClassifier': XGBRFClassifier(n_estimators=250, n_jobs=-2), + 'GBMRegressor': GradientBoostingRegressor(n_estimators=200), + 'GBMClassifier': GradientBoostingClassifier(n_estimators=200), + 'XGBRegressor': XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-2), + 'XGBClassifier': XGBClassifier(n_estimators=100, learning_rate=0.05, n_jobs=-2), + 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200), + 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200), + } -#if __name__ == '__main__': -# manual_test() \ No newline at end of file + return funcs[func_name] diff --git a/sweep.py b/sweep.py index 45dd388..2fc5d46 100644 --- a/sweep.py +++ b/sweep.py @@ -36,7 +36,7 @@ def train() -> None: if __name__ == '__main__': - # this is the main function that will be called when running the script from the command line with arguments + # this is the main block of code that will be called when running the script from the command line with arguments parser = argparse.ArgumentParser(description='Method for sweeping on W&B') parser.add_argument('-l', metavar='level', type=str, From cf1ba5eada51de7fea55f64d76d858f1f9db07e2 Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:11:45 +0100 Subject: [PATCH 03/13] added docstring and return type --- dataloader/cm_querysets.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/dataloader/cm_querysets.py b/dataloader/cm_querysets.py index 749d6c9..e4b96c1 100644 --- a/dataloader/cm_querysets.py +++ b/dataloader/cm_querysets.py @@ -24,7 +24,18 @@ ####################################################################################################################### -def get_cm_querysets(): +def get_cm_querysets() -> list: + """ + This function generates a list of Queryset objects for the 'country_month' table. + + Each Queryset represents a specific query to be executed. The Queryset is defined with a name and a table. + It includes several Columns, each representing a column in the database. + The Column is defined with a name, a source table, and a source column. + Each Column can have transformations applied to it, such as filling missing values. + + Returns: + list: A list of Queryset objects. + """ qs_baseline = (Queryset("fatalities003_baseline", "country_month") @@ -3878,7 +3889,7 @@ def get_cm_querysets(): qs_conflict_stub.operations[0:] qs_conflict.publish() - + ################################################################################################################### # Conflict history model and baseline, nonlog formulation @@ -3963,8 +3974,6 @@ def get_cm_querysets(): qs_joint_broad_stub.operations[0:] qs_joint_broad.publish() - - ################################################################################################################### # joint broad model and baseline @@ -3982,7 +3991,6 @@ def get_cm_querysets(): qs_joint_broad_stub.operations[0:] qs_joint_broad_nonlog.publish() - ################################################################################################################### # faostat model and baseline From 72b408372eb95f4e82ab1fcde27e78bc4c701a13 Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:24:54 +0100 Subject: [PATCH 04/13] added return types --- util/new_metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/util/new_metrics.py b/util/new_metrics.py index c5b5f56..5f17310 100644 --- a/util/new_metrics.py +++ b/util/new_metrics.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt -def tweedie_loss(p, q, pow=1.5, eps=np.exp(-100)): +def tweedie_loss(p, q, pow=1.5, eps=np.exp(-100)) -> float: """ The Tweedie loss function is defined as: $L(p, q) = -p q^{1-pow} / (1-pow) + q^{2-pow} / (2-pow)$. I is used to evaluate the performance of a model that predicts the mean of a Tweedie distribution. @@ -19,7 +19,7 @@ def tweedie_loss(p, q, pow=1.5, eps=np.exp(-100)): return np.mean(loss) -def kl_divergence(p, q, eps=np.exp(-100)): +def kl_divergence(p, q, eps=np.exp(-100)) -> float: """ The KL divergence between two discrete distributions p and q is defined as: $\sum_i p_i \log(p_i / q_i)$. It describes the difference between two distributions in terms of information lost when q is used to approximate p. @@ -32,7 +32,7 @@ def kl_divergence(p, q, eps=np.exp(-100)): return np.sum(p * np.log(p / q)) -def jeffreys_divergence(p, q, eps=np.exp(-100)): +def jeffreys_divergence(p, q, eps=np.exp(-100)) -> float: """ Jeffreys divergence is a symmetrized version of KL divergence. See https://en.wikipedia.org/wiki/Hellinger_distance The parameter $eps$ is used to avoid numerical issues when $pow < 1$. @@ -44,7 +44,7 @@ def jeffreys_divergence(p, q, eps=np.exp(-100)): return 0.5 * np.sum((p - q) * np.log(p / q)) -def jenson_shannon_divergence(p, q, eps=np.exp(-100)): +def jenson_shannon_divergence(p, q, eps=np.exp(-100)) -> float: """ Jenson-Shannon divergence is also a symmetrized version of KL divergence. See https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence JSD = 0.5 * KL(p, m) + 0.5 * KL(q, m) From 04f7d2732a499caba0cb991f7c94f2be5b2be1ac Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:32:18 +0100 Subject: [PATCH 05/13] removed unused imports --- dataloader/cm_querysets.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/dataloader/cm_querysets.py b/dataloader/cm_querysets.py index e4b96c1..68b9feb 100644 --- a/dataloader/cm_querysets.py +++ b/dataloader/cm_querysets.py @@ -3,23 +3,8 @@ # ## cm level # -# ## Importing modules - -# Basics -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import matplotlib.cbook as cbook -# sklearn -from sklearn.ensemble import RandomForestRegressor -from sklearn import linear_model -# Views 3 -from viewser.operations import fetch from viewser import Queryset, Column -import views_runs -from views_partitioning import data_partitioner, legacy -from stepshift import views -# import views_dataviz + ####################################################################################################################### From 66ebaa1e124f2655e3bd9427e5aee850941fc9fb Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:32:49 +0100 Subject: [PATCH 06/13] removed unused function report() and added docstrings --- dataloader/pgm_querysets.py | 47 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/dataloader/pgm_querysets.py b/dataloader/pgm_querysets.py index 69ebd62..2f6d48c 100644 --- a/dataloader/pgm_querysets.py +++ b/dataloader/pgm_querysets.py @@ -1,29 +1,22 @@ # # Specifying querysets for use in Predicting Fatalities project # ## pgm level -# -# -# ## Importing modules - -# Basics -import numpy as np from viewser import Queryset, Column -def report(df): - print() - print(f"A dataset with {len(df.columns)} columns, with " - f"data between t {min(df.index.get_level_values(0))} " - f"and {max(df.index.get_level_values(0))}. " - f"({len(np.unique(df.index.get_level_values(1)))} units)" - ) - return - -# GED, baseline, ln versions of predictors - +def get_pgm_querysets() -> list: + """ + This function generates a Queryset object for the 'priogrid_month' table. -def get_pgm_querysets(): + The Queryset represents a specific query to be executed. It is defined with a name and a table. + It includes several Columns, each representing a column in the database. + The Column is defined with a name, a source table, and a source column. + Each Column can have transformations applied to it, such as replacing missing values, applying a natural logarithm, + applying a boolean condition, calculating the time since an event, applying a decay function, etc. + Returns: + Queryset: A Queryset object. + """ qs_baseline = (Queryset("fatalities003_pgm_baseline", "priogrid_month") # target variable @@ -1181,14 +1174,14 @@ def get_pgm_querysets(): # report(data) qslist = [ - qs_baseline, - qs_conflict_long, - qs_escwa_drought, - qs_natsoc, - qs_broad, - qs_conf_history, - qs_treelag, - qs_sptime_dist - ] + qs_baseline, + qs_conflict_long, + qs_escwa_drought, + qs_natsoc, + qs_broad, + qs_conf_history, + qs_treelag, + qs_sptime_dist + ] return qslist From 9238cc0372e1939b95768fff693c5aa49d6f87b3 Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:38:48 +0100 Subject: [PATCH 07/13] added docstrings, return types --- dataloader/ModelDefinitions.py | 93 +++++++++++++++------------------- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/dataloader/ModelDefinitions.py b/dataloader/ModelDefinitions.py index 5a651c2..3644317 100644 --- a/dataloader/ModelDefinitions.py +++ b/dataloader/ModelDefinitions.py @@ -1,11 +1,5 @@ -# The ModelList is a list of dictionaries that define a range of models for the project +from dataloader.ViewsEstimators import * -# sys.path.append('../') -# sys.path.append('../Tools') -#sys.path.append('../Intermediates') -# sklearn - -from ViewsEstimators import * class FixedFirstSplitRegression(BaseEstimator): """ Regression model which makes the first split according to a specified feature and then splits according to other @@ -25,7 +19,7 @@ def __init__(self, zeros_name: str = 'RFRegressor', ones_indicator: str = '', ones_params: Optional[dict] = None, - zeros_params: Optional[dict] = None): + zeros_params: Optional[dict] = None) -> None: self.ones_name = ones_name self.zeros_name = zeros_name @@ -36,7 +30,7 @@ def __init__(self, self.zeros_fi = [] @staticmethod - def _resolve_estimator(func_name: str): + def _resolve_estimator(func_name: str) -> BaseEstimator: """ Lookup table for supported estimators. This is necessary because sklearn estimator default arguments must pass equality test, and instantiated sub-estimators are not equal. """ @@ -45,22 +39,22 @@ def _resolve_estimator(func_name: str): 'logistic': LogisticRegression(solver='liblinear'), 'LGBMRegressor': LGBMRegressor(n_estimators=250), 'LGBMClassifier': LGBMClassifier(n_estimators=250), - 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2), - 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2), + 'RFRegressor': XGBRFRegressor(n_estimators=250, n_jobs=-2), + 'RFClassifier': XGBRFClassifier(n_estimators=250, n_jobs=-2), 'GBMRegressor': GradientBoostingRegressor(n_estimators=200), 'GBMClassifier': GradientBoostingClassifier(n_estimators=200), - 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2), + 'XGBRegressor': XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-2), + 'XGBClassifier': XGBClassifier(n_estimators=100, learning_rate=0.05, n_jobs=-2), 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200), 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200), - } + } return funcs[func_name] def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series], - z: Union[np.ndarray, pd.Series]): + z: Union[np.ndarray, pd.Series]) -> BaseEstimator: X, y = check_X_y(X, y, dtype=None, accept_sparse=False, accept_large_sparse=False, @@ -73,46 +67,42 @@ def fit(self, self.ones_ = self._resolve_estimator(self.ones_name) if self.ones_params: self.ones_.set_params(**self.ones_params) - self.ones_.fit(X[z==1], y[z==1]) + self.ones_.fit(X[z == 1], y[z == 1]) self.ones_fi = self.ones_.feature_importances_ self.zeros_ = self._resolve_estimator(self.zeros_name) if self.zeros_params: self.zeros_.set_params(**self.zeros_params) - self.zeros_.fit(X[z==0], y[z==0]) + self.zeros_.fit(X[z == 0], y[z == 0]) self.zeros_fi = self.zeros_.feature_importances_ self.is_fitted_ = True return self - - def predict(self, X: Union[np.ndarray, pd.DataFrame]): -# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]): + def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: + # def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]): """ Predict combined response using probabilistic classification outcome """ X = check_array(X, accept_sparse=False, accept_large_sparse=False) check_is_fitted(self, 'is_fitted_') -# predict = +# predict = return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X) -def manual_test(): - """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """ - check_estimator(HurdleRegression) - from sklearn.datasets import make_regression - X, y = make_regression() - reg = FixedFirstSplitRegression() - reg.fit(X, y) - reg.predict(X) - - +def DefineEnsembleModels(level) -> list: + """ + This function defines an ensemble of models for VIEWS ensembling. + An ensemble model is a predictive model that combines the predictions from multiple other models. + It is used to reduce the variance, bias, or improve predictions. -def DefineEnsembleModels(level): + Returns: + list: a list object based on level of analysis (cm or pgm) + """ ModelList = [] if level == 'cm': nj = 12 - + model = { 'modelname': 'fatalities003_nl_baseline_rf', 'algorithm': XGBRFRegressor(n_estimators=300, n_jobs=nj), @@ -134,11 +124,10 @@ def DefineEnsembleModels(level): 'queryset': "fatalities003_conflict_history", 'preprocessing': 'float_it', 'level': 'cm', - 'description': 'A collection of variables that together map the conflict history of a country, random forests regression model.' , + 'description': 'A collection of variables that together map the conflict history of a country, random forests regression model.', 'long_description': 'A collection of variables that together map the conflict history of a country. The features include lagged dependent variables for each conflict type as coded by the UCDP (state-based, one-sided, or non-state) for up to each of the preceding six months, decay functions of time since conflict caused 5, 100, and 500 deaths in a month, for each type of violence, whether ACLED (https://doi.org/10.1177/0022343310378914 recorded similar violence, and whether there was recent violence in any neighboring countries.' } ModelList.append(model) - # Model: GED logged dependent variable, logged conflict history variables, gradient boosting model = { @@ -152,7 +141,7 @@ def DefineEnsembleModels(level): 'description': 'A collection of variables that together map the conflict history of a country, scikit gradient boosting regression model.', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fatalities003_nl_conflicthistory_hurdle_lgb', @@ -179,9 +168,7 @@ def DefineEnsembleModels(level): 'long_description': '' } ModelList.append(model) - - - + model = { 'modelname': 'fatalities003_nl_conflicthistory_nonlog_hurdle_lgb', 'algorithm': HurdleRegression(clf_name='LGBMClassifier', reg_name='LGBMRegressor'), @@ -190,10 +177,10 @@ def DefineEnsembleModels(level): 'queryset': "fatalities003_conflict_history_nonlog", 'preprocessing': 'float_it', 'level': 'cm', - 'description': 'A collection of variables that together map the conflict history of a country, random forests regression model.' , + 'description': 'A collection of variables that together map the conflict history of a country, random forests regression model.', 'long_description': 'A collection of variables that together map the conflict history of a country. The features include lagged dependent variables for each conflict type as coded by the UCDP (state-based, one-sided, or non-state) for up to each of the preceding six months, decay functions of time since conflict caused 5, 100, and 500 deaths in a month, for each type of violence, whether ACLED (https://doi.org/10.1177/0022343310378914 recorded similar violence, and whether there was recent violence in any neighboring countries.' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fatalities003_nl_vdem_hurdle_xgb', @@ -285,7 +272,7 @@ def DefineEnsembleModels(level): 'long_description': '' } ModelList.append(model) - + model = { 'modelname': 'fatalities003_joint_narrow_xgb', 'algorithm': XGBRFRegressor(n_estimators=250, n_jobs=nj), @@ -350,7 +337,7 @@ def DefineEnsembleModels(level): 'description': '', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fatalities003_nl_faostat_rf', @@ -363,7 +350,7 @@ def DefineEnsembleModels(level): 'description': '', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fatalities003_faoprices_rf', @@ -376,7 +363,7 @@ def DefineEnsembleModels(level): 'description': '', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fatalities003_imfweo_rf', @@ -389,7 +376,7 @@ def DefineEnsembleModels(level): 'description': '', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fat_hh20_Markov_glm', @@ -402,7 +389,7 @@ def DefineEnsembleModels(level): 'description': '', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) model = { 'modelname': 'fat_hh20_Markov_rf', @@ -415,16 +402,18 @@ def DefineEnsembleModels(level): 'description': '', 'long_description': '' } - #ModelList.append(model) + # ModelList.append(model) elif level == 'pgm': nj = 12 n_estimators = 200 - rf_regressor = RandomForestRegressor(n_estimators=n_estimators, n_jobs=nj) + rf_regressor = RandomForestRegressor( + n_estimators=n_estimators, n_jobs=nj) - xgb_regressor = XGBRegressor(n_estimators=n_estimators, tree_method='hist', n_jobs=nj) + xgb_regressor = XGBRegressor( + n_estimators=n_estimators, tree_method='hist', n_jobs=nj) lgbm_regressor = LGBMRegressor(n_estimators=n_estimators) @@ -608,11 +597,11 @@ def DefineEnsembleModels(level): } ModelList.append(model) else: - raise Exception(f"Unrecognised level {level}: allowed values are cm or pgm") + raise Exception( + f"Unrecognised level {level}: allowed values are cm or pgm") for model in ModelList: model['predstore_calib'] = level + '_' + model['modelname'] + '_calib' model['predstore_test'] = level + '_' + model['modelname'] + '_test' return ModelList - From c2bf13ba99654daccc0af090942bd67d7bf8542e Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Thu, 1 Feb 2024 14:59:17 +0100 Subject: [PATCH 08/13] added docstrings and return types --- dataloader/FetchData.py | 400 ++++++++++++++++++---------------------- 1 file changed, 183 insertions(+), 217 deletions(-) diff --git a/dataloader/FetchData.py b/dataloader/FetchData.py index b565cea..dda1cf4 100644 --- a/dataloader/FetchData.py +++ b/dataloader/FetchData.py @@ -7,7 +7,7 @@ from dataloader import cm_querysets, pgm_querysets, ModelDefinitions -def ReturnQsList(level): +def ReturnQsList(level) -> list: if level == 'cm': return cm_querysets.get_cm_querysets() elif level == 'pgm': @@ -16,7 +16,7 @@ def ReturnQsList(level): raise Exception(f'unrecognised level {level}') -def SummarizeTable(dfname,df): +def SummarizeTable(dfname, df) -> None: print(f"{dfname}: A dataset with {len(df.columns)} columns, with " f"data between t = {min(df.index.get_level_values(0))} " f"and {max(df.index.get_level_values(0))}; " @@ -24,127 +24,57 @@ def SummarizeTable(dfname,df): ) -def FetchTable(Queryset, name): +def FetchTable(Queryset, name) -> dict: df = Queryset.fetch().astype(float) df.name = name # SummarizeTable(name,df) data = { - 'Name': name, - 'df': df - } + 'Name': name, + 'df': df + } return data -def FetchData(run_id): - print(f'Fetching data using querysets; {run_id}; returns as list of dictionaries containing datasets') +def FetchData(run_id) -> list: + print( + f'Fetching data using querysets; {run_id}; returns as list of dictionaries containing datasets') Datasets = [] - if run_id == 'Fatalities001': - Datasets.append(FetchTable((Queryset("hh_fatalities_ged_ln_ultrashort", "country_month")), 'baseline')) - Datasets.append(FetchTable((Queryset("hh_fatalities_ged_acled_ln", "country_month")), 'conflictlong_ln')) - Datasets.append(FetchTable((Queryset("fat_cm_conflict_history", "country_month")), 'conflict_ln')) - Datasets.append(FetchTable((Queryset("fat_cm_conflict_history_exp", "country_month")), 'conflict_nolog')) - Datasets.append(FetchTable((Queryset("hh_fatalities_wdi_short", "country_month")), 'wdi_short')) - Datasets.append(FetchTable((Queryset("hh_fatalities_vdem_short", "country_month")), 'vdem_short')) - Datasets.append(FetchTable((Queryset("hh_topic_model_short", "country_month")), 'topics_short')) - Datasets.append(FetchTable((Queryset("hh_broad", "country_month")), 'broad')) - # Datasets.append(FetchTable((Queryset("hh_prs", "country_month")),'prs')) - Datasets.append(FetchTable((Queryset("hh_greatest_hits", "country_month")), 'gh')) - Datasets.append(FetchTable((Queryset("hh_20_features", "country_month")), 'hh20')) - Datasets.append(FetchTable((Queryset("hh_all_features", "country_month")), 'all_features')) - - # PCA - Standard_features = ['ged_sb_dep', 'ged_sb', 'decay_ged_sb_5', 'decay_ged_os_5', 'splag_1_decay_ged_sb_5', - 'wdi_sp_pop_totl'] - - sources = [] - af = { - 'name': 'all features', - 'dataset': Datasets[10]['df'], - 'n_comp': 20 - } - sources.append(af) - topics = { - 'name': 'topics', - 'dataset': Datasets[6]['df'], - 'n_comp': 10 - } - sources.append(topics) - vdem = { - 'name': 'vdem', - 'dataset': Datasets[5]['df'], - 'n_comp': 15 - } - sources.append(vdem) - wdi = { - 'name': 'wdi', - 'dataset': Datasets[4]['df'], - 'n_comp': 15 - } - sources.append(wdi) - - EndOfPCAData = 516 - for source in sources: - source = PCA(source, Standard_features, EndOfPCAData) - - Data = { - 'Name': 'pca_all', - 'df': af['result'] - } - Datasets.append(Data) - - Data = { - 'Name': 'pca_topics', - 'df': topics['result'] - } - Datasets.append(Data) - - Data = { - 'Name': 'pca_vdem', - 'df': vdem['result'] - } - Datasets.append(Data) - - Data = { - 'Name': 'pca_wdi', - 'df': wdi['result'] - } - Datasets.append(Data) - - elif run_id == 'Fatalities002': - Datasets.append(FetchTable((Queryset("fatalities002_baseline", "country_month")),'baseline002')) - Datasets.append(FetchTable((Queryset("fatalities002_conflict_history_long", "country_month")),'conflictlong_ln')) - Datasets.append(FetchTable((Queryset("fatalities002_conflict_history", "country_month")),'conflict_ln')) - Datasets.append(FetchTable((Queryset("fatalities002_wdi_short", "country_month")),'wdi_short')) - Datasets.append(FetchTable((Queryset("fatalities002_vdem_short", "country_month")),'vdem_short')) - Datasets.append(FetchTable((Queryset("fatalities002_topics", "country_month")),'topics_002')) - Datasets.append(FetchTable((Queryset("fatalities002_joint_broad", "country_month")),'joint_broad')) - Datasets.append(FetchTable((Queryset("fatalities002_joint_narrow", "country_month")),'joint_narrow')) - Datasets.append(FetchTable((Queryset("fatalities002_all_features", "country_month")),'all_features')) - Datasets.append(FetchTable((Queryset("fatalities002_aquastat", "country_month")),'aquastat')) - Datasets.append(FetchTable((Queryset("fatalities002_faostat", "country_month")),'faostat')) - Datasets.append(FetchTable((Queryset("fatalities002_faoprices", "country_month")),'faoprices')) - Datasets.append(FetchTable((Queryset("fatalities002_imfweo", "country_month")),'imfweo')) - - elif run_id == 'Fatalities003': - Datasets.append(FetchTable((Queryset("fatalities003_baseline", "country_month")),'baseline003')) - Datasets.append(FetchTable((Queryset("fatalities003_conflict_history_long", "country_month")),'conflictlong_ln')) - Datasets.append(FetchTable((Queryset("fatalities003_conflict_history", "country_month")),'conflict_ln')) - Datasets.append(FetchTable((Queryset("fatalities003_conflict_history_nonlog", "country_month")),'conflict_nonlog')) - Datasets.append(FetchTable((Queryset("fatalities003_wdi_short", "country_month")),'wdi_short')) - Datasets.append(FetchTable((Queryset("fatalities003_vdem_short", "country_month")),'vdem_short')) - Datasets.append(FetchTable((Queryset("fatalities003_topics", "country_month")),'topics_003')) - Datasets.append(FetchTable((Queryset("fatalities003_joint_broad", "country_month")),'joint_broad')) - Datasets.append(FetchTable((Queryset("fatalities003_joint_broad_nonlog", "country_month")),'joint_broad_nonlog')) - Datasets.append(FetchTable((Queryset("fatalities003_joint_narrow", "country_month")),'joint_narrow')) - Datasets.append(FetchTable((Queryset("fatalities003_all_features", "country_month")),'all_features')) - Datasets.append(FetchTable((Queryset("fatalities003_aquastat", "country_month")),'aquastat')) - Datasets.append(FetchTable((Queryset("fatalities003_faostat", "country_month")),'faostat')) - Datasets.append(FetchTable((Queryset("fatalities003_faoprices", "country_month")),'faoprices')) - Datasets.append(FetchTable((Queryset("fatalities003_imfweo", "country_month")),'imfweo')) + if run_id == 'Fatalities003': + Datasets.append(FetchTable( + (Queryset("fatalities003_baseline", "country_month")), 'baseline003')) + Datasets.append(FetchTable((Queryset( + "fatalities003_conflict_history_long", "country_month")), 'conflictlong_ln')) + Datasets.append(FetchTable( + (Queryset("fatalities003_conflict_history", "country_month")), 'conflict_ln')) + Datasets.append(FetchTable((Queryset( + "fatalities003_conflict_history_nonlog", "country_month")), 'conflict_nonlog')) + Datasets.append(FetchTable( + (Queryset("fatalities003_wdi_short", "country_month")), 'wdi_short')) + Datasets.append(FetchTable( + (Queryset("fatalities003_vdem_short", "country_month")), 'vdem_short')) + Datasets.append(FetchTable( + (Queryset("fatalities003_topics", "country_month")), 'topics_003')) + Datasets.append(FetchTable( + (Queryset("fatalities003_joint_broad", "country_month")), 'joint_broad')) + Datasets.append(FetchTable((Queryset( + "fatalities003_joint_broad_nonlog", "country_month")), 'joint_broad_nonlog')) + Datasets.append(FetchTable( + (Queryset("fatalities003_joint_narrow", "country_month")), 'joint_narrow')) + Datasets.append(FetchTable( + (Queryset("fatalities003_all_features", "country_month")), 'all_features')) + Datasets.append(FetchTable( + (Queryset("fatalities003_aquastat", "country_month")), 'aquastat')) + Datasets.append(FetchTable( + (Queryset("fatalities003_faostat", "country_month")), 'faostat')) + Datasets.append(FetchTable( + (Queryset("fatalities003_faoprices", "country_month")), 'faoprices')) + Datasets.append(FetchTable( + (Queryset("fatalities003_imfweo", "country_month")), 'imfweo')) # PCA - Standard_features = ['ged_sb_dep','ged_sb', 'decay_ged_sb_5', 'decay_ged_os_5', 'splag_1_decay_ged_sb_5', 'wdi_sp_pop_totl'] + Standard_features = ['ged_sb_dep', 'ged_sb', 'decay_ged_sb_5', + 'decay_ged_os_5', 'splag_1_decay_ged_sb_5', 'wdi_sp_pop_totl'] sources = [] af = { @@ -177,7 +107,7 @@ def FetchData(run_id): EndOfPCAData = 516 for source in sources: - source = PCA(source, Standard_features,EndOfPCAData) + source = PCA(source, Standard_features, EndOfPCAData) Data = { 'Name': 'pca_all', @@ -206,38 +136,40 @@ def FetchData(run_id): else: raise Exception(f"run_id {run_id} not recognised") - return(Datasets) + return (Datasets) -def get_df_from_datasets_by_name(Datasets,name): +def get_df_from_datasets_by_name(Datasets, name) -> pd.DataFrame: for ds in Datasets: if name in ds['Name']: return ds['df'] else: - raise Exception('No Dataset similar to ',name,'found') + raise Exception('No Dataset similar to ', name, 'found') -def fetch_cm_data_from_model_def(qslist): +def fetch_cm_data_from_model_def(qslist) -> list: level = 'cm' ModelList = ModelDefinitions.DefineEnsembleModels(level) - defined_querysets=[qs.name for qs in qslist] + defined_querysets = [qs.name for qs in qslist] - model_querysets=list(set([model['queryset'] for model in ModelList])) + model_querysets = list(set([model['queryset'] for model in ModelList])) - qs_short_names={} + qs_short_names = {} for model in ModelList: - qs_short_names[model['queryset']]=model['data_train'] + qs_short_names[model['queryset']] = model['data_train'] - Datasets=[] + Datasets = [] for model_qs in model_querysets: if model_qs not in defined_querysets: - raise Exception(f'queryset',model_qs,'is not defined in the imported queryset definitions file') + raise Exception( + f'queryset', model_qs, 'is not defined in the imported queryset definitions file') - Datasets.append(FetchTable((Queryset(model_qs, "country_month")), qs_short_names[model_qs])) + Datasets.append(FetchTable( + (Queryset(model_qs, "country_month")), qs_short_names[model_qs])) # PCA Standard_features = ['ged_sb_dep', 'ged_sb', 'decay_ged_sb_5', 'decay_ged_os_5', 'splag_1_decay_ged_sb_5', @@ -248,117 +180,136 @@ def fetch_cm_data_from_model_def(qslist): name = 'all_features' af = { 'name': name, - 'dataset': get_df_from_datasets_by_name(Datasets,name), + 'dataset': get_df_from_datasets_by_name(Datasets, name), 'n_comp': 20 - } + } sources.append(af) name = 'topics' topics = { - 'name': name, - 'dataset': get_df_from_datasets_by_name(Datasets, name), - 'n_comp': 10 - } + 'name': name, + 'dataset': get_df_from_datasets_by_name(Datasets, name), + 'n_comp': 10 + } sources.append(topics) name = 'vdem' vdem = { - 'name': name, - 'dataset': get_df_from_datasets_by_name(Datasets, name), - 'n_comp': 15 - } + 'name': name, + 'dataset': get_df_from_datasets_by_name(Datasets, name), + 'n_comp': 15 + } sources.append(vdem) name = 'wdi' wdi = { - 'name': name, - 'dataset': get_df_from_datasets_by_name(Datasets, name), - 'n_comp': 15 - } + 'name': name, + 'dataset': get_df_from_datasets_by_name(Datasets, name), + 'n_comp': 15 + } sources.append(wdi) EndOfPCAData = 516 for source in sources: - source = PCA(source, Standard_features,EndOfPCAData) + source = PCA(source, Standard_features, EndOfPCAData) Data = { - 'Name': 'pca_all', - 'df': af['result'] - } + 'Name': 'pca_all', + 'df': af['result'] + } Datasets.append(Data) Data = { - 'Name': 'pca_topics', - 'df': topics['result'] - } + 'Name': 'pca_topics', + 'df': topics['result'] + } Datasets.append(Data) Data = { - 'Name': 'pca_vdem', - 'df': vdem['result'] - } + 'Name': 'pca_vdem', + 'df': vdem['result'] + } Datasets.append(Data) Data = { - 'Name': 'pca_wdi', - 'df': wdi['result'] - } + 'Name': 'pca_wdi', + 'df': wdi['result'] + } Datasets.append(Data) return Datasets -def FetchData_pgm(run_id): +def FetchData_pgm(run_id) -> list: # print('Fetching data using querysets; returns as list of dictionaries containing datasets') Datasets = [] if run_id == 'Fatalities001': - Datasets.append(FetchTable((Queryset("hh_fat_pgm_baseline", "priogrid_month")),'baseline')) - Datasets.append(FetchTable((Queryset("hh_fat_pgm_conflictlong", "priogrid_month")),'conflictlong')) - Datasets.append(FetchTable((Queryset("fat_escwa_drought_vulnerability_pgm", "priogrid_month")),'escwa_drought')) - Datasets.append(FetchTable((Queryset("hh_fat_pgm_natsoc", "priogrid_month")),'natsoc')) - Datasets.append(FetchTable((Queryset("hh_fat_pgm_broad", "priogrid_month")),'broad')) - Datasets.append(FetchTable((Queryset("paola_fatalities_conflict_history", "priogrid_month")),'paola_conf_hist')) - Datasets.append(FetchTable((Queryset("jim_pgm_conflict_treelag_d_1_d_2", "priogrid_month")),'conf_treelag')) - Datasets.append(FetchTable((Queryset("jim_pgm_conflict_target_sptime_dist_nu1_10_001", "priogrid_month")),'conf_sptime_dist')) + Datasets.append(FetchTable( + (Queryset("hh_fat_pgm_baseline", "priogrid_month")), 'baseline')) + Datasets.append(FetchTable( + (Queryset("hh_fat_pgm_conflictlong", "priogrid_month")), 'conflictlong')) + Datasets.append(FetchTable((Queryset( + "fat_escwa_drought_vulnerability_pgm", "priogrid_month")), 'escwa_drought')) + Datasets.append(FetchTable( + (Queryset("hh_fat_pgm_natsoc", "priogrid_month")), 'natsoc')) + Datasets.append(FetchTable( + (Queryset("hh_fat_pgm_broad", "priogrid_month")), 'broad')) + Datasets.append(FetchTable((Queryset( + "paola_fatalities_conflict_history", "priogrid_month")), 'paola_conf_hist')) + Datasets.append(FetchTable( + (Queryset("jim_pgm_conflict_treelag_d_1_d_2", "priogrid_month")), 'conf_treelag')) + Datasets.append(FetchTable((Queryset( + "jim_pgm_conflict_target_sptime_dist_nu1_10_001", "priogrid_month")), 'conf_sptime_dist')) if run_id == 'Fatalities003': - Datasets.append(FetchTable((Queryset("fatalities003_pgm_baseline", "priogrid_month")), 'baseline')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_conflictlong", "priogrid_month")), 'conflictlong')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_escwa_drought", "priogrid_month")), 'escwa_drought')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_natsoc", "priogrid_month")), 'natsoc')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_broad", "priogrid_month")), 'broad')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_conflict_history", "priogrid_month")), 'conflict_hist')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_conflict_treelag", "priogrid_month")), 'conflict_treelag')) - Datasets.append(FetchTable((Queryset("fatalities003_pgm_conflict_sptime_dist", "priogrid_month")),'conflict_sptime_dist')) - return(Datasets) - - -def fetch_pgm_data_from_model_def(qslist): + Datasets.append(FetchTable( + (Queryset("fatalities003_pgm_baseline", "priogrid_month")), 'baseline')) + Datasets.append(FetchTable( + (Queryset("fatalities003_pgm_conflictlong", "priogrid_month")), 'conflictlong')) + Datasets.append(FetchTable( + (Queryset("fatalities003_pgm_escwa_drought", "priogrid_month")), 'escwa_drought')) + Datasets.append(FetchTable( + (Queryset("fatalities003_pgm_natsoc", "priogrid_month")), 'natsoc')) + Datasets.append(FetchTable( + (Queryset("fatalities003_pgm_broad", "priogrid_month")), 'broad')) + Datasets.append(FetchTable((Queryset( + "fatalities003_pgm_conflict_history", "priogrid_month")), 'conflict_hist')) + Datasets.append(FetchTable((Queryset( + "fatalities003_pgm_conflict_treelag", "priogrid_month")), 'conflict_treelag')) + Datasets.append(FetchTable((Queryset( + "fatalities003_pgm_conflict_sptime_dist", "priogrid_month")), 'conflict_sptime_dist')) + return (Datasets) + + +def fetch_pgm_data_from_model_def(qslist) -> list: level = 'pgm' ModelList = ModelDefinitions.DefineEnsembleModels(level) - defined_querysets=[qs.name for qs in qslist] + defined_querysets = [qs.name for qs in qslist] - model_querysets=list(set([model['queryset'] for model in ModelList])) + model_querysets = list(set([model['queryset'] for model in ModelList])) - qs_short_names={} + qs_short_names = {} for model in ModelList: - qs_short_names[model['queryset']]=model['data_train'] + qs_short_names[model['queryset']] = model['data_train'] - Datasets=[] + Datasets = [] for model_qs in model_querysets: if model_qs not in defined_querysets: - raise Exception(f'queryset',model_qs,'is not defined in the imported queryset definitions file') + raise Exception( + f'queryset', model_qs, 'is not defined in the imported queryset definitions file') - Datasets.append(FetchTable((Queryset(model_qs, "priogrid_month")), qs_short_names[model_qs])) + Datasets.append(FetchTable( + (Queryset(model_qs, "priogrid_month")), qs_short_names[model_qs])) return Datasets -def get_training_data(Datasets, ModelList, model_name): + +def get_training_data(Datasets, ModelList, model_name) -> pd.DataFrame: for model in ModelList: if model['modelname'] == model_name: ds_name = model['data_train'] @@ -375,7 +326,7 @@ def get_training_data(Datasets, ModelList, model_name): return None -def data_integrity_check(dataset, depvar): +def data_integrity_check(dataset, depvar) -> None: if depvar not in dataset['df'].columns: print(depvar, 'not found in', dataset['Name']) return @@ -386,17 +337,19 @@ def data_integrity_check(dataset, depvar): dataset['df'].insert(0, depvar, depvar_column) if 'country_id' in dataset['df'].columns: - print('country_id found in dataset for ', dataset['Name'], '- dropping') + print('country_id found in dataset for ', + dataset['Name'], '- dropping') dataset['df'] = dataset['df'].drop(['country_id', ], 1) for column in dataset['df'].columns: if dataset['df'][column].isna().sum() != 0: - print('WARNING - NaN/Null data detected in', dataset['Name'], 'column', column) + print('WARNING - NaN/Null data detected in', + dataset['Name'], 'column', column) return -def index_check(model, df_with_wanted_index): +def index_check(model, df_with_wanted_index) -> None: level0_name_wanted, level1_name_wanted = df_with_wanted_index.index.names for key in model.keys(): @@ -410,7 +363,8 @@ def index_check(model, df_with_wanted_index): print('original:', level0_name_have, level1_name_have) print('fixed:', level0_name_wanted, level1_name_wanted) - model[key].index.set_names([level0_name_wanted, level1_name_wanted], inplace=True) + model[key].index.set_names( + [level0_name_wanted, level1_name_wanted], inplace=True) except: pass @@ -418,9 +372,9 @@ def index_check(model, df_with_wanted_index): return -def PCA(source, Standard_features, EndOfPCAData): +def PCA(source, Standard_features, EndOfPCAData) -> dict: df = source['dataset'].loc[121:EndOfPCAData].copy() - df = df.replace([np.inf, -np.inf], 0) + df = df.replace([np.inf, -np.inf], 0) df = df.fillna(0) pca = decomposition.PCA(n_components=source['n_comp']) pca.fit(df) @@ -437,11 +391,12 @@ def PCA(source, Standard_features, EndOfPCAData): colnames.append(colname) source['result'].columns = colnames source['result'].head() - return(source) + return (source) -def find_index(dicts, key, value): - class Null: pass +def find_index(dicts, key, value) -> int: + class Null: + pass for i, d in enumerate(dicts): if d.get(key, Null) == value: return i @@ -449,27 +404,27 @@ class Null: pass raise ValueError('no dict with the key and value combination found') -def RetrieveFromList(Datasets,name): +def RetrieveFromList(Datasets, name) -> pd.DataFrame: return Datasets[find_index(Datasets, 'Name', name)]['df'] -def find_between(s, start, end): +def find_between(s, start, end) -> str: return (s.split(start))[1].split(end)[0] -def find_between_brackets(s): +def find_between_brackets(s) -> str: return (s.split('['))[1].split(']')[0] -def document_queryset(qslist,dev_id): +def document_queryset(qslist, dev_id) -> None: ''' Writes a markdown file listing the variables in the querysets passed in the list of querysets ''' - file = open("../Documentation/Querysets.md","w") + file = open("../Documentation/Querysets.md", "w") file.write('# Documentation of querysets') file.write(dev_id) for qs in qslist: - print('Model: ',qs.name) + print('Model: ', qs.name) file.write(qs.name) ModelMetaData = [] i = 0 @@ -477,21 +432,21 @@ def document_queryset(qslist,dev_id): VarMetaData = { 'Model': qs.name, 'Included variable name': find_between_brackets(str(var[0])), - 'Database variable name': find_between(str(var[-1]),'name=',' ') + 'Database variable name': find_between(str(var[-1]), 'name=', ' ') } Transformations = [] for line in var: - # print('line:',line) - item = str(line) + # print('line:',line) + item = str(line) if 'trf' in item: - trf = find_between(item,'name=',' ') + trf = find_between(item, 'name=', ' ') # print('trf:', trf) if 'util.rename' not in trf: Transformations.append(trf) # print(Transformations) VarMetaData['Transformations'] = Transformations ModelMetaData.append(VarMetaData) - i= i + 1 + i = i + 1 ModelMetaData_df = pd.DataFrame(ModelMetaData) filename = '../Documentation/Model_' + qs.name + '.md' ModelMetaData_df.to_markdown(index=False, buf=filename) @@ -501,9 +456,9 @@ def document_queryset(qslist,dev_id): file.close() -def document_ensemble(ModelList, outcome): +def document_ensemble(ModelList, outcome) -> None: ''' Writes a markdown file listing the models passed in the list of models ''' - + i = 0 EnsembleMetaData = [] for model in ModelList: @@ -520,25 +475,29 @@ def document_ensemble(ModelList, outcome): ModelMetaData['PCA'] = 'True' else: ModelMetaData['PCA'] = 'False' - + EnsembleMetaData.append(ModelMetaData) i = i + 1 EnsembleMetaData_df = pd.DataFrame(EnsembleMetaData) filename = f'../Documentation/Ensemble_{outcome}.md' EnsembleMetaData_df.to_markdown(index=False, buf=filename) - return(EnsembleMetaData_df) + return (EnsembleMetaData_df) # calibration of pgm predictions using cm predictions: -def calibrate_pg_with_c(df_pgm, df_cm, column, df_pg_id_c_id=None, log_feature=False, super_calibrate=False): + + +def calibrate_pg_with_c(df_pgm, df_cm, column, df_pg_id_c_id=None, log_feature=False, super_calibrate=False) -> pd.DataFrame: try: assert df_pgm.index.names[0] == 'month_id' except AssertionError: raise ValueError(f"Expected pgm df to have month_id as 1st index") try: - assert df_pgm.index.names[1] in ['priogrid_gid', 'priogrid_id', 'pg_id'] + assert df_pgm.index.names[1] in [ + 'priogrid_gid', 'priogrid_id', 'pg_id'] except AssertionError: - raise ValueError(f"Expected pgm df to have one of priogrid_gid, priogrid_id, pg_id as 2nd index") + raise ValueError( + f"Expected pgm df to have one of priogrid_gid, priogrid_id, pg_id as 2nd index") try: assert df_cm.index.names[0] == 'month_id' @@ -548,7 +507,8 @@ def calibrate_pg_with_c(df_pgm, df_cm, column, df_pg_id_c_id=None, log_feature=F try: assert df_cm.index.names[1] in ['country_id', 'c_id'] except AssertionError: - raise ValueError(f"Expected cm df to have one of country_id, c_id as 2nd index") + raise ValueError( + f"Expected cm df to have one of country_id, c_id as 2nd index") try: assert column in df_pgm.columns @@ -584,11 +544,15 @@ def calibrate_pg_with_c(df_pgm, df_cm, column, df_pg_id_c_id=None, log_feature=F normalised = np.zeros((df_pgm[column].size)) if log_feature: - df_to_calib = pd.DataFrame(index=df_pgm.index, columns=[column, ], data=np.exp(df_pgm[column].values) - 1) - df_calib_from = pd.DataFrame(index=df_cm.index, columns=[column, ], data=np.exp(df_cm[column].values) - 1) + df_to_calib = pd.DataFrame(index=df_pgm.index, columns=[ + column, ], data=np.exp(df_pgm[column].values) - 1) + df_calib_from = pd.DataFrame(index=df_cm.index, columns=[ + column, ], data=np.exp(df_cm[column].values) - 1) else: - df_to_calib = pd.DataFrame(index=df_pgm.index, columns=[column, ], data=df_pgm[column].values) - df_calib_from = pd.DataFrame(index=df_cm.index, columns=[column, ], data=df_cm[column].values) + df_to_calib = pd.DataFrame(index=df_pgm.index, columns=[ + column, ], data=df_pgm[column].values) + df_calib_from = pd.DataFrame(index=df_cm.index, columns=[ + column, ], data=df_cm[column].values) for imonth, month in enumerate(input_months_pgm): @@ -599,7 +563,8 @@ def calibrate_pg_with_c(df_pgm, df_cm, column, df_pg_id_c_id=None, log_feature=F df_data_month_pgm = pd.DataFrame(df_to_calib[column].loc[month]) - values_month_pgm = df_to_calib[column].loc[month].values.reshape(pg_size) + values_month_pgm = df_to_calib[column].loc[month].values.reshape( + pg_size) df_data_month_cm = pd.DataFrame(df_calib_from[column].loc[month]) @@ -633,13 +598,14 @@ def calibrate_pg_with_c(df_pgm, df_cm, column, df_pg_id_c_id=None, log_feature=F if log_feature: normalised = np.log(normalised + 1) - df_out = pd.DataFrame(index=df_pgm.index, columns=[column, ], data=normalised) + df_out = pd.DataFrame(index=df_pgm.index, columns=[ + column, ], data=normalised) return df_out # helper function for pgm-cm calibration, which fetches country-ids for pg-ids -def fetch_df_pg_id_c_id(): +def fetch_df_pg_id_c_id() -> pd.DataFrame: qs = (Queryset("jed_pgm_cm", "priogrid_month") .with_column(Column("country_id", from_table="country_month", from_column="country_id") @@ -648,4 +614,4 @@ def fetch_df_pg_id_c_id(): df_pg_id_c_id = qs.publish().fetch() - return df_pg_id_c_id \ No newline at end of file + return df_pg_id_c_id From 30b4e6de3a39c5bf594d836d551efde2caac4f26 Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:44:48 +0100 Subject: [PATCH 09/13] imported os --- sweep.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sweep.py b/sweep.py index 2fc5d46..9b89340 100644 --- a/sweep.py +++ b/sweep.py @@ -2,6 +2,10 @@ from pathlib import Path import argparse import wandb +import os +from diskcache import Cache +import warnings +warnings.filterwarnings("ignore") def train() -> None: @@ -37,6 +41,7 @@ def train() -> None: if __name__ == '__main__': # this is the main block of code that will be called when running the script from the command line with arguments + os.environ['WANDB_SILENT'] = 'true' parser = argparse.ArgumentParser(description='Method for sweeping on W&B') parser.add_argument('-l', metavar='level', type=str, From f5a44035b1b42f0cd13d672aad732043b5226f91 Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:45:17 +0100 Subject: [PATCH 10/13] removed import cache --- sweep.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sweep.py b/sweep.py index 9b89340..7edee63 100644 --- a/sweep.py +++ b/sweep.py @@ -3,7 +3,6 @@ import argparse import wandb import os -from diskcache import Cache import warnings warnings.filterwarnings("ignore") From 6a2e816cf9b547e02bf8899df8995d52cd66d27a Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:46:45 +0100 Subject: [PATCH 11/13] added docstrings, removed unused libraries --- util/utils_map.py | 155 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 46 deletions(-) diff --git a/util/utils_map.py b/util/utils_map.py index 74ab1a3..954adb1 100644 --- a/util/utils_map.py +++ b/util/utils_map.py @@ -1,28 +1,44 @@ +import warnings +from pathlib import Path +import matplotlib.ticker as ticker +import matplotlib.colors as colors import numpy as np import pandas as pd import geopandas as gpd import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import make_axes_locatable import matplotlib -matplotlib.pyplot.switch_backend('Agg') # Important! Disable GUI windows so that thread won't break down -import matplotlib.colors as colors -import matplotlib.ticker as ticker -import requests -from PIL import Image -from pathlib import Path +# Important! Disable GUI windows so that thread won't break down +matplotlib.pyplot.switch_backend('Agg') +warnings.filterwarnings("ignore") + class GeoPlotter: + """ + This class is used to plot the maps for the predictions and the errors. It uses the geopandas library to read the shapefiles for the countries and the priogrids. The class has two methods, plot_cm_map and plot_pgm_map, which are used to plot the maps for the cm and pgm levels, respectively. The methods take the following parameters: + - df: A pandas DataFrame containing the predictions and the errors. + - month: An integer representing the month for which the maps are to be plotted. + - step: A string representing the step for which the maps are to be plotted. + - transform: A string representing the transformation to be used for the predictions and the errors. + """ # Define the bounding box coordinates for Africa and the Middle East (latitude and longitude) XMIN, XMAX, YMIN, YMAX = -18.5, 64.0, -35.5, 43.0 - def __init__(self, cm_path='./GeospatialData/countries.shp', pgm_path='./GeospatialData/priogrid.shp'): + def __init__(self, cm_path='./GeospatialData/countries.shp', pgm_path='./GeospatialData/priogrid.shp') -> None: self.cm_path = Path(cm_path) self.pgm_path = Path(pgm_path) self.cm_file = gpd.read_file(self.cm_path) self.pgm_file = gpd.read_file(self.pgm_path) - def plot_cm_map(self, df, month, step, transform): + """ + This method is used to plot the maps for the cm level. It takes the following parameters: + - df: A pandas DataFrame containing the predictions and the errors. + - month: An integer representing the month for which the maps are to be plotted. + - step: A string representing the step for which the maps are to be plotted. + - transform: A string representing the transformation to be used for the predictions and the errors. + """ + df_cm = df.reset_index() df_cm = pd.merge(df_cm, self.cm_file, on='country_id') gdf_cm = gpd.GeoDataFrame(df_cm, crs=self.cm_file.crs) @@ -30,52 +46,63 @@ def plot_cm_map(self, df, month, step, transform): # Create subplot fig, axes = plt.subplots(2, 2, figsize=(24, 16)) - fig.suptitle(f'Level: cm, Month: {month}, Transform: {transform}', fontsize=25) - self.plot_fatality(gdf_cm_m, axes[0,0], step, 'cm') - self.plot_absolute_error(gdf_cm_m, axes[0,1], step, 'cm') - self.plot_squared_error(gdf_cm_m, axes[1,0], step, 'cm') - self.plot_squared_error(gdf_cm_m, axes[1,1], step, 'cm', if_log=True) - - # # Download and display the VIEWS logo image - # logo_url = "https://cdn.cloud.prio.org/images/c784369fb4ae42acb7ee882e91056d92.png?x=800&" - # response = requests.get(logo_url, stream=True) - # - # if response.status_code == 200: - # logo_img = Image.open(response.raw) - # logo_ax = fig.add_axes( - # [0.28, 0.18, 0.1, 0.1]) # Define the position and size of the logo [left, bottom, width, height] - # logo_ax.imshow(logo_img) - # logo_ax.axis('off') # Turn off axis labels and ticks for the logo - # else: - # print("Failed to download the logo image") - + fig.suptitle( + f'Level: cm, Month: {month}, Transform: {transform}', fontsize=25) + self.plot_fatality(gdf_cm_m, axes[0, 0], step, 'cm') + self.plot_absolute_error(gdf_cm_m, axes[0, 1], step, 'cm') + self.plot_squared_error(gdf_cm_m, axes[1, 0], step, 'cm') + self.plot_squared_error(gdf_cm_m, axes[1, 1], step, 'cm', if_log=True) plt.close() return fig def plot_pgm_map(self, df, month, step, transform): + """ + This method is used to plot the maps for the pgm level. It takes the following parameters: + - df: A pandas DataFrame containing the predictions and the errors. + - month: An integer representing the month for which the maps are to be plotted. + - step: A string representing the step for which the maps are to be plotted. + - transform: A string representing the transformation to be used for the predictions and the errors. + """ df_pgm = df.reset_index() - df_pgm = pd.merge(df_pgm, self.pgm_file, left_on='priogrid_id', right_on='priogrid_i') # Note that in predictions the coloumn name is priogrid_id instead of priogrid_gid + # Note that in predictions the coloumn name is priogrid_id instead of priogrid_gid + df_pgm = pd.merge(df_pgm, self.pgm_file, + left_on='priogrid_id', right_on='priogrid_i') gdf_pgm = gpd.GeoDataFrame(df_pgm, crs=self.pgm_file.crs) gdf_pgm_m = gdf_pgm.loc[gdf_pgm['month_id'] == month] # Create subplot fig, axes = plt.subplots(2, 2, figsize=(24, 16)) - fig.suptitle(f'Level: pgm, Month: {month}, Transform: {transform}', fontsize=25) - self.plot_fatality(gdf_pgm_m, axes[0,0], step, 'pgm') - self.plot_absolute_error(gdf_pgm_m, axes[0,1], step, 'pgm') - self.plot_squared_error(gdf_pgm_m, axes[1,0], step, 'pgm') - self.plot_squared_error(gdf_pgm_m, axes[1,1], step, 'pgm', if_log=True) + fig.suptitle( + f'Level: pgm, Month: {month}, Transform: {transform}', fontsize=25) + self.plot_fatality(gdf_pgm_m, axes[0, 0], step, 'pgm') + self.plot_absolute_error(gdf_pgm_m, axes[0, 1], step, 'pgm') + self.plot_squared_error(gdf_pgm_m, axes[1, 0], step, 'pgm') + self.plot_squared_error( + gdf_pgm_m, axes[1, 1], step, 'pgm', if_log=True) plt.close() return fig def add_cax(self, ax): + """ + This method is used to add a colorbar to the map. It takes the following parameters: + - ax: The axes object to which the colorbar is to be added. + """ divider = make_axes_locatable(ax) cax = divider.append_axes("bottom", size="5%", pad=0.5) cax.tick_params(labelsize=20) return cax - def add_cm_plot(self, gdf, column, vmin, vmax, ax, title): + def add_cm_plot(self, gdf, column, vmin, vmax, ax, title) -> None: + """ + This method is used to add a plot for the cm level. It takes the following parameters: + - gdf: A geopandas GeoDataFrame containing the predictions and the errors. + - column: A string representing the column to be plotted. + - vmin: A float representing the minimum value for the colorbar. + - vmax: A float representing the maximum value for the colorbar. + - ax: The axes object to which the plot is to be added. + - title: A string representing the title of the plot. + """ cax = self.add_cax(ax) self.cm_file.boundary.plot(ax=ax, linewidth=0.3) plot = gdf.plot(column=column, cmap='viridis', legend=True, @@ -89,12 +116,23 @@ def add_cm_plot(self, gdf, column, vmin, vmax, ax, title): colorbar = plot.get_figure().get_axes()[1] colorbar.xaxis.set_major_formatter(ticker.ScalarFormatter()) - def add_pgm_plot(self, gdf, column, vmin, vmax, ax, title): + def add_pgm_plot(self, gdf, column, vmin, vmax, ax, title) -> None: + """ + This method is used to add a plot for the pgm level. It takes the following parameters: + - gdf: A geopandas GeoDataFrame containing the predictions and the errors. + - column: A string representing the column to be plotted. + - vmin: A float representing the minimum value for the colorbar. + - vmax: A float representing the maximum value for the colorbar. + - ax: The axes object to which the plot is to be added. + - title: A string representing the title of the plot. + """ cax = self.add_cax(ax) - self.pgm_file.boundary.plot(ax=ax, linewidth=0.2) # Plot pgm boundaries - self.cm_file.boundary.plot(ax=ax, linewidth=1.1, color='grey') # use cm unit map + self.pgm_file.boundary.plot( + ax=ax, linewidth=0.2) # Plot pgm boundaries + self.cm_file.boundary.plot( + ax=ax, linewidth=1.1, color='grey') # use cm unit map plot = gdf.plot(column=column, cmap='viridis', legend=True, - norm=colors.SymLogNorm(linthresh=1, vmin=vmin,vmax=vmax), cax=cax, + norm=colors.SymLogNorm(linthresh=1, vmin=vmin, vmax=vmax), cax=cax, legend_kwds={"label": " ", "orientation": "horizontal"}, ax=ax, linewidth=0.2, edgecolor='#FF000000') ax.set_title(title, fontsize=15, y=1) @@ -105,9 +143,17 @@ def add_pgm_plot(self, gdf, column, vmin, vmax, ax, title): ax.set_ylim(self.YMIN, self.YMAX) colorbar = plot.get_figure().get_axes()[1] - colorbar.xaxis.set_major_formatter(ticker.ScalarFormatter()) # Customize tick labels - - def plot_fatality(self, gdf_m, ax, step, level): + colorbar.xaxis.set_major_formatter( + ticker.ScalarFormatter()) # Customize tick labels + + def plot_fatality(self, gdf_m, ax, step, level) -> None: + """ + This method is used to plot the predictions for the fatalities. It takes the following parameters: + - gdf_m: A geopandas GeoDataFrame containing the predictions and the errors. + - ax: The axes object to which the plot is to be added. + - step: A string representing the step for which the predictions are to be plotted. + - level: A string representing the level for which the predictions are to be plotted. + """ pred_min = gdf_m[step].min() pred_max = gdf_m[step].max() # print(pred_max, pred_min) @@ -119,7 +165,14 @@ def plot_fatality(self, gdf_m, ax, step, level): elif level == 'pgm': self.add_pgm_plot(gdf_m, column, pred_min, pred_max, ax, title) - def plot_absolute_error(self, gdf_m, ax, step, level): + def plot_absolute_error(self, gdf_m, ax, step, level) -> None: + """ + This method is used to plot the absolute error for the predictions. It takes the following parameters: + - gdf_m: A geopandas GeoDataFrame containing the predictions and the errors. + - ax: The axes object to which the plot is to be added. + - step: A string representing the step for which the absolute error is to be plotted. + - level: A string representing the level for which the absolute error is to be plotted. + """ gdf_m['absolute_error'] = abs(gdf_m['ged_sb_dep'] - gdf_m[step]) ae_min = gdf_m['absolute_error'].min() ae_max = gdf_m['absolute_error'].max() @@ -131,12 +184,22 @@ def plot_absolute_error(self, gdf_m, ax, step, level): elif level == 'pgm': self.add_pgm_plot(gdf_m, column, ae_min, ae_max, ax, title) - def plot_squared_error(self, gdf_m, ax, step, level, if_log=False): + def plot_squared_error(self, gdf_m, ax, step, level, if_log=False) -> None: + """ + This method is used to plot the squared error for the predictions. It takes the following parameters: + - gdf_m: A geopandas GeoDataFrame containing the predictions and the errors. + - ax: The axes object to which the plot is to be added. + - step: A string representing the step for which the squared error is to be plotted. + - level: A string representing the level for which the squared error is to be plotted. + - if_log: A boolean representing whether the squared error is to be plotted on a logarithmic scale. + """ if if_log: - gdf_m['squared_error'] = np.square(np.log(gdf_m['ged_sb_dep']+1) - np.log(gdf_m[step]+1)) + gdf_m['squared_error'] = np.square( + np.log(gdf_m['ged_sb_dep']+1) - np.log(gdf_m[step]+1)) title = 'Squared Logarithmic Error' else: - gdf_m['squared_error'] = np.square(gdf_m['ged_sb_dep'] - gdf_m[step]) + gdf_m['squared_error'] = np.square( + gdf_m['ged_sb_dep'] - gdf_m[step]) title = 'Squared Error' se_min = gdf_m['squared_error'].min() se_max = gdf_m['squared_error'].max() @@ -145,4 +208,4 @@ def plot_squared_error(self, gdf_m, ax, step, level, if_log=False): if level == 'cm': self.add_cm_plot(gdf_m, column, se_min, se_max, ax, title) elif level == 'pgm': - self.add_pgm_plot(gdf_m, column, se_min, se_max, ax, title) \ No newline at end of file + self.add_pgm_plot(gdf_m, column, se_min, se_max, ax, title) From b3f5bdb90a5eef2faf6f543f425d4358eee0eed7 Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:47:09 +0100 Subject: [PATCH 12/13] added docstrings --- util/utils.py | 298 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 213 insertions(+), 85 deletions(-) diff --git a/util/utils.py b/util/utils.py index d9b0946..7889f71 100644 --- a/util/utils.py +++ b/util/utils.py @@ -1,27 +1,28 @@ -import os +from util.new_metrics import * +from util.utils_map import GeoPlotter +from dataloader.ViewsEstimators import * +from dataloader.FetchData import * +from views_forecasts.extensions import * +from views_runs.run_result import RunResult +from views_partitioning.data_partitioner import DataPartitioner +from views_runs import storage, StepshiftedModels +from diskcache import Cache import wandb import copy from pathlib import Path -import warnings - from sklearn.metrics import mean_squared_error -warnings.filterwarnings("ignore") -os.environ['WANDB_SILENT'] = 'true' -from diskcache import Cache -cache = Cache('./cache', size_limit=100000000000) - -from views_runs import storage, StepshiftedModels -from views_partitioning.data_partitioner import DataPartitioner -from views_runs.run_result import RunResult -from views_forecasts.extensions import * -from dataloader.FetchData import * -from ViewsEstimators import * -from util.utils_map import GeoPlotter -from util.new_metrics import * - def fetch_data(level: str) -> (Tuple[List[Queryset], List[Dict[str, pd.DataFrame]]]): + """ + This function fetches the query sets and datasets from the database based on the level of the data (cm or pgm). + The function uses the following global variables: + - level: A string indicating the level of the data (cm or pgm). + - ReturnQsList: A function that returns a list of query sets based on the level of the data. + - fetch_cm_data_from_model_def: A function that fetches the datasets from the database based on the query sets. + - fetch_pgm_data_from_model_def: A function that fetches the datasets from the database based on the query sets. + The function returns a tuple containing the query sets and datasets. + """ print('Fetching query sets') qslist = ReturnQsList(level) print('Fetching datasets') @@ -31,34 +32,93 @@ def fetch_data(level: str) -> (Tuple[List[Queryset], List[Dict[str, pd.DataFrame Datasets = fetch_pgm_data_from_model_def(qslist) return qslist, Datasets + +# 100GB cache. Change this to a smaller value if you have less space. +cache = Cache('./cache', size_limit=100000000000) + + +# Cache the result of this function @cache.memoize(typed=True, expire=None, tag='data') -def i_fetch_data(level): +def i_fetch_data(level) -> (Tuple[List[Queryset], List[Dict[str, pd.DataFrame]]]): + """ + This function fetches the query sets and datasets from the database based on the level of the data (cm or pgm). + The function uses the following global variables: + - level: A string indicating the level of the data (cm or pgm). + - ReturnQsList: A function that returns a list of query sets based on the level of the data. + - fetch_cm_data_from_model_def: A function that fetches the datasets from the database based on the query sets. + - fetch_pgm_data_from_model_def: A function that fetches the datasets from the database based on the query sets. + The function returns a tuple containing the query sets and datasets. + """ return fetch_data(level) -def standardize_raw_data(df): +def standardize_raw_data(df) -> pd.DataFrame: + """ + This function standardizes the raw data. + The function uses the following global variables: + - df: A pandas DataFrame containing the raw data. + The function returns a pandas DataFrame containing the standardized data. + """ mean_val = df.mean() std_val = df.std() standardized_df = (df - mean_val) / std_val - + return standardized_df -def normalize_raw_data(df, b, a): + +def normalize_raw_data(df, b, a) -> pd.DataFrame: + """ + This function normalizes the raw data. + The function uses the following global variables: + - df: A pandas DataFrame containing the raw data. + - b: A float indicating the maximum value of the normalized data. + - a: A float indicating the minimum value of the normalized data. + The function returns a pandas DataFrame containing the normalized data. + """ x_min = df.min() x_max = df.max() x_norm = (b - a) * (df - x_min) / (x_max - x_min) + a return x_norm -def normalize_retransform(x, min_val, max_val, b=1, a=0): +def normalize_retransform(x, min_val, max_val, b=1, a=0) -> float: + """ + This function retransforms the normalized data. + The function uses the following global variables: + - x: A float indicating the normalized value. + - min_val: A float indicating the minimum value of the normalized data. + - max_val: A float indicating the maximum value of the normalized data. + - b: A float indicating the maximum value of the retransformed data. + - a: A float indicating the minimum value of the retransformed data. + The function returns a float indicating the retransformed value. + """ return (x - a) / (b - a) * (max_val - min_val) + min_val -def standardize_retransform(x, mean_val, std_val): +def standardize_retransform(x, mean_val, std_val) -> float: + """ + This function retransforms the standardized data. + The function uses the following global variables: + - x: A float indicating the standardized value. + - mean_val: A float indicating the mean value of the standardized data. + - std_val: A float indicating the standard deviation of the standardized data. + The function returns a float indicating the retransformed value. + """ return x * std_val + mean_val -def transform_data(Datasets, transforms, level, b=1, a=0, by_group=True): +def transform_data(Datasets, transforms, level, b=1, a=0, by_group=True) -> (Dict[str, List[Dict[str, pd.DataFrame]]], Dict[str, Dict[str, pd.DataFrame]]): + """ + This function transforms the data based on the specified transformations. + The function uses the following global variables: + - Datasets: A list of dictionaries containing the datasets. + - transforms: A list of strings indicating the transformations to be applied to the data. + - level: A string indicating the level of the data (cm or pgm). + - b: A float indicating the maximum value of the normalized data. + - a: A float indicating the minimum value of the normalized data. + - by_group: A boolean indicating whether the data should be transformed by group. + The function returns a tuple containing a dictionary of transformed datasets and a dictionary of transformed parameters. + """ Datasets_transformed = {} para_transformed = {} @@ -75,7 +135,8 @@ def transform_data(Datasets, transforms, level, b=1, a=0, by_group=True): elif transform == 'log': for dataset in Datasets_copy: - dataset['df']['ged_sb_dep'] = np.log(dataset['df']['ged_sb_dep'] + 1) + dataset['df']['ged_sb_dep'] = np.log( + dataset['df']['ged_sb_dep'] + 1) Datasets_transformed[transform] = Datasets_copy para_transformed[transform] = None @@ -83,18 +144,22 @@ def transform_data(Datasets, transforms, level, b=1, a=0, by_group=True): dict_max_min = {} for dataset in Datasets_copy: if by_group: - min_values = dataset['df'].groupby(level=target)['ged_sb_dep'].min() - max_values = dataset['df'].groupby(level=target)['ged_sb_dep'].max() + min_values = dataset['df'].groupby( + level=target)['ged_sb_dep'].min() + max_values = dataset['df'].groupby( + level=target)['ged_sb_dep'].max() - dict_max_min[dataset['Name']] = pd.DataFrame({'min_val': min_values, 'max_val': max_values}) + dict_max_min[dataset['Name']] = pd.DataFrame( + {'min_val': min_values, 'max_val': max_values}) else: min_values = dataset['df']['ged_sb_dep'].min() max_values = dataset['df']['ged_sb_dep'].max() - dict_max_min[dataset['Name']] = pd.DataFrame({'min_val': [min_values], 'max_val': [max_values]}) + dict_max_min[dataset['Name']] = pd.DataFrame( + {'min_val': [min_values], 'max_val': [max_values]}) dataset['df']['ged_sb_dep'] = (b - a) * (dataset['df']['ged_sb_dep'] - min_values) / ( - max_values - min_values) + a + max_values - min_values) + a dataset['df']['ged_sb_dep'].fillna(0, inplace=True) Datasets_transformed[transform] = Datasets_copy para_transformed[transform] = dict_max_min @@ -103,53 +168,85 @@ def transform_data(Datasets, transforms, level, b=1, a=0, by_group=True): dict_mean_std = {} for dataset in Datasets_copy: if by_group: - mean_values = dataset['df'].groupby(level=target)['ged_sb_dep'].mean() - std_values = dataset['df'].groupby(level=target)['ged_sb_dep'].std() - dict_mean_std[dataset['Name']] = pd.DataFrame({'mean_val': mean_values, 'std_val': std_values}) + mean_values = dataset['df'].groupby( + level=target)['ged_sb_dep'].mean() + std_values = dataset['df'].groupby( + level=target)['ged_sb_dep'].std() + dict_mean_std[dataset['Name']] = pd.DataFrame( + {'mean_val': mean_values, 'std_val': std_values}) else: mean_values = dataset['df']['ged_sb_dep'].mean() std_values = dataset['df']['ged_sb_dep'].std() - dict_mean_std[dataset['Name']] = pd.DataFrame({'mean_val': [mean_values], 'std_val': [std_values]}) + dict_mean_std[dataset['Name']] = pd.DataFrame( + {'mean_val': [mean_values], 'std_val': [std_values]}) - dataset['df']['ged_sb_dep'] = (dataset['df']['ged_sb_dep'] - mean_values) / std_values + dataset['df']['ged_sb_dep'] = ( + dataset['df']['ged_sb_dep'] - mean_values) / std_values dataset['df']['ged_sb_dep'].fillna(0, inplace=True) Datasets_transformed[transform] = Datasets_copy para_transformed[transform] = dict_mean_std else: - raise ValueError("Wrong transformation, only support 'raw', 'log', 'normalize', 'standardize'.") + raise ValueError( + "Wrong transformation, only support 'raw', 'log', 'normalize', 'standardize'.") return Datasets_transformed, para_transformed def get_config_path(config_path: Path) -> Path: + """ + This function gets the configuration path. + The function uses the following global variables: + - config_path: A Path object indicating the path to the configuration directory. + The function returns a tuple containing the paths to the common configuration file, the wandb configuration file, the model configuration directory, and the sweep configuration directory. + """ common_config_path = config_path / 'common_config.py' wandb_config_path = config_path / 'wandb_config.py' model_config_path = config_path / 'model_config' sweep_config_path = config_path / 'sweep_config' if not common_config_path.is_file(): - raise FileNotFoundError(f'The common configuration file {common_config_path} does not exist.') + raise FileNotFoundError( + f'The common configuration file {common_config_path} does not exist.') if not wandb_config_path.is_file(): - raise FileNotFoundError(f'The wandb configuration file {wandb_config_path} does not exist.') + raise FileNotFoundError( + f'The wandb configuration file {wandb_config_path} does not exist.') if not model_config_path.exists() or not model_config_path.is_dir(): - raise FileNotFoundError(f'The directory {model_config_path} does not exist or is not a directory.') + raise FileNotFoundError( + f'The directory {model_config_path} does not exist or is not a directory.') if not sweep_config_path.exists() or not sweep_config_path.is_dir(): - raise FileNotFoundError(f'The directory {sweep_config_path} does not exist or is not a directory.') + raise FileNotFoundError( + f'The directory {sweep_config_path} does not exist or is not a directory.') return common_config_path, wandb_config_path, model_config_path, sweep_config_path def get_config_from_path(path: Path, config_name: str) -> Dict: + """ + This function gets the configuration from the specified path. + The function uses the following global variables: + - path: A Path object indicating the path to the configuration file. + - config_name: A string indicating the name of the configuration. + The function returns a dictionary containing the configuration. + """ if config_name not in ['common', 'wandb', 'sweep', 'model']: - raise ValueError("Wrong configuration name, only support 'common', 'wandb', 'sweep', 'model'.") + raise ValueError( + "Wrong configuration name, only support 'common', 'wandb', 'sweep', 'model'.") config = {} exec(path.read_text(), config) config_name = config_name + '_config' return config[config_name] -def steps_evaluation(df, steps, transform='raw'): +def steps_evaluation(df, steps, transform='raw') -> None: + """ + This function evaluates the steps of the model. + The function uses the following global variables: + - df: A pandas DataFrame containing the data. + - steps: A list of integers indicating the steps to be evaluated. + - transform: A string indicating the transformation to be applied to the data. + The function returns None. + """ for step_number in steps: wandb.log({f'mse_{transform}_step_{step_number}': mean_squared_error( df['ged_sb_dep'], df[f'step_pred_{step_number}'])}) @@ -164,11 +261,18 @@ def steps_evaluation(df, steps, transform='raw'): df['ged_sb_dep'], df[f'step_pred_{step_number}'])}) -def metrics_evaluation(df, pred_cols, transform='raw'): +def metrics_evaluation(df, pred_cols, transform='raw') -> None: + """ + This function evaluates the metrics of the model. + The function uses the following global variables: + - df: A pandas DataFrame containing the data. + - pred_cols: A list of strings indicating the columns containing the predictions. + - transform: A string indicating the transformation to be applied to the data. + The function returns None. + """ df['mse'] = df.apply(lambda row: mean_squared_error([row['ged_sb_dep']] * 36, [row[col] for col in pred_cols]), axis=1) - df['tloss'] = df.apply(lambda row: tweedie_loss([row['ged_sb_dep']] * 36, [row[col] for col in pred_cols], pow=1.5, eps=np.exp(-100)), axis=1) df['kld'] = df.apply(lambda row: kl_divergence([row['ged_sb_dep']] * 36, @@ -191,10 +295,21 @@ def metrics_evaluation(df, pred_cols, transform='raw'): jend_{transform}: {df["jend"].mean()}''') -def retransform_data(df, transform, para_transformed, by_group=True, b=1, a=0): +def retransform_data(df, transform, para_transformed, by_group=True, b=1, a=0) -> pd.DataFrame: + """ + This function retransforms the data based on the specified transformation. + The function uses the following global variables: + - df: A pandas DataFrame containing the data. + - transform: A string indicating the transformation to be applied to the data. + - para_transformed: A dictionary containing the transformed parameters. + - by_group: A boolean indicating whether the data should be retransformed by group. + - b: A float indicating the maximum value of the normalized data. + - a: A float indicating the minimum value of the normalized data. + The function returns a pandas DataFrame containing the retransformed data. + """ if transform == 'log': df = np.exp(df) - 1 - + elif transform == 'normalize': df_para_model = para_transformed[transform][wandb.config['data_train']] if by_group: @@ -208,9 +323,9 @@ def retransform_data(df, transform, para_transformed, by_group=True, b=1, a=0): elif transform == 'standardize': df_para_model = para_transformed[transform][wandb.config['data_train']] if by_group: - df = df.apply(lambda row: standardize_retransform(row, - df_para_model['mean_val'].loc[row.name[1]], - df_para_model['std_val'].loc[row.name[1]]), axis=1) + df = df.apply(lambda row: standardize_retransform(row, + df_para_model['mean_val'].loc[row.name[1]], + df_para_model['std_val'].loc[row.name[1]]), axis=1) else: mean = df_para_model['mean'].iloc[0] std = df_para_model['std'].iloc[0] @@ -218,7 +333,14 @@ def retransform_data(df, transform, para_transformed, by_group=True, b=1, a=0): return df -def retrain_transformed_sweep(Datasets_transformed, model_paras): +def retrain_transformed_sweep(Datasets_transformed, model_paras) -> None: + """ + This function retrains the model with the transformed datasets. + The function uses the following global variables: + - Datasets_transformed: A dictionary containing the transformed datasets. + - model_paras: A list of strings indicating the model parameters. + The function returns None. + """ modelstore = storage.Storage() run_id = wandb.config['run_id'] steps = wandb.config['steps'] @@ -246,8 +368,10 @@ def retrain_transformed_sweep(Datasets_transformed, model_paras): retrain=force_retrain, store=modelstore, partitioner=DataPartitioner({"calib": calib_partitioner_dict}), - stepshifted_models=StepshiftedModels(model, steps, wandb.config['depvar']), - dataset=RetrieveFromList(Datasets_transformed[transform], wandb.config['data_train']), + stepshifted_models=StepshiftedModels( + model, steps, wandb.config['depvar']), + dataset=RetrieveFromList( + Datasets_transformed[transform], wandb.config['data_train']), queryset_name=wandb.config['queryset'], partition_name="calib", timespan_name="train", @@ -261,18 +385,23 @@ def retrain_transformed_sweep(Datasets_transformed, model_paras): predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=wandb.config[f'predstore_calib_{transform}']) except KeyError: - print(wandb.config[f'predstore_calib_{transform}'], ', run', run_id, 'does not exist, predicting') - predictions_calib = RunResult_calib.run.predict("calib", "predict", RunResult_calib.data) + print(wandb.config[f'predstore_calib_{transform}'], + ', run', run_id, 'does not exist, predicting') + predictions_calib = RunResult_calib.run.predict( + "calib", "predict", RunResult_calib.data) predictions_calib.forecasts.set_run(run_id) - predictions_calib.forecasts.to_store(name=wandb.config[f'predstore_calib_{transform}']) + predictions_calib.forecasts.to_store( + name=wandb.config[f'predstore_calib_{transform}']) print(f'Test partition ({transform})') RunResult_test = RunResult.retrain_or_retrieve( retrain=force_retrain, store=modelstore, partitioner=DataPartitioner({"test": test_partitioner_dict}), - stepshifted_models=StepshiftedModels(model, steps, wandb.config['depvar']), - dataset=RetrieveFromList(Datasets_transformed[transform], wandb.config['data_train']), + stepshifted_models=StepshiftedModels( + model, steps, wandb.config['depvar']), + dataset=RetrieveFromList( + Datasets_transformed[transform], wandb.config['data_train']), queryset_name=wandb.config['queryset'], partition_name="test", timespan_name="train", @@ -286,19 +415,25 @@ def retrain_transformed_sweep(Datasets_transformed, model_paras): predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=wandb.config[f'predstore_test_{transform}']) except KeyError: - print(wandb.config[f'predstore_test_{transform}'], ', run', run_id, 'does not exist, predicting') - predictions_test = RunResult_test.run.predict("test", "predict", RunResult_test.data) + print(wandb.config[f'predstore_test_{transform}'], + ', run', run_id, 'does not exist, predicting') + predictions_test = RunResult_test.run.predict( + "test", "predict", RunResult_test.data) predictions_test.forecasts.set_run(run_id) - predictions_test.forecasts.to_store(name=wandb.config[f'predstore_test_{transform}']) + predictions_test.forecasts.to_store( + name=wandb.config[f'predstore_test_{transform}']) -def evaluate(target, para_transformed, retransform=True, by_group=False, b=1, a=0, plot_map=False): - ''' - :param target: 'calib' or 'test' - :param para_transformed: the dict that is generated by transform_data - :param retransform: transform the data back if True - :param retransform_by_group: transform the data back by country_id if True. Make sure it is the same value in transform_data - ''' +def evaluate(target, para_transformed, retransform=True, by_group=False, b=1, a=0, plot_map=False) -> None: + """ + This function evaluates the model. + The function uses the following global variables: + - target: A string indicating the target of the evaluation (calib or test). + - para_transformed: A dictionary containing the transformed parameters. + - retransform: A boolean indicating whether the data should be retransformed. + - retransform_by_group: A boolean indicating whether the data should be retransformed by group. + The function returns None. + """ print(f'Evaluating model {wandb.config["modelname"]}') if target not in ['calib', 'test']: @@ -314,15 +449,17 @@ def evaluate(target, para_transformed, retransform=True, by_group=False, b=1, a= stepcols.append('step_pred_' + str(step)) pred_cols = [f'step_pred_{str(i)}' for i in steps] - step_eval = [1, 3, 6, 9, 12, 36] # target steps for evaluation + step_eval = [1, 3, 6, 9, 12, 36] # target steps for evaluation name = wandb.config[f'predstore_{target}_{transform}'] - df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols] + df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([ + np.inf, -np.inf], 0)[stepcols] # Retransform the data (save for HH's idea) - df_retransform = retransform_data(df, transform, para_transformed, by_group, b, a) - - # raw outcomes evaluation + df_retransform = retransform_data( + df, transform, para_transformed, by_group, b, a) + + # raw outcomes evaluation if retransform: df = df_retransform.copy(deep=True) @@ -345,12 +482,9 @@ def evaluate(target, para_transformed, retransform=True, by_group=False, b=1, a= print('mse', df['mse'].mean()) wandb.log({'mse': df['mse'].mean()}) - # df.to_csv('raw.csv') + # As requested by HH, after retransforming the data back, we need to transform the predictions again using all the transformations, + # i.e., raw, log, normalize, standardize. Then we evaluate the metrics and steps again. - """ - As requested by HH, after retransforming the data back, we need to transform the predictions again using all the transformations, - i.e., raw, log, normalize, standardize. - """ # ++++++++++++++++++++ Log df = df_retransform.copy(deep=True) df = np.log(df+1) @@ -359,22 +493,16 @@ def evaluate(target, para_transformed, retransform=True, by_group=False, b=1, a= metrics_evaluation(df, pred_cols, transform='log') steps_evaluation(df, step_eval, transform='log') - # df.to_csv('log.csv') - # ++++++++++++++++++++ Standardize - df = df_retransform.copy(deep=True) + df = df_retransform.copy(deep=True) df = standardize_raw_data(df) metrics_evaluation(df, pred_cols, transform='standardize') steps_evaluation(df, step_eval, transform='standardize') - # df.to_csv('stan.csv') - - #++++++++++++++++++++ Normalize + # ++++++++++++++++++++ Normalize df = df_retransform.copy(deep=True) df = normalize_raw_data(df, b, a) metrics_evaluation(df, pred_cols, transform='normalize') steps_evaluation(df, step_eval, transform='normalize') - - # df.to_csv('norm.csv') From 612f738b3c48fc8f0ba884e3e093d514e803308b Mon Sep 17 00:00:00 2001 From: nskazmi <142994595+nskazmi@users.noreply.github.com> Date: Fri, 2 Feb 2024 13:48:57 +0100 Subject: [PATCH 13/13] some refactoring of imports --- dataloader/ModelDefinitions.py | 2 +- util/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataloader/ModelDefinitions.py b/dataloader/ModelDefinitions.py index 3644317..ee523ae 100644 --- a/dataloader/ModelDefinitions.py +++ b/dataloader/ModelDefinitions.py @@ -1,4 +1,4 @@ -from dataloader.ViewsEstimators import * +from ViewsEstimators import * class FixedFirstSplitRegression(BaseEstimator): diff --git a/util/utils.py b/util/utils.py index 7889f71..e2da9e2 100644 --- a/util/utils.py +++ b/util/utils.py @@ -1,6 +1,6 @@ +from ViewsEstimators import * from util.new_metrics import * from util.utils_map import GeoPlotter -from dataloader.ViewsEstimators import * from dataloader.FetchData import * from views_forecasts.extensions import * from views_runs.run_result import RunResult