From af94f103537f41bd593e78adb02c1bae15c69587 Mon Sep 17 00:00:00 2001 From: yuvalrhino Date: Sun, 5 Jan 2025 22:21:56 +0200 Subject: [PATCH 1/6] Optimize docker-push.sh * Better default value handling of image_registry argument * Fix issue when image_registry isn't provided --- rhino-utils/docker-push.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/rhino-utils/docker-push.sh b/rhino-utils/docker-push.sh index 889f9af..525e44c 100755 --- a/rhino-utils/docker-push.sh +++ b/rhino-utils/docker-push.sh @@ -71,16 +71,10 @@ fi # Set the container_image_uri based on rhino_domain if [[ "$rhino_domain" == "rhinohealth.com" ]]; then - if [[ "$image_registry" == "" ]]; then - image_registry="913123821419.dkr.ecr.us-east-1.amazonaws.com" - fi - container_image_uri="$image_registry/$image_repo_name:$docker_image_tag" + container_image_uri="${image_registry:-913123821419.dkr.ecr.us-east-1.amazonaws.com}/$image_repo_name:$docker_image_tag" else - if [[ "$image_registry" == "" ]]; then - # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account). - image_registry="europe-west4-docker.pkg.dev/$gcp_project_id" - fi - container_image_uri="$image_registry/$image_repo_name/images:$docker_image_tag" + # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account). + container_image_uri="${image_registry:-europe-west4-docker.pkg.dev}/$gcp_project_id/$image_repo_name/images:$docker_image_tag" fi docker_build_base_cmd=(docker build --platform linux/amd64) From 95a637a6dc30c304a40f0ba9dbf8753ff9f601f1 Mon Sep 17 00:00:00 2001 From: Noy-Maimon Date: Thu, 9 Jan 2025 23:45:52 +0200 Subject: [PATCH 2/6] better errors --- .../custom/coeff_optimizer.py | 5 ++- .../custom/regression_trainer.py | 37 +++++++++++++++---- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py b/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py index d921384..28a5e52 100644 --- a/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py +++ b/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py @@ -234,6 +234,7 @@ def add(self, data, add_results, contribution_round): add_results["A_sum"] += data["site_ols_params"]['A'] add_results["B_sum"] += data["site_ols_params"]['B'] add_results["combined_hessian"] += data["site_hessian"] + add_results["exog_names"] = data["exog_names"] def get_result(self, add_results, contribution_round, target_accuracy, **kwargs): next_beta = np.linalg.inv(add_results["A_sum"]).dot(add_results["B_sum"]) @@ -246,13 +247,13 @@ def get_result(self, add_results, contribution_round, target_accuracy, **kwargs) # Stop if the result is already accurate enough if np.all(np.greater(target_accuracy, accuracy)): print(f"Reached accuracy threshold") - return None, {"beta": next_beta, "fed_stderror": fed_stderror, "signal": 'ABORT', "Reached accuracy threshold": True} + return None, {"beta": next_beta, "fed_stderror": fed_stderror, "variable_names_by_betas_order": data["exog_names"], "signal": 'ABORT', "Reached accuracy threshold": True} add_results["A_sum"] = 0 add_results["B_sum"] = 0 add_results["combined_hessian"] = 0 print(f"next beta after contribution round {contribution_round} is {next_beta}") - return None, {"site_info": {"params": next_beta}, "beta": next_beta, "fed_stderror": fed_stderror, "Reached accuracy threshold": False} + return None, {"site_info": {"params": next_beta}, "beta": next_beta, "fed_stderror": fed_stderror, "variable_names_by_betas_order": data["exog_names"], "Reached accuracy threshold": False} OPTIMIZERS = {"NR": NewtonRaphson, "IRLS": IRLS} diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py index e149c95..c3580be 100644 --- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py +++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py @@ -81,14 +81,6 @@ def __init__( self.site_info = dict() print(f"Initialized Federated Client. {x_values=}, {y_values=}, {formula=}, {glm_type=}, {add_intercept=}, {cast_to_string_fields=}") - if not self.formula and not (self.x_values and self.y_values): - print("Either formula or x_values and y_values must be provided.") - raise ValueError("Either formula or x_values and y_values must be provided.") - - if self.offset and self.family_class != sm.families.Poisson: - print("Offset is only supported for Poisson distribution family.") - raise ValueError("Offset is only supported for Poisson distribution family.") - # Load dataset datasets_path = '/input/datasets' dataset_uid = next(os.walk(datasets_path))[1][0] @@ -111,6 +103,35 @@ def __init__( if self.data_x is not None: self.data_x["Intercept"] = 1 + def _validate_input(self): + """ + Validate the following: # TODO: add + """ + # Validate either formula or explicit columns are supplied + if not self.formula and not (self.x_values and self.y_values): + print("Either formula or x_values and y_values must be provided.") + raise ValueError("Either formula or x_values and y_values must be provided.") + + # Validate formula structure + if formula: + formula = self.formula.rstrip(" ") # TODO: better function for cleaning spaces + dependent_var, independent_vars = formula.split("~") + formula_parts = independent_vars.split("+") + [dependent_var] + else: + formula_parts = self.x_values + self.y_values + missing_parts = [part for part in formula_parts if part not in self.data.columns] + if missing_parts: + raise ValueError( + f"The given {"formula" if formla else "y_values or x values"} contains variables that are missing from the data columns: {missing_parts}") + INVALID_SIGNS = ["/", ">", "<"] # TODO: add problematic signs + if any(sign in part for part in formula_parts for sign in INVALID_SIGNS): + raise ValueError(f"Column headers with the signs {INVALID_SIGNS} are invalid, please modify the dataset columns and the formula.") + + # Validate use of offset + if self.offset and self.family_class != sm.families.Poisson: + print("Offset is only supported for Poisson distribution family.") + raise ValueError("Offset is only supported for Poisson distribution family.") + def handle_event(self, event_type: str, fl_ctx: FLContext): pass From d83572feb7682b0dfad3ac4a9e1094340bdc601e Mon Sep 17 00:00:00 2001 From: Noy-Maimon Date: Sun, 12 Jan 2025 18:20:49 +0200 Subject: [PATCH 3/6] invalid signs --- .../regression-glm-coeff/custom/regression_trainer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py index c3580be..671c83e 100644 --- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py +++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py @@ -35,6 +35,13 @@ from coeff_optimizer import OPTIMIZERS +INVALID_SIGNS = [ + '<', '>', '=', + '+', '-', '*', '/', '//', '%', '**', + '(', ')', ':', '~', '|', '^', ',', + '.', "'", '"', '@', '#', '$', + '[', ']', '{', '}', '?', '!', ' ' +] # These are signs that will fail the smf.glm function if they appear in the column name class GLMTrainer(Executor): def __init__( @@ -123,9 +130,8 @@ def _validate_input(self): if missing_parts: raise ValueError( f"The given {"formula" if formla else "y_values or x values"} contains variables that are missing from the data columns: {missing_parts}") - INVALID_SIGNS = ["/", ">", "<"] # TODO: add problematic signs if any(sign in part for part in formula_parts for sign in INVALID_SIGNS): - raise ValueError(f"Column headers with the signs {INVALID_SIGNS} are invalid, please modify the dataset columns and the formula.") + raise ValueError(f"Column headers with the signs {INVALID_SIGNS} are invalid, please modify the dataset columns and the model's formula.") # Validate use of offset if self.offset and self.family_class != sm.families.Poisson: From 81d342dc592ed900152b2b6329adddafcd174519 Mon Sep 17 00:00:00 2001 From: Noy-Maimon Date: Sun, 12 Jan 2025 18:24:28 +0200 Subject: [PATCH 4/6] docstring --- .../regression-glm-coeff/custom/regression_trainer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py index 671c83e..c46a77e 100644 --- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py +++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py @@ -112,7 +112,13 @@ def __init__( def _validate_input(self): """ - Validate the following: # TODO: add + Validate the input parameters for the model, including: + - Ensure that either formula or x_values and y_values are provided. + - Ensure that the supplied variables (either from formula or x_values and y_values) are present in the dataset. + - Ensure that the supplied variables do not contain invalid signs that will cause the model to fail. + - Ensure that the offset is only used with the Poisson distribution family. + + """ # Validate either formula or explicit columns are supplied if not self.formula and not (self.x_values and self.y_values): From 37a69594389a999703730c07fa30f2e6d9779a63 Mon Sep 17 00:00:00 2001 From: Noy-Maimon Date: Sun, 12 Jan 2025 18:26:20 +0200 Subject: [PATCH 5/6] spaces --- .../regression-glm-coeff/custom/regression_trainer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py index c46a77e..fdc7426 100644 --- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py +++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py @@ -117,8 +117,6 @@ def _validate_input(self): - Ensure that the supplied variables (either from formula or x_values and y_values) are present in the dataset. - Ensure that the supplied variables do not contain invalid signs that will cause the model to fail. - Ensure that the offset is only used with the Poisson distribution family. - - """ # Validate either formula or explicit columns are supplied if not self.formula and not (self.x_values and self.y_values): @@ -126,8 +124,8 @@ def _validate_input(self): raise ValueError("Either formula or x_values and y_values must be provided.") # Validate formula structure - if formula: - formula = self.formula.rstrip(" ") # TODO: better function for cleaning spaces + if self.formula: + formula = self.formula.replace(" ", "") dependent_var, independent_vars = formula.split("~") formula_parts = independent_vars.split("+") + [dependent_var] else: From 07645418ae094fb55751be56eab3961187602e58 Mon Sep 17 00:00:00 2001 From: Noy-Maimon Date: Sun, 12 Jan 2025 18:29:39 +0200 Subject: [PATCH 6/6] docker-push.sh --- rhino-utils/docker-push.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/rhino-utils/docker-push.sh b/rhino-utils/docker-push.sh index 525e44c..889f9af 100755 --- a/rhino-utils/docker-push.sh +++ b/rhino-utils/docker-push.sh @@ -71,10 +71,16 @@ fi # Set the container_image_uri based on rhino_domain if [[ "$rhino_domain" == "rhinohealth.com" ]]; then - container_image_uri="${image_registry:-913123821419.dkr.ecr.us-east-1.amazonaws.com}/$image_repo_name:$docker_image_tag" + if [[ "$image_registry" == "" ]]; then + image_registry="913123821419.dkr.ecr.us-east-1.amazonaws.com" + fi + container_image_uri="$image_registry/$image_repo_name:$docker_image_tag" else - # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account). - container_image_uri="${image_registry:-europe-west4-docker.pkg.dev}/$gcp_project_id/$image_repo_name/images:$docker_image_tag" + if [[ "$image_registry" == "" ]]; then + # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account). + image_registry="europe-west4-docker.pkg.dev/$gcp_project_id" + fi + container_image_uri="$image_registry/$image_repo_name/images:$docker_image_tag" fi docker_build_base_cmd=(docker build --platform linux/amd64)