From af94f103537f41bd593e78adb02c1bae15c69587 Mon Sep 17 00:00:00 2001
From: yuvalrhino <yuval@rhinohealth.com>
Date: Sun, 5 Jan 2025 22:21:56 +0200
Subject: [PATCH 1/6] Optimize docker-push.sh * Better default value handling
 of image_registry argument * Fix issue when image_registry isn't provided

---
 rhino-utils/docker-push.sh | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/rhino-utils/docker-push.sh b/rhino-utils/docker-push.sh
index 889f9af..525e44c 100755
--- a/rhino-utils/docker-push.sh
+++ b/rhino-utils/docker-push.sh
@@ -71,16 +71,10 @@ fi
 
 # Set the container_image_uri based on rhino_domain
 if [[ "$rhino_domain" == "rhinohealth.com" ]]; then
-  if [[ "$image_registry" == "" ]]; then
-    image_registry="913123821419.dkr.ecr.us-east-1.amazonaws.com"
-  fi
-  container_image_uri="$image_registry/$image_repo_name:$docker_image_tag"
+  container_image_uri="${image_registry:-913123821419.dkr.ecr.us-east-1.amazonaws.com}/$image_repo_name:$docker_image_tag"
 else
-  if [[ "$image_registry" == "" ]]; then
-      # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account).
-      image_registry="europe-west4-docker.pkg.dev/$gcp_project_id"
-  fi
-  container_image_uri="$image_registry/$image_repo_name/images:$docker_image_tag"
+  # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account).
+  container_image_uri="${image_registry:-europe-west4-docker.pkg.dev}/$gcp_project_id/$image_repo_name/images:$docker_image_tag"
 fi
 
 docker_build_base_cmd=(docker build --platform linux/amd64)

From 95a637a6dc30c304a40f0ba9dbf8753ff9f601f1 Mon Sep 17 00:00:00 2001
From: Noy-Maimon <noy@rhinohealth.com>
Date: Thu, 9 Jan 2025 23:45:52 +0200
Subject: [PATCH 2/6] better errors

---
 .../custom/coeff_optimizer.py                 |  5 ++-
 .../custom/regression_trainer.py              | 37 +++++++++++++++----
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py b/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py
index d921384..28a5e52 100644
--- a/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py
+++ b/examples/nvflare/regression-glm-coeff/custom/coeff_optimizer.py
@@ -234,6 +234,7 @@ def add(self, data, add_results, contribution_round):
         add_results["A_sum"] += data["site_ols_params"]['A']
         add_results["B_sum"] += data["site_ols_params"]['B']
         add_results["combined_hessian"] += data["site_hessian"]
+        add_results["exog_names"] = data["exog_names"]
 
     def get_result(self, add_results, contribution_round, target_accuracy, **kwargs):
         next_beta = np.linalg.inv(add_results["A_sum"]).dot(add_results["B_sum"])
@@ -246,13 +247,13 @@ def get_result(self, add_results, contribution_round, target_accuracy, **kwargs)
         # Stop if the result is already accurate enough
         if np.all(np.greater(target_accuracy, accuracy)):
             print(f"Reached accuracy threshold")
-            return None, {"beta": next_beta, "fed_stderror": fed_stderror, "signal": 'ABORT', "Reached accuracy threshold": True}
+            return None, {"beta": next_beta, "fed_stderror": fed_stderror, "variable_names_by_betas_order": data["exog_names"], "signal": 'ABORT', "Reached accuracy threshold": True}
 
         add_results["A_sum"] = 0
         add_results["B_sum"] = 0
         add_results["combined_hessian"] = 0
         print(f"next beta after contribution round {contribution_round} is {next_beta}")
-        return None, {"site_info": {"params": next_beta}, "beta": next_beta, "fed_stderror": fed_stderror, "Reached accuracy threshold": False}
+        return None, {"site_info": {"params": next_beta}, "beta": next_beta, "fed_stderror": fed_stderror, "variable_names_by_betas_order": data["exog_names"], "Reached accuracy threshold": False}
 
 
 OPTIMIZERS = {"NR": NewtonRaphson, "IRLS": IRLS}
diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
index e149c95..c3580be 100644
--- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
+++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
@@ -81,14 +81,6 @@ def __init__(
         self.site_info = dict()
         print(f"Initialized Federated Client. {x_values=}, {y_values=}, {formula=}, {glm_type=}, {add_intercept=}, {cast_to_string_fields=}")
 
-        if not self.formula and not (self.x_values and self.y_values):
-            print("Either formula or x_values and y_values must be provided.")
-            raise ValueError("Either formula or x_values and y_values must be provided.")
-
-        if self.offset and self.family_class != sm.families.Poisson:
-            print("Offset is only supported for Poisson distribution family.")
-            raise ValueError("Offset is only supported for Poisson distribution family.")
-
         # Load dataset
         datasets_path = '/input/datasets'
         dataset_uid = next(os.walk(datasets_path))[1][0]
@@ -111,6 +103,35 @@ def __init__(
             if self.data_x is not None:
                 self.data_x["Intercept"] = 1
 
+    def _validate_input(self):
+        """
+        Validate the following: # TODO: add
+        """
+        # Validate either formula or explicit columns are supplied
+        if not self.formula and not (self.x_values and self.y_values):
+            print("Either formula or x_values and y_values must be provided.")
+            raise ValueError("Either formula or x_values and y_values must be provided.")
+
+        # Validate formula structure
+        if formula:
+            formula = self.formula.rstrip(" ") # TODO: better function for cleaning spaces
+            dependent_var, independent_vars = formula.split("~")
+            formula_parts = independent_vars.split("+") + [dependent_var]
+        else:
+            formula_parts = self.x_values + self.y_values
+        missing_parts = [part for part in formula_parts if part not in self.data.columns]
+        if missing_parts:
+            raise ValueError(
+                f"The given {"formula" if formla else "y_values or x values"} contains variables that are missing from the data columns: {missing_parts}")
+        INVALID_SIGNS = ["/", ">", "<"] # TODO: add problematic signs
+        if any(sign in part for part in formula_parts for sign in INVALID_SIGNS):
+            raise ValueError(f"Column headers with the signs {INVALID_SIGNS} are invalid, please modify the dataset columns and the formula.")
+
+        # Validate use of offset
+        if self.offset and self.family_class != sm.families.Poisson:
+            print("Offset is only supported for Poisson distribution family.")
+            raise ValueError("Offset is only supported for Poisson distribution family.")
+
     def handle_event(self, event_type: str, fl_ctx: FLContext):
         pass
 

From d83572feb7682b0dfad3ac4a9e1094340bdc601e Mon Sep 17 00:00:00 2001
From: Noy-Maimon <noy@rhinohealth.com>
Date: Sun, 12 Jan 2025 18:20:49 +0200
Subject: [PATCH 3/6] invalid signs

---
 .../regression-glm-coeff/custom/regression_trainer.py  | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
index c3580be..671c83e 100644
--- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
+++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
@@ -35,6 +35,13 @@
 
 from coeff_optimizer import OPTIMIZERS
 
+INVALID_SIGNS = [
+    '<', '>', '=',
+    '+', '-', '*', '/', '//', '%', '**',
+    '(', ')', ':', '~', '|', '^', ',',
+    '.', "'", '"', '@', '#', '$',
+    '[', ']', '{', '}', '?', '!', ' '
+]  # These are signs that will fail the smf.glm function if they appear in the column name
 
 class GLMTrainer(Executor):
     def __init__(
@@ -123,9 +130,8 @@ def _validate_input(self):
         if missing_parts:
             raise ValueError(
                 f"The given {"formula" if formla else "y_values or x values"} contains variables that are missing from the data columns: {missing_parts}")
-        INVALID_SIGNS = ["/", ">", "<"] # TODO: add problematic signs
         if any(sign in part for part in formula_parts for sign in INVALID_SIGNS):
-            raise ValueError(f"Column headers with the signs {INVALID_SIGNS} are invalid, please modify the dataset columns and the formula.")
+            raise ValueError(f"Column headers with the signs {INVALID_SIGNS} are invalid, please modify the dataset columns and the model's formula.")
 
         # Validate use of offset
         if self.offset and self.family_class != sm.families.Poisson:

From 81d342dc592ed900152b2b6329adddafcd174519 Mon Sep 17 00:00:00 2001
From: Noy-Maimon <noy@rhinohealth.com>
Date: Sun, 12 Jan 2025 18:24:28 +0200
Subject: [PATCH 4/6] docstring

---
 .../regression-glm-coeff/custom/regression_trainer.py     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
index 671c83e..c46a77e 100644
--- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
+++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
@@ -112,7 +112,13 @@ def __init__(
 
     def _validate_input(self):
         """
-        Validate the following: # TODO: add
+        Validate the input parameters for the model, including:
+        - Ensure that either formula or x_values and y_values are provided.
+        - Ensure that the supplied variables (either from formula or x_values and y_values) are present in the dataset.
+        - Ensure that the supplied variables do not contain invalid signs that will cause the model to fail.
+        - Ensure that the offset is only used with the Poisson distribution family.
+
+
         """
         # Validate either formula or explicit columns are supplied
         if not self.formula and not (self.x_values and self.y_values):

From 37a69594389a999703730c07fa30f2e6d9779a63 Mon Sep 17 00:00:00 2001
From: Noy-Maimon <noy@rhinohealth.com>
Date: Sun, 12 Jan 2025 18:26:20 +0200
Subject: [PATCH 5/6] spaces

---
 .../regression-glm-coeff/custom/regression_trainer.py       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
index c46a77e..fdc7426 100644
--- a/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
+++ b/examples/nvflare/regression-glm-coeff/custom/regression_trainer.py
@@ -117,8 +117,6 @@ def _validate_input(self):
         - Ensure that the supplied variables (either from formula or x_values and y_values) are present in the dataset.
         - Ensure that the supplied variables do not contain invalid signs that will cause the model to fail.
         - Ensure that the offset is only used with the Poisson distribution family.
-
-
         """
         # Validate either formula or explicit columns are supplied
         if not self.formula and not (self.x_values and self.y_values):
@@ -126,8 +124,8 @@ def _validate_input(self):
             raise ValueError("Either formula or x_values and y_values must be provided.")
 
         # Validate formula structure
-        if formula:
-            formula = self.formula.rstrip(" ") # TODO: better function for cleaning spaces
+        if self.formula:
+            formula = self.formula.replace(" ", "")
             dependent_var, independent_vars = formula.split("~")
             formula_parts = independent_vars.split("+") + [dependent_var]
         else:

From 07645418ae094fb55751be56eab3961187602e58 Mon Sep 17 00:00:00 2001
From: Noy-Maimon <noy@rhinohealth.com>
Date: Sun, 12 Jan 2025 18:29:39 +0200
Subject: [PATCH 6/6] docker-push.sh

---
 rhino-utils/docker-push.sh | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/rhino-utils/docker-push.sh b/rhino-utils/docker-push.sh
index 525e44c..889f9af 100755
--- a/rhino-utils/docker-push.sh
+++ b/rhino-utils/docker-push.sh
@@ -71,10 +71,16 @@ fi
 
 # Set the container_image_uri based on rhino_domain
 if [[ "$rhino_domain" == "rhinohealth.com" ]]; then
-  container_image_uri="${image_registry:-913123821419.dkr.ecr.us-east-1.amazonaws.com}/$image_repo_name:$docker_image_tag"
+  if [[ "$image_registry" == "" ]]; then
+    image_registry="913123821419.dkr.ecr.us-east-1.amazonaws.com"
+  fi
+  container_image_uri="$image_registry/$image_repo_name:$docker_image_tag"
 else
-  # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account).
-  container_image_uri="${image_registry:-europe-west4-docker.pkg.dev}/$gcp_project_id/$image_repo_name/images:$docker_image_tag"
+  if [[ "$image_registry" == "" ]]; then
+      # In gcp, the image registry is specific to the project (as opposed to AWS where it's all under the infra account).
+      image_registry="europe-west4-docker.pkg.dev/$gcp_project_id"
+  fi
+  container_image_uri="$image_registry/$image_repo_name/images:$docker_image_tag"
 fi
 
 docker_build_base_cmd=(docker build --platform linux/amd64)