From f8397288be480f8912ed50f31b0230f8d27aa2ac Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com> Date: Wed, 24 Sep 2025 13:05:30 -0400 Subject: [PATCH 1/6] fixed the case for empty class values --- .../machine_learning/vertica/ensemble.py | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index 4dff5409b..78ef8dfc8 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -2698,6 +2698,27 @@ def _compute_attributes(self) -> None: except MissingRelation: self.classes_ = np.array([]) + # If classes are empty, infer them from the first tree's values + if len(self.classes_) == 0: + try: + first_tree = self._compute_trees_arrays(self.get_tree(0), self.X, True) + unique_values = set() + for j in range(len(first_tree[4])): # first_tree[4] is the value array + if not isinstance(first_tree[4][j], NoneType): + unique_values.add(first_tree[4][j]) + if unique_values: + self.classes_ = np.array(sorted(unique_values)) + else: + raise ValueError( + "Unable to determine classes: no valid values found in tree structure. " + "This may indicate a problem with the model or data." + ) + except Exception as e: + raise ValueError( + f"Failed to infer classes from model structure: {str(e)}. " + "This may indicate the model is corrupted or incompatible." + ) + trees = [] for i in range(self.n_estimators_): tree = self._compute_trees_arrays(self.get_tree(i), self.X, True) @@ -2720,10 +2741,11 @@ def _compute_attributes(self) -> None: if str(c) == str(tree_d["value"][j]): prob[k] = tree[6][j] break - other_proba = (1 - tree[6][j]) / (n_classes - 1) - for k, p in enumerate(prob): - if p == 0.0: - prob[k] = other_proba + if n_classes > 1: + other_proba = (1 - tree[6][j]) / (n_classes - 1) + for k, p in enumerate(prob): + if p == 0.0: + prob[k] = other_proba tree_d["value"][j] = prob model = mm.BinaryTreeClassifier(**tree_d) trees += [model] From 6b25c6c44915800e4dad4cb2ce333e62e2e45ad6 Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:09:14 -0400 Subject: [PATCH 2/6] updated correct error --- verticapy/machine_learning/vertica/ensemble.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index 78ef8dfc8..96a60858c 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -28,6 +28,7 @@ SQLColumns, SQLRelation, ) +from verticapy.errors import ModelError from verticapy._utils._gen import gen_name from verticapy._utils._sql._collect import save_verticapy_logs from verticapy._utils._sql._format import clean_query, format_type, quote_ident @@ -2709,12 +2710,12 @@ def _compute_attributes(self) -> None: if unique_values: self.classes_ = np.array(sorted(unique_values)) else: - raise ValueError( + raise ModelError( "Unable to determine classes: no valid values found in tree structure. " "This may indicate a problem with the model or data." ) except Exception as e: - raise ValueError( + raise ModelError( f"Failed to infer classes from model structure: {str(e)}. " "This may indicate the model is corrupted or incompatible." ) From ca9da4e8d12fba68d575004ff082ed8bdfe419a7 Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:55:39 -0400 Subject: [PATCH 3/6] addede tests --- .../vertica/test_model_management.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/verticapy/tests_new/machine_learning/vertica/test_model_management.py b/verticapy/tests_new/machine_learning/vertica/test_model_management.py index 7232fee98..aa75315c1 100644 --- a/verticapy/tests_new/machine_learning/vertica/test_model_management.py +++ b/verticapy/tests_new/machine_learning/vertica/test_model_management.py @@ -286,6 +286,136 @@ def test_load_model( py_res, rel=rel_abs_tol_map[model_class]["load_model"]["rel"] ) +class TestModelManagementFromDB: + """ + Focused test class for load_model_from_database functionality + """ + + @pytest.mark.parametrize("category", ["vertica"]) # Only test vertica category + @pytest.mark.parametrize( + "model_class", + [ + "RandomForestRegressor", + "RandomForestClassifier", + "DecisionTreeRegressor", + "DecisionTreeClassifier", + "DummyTreeRegressor", + # "DummyTreeClassifier", # fail + "XGBRegressor", + "XGBClassifier", + "Ridge", + "Lasso", + "ElasticNet", + "LinearRegression", + "LinearSVR", + "PoissonRegressor", + # "AR", "MA", "ARMA", "ARIMA", # Models were skipped above in the simple load test as well + ], + ) + def test_load_model_from_database( + self, + get_py_model, + winequality_vpy_fun, + titanic_vd_fun, + model_class, + category, + schema_loader, + ): + """ + test function - load_model directly from database storage + Tests the real user workflow: create model with name -> fit -> load by name -> predict + Focused test with reduced parameters for efficiency + """ + # No need to skip - we only have vertica category now + + py_model_obj = get_py_model(model_class) + + # Create a unique model name for this test + model_name = f"test_load_db_{model_class.lower()}" + full_model_name = f"{schema_loader}.{model_name}" + + # Clean up any existing model + vp.drop(name=full_model_name, method="model") + + # Step 1: Create and fit a fresh model with a specific name (this saves it to database) + model_class_obj = getattr( + __import__("verticapy.machine_learning.vertica", fromlist=[model_class]), + model_class + ) + + # Create model with name (this will save it to the database when fitted) + original_model = model_class_obj(name=full_model_name) + + # Fit the model with appropriate data and features based on model type + if model_class in [ + "RandomForestRegressor", + "DecisionTreeRegressor", + "DummyTreeRegressor", + "XGBRegressor", + "Ridge", + "Lasso", + "ElasticNet", + "LinearRegression", + "LinearSVR", + "PoissonRegressor", + ]: + # Regression models - use winequality dataset + original_model.fit(winequality_vpy_fun, ["citric_acid", "residual_sugar", "alcohol"], "quality") + elif model_class in [ + "RandomForestClassifier", + "DecisionTreeClassifier", + "DummyTreeClassifier", + "XGBClassifier", + ]: + # Classification models - use titanic dataset + original_model.fit(titanic_vd_fun, ["age", "fare", "sex"], "survived") + + # Step 2: Load the model from database using its name (this is what users do) + loaded_model = load_model(name=full_model_name) + + # Step 3: Test prediction with loaded model (this is where the bug occurs) + if model_class in [ + "RandomForestClassifier", + "DecisionTreeClassifier", + "DummyTreeClassifier", + "XGBClassifier", + ]: + # Classification models + pred_vdf = loaded_model.predict( + titanic_vd_fun, ["age", "fare", "sex"], "db_prediction" + ) + # Handle None values in prediction results + prediction_values = pred_vdf[["db_prediction"]].to_list() + valid_predictions = [] + for row in prediction_values: + if row[0] is not None: + valid_predictions.append(int(row[0])) + vpy_res = np.mean(valid_predictions) if valid_predictions else 0 + py_res = py_model_obj.pred.sum() + else: + # Regression models + pred_vdf = loaded_model.predict( + winequality_vpy_fun, + ["citric_acid", "residual_sugar", "alcohol"], + "db_prediction", + ) + vpy_res = np.mean( + list(chain(*np.array(pred_vdf[["db_prediction"]].to_list(), dtype=float))) + ) + py_res = py_model_obj.pred.mean() + + _rel_tol, _abs_tol = calculate_tolerance(vpy_res, py_res) + print( + f"Model_class: {model_class}, Metric_name: load_model_from_database, rel_tol(e): {'%.e' % Decimal(_rel_tol)}, abs_tol(e): {'%.e' % Decimal(_abs_tol)}" + ) + + assert vpy_res == pytest.approx( + py_res, rel=rel_abs_tol_map[model_class]["load_model"]["rel"] + ) + + # Clean up + vp.drop(name=full_model_name, method="model") + @pytest.mark.parametrize( "category", From 8d524d4e98ad69e37786721d5735807ae61e6f8d Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:21:59 -0400 Subject: [PATCH 4/6] fixed bug for XGBclassifier as well --- verticapy/machine_learning/vertica/ensemble.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index 96a60858c..29a9dcbc2 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -3693,11 +3693,21 @@ def _compute_attributes(self) -> None: ) # Handling NULL Values. null_ = False - if self.classes_[0] == "": + if len(self.classes_) > 0 and self.classes_[0] == "": self.classes_ = self.classes_[1:] null_ = True if self._is_binary_classifier(): - prior = self._compute_prior() + try: + prior = self._compute_prior() + except (MissingRelation, QueryError): + # If training data is not available, use default prior from initial_prediction + try: + prior_values = self.get_vertica_attributes("initial_prediction")["value"] + if null_: + prior_values = prior_values[1:] + prior = np.array(prior_values)[1] if len(prior_values) > 1 else 0.5 + except: + prior = 0.5 # Default fallback else: prior = np.array( self.get_vertica_attributes("initial_prediction")["value"] @@ -3741,7 +3751,6 @@ def _compute_attributes(self) -> None: model = mm.BinaryTreeClassifier(**tree_d) trees += [model] self.trees_ = trees - # I/O Methods. def to_memmodel(self) -> mm.XGBClassifier: From 9149e6cd49d1949367b055a60b1b49167f60a65d Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:23:34 -0400 Subject: [PATCH 5/6] black --- verticapy/machine_learning/vertica/ensemble.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index 29a9dcbc2..bf3a93dc0 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -3702,10 +3702,14 @@ def _compute_attributes(self) -> None: except (MissingRelation, QueryError): # If training data is not available, use default prior from initial_prediction try: - prior_values = self.get_vertica_attributes("initial_prediction")["value"] + prior_values = self.get_vertica_attributes( + "initial_prediction" + )["value"] if null_: prior_values = prior_values[1:] - prior = np.array(prior_values)[1] if len(prior_values) > 1 else 0.5 + prior = ( + np.array(prior_values)[1] if len(prior_values) > 1 else 0.5 + ) except: prior = 0.5 # Default fallback else: @@ -3751,6 +3755,7 @@ def _compute_attributes(self) -> None: model = mm.BinaryTreeClassifier(**tree_d) trees += [model] self.trees_ = trees + # I/O Methods. def to_memmodel(self) -> mm.XGBClassifier: From 9426a164c3d47d201f5b39604e1c0acefe198f12 Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com> Date: Fri, 10 Oct 2025 10:11:15 -0400 Subject: [PATCH 6/6] Added a getter function --- verticapy/machine_learning/vertica/base.py | 12 ++++++++++++ verticapy/machine_learning/vertica/ensemble.py | 9 +++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/verticapy/machine_learning/vertica/base.py b/verticapy/machine_learning/vertica/base.py index c9eb28d10..e9871b4a6 100755 --- a/verticapy/machine_learning/vertica/base.py +++ b/verticapy/machine_learning/vertica/base.py @@ -2562,6 +2562,18 @@ def _get_features_importance(self, tree_id: Optional[int] = None) -> np.ndarray: self._compute_features_importance(tree_id=tree_id) return self._get_features_importance(tree_id=tree_id) + def _get_tree_classes(self, tree_arrays: list) -> np.ndarray: + """ + Extracts unique class labels from the output of _compute_trees_arrays. + Returns a sorted numpy array of unique class labels. + """ + value_array = tree_arrays[4] + unique_values = set() + for val in value_array: + if not isinstance(val, NoneType): + unique_values.add(val) + return np.array(sorted(unique_values)) + def features_importance( self, tree_id: Optional[int] = None, diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index bf3a93dc0..50f855da4 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -2703,12 +2703,9 @@ def _compute_attributes(self) -> None: if len(self.classes_) == 0: try: first_tree = self._compute_trees_arrays(self.get_tree(0), self.X, True) - unique_values = set() - for j in range(len(first_tree[4])): # first_tree[4] is the value array - if not isinstance(first_tree[4][j], NoneType): - unique_values.add(first_tree[4][j]) - if unique_values: - self.classes_ = np.array(sorted(unique_values)) + inferred_classes = self._get_tree_classes(first_tree) + if len(inferred_classes) > 0: + self.classes_ = inferred_classes else: raise ModelError( "Unable to determine classes: no valid values found in tree structure. "