diff --git a/verticapy/machine_learning/vertica/base.py b/verticapy/machine_learning/vertica/base.py index c9eb28d10..e9871b4a6 100755 --- a/verticapy/machine_learning/vertica/base.py +++ b/verticapy/machine_learning/vertica/base.py @@ -2562,6 +2562,18 @@ def _get_features_importance(self, tree_id: Optional[int] = None) -> np.ndarray: self._compute_features_importance(tree_id=tree_id) return self._get_features_importance(tree_id=tree_id) + def _get_tree_classes(self, tree_arrays: list) -> np.ndarray: + """ + Extracts unique class labels from the output of _compute_trees_arrays. + Returns a sorted numpy array of unique class labels. + """ + value_array = tree_arrays[4] + unique_values = set() + for val in value_array: + if not isinstance(val, NoneType): + unique_values.add(val) + return np.array(sorted(unique_values)) + def features_importance( self, tree_id: Optional[int] = None, diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index 4dff5409b..50f855da4 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -28,6 +28,7 @@ SQLColumns, SQLRelation, ) +from verticapy.errors import ModelError from verticapy._utils._gen import gen_name from verticapy._utils._sql._collect import save_verticapy_logs from verticapy._utils._sql._format import clean_query, format_type, quote_ident @@ -2698,6 +2699,24 @@ def _compute_attributes(self) -> None: except MissingRelation: self.classes_ = np.array([]) + # If classes are empty, infer them from the first tree's values + if len(self.classes_) == 0: + try: + first_tree = self._compute_trees_arrays(self.get_tree(0), self.X, True) + inferred_classes = self._get_tree_classes(first_tree) + if len(inferred_classes) > 0: + self.classes_ = inferred_classes + else: + raise ModelError( + "Unable to determine classes: no valid values found in tree structure. " + "This may indicate a problem with the model or data." + ) + except Exception as e: + raise ModelError( + f"Failed to infer classes from model structure: {str(e)}. " + "This may indicate the model is corrupted or incompatible." + ) + trees = [] for i in range(self.n_estimators_): tree = self._compute_trees_arrays(self.get_tree(i), self.X, True) @@ -2720,10 +2739,11 @@ def _compute_attributes(self) -> None: if str(c) == str(tree_d["value"][j]): prob[k] = tree[6][j] break - other_proba = (1 - tree[6][j]) / (n_classes - 1) - for k, p in enumerate(prob): - if p == 0.0: - prob[k] = other_proba + if n_classes > 1: + other_proba = (1 - tree[6][j]) / (n_classes - 1) + for k, p in enumerate(prob): + if p == 0.0: + prob[k] = other_proba tree_d["value"][j] = prob model = mm.BinaryTreeClassifier(**tree_d) trees += [model] @@ -3670,11 +3690,25 @@ def _compute_attributes(self) -> None: ) # Handling NULL Values. null_ = False - if self.classes_[0] == "": + if len(self.classes_) > 0 and self.classes_[0] == "": self.classes_ = self.classes_[1:] null_ = True if self._is_binary_classifier(): - prior = self._compute_prior() + try: + prior = self._compute_prior() + except (MissingRelation, QueryError): + # If training data is not available, use default prior from initial_prediction + try: + prior_values = self.get_vertica_attributes( + "initial_prediction" + )["value"] + if null_: + prior_values = prior_values[1:] + prior = ( + np.array(prior_values)[1] if len(prior_values) > 1 else 0.5 + ) + except: + prior = 0.5 # Default fallback else: prior = np.array( self.get_vertica_attributes("initial_prediction")["value"] diff --git a/verticapy/tests_new/machine_learning/vertica/test_model_management.py b/verticapy/tests_new/machine_learning/vertica/test_model_management.py index 7232fee98..aa75315c1 100644 --- a/verticapy/tests_new/machine_learning/vertica/test_model_management.py +++ b/verticapy/tests_new/machine_learning/vertica/test_model_management.py @@ -286,6 +286,136 @@ def test_load_model( py_res, rel=rel_abs_tol_map[model_class]["load_model"]["rel"] ) +class TestModelManagementFromDB: + """ + Focused test class for load_model_from_database functionality + """ + + @pytest.mark.parametrize("category", ["vertica"]) # Only test vertica category + @pytest.mark.parametrize( + "model_class", + [ + "RandomForestRegressor", + "RandomForestClassifier", + "DecisionTreeRegressor", + "DecisionTreeClassifier", + "DummyTreeRegressor", + # "DummyTreeClassifier", # fail + "XGBRegressor", + "XGBClassifier", + "Ridge", + "Lasso", + "ElasticNet", + "LinearRegression", + "LinearSVR", + "PoissonRegressor", + # "AR", "MA", "ARMA", "ARIMA", # Models were skipped above in the simple load test as well + ], + ) + def test_load_model_from_database( + self, + get_py_model, + winequality_vpy_fun, + titanic_vd_fun, + model_class, + category, + schema_loader, + ): + """ + test function - load_model directly from database storage + Tests the real user workflow: create model with name -> fit -> load by name -> predict + Focused test with reduced parameters for efficiency + """ + # No need to skip - we only have vertica category now + + py_model_obj = get_py_model(model_class) + + # Create a unique model name for this test + model_name = f"test_load_db_{model_class.lower()}" + full_model_name = f"{schema_loader}.{model_name}" + + # Clean up any existing model + vp.drop(name=full_model_name, method="model") + + # Step 1: Create and fit a fresh model with a specific name (this saves it to database) + model_class_obj = getattr( + __import__("verticapy.machine_learning.vertica", fromlist=[model_class]), + model_class + ) + + # Create model with name (this will save it to the database when fitted) + original_model = model_class_obj(name=full_model_name) + + # Fit the model with appropriate data and features based on model type + if model_class in [ + "RandomForestRegressor", + "DecisionTreeRegressor", + "DummyTreeRegressor", + "XGBRegressor", + "Ridge", + "Lasso", + "ElasticNet", + "LinearRegression", + "LinearSVR", + "PoissonRegressor", + ]: + # Regression models - use winequality dataset + original_model.fit(winequality_vpy_fun, ["citric_acid", "residual_sugar", "alcohol"], "quality") + elif model_class in [ + "RandomForestClassifier", + "DecisionTreeClassifier", + "DummyTreeClassifier", + "XGBClassifier", + ]: + # Classification models - use titanic dataset + original_model.fit(titanic_vd_fun, ["age", "fare", "sex"], "survived") + + # Step 2: Load the model from database using its name (this is what users do) + loaded_model = load_model(name=full_model_name) + + # Step 3: Test prediction with loaded model (this is where the bug occurs) + if model_class in [ + "RandomForestClassifier", + "DecisionTreeClassifier", + "DummyTreeClassifier", + "XGBClassifier", + ]: + # Classification models + pred_vdf = loaded_model.predict( + titanic_vd_fun, ["age", "fare", "sex"], "db_prediction" + ) + # Handle None values in prediction results + prediction_values = pred_vdf[["db_prediction"]].to_list() + valid_predictions = [] + for row in prediction_values: + if row[0] is not None: + valid_predictions.append(int(row[0])) + vpy_res = np.mean(valid_predictions) if valid_predictions else 0 + py_res = py_model_obj.pred.sum() + else: + # Regression models + pred_vdf = loaded_model.predict( + winequality_vpy_fun, + ["citric_acid", "residual_sugar", "alcohol"], + "db_prediction", + ) + vpy_res = np.mean( + list(chain(*np.array(pred_vdf[["db_prediction"]].to_list(), dtype=float))) + ) + py_res = py_model_obj.pred.mean() + + _rel_tol, _abs_tol = calculate_tolerance(vpy_res, py_res) + print( + f"Model_class: {model_class}, Metric_name: load_model_from_database, rel_tol(e): {'%.e' % Decimal(_rel_tol)}, abs_tol(e): {'%.e' % Decimal(_abs_tol)}" + ) + + assert vpy_res == pytest.approx( + py_res, rel=rel_abs_tol_map[model_class]["load_model"]["rel"] + ) + + # Clean up + vp.drop(name=full_model_name, method="model") + @pytest.mark.parametrize( "category",