Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions verticapy/machine_learning/vertica/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2562,6 +2562,18 @@ def _get_features_importance(self, tree_id: Optional[int] = None) -> np.ndarray:
self._compute_features_importance(tree_id=tree_id)
return self._get_features_importance(tree_id=tree_id)

def _get_tree_classes(self, tree_arrays: list) -> np.ndarray:
"""
Extracts unique class labels from the output of _compute_trees_arrays.
Returns a sorted numpy array of unique class labels.
"""
value_array = tree_arrays[4]
unique_values = set()
for val in value_array:
if not isinstance(val, NoneType):
unique_values.add(val)
return np.array(sorted(unique_values))

def features_importance(
self,
tree_id: Optional[int] = None,
Expand Down
46 changes: 40 additions & 6 deletions verticapy/machine_learning/vertica/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
SQLColumns,
SQLRelation,
)
from verticapy.errors import ModelError
from verticapy._utils._gen import gen_name
from verticapy._utils._sql._collect import save_verticapy_logs
from verticapy._utils._sql._format import clean_query, format_type, quote_ident
Expand Down Expand Up @@ -2698,6 +2699,24 @@ def _compute_attributes(self) -> None:
except MissingRelation:
self.classes_ = np.array([])

# If classes are empty, infer them from the first tree's values
if len(self.classes_) == 0:
try:
first_tree = self._compute_trees_arrays(self.get_tree(0), self.X, True)
inferred_classes = self._get_tree_classes(first_tree)
if len(inferred_classes) > 0:
self.classes_ = inferred_classes
else:
raise ModelError(
"Unable to determine classes: no valid values found in tree structure. "
"This may indicate a problem with the model or data."
)
except Exception as e:
raise ModelError(
f"Failed to infer classes from model structure: {str(e)}. "
"This may indicate the model is corrupted or incompatible."
)

trees = []
for i in range(self.n_estimators_):
tree = self._compute_trees_arrays(self.get_tree(i), self.X, True)
Expand All @@ -2720,10 +2739,11 @@ def _compute_attributes(self) -> None:
if str(c) == str(tree_d["value"][j]):
prob[k] = tree[6][j]
break
other_proba = (1 - tree[6][j]) / (n_classes - 1)
for k, p in enumerate(prob):
if p == 0.0:
prob[k] = other_proba
if n_classes > 1:
other_proba = (1 - tree[6][j]) / (n_classes - 1)
for k, p in enumerate(prob):
if p == 0.0:
prob[k] = other_proba
tree_d["value"][j] = prob
model = mm.BinaryTreeClassifier(**tree_d)
trees += [model]
Expand Down Expand Up @@ -3670,11 +3690,25 @@ def _compute_attributes(self) -> None:
)
# Handling NULL Values.
null_ = False
if self.classes_[0] == "":
if len(self.classes_) > 0 and self.classes_[0] == "":
self.classes_ = self.classes_[1:]
null_ = True
if self._is_binary_classifier():
prior = self._compute_prior()
try:
prior = self._compute_prior()
except (MissingRelation, QueryError):
# If training data is not available, use default prior from initial_prediction
try:
prior_values = self.get_vertica_attributes(
"initial_prediction"
)["value"]
if null_:
prior_values = prior_values[1:]
prior = (
np.array(prior_values)[1] if len(prior_values) > 1 else 0.5
)
except:
prior = 0.5 # Default fallback
else:
prior = np.array(
self.get_vertica_attributes("initial_prediction")["value"]
Expand Down
130 changes: 130 additions & 0 deletions verticapy/tests_new/machine_learning/vertica/test_model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,136 @@ def test_load_model(
py_res, rel=rel_abs_tol_map[model_class]["load_model"]["rel"]
)

class TestModelManagementFromDB:
"""
Focused test class for load_model_from_database functionality
"""

@pytest.mark.parametrize("category", ["vertica"]) # Only test vertica category
@pytest.mark.parametrize(
"model_class",
[
"RandomForestRegressor",
"RandomForestClassifier",
"DecisionTreeRegressor",
"DecisionTreeClassifier",
"DummyTreeRegressor",
# "DummyTreeClassifier", # fail
"XGBRegressor",
"XGBClassifier",
"Ridge",
"Lasso",
"ElasticNet",
"LinearRegression",
"LinearSVR",
"PoissonRegressor",
# "AR", "MA", "ARMA", "ARIMA", # Models were skipped above in the simple load test as well
],
)
def test_load_model_from_database(
self,
get_py_model,
winequality_vpy_fun,
titanic_vd_fun,
model_class,
category,
schema_loader,
):
"""
test function - load_model directly from database storage
Tests the real user workflow: create model with name -> fit -> load by name -> predict
Focused test with reduced parameters for efficiency
"""
# No need to skip - we only have vertica category now

py_model_obj = get_py_model(model_class)

# Create a unique model name for this test
model_name = f"test_load_db_{model_class.lower()}"
full_model_name = f"{schema_loader}.{model_name}"

# Clean up any existing model
vp.drop(name=full_model_name, method="model")

# Step 1: Create and fit a fresh model with a specific name (this saves it to database)
model_class_obj = getattr(
__import__("verticapy.machine_learning.vertica", fromlist=[model_class]),
model_class
)

# Create model with name (this will save it to the database when fitted)
original_model = model_class_obj(name=full_model_name)

# Fit the model with appropriate data and features based on model type
if model_class in [
"RandomForestRegressor",
"DecisionTreeRegressor",
"DummyTreeRegressor",
"XGBRegressor",
"Ridge",
"Lasso",
"ElasticNet",
"LinearRegression",
"LinearSVR",
"PoissonRegressor",
]:
# Regression models - use winequality dataset
original_model.fit(winequality_vpy_fun, ["citric_acid", "residual_sugar", "alcohol"], "quality")
elif model_class in [
"RandomForestClassifier",
"DecisionTreeClassifier",
"DummyTreeClassifier",
"XGBClassifier",
]:
# Classification models - use titanic dataset
original_model.fit(titanic_vd_fun, ["age", "fare", "sex"], "survived")

# Step 2: Load the model from database using its name (this is what users do)
loaded_model = load_model(name=full_model_name)

# Step 3: Test prediction with loaded model (this is where the bug occurs)
if model_class in [
"RandomForestClassifier",
"DecisionTreeClassifier",
"DummyTreeClassifier",
"XGBClassifier",
]:
# Classification models
pred_vdf = loaded_model.predict(
titanic_vd_fun, ["age", "fare", "sex"], "db_prediction"
)
# Handle None values in prediction results
prediction_values = pred_vdf[["db_prediction"]].to_list()
valid_predictions = []
for row in prediction_values:
if row[0] is not None:
valid_predictions.append(int(row[0]))
vpy_res = np.mean(valid_predictions) if valid_predictions else 0
py_res = py_model_obj.pred.sum()
else:
# Regression models
pred_vdf = loaded_model.predict(
winequality_vpy_fun,
["citric_acid", "residual_sugar", "alcohol"],
"db_prediction",
)
vpy_res = np.mean(
list(chain(*np.array(pred_vdf[["db_prediction"]].to_list(), dtype=float)))
)
py_res = py_model_obj.pred.mean()

_rel_tol, _abs_tol = calculate_tolerance(vpy_res, py_res)
print(
f"Model_class: {model_class}, Metric_name: load_model_from_database, rel_tol(e): {'%.e' % Decimal(_rel_tol)}, abs_tol(e): {'%.e' % Decimal(_abs_tol)}"
)

assert vpy_res == pytest.approx(
py_res, rel=rel_abs_tol_map[model_class]["load_model"]["rel"]
)

# Clean up
vp.drop(name=full_model_name, method="model")


@pytest.mark.parametrize(
"category",
Expand Down
Loading