Skip to content
84 changes: 42 additions & 42 deletions ViewsEstimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
from xgboost import XGBRFRegressor, XGBRFClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

#from lightgbm import LGBMClassifier, LGBMRegressor


class HurdleRegression(BaseEstimator):
""" Regression model which handles excessive zeros by fitting a two-part model and combining predictions:
Expand All @@ -35,7 +33,7 @@ def __init__(self,
clf_name: str = 'logistic',
reg_name: str = 'linear',
clf_params: Optional[dict] = None,
reg_params: Optional[dict] = None):
reg_params: Optional[dict] = None) -> None:

self.clf_name = clf_name
self.reg_name = reg_name
Expand All @@ -44,31 +42,29 @@ def __init__(self,
self.clf_fi = []
self.reg_fi = []

@staticmethod
def _resolve_estimator(func_name: str):
""" Lookup table for supported estimators.
This is necessary because sklearn estimator default arguments
must pass equality test, and instantiated sub-estimators are not equal. """
def fit(self,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.Series]) -> BaseEstimator:
"""
Fit the model using the provided features and target.

funcs = {'linear': LinearRegression(),
'logistic': LogisticRegression(solver='liblinear'),
'LGBMRegressor': LGBMRegressor(n_estimators=250),
'LGBMClassifier': LGBMClassifier(n_estimators=250),
'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2),
'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2),
'GBMRegressor': GradientBoostingRegressor(n_estimators=200),
'GBMClassifier': GradientBoostingClassifier(n_estimators=200),
'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2),
'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2),
'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),
'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),
}
Parameters:
- X (Union[np.ndarray, pd.DataFrame]): Features for training the model.
- y (Union[np.ndarray, pd.Series]): Target variable.

return funcs[func_name]
Raises:
- ValueError: If the number of features in X is less than 2.

def fit(self,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.Series]):
Returns:
- self: The fitted model.

The `fit` method trains both a classification (`clf_`) and regression (`reg_`) model.
The classification model is used to predict whether y is greater than 0, while the regression
model is trained on instances where y is greater than 0. Feature importances for both models
are stored in `clf_fi` and `reg_fi`, respectively.

Note: The provided features and target are checked for validity, and the fitted status is updated.
"""
X, y = check_X_y(X, y, dtype=None,
accept_sparse=False,
accept_large_sparse=False,
Expand All @@ -92,32 +88,36 @@ def fit(self,
self.is_fitted_ = True
return self


# def predict(self, X: Union[np.ndarray, pd.DataFrame]):
def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]):
def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
""" Predict combined response using binary classification outcome """
X = check_array(X, accept_sparse=False, accept_large_sparse=False)
check_is_fitted(self, 'is_fitted_')
return self.clf_.predict(X) * self.reg_.predict(X)

def predict(self, X: Union[np.ndarray, pd.DataFrame]):
# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]):
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
""" Predict combined response using probabilistic classification outcome """
X = check_array(X, accept_sparse=False, accept_large_sparse=False)
check_is_fitted(self, 'is_fitted_')
return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)

def manual_test():
""" Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """
check_estimator(HurdleRegression)
from sklearn.datasets import make_regression
X, y = make_regression()
reg = HurdleRegression()
reg.fit(X, y)
reg.predict(X)


@staticmethod
def _resolve_estimator(func_name: str) -> BaseEstimator:
""" Lookup table for supported estimators.
This is necessary because sklearn estimator default arguments
must pass equality test, and instantiated sub-estimators are not equal. """

funcs = {'linear': LinearRegression(),
'logistic': LogisticRegression(solver='liblinear'),
'LGBMRegressor': LGBMRegressor(n_estimators=250),
'LGBMClassifier': LGBMClassifier(n_estimators=250),
'RFRegressor': XGBRFRegressor(n_estimators=250, n_jobs=-2),
'RFClassifier': XGBRFClassifier(n_estimators=250, n_jobs=-2),
'GBMRegressor': GradientBoostingRegressor(n_estimators=200),
'GBMClassifier': GradientBoostingClassifier(n_estimators=200),
'XGBRegressor': XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-2),
'XGBClassifier': XGBClassifier(n_estimators=100, learning_rate=0.05, n_jobs=-2),
'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),
'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),
}

#if __name__ == '__main__':
# manual_test()
return funcs[func_name]
Loading