diff --git a/autopeptideml/data/h_param_search/logreg_class.yml b/autopeptideml/data/h_param_search/logreg_class.yml new file mode 100644 index 0000000..9bac92b --- /dev/null +++ b/autopeptideml/data/h_param_search/logreg_class.yml @@ -0,0 +1,31 @@ +penalty: + type: fixed + value: l2 + +solver: + type: categorical + values: + - liblinear + - lbfgs + - saga + - newton-cg + +C: + type: float + min: 1e-3 + max: 1e3 + log: True + +fit_intercept: + type: categorical + values: + - True + - False + +max_iter: + type: fixed + value: 1000 + +tol: + type: fixed + value: 1e-4 diff --git a/autopeptideml/db/__init__.py b/autopeptideml/db/__init__.py index cd5ff9d..e40f11c 100644 --- a/autopeptideml/db/__init__.py +++ b/autopeptideml/db/__init__.py @@ -1 +1 @@ -from .negative_sampling import add_negatives_from_db +from .negative_sampling import add_negatives_from_db, setup_databases diff --git a/autopeptideml/db/negative_sampling.py b/autopeptideml/db/negative_sampling.py index 2dc0def..5c5e69f 100644 --- a/autopeptideml/db/negative_sampling.py +++ b/autopeptideml/db/negative_sampling.py @@ -43,6 +43,34 @@ def _length(input_str: List[str], n_jobs: int, MATCHING = {'mw': _mw, 'length': _length} +def setup_databases(): + try: + import gdown + except ImportError: + raise ImportError("This module requires gdown. Try: `pip install gdown`") + + db_dir = osp.join(osp.dirname(__file__), '..', 'data', 'dbs') + if not osp.isdir(db_dir): + os.makedirs(db_dir, exist_ok=True) + + verbose = True + + print("Downloading canonical database...") + path = osp.join(db_dir, 'canonical.csv') + FILE_ID = "189VtkbQ2bVpQlAe2UMBSzt_O4F7EyBWl" + gdown.download(id=FILE_ID, output=path, quiet=verbose) + + print("Downloading non-canonical database...") + path = osp.join(db_dir, 'non-canonical.csv') + FILE_ID = "1U4RXDNx_aijVDJ1oTaRKjo78Yakd3Mg4" + gdown.download(id=FILE_ID, output=path, quiet=verbose) + + print("Downloading negative database...") + path = osp.join(db_dir, 'both.csv') + FILE_ID = "189VtkbQ2bVpQlAe2UMBSzt_O4F7EyBWl" + gdown.download(id=FILE_ID, output=path, quiet=verbose) + + def get_neg_db(target_db: str, verbose: bool, return_path: bool = False) -> pd.DataFrame: """ Retrieves a precompiled database of negative samples. diff --git a/autopeptideml/train/architectures.py b/autopeptideml/train/architectures.py index 0efcde5..29cad8c 100644 --- a/autopeptideml/train/architectures.py +++ b/autopeptideml/train/architectures.py @@ -12,7 +12,7 @@ from skl2onnx import to_onnx -SKLEARN_MODELS = ['knn', 'svm', 'rf', 'gradboost'] +SKLEARN_MODELS = ['knn', 'svm', 'rf', 'gradboost', 'logreg', 'linreg'] ALL_MODELS = SKLEARN_MODELS + ['lightgbm', 'xgboost'] @@ -26,8 +26,11 @@ class OnnxModel: :type path: str """ def __init__(self, path: str): + so = rt.SessionOptions() + so.log_severity_level = 3 # 0 = verbose, 1 = info, 2 = warning, 3 = error, 4 = fatal self.session = rt.InferenceSession( - path, providers=['CPUExecutionProvider'] + path, providers=['CPUExecutionProvider'], + sess_options=so ) def predict(self, x: np.ndarray): @@ -237,13 +240,14 @@ def load_sklearn_models(task: str) -> Dict[str, Callable]: raise ImportError("This function requires scikit-learn", "Please try: `pip install scikit-learn`") - from sklearn import (svm, ensemble, neighbors) + from sklearn import (svm, ensemble, neighbors, linear_model) if 'class' in task: arch = { 'knn': neighbors.KNeighborsClassifier, 'svm': svm.SVC, 'rf': ensemble.RandomForestClassifier, 'gradboost': ensemble.GradientBoostingClassifier, + 'logreg': linear_model.LogisticRegression } elif 'reg' in task: @@ -252,7 +256,8 @@ def load_sklearn_models(task: str) -> Dict[str, Callable]: 'svm': svm.SVR, 'rf': ensemble.RandomForestRegressor, 'adaboost': ensemble.AdaBoostRegressor, - 'gradboost': ensemble.GradientBoostingRegressor + 'gradboost': ensemble.GradientBoostingRegressor, + 'linreg': linear_model.LinearRegression } else: raise NotImplementedError( diff --git a/autopeptideml/train/trainer.py b/autopeptideml/train/trainer.py index fa16321..b3b672a 100644 --- a/autopeptideml/train/trainer.py +++ b/autopeptideml/train/trainer.py @@ -260,11 +260,10 @@ def _get_hpspace(self, models: List[str], custom_hpspace: dict) -> dict: if models is None: models = ALL_MODELS for model in models: + config_path = osp.join(config_dir, f'{model}_{self.task}.yml') + hpspace = yaml.safe_load(open(config_path)) if model in custom_hpspace: - hpspace = custom_hpspace[model] - else: - config_path = osp.join(config_dir, f'{model}_{self.task}.yml') - hpspace = yaml.safe_load(open(config_path)) + hpspace.update(custom_hpspace[model]) if 'n_jobs' in hpspace: hpspace['n_jobs'] = {'type': 'fixed', 'value': self.n_jobs} if 'random_state' in hpspace: @@ -417,7 +416,6 @@ def _hpo_step(self, trial) -> dict: if self.task == 'reg' and h_m['name'] == 'svm': if 'probability' in h_m['variables']: del h_m['variables']['probability'] - arch = arch(**h_m['variables']) train_x, train_y = x[h_m['representation']][train_idx], y[train_idx] arch.fit(train_x, train_y)