Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions autopeptideml/data/h_param_search/logreg_class.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
penalty:
type: fixed
value: l2

solver:
type: categorical
values:
- liblinear
- lbfgs
- saga
- newton-cg

C:
type: float
min: 1e-3
max: 1e3
log: True

fit_intercept:
type: categorical
values:
- True
- False

max_iter:
type: fixed
value: 1000

tol:
type: fixed
value: 1e-4
2 changes: 1 addition & 1 deletion autopeptideml/db/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .negative_sampling import add_negatives_from_db
from .negative_sampling import add_negatives_from_db, setup_databases
28 changes: 28 additions & 0 deletions autopeptideml/db/negative_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,34 @@ def _length(input_str: List[str], n_jobs: int,
MATCHING = {'mw': _mw, 'length': _length}


def setup_databases():
try:
import gdown
except ImportError:
raise ImportError("This module requires gdown. Try: `pip install gdown`")

db_dir = osp.join(osp.dirname(__file__), '..', 'data', 'dbs')
if not osp.isdir(db_dir):
os.makedirs(db_dir, exist_ok=True)

verbose = True

print("Downloading canonical database...")
path = osp.join(db_dir, 'canonical.csv')
FILE_ID = "189VtkbQ2bVpQlAe2UMBSzt_O4F7EyBWl"
gdown.download(id=FILE_ID, output=path, quiet=verbose)

print("Downloading non-canonical database...")
path = osp.join(db_dir, 'non-canonical.csv')
FILE_ID = "1U4RXDNx_aijVDJ1oTaRKjo78Yakd3Mg4"
gdown.download(id=FILE_ID, output=path, quiet=verbose)

print("Downloading negative database...")
path = osp.join(db_dir, 'both.csv')
FILE_ID = "189VtkbQ2bVpQlAe2UMBSzt_O4F7EyBWl"
gdown.download(id=FILE_ID, output=path, quiet=verbose)


def get_neg_db(target_db: str, verbose: bool, return_path: bool = False) -> pd.DataFrame:
"""
Retrieves a precompiled database of negative samples.
Expand Down
13 changes: 9 additions & 4 deletions autopeptideml/train/architectures.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from skl2onnx import to_onnx


SKLEARN_MODELS = ['knn', 'svm', 'rf', 'gradboost']
SKLEARN_MODELS = ['knn', 'svm', 'rf', 'gradboost', 'logreg', 'linreg']
ALL_MODELS = SKLEARN_MODELS + ['lightgbm', 'xgboost']


Expand All @@ -26,8 +26,11 @@ class OnnxModel:
:type path: str
"""
def __init__(self, path: str):
so = rt.SessionOptions()
so.log_severity_level = 3 # 0 = verbose, 1 = info, 2 = warning, 3 = error, 4 = fatal
self.session = rt.InferenceSession(
path, providers=['CPUExecutionProvider']
path, providers=['CPUExecutionProvider'],
sess_options=so
)

def predict(self, x: np.ndarray):
Expand Down Expand Up @@ -237,13 +240,14 @@ def load_sklearn_models(task: str) -> Dict[str, Callable]:
raise ImportError("This function requires scikit-learn",
"Please try: `pip install scikit-learn`")

from sklearn import (svm, ensemble, neighbors)
from sklearn import (svm, ensemble, neighbors, linear_model)
if 'class' in task:
arch = {
'knn': neighbors.KNeighborsClassifier,
'svm': svm.SVC,
'rf': ensemble.RandomForestClassifier,
'gradboost': ensemble.GradientBoostingClassifier,
'logreg': linear_model.LogisticRegression

}
elif 'reg' in task:
Expand All @@ -252,7 +256,8 @@ def load_sklearn_models(task: str) -> Dict[str, Callable]:
'svm': svm.SVR,
'rf': ensemble.RandomForestRegressor,
'adaboost': ensemble.AdaBoostRegressor,
'gradboost': ensemble.GradientBoostingRegressor
'gradboost': ensemble.GradientBoostingRegressor,
'linreg': linear_model.LinearRegression
}
else:
raise NotImplementedError(
Expand Down
8 changes: 3 additions & 5 deletions autopeptideml/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,10 @@ def _get_hpspace(self, models: List[str], custom_hpspace: dict) -> dict:
if models is None:
models = ALL_MODELS
for model in models:
config_path = osp.join(config_dir, f'{model}_{self.task}.yml')
hpspace = yaml.safe_load(open(config_path))
if model in custom_hpspace:
hpspace = custom_hpspace[model]
else:
config_path = osp.join(config_dir, f'{model}_{self.task}.yml')
hpspace = yaml.safe_load(open(config_path))
hpspace.update(custom_hpspace[model])
if 'n_jobs' in hpspace:
hpspace['n_jobs'] = {'type': 'fixed', 'value': self.n_jobs}
if 'random_state' in hpspace:
Expand Down Expand Up @@ -417,7 +416,6 @@ def _hpo_step(self, trial) -> dict:
if self.task == 'reg' and h_m['name'] == 'svm':
if 'probability' in h_m['variables']:
del h_m['variables']['probability']

arch = arch(**h_m['variables'])
train_x, train_y = x[h_m['representation']][train_idx], y[train_idx]
arch.fit(train_x, train_y)
Expand Down