From cafa9f5cd0b8eda54def9163640c2c2304c3812d Mon Sep 17 00:00:00 2001 From: Jonas Eschle 'Mayou36 Date: Thu, 15 Nov 2018 12:53:43 +0100 Subject: [PATCH 1/5] Lossen requirements --- requirements.txt | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9b5f5b87..6058fe23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ numpy >= 1.10.0 scipy >= 0.16.0 -matplotlib == 1.5.1 -pandas == 0.17.1 -scikit-learn == 0.17.1 -ipykernel == 4.3.1 -notebook == 4.2.1 -bokeh == 0.11.1 -mpld3 == 0.2 -neurolab == 0.3.5 -theano == 0.8.2 -nose == 1.3.7 -nose-parameterized == 0.5.0 -theanets == 0.7.3 -pybrain == 0.3 -xgboost == 0.4a30 -hep_ml == 0.4 -requests == 2.9.1 \ No newline at end of file +matplotlib > 1.3 +pandas > 0.17.1 +scikit-learn > 0.17.1 +ipykernel > 4.3.1 +notebook +bokeh +mpld3 +neurolab +theano +nose +nose-parameterized +theanets +pybrain +xgboost +hep_ml +requests From 37c919e4e93fedeac89780af518bdca24aa84c48 Mon Sep 17 00:00:00 2001 From: Jonas Eschle 'Mayou36 Date: Thu, 15 Nov 2018 14:05:42 +0100 Subject: [PATCH 2/5] Remove some requirements --- requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6058fe23..12352a60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,12 +7,8 @@ ipykernel > 4.3.1 notebook bokeh mpld3 -neurolab -theano nose nose-parameterized -theanets -pybrain xgboost hep_ml requests From f3129934272b5a7dd7b0a5529266aa8fd6b42e5b Mon Sep 17 00:00:00 2001 From: Jonas Eschle 'Mayou36 Date: Thu, 15 Nov 2018 14:21:01 +0100 Subject: [PATCH 3/5] Remove unnecessary tests --- rep/metaml/folding.py | 2 +- rep/metaml/gridsearch.py | 2 +- tests/test_neurolab.py | 120 -------------------------------------- tests/test_pybrain.py | 110 ----------------------------------- tests/test_theanets.py | 121 --------------------------------------- tests/test_tmva.py | 24 -------- 6 files changed, 2 insertions(+), 377 deletions(-) delete mode 100644 tests/test_neurolab.py delete mode 100644 tests/test_pybrain.py delete mode 100644 tests/test_theanets.py delete mode 100644 tests/test_tmva.py diff --git a/rep/metaml/folding.py b/rep/metaml/folding.py index c7562f65..acddb9c4 100644 --- a/rep/metaml/folding.py +++ b/rep/metaml/folding.py @@ -10,7 +10,7 @@ from six.moves import zip from sklearn import clone -from sklearn.cross_validation import KFold +from sklearn.model_selection import KFold from sklearn.utils import check_random_state from . import utils from .factory import train_estimator diff --git a/rep/metaml/gridsearch.py b/rep/metaml/gridsearch.py index b5a0bfc9..56c99aba 100644 --- a/rep/metaml/gridsearch.py +++ b/rep/metaml/gridsearch.py @@ -88,7 +88,7 @@ import numpy from sklearn.base import clone -from sklearn.cross_validation import StratifiedKFold, KFold +from sklearn.model_selection import StratifiedKFold, KFold from sklearn.ensemble.forest import RandomForestRegressor from sklearn.utils.random import check_random_state diff --git a/tests/test_neurolab.py b/tests/test_neurolab.py deleted file mode 100644 index b1e0af9f..00000000 --- a/tests/test_neurolab.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2014-2015 Yandex LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import division, print_function, absolute_import -from rep.test.test_estimators import check_classifier, check_regression, generate_classification_data, \ - check_params, check_classification_reproducibility -from sklearn.ensemble import BaggingClassifier -from rep.estimators.sklearn import SklearnClassifier -from rep.estimators.neurolab import NeurolabClassifier, NeurolabRegressor -import rep -import neurolab as nl - -__author__ = 'Sterzhanov Vladislav' - -N_EPOCHS2 = 20 -N_EPOCHS4 = 60 -N_EPOCHS_REGR = 10 - -classifier_params = { - 'has_staged_pp': False, - 'has_importances': False, - 'supports_weight': False, -} - -regressor_params = { - 'has_staged_predictions': False, - 'has_importances': False, - 'supports_weight': False, -} - - -def test_neurolab_params(): - check_params(NeurolabClassifier, layers=[1, 2], epochs=5, trainf='blah', cn=2, omnomnom=4) - check_params(NeurolabRegressor, layers=[1, 2], epochs=5, trainf='blah', cn=2, omnomnom=4) - - -def test_neurolab_single_classification(): - check_classifier(NeurolabClassifier(layers=[], epochs=N_EPOCHS2, trainf=None), - **classifier_params) - check_classifier(NeurolabClassifier(layers=[2], epochs=N_EPOCHS2), - **classifier_params) - check_classifier(NeurolabClassifier(layers=[1, 1], epochs=N_EPOCHS2), - **classifier_params) - - -def test_partial_fit(): - clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd) - X, y, _ = generate_classification_data() - clf.fit(X, y) - clf.partial_fit(X[:2], y[:2]) - - -def test_neurolab_regression(): - check_regression(NeurolabRegressor(layers=[1], epochs=N_EPOCHS_REGR), **regressor_params) - - -def test_neurolab_reproducibility(): - clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd) - X, y, _ = generate_classification_data() - check_classification_reproducibility(clf, X, y) - - -def test_neurolab_multiclassification(): - check_classifier(NeurolabClassifier(layers=[10], epochs=N_EPOCHS4, trainf=nl.train.train_rprop), - n_classes=4, **classifier_params) - - -def test_neurolab_multi_regression(): - check_regression(NeurolabRegressor(layers=[10], epochs=N_EPOCHS_REGR), - n_targets=3, **regressor_params) - - -def test_neurolab_stacking(): - base_nlab = NeurolabClassifier(layers=[], epochs=N_EPOCHS2 * 2, trainf=nl.train.train_rprop) - base_bagging = BaggingClassifier(base_estimator=base_nlab, n_estimators=3) - check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params) - - -def test_neurolab_classification_types(): - import pandas as pd - for net_type in rep.estimators.neurolab.NET_TYPES.keys(): - try: - clf = NeurolabClassifier(net_type=net_type, epochs=2) - ds = pd.DataFrame() - ds['feature1'] = [0, 1, 2, 3, 4, 5] - ds['feature2'] = [5, 7, 2, 4, 7, 9] - ds['y'] = [0, 0, 0, 1, 1, 1] - clf.fit(ds[['feature1', 'feature2']] / 10., ds['y']) - _ = clf.predict_proba(ds[['feature1', 'feature2']] / 10.) - print(net_type, 'is ok') - except Exception as e: - print(net_type, 'FAILED', e) - - -def test_neurolab_regression_types(): - import pandas as pd - for net_type in rep.estimators.neurolab.NET_TYPES.keys(): - try: - clf = NeurolabRegressor(net_type=net_type, epochs=2) - ds = pd.DataFrame() - ds['feature1'] = [0, 1, 2, 3, 4, 5] - ds['feature2'] = [5, 7, 2, 4, 7, 9] - ds['y'] = [0, 0, 0, 1, 1, 1] - clf.fit(ds[['feature1', 'feature2']] / 10., ds['y']) - _ = clf.predict(ds[['feature1', 'feature2']] / 10.) - print(net_type, 'is ok') - except Exception as e: - print(net_type, 'FAILED', e) diff --git a/tests/test_pybrain.py b/tests/test_pybrain.py deleted file mode 100644 index 5c049732..00000000 --- a/tests/test_pybrain.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright 2014-2015 Yandex LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import division, print_function, absolute_import - -import six - -if six.PY3: - # PyBrain doesn't support python3 - import nose - - raise nose.SkipTest - -import numpy -from rep.test.test_estimators import check_classifier, check_regression, check_params, \ - generate_classification_data, check_classification_reproducibility -from rep.estimators.pybrain import PyBrainClassifier, PyBrainRegressor -from sklearn.ensemble import BaggingClassifier -from rep.estimators import SklearnClassifier -from . import known_failure - -__author__ = 'Artem Zhirokhov' - -classifier_params = { - 'has_staged_pp': False, - 'has_importances': False, - 'supports_weight': False -} - -regressor_params = { - 'has_staged_predictions': False, - 'has_importances': False, - 'supports_weight': False -} - - -def test_pybrain_params(): - check_params(PyBrainClassifier, layers=[1, 2], epochs=5, use_rprop=True, hiddenclass=['LinearLayer']) - check_params(PyBrainRegressor, layers=[1, 2], epochs=5, etaplus=1.3, hiddenclass=['LinearLayer'], learningrate=0.1) - - -def test_pybrain_classification(): - clf = PyBrainClassifier(epochs=2) - check_classifier(clf, **classifier_params) - check_classifier(PyBrainClassifier(epochs=-1, continue_epochs=1, layers=[]), **classifier_params) - check_classifier(PyBrainClassifier(epochs=2, layers=[5, 2]), **classifier_params) - - -@known_failure -def test_pybrain_reproducibility(): - # This test fails. Because PyBrain can't reproduce training. - X, y, _ = generate_classification_data() - clf1 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y) - clf2 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y) - print(clf1.predict_proba(X) - clf2.predict_proba(X)) - assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'different predicitons' - check_classification_reproducibility(clf1, X, y) - - -def test_pybrain_Linear_MDLSTM(): - check_classifier(PyBrainClassifier(epochs=2, layers=[10, 2], hiddenclass=['LinearLayer', 'MDLSTMLayer']), - **classifier_params) - check_regression(PyBrainRegressor(epochs=3, layers=[10, 2], hiddenclass=['LinearLayer', 'MDLSTMLayer']), - **regressor_params) - - -def test_pybrain_SoftMax_Tanh(): - check_classifier(PyBrainClassifier(epochs=10, layers=[5, 2], hiddenclass=['TanhLayer', 'SoftmaxLayer'], - use_rprop=True), - **classifier_params) - check_regression( - PyBrainRegressor(epochs=2, layers=[10, 5, 2], hiddenclass=['TanhLayer', 'SoftmaxLayer', 'TanhLayer']), - **regressor_params) - - -def pybrain_test_partial_fit(): - clf = PyBrainClassifier(layers=[4], epochs=2) - X, y, _ = generate_classification_data() - clf.partial_fit(X, y) - clf.partial_fit(X[:2], y[:2]) - - -def test_pybrain_multi_classification(): - check_classifier(PyBrainClassifier(), n_classes=4, **classifier_params) - - -def test_pybrain_regression(): - check_regression(PyBrainRegressor(), **regressor_params) - - -def test_pybrain_multi_regression(): - check_regression(PyBrainRegressor(), n_targets=4, **regressor_params) - - -def test_simple_stacking_pybrain(): - base_pybrain = PyBrainClassifier(epochs=2) - base_bagging = BaggingClassifier(base_estimator=base_pybrain, n_estimators=3) - check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params) diff --git a/tests/test_theanets.py b/tests/test_theanets.py deleted file mode 100644 index 11477b13..00000000 --- a/tests/test_theanets.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2014-2015 Yandex LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import division, print_function, absolute_import -from sklearn.preprocessing.data import StandardScaler -from sklearn.metrics import roc_auc_score -from sklearn.ensemble import BaggingClassifier -from rep.test.test_estimators import check_classifier, check_regression, check_params, \ - check_classification_reproducibility -from rep.test.test_estimators import generate_classification_data -from rep.estimators.sklearn import SklearnClassifier -from rep.estimators.theanets import TheanetsClassifier, TheanetsRegressor - -from tests import retry_if_fails - -__author__ = 'Lisa Ignatyeva, Tatiana Likhomanenko, Alex Rogozhnikov' - -classifier_params = { - 'has_staged_pp': False, - 'has_importances': False, - 'supports_weight': True, -} - -regressor_params = { - 'has_staged_predictions': False, - 'has_importances': False, - 'supports_weight': True, -} - -impatient = dict(patience=0, validate_every=5, min_improvement=0.1) - - -@retry_if_fails -def test_theanets_params(): - check_params(TheanetsClassifier, layers=[1, 2], scaler=False, trainers=[{}, {'algo': 'nag', 'learning_rate': 0.1}]) - check_params(TheanetsRegressor, layers=[1, 2], scaler=False, trainers=[{}, {'algo': 'nag', 'learning_rate': 0.1}]) - - -def test_pretrain(): - trainX, trainY, _ = generate_classification_data() - trainers = [{'algo': 'pretrain', 'learning_rate': 0.5, 'patience': 1, 'validate_every': 1}] - # only checking that fitting doesn't throw errors - # this frequently gets stuck on CI - TheanetsClassifier(layers=[5], trainers=trainers).fit(trainX, trainY) - - -@retry_if_fails -def test_theanets_configurations(): - check_classifier( - TheanetsClassifier(layers=[13], scaler=False, - trainers=[dict(algo='nag', learning_rate=0.1, **impatient)]), - **classifier_params) - check_classifier( - TheanetsClassifier(layers=[5, 5], - trainers=[dict(algo='adam', learning_rate=0.01, momentum=0.9)] - ), - **classifier_params) - - -@retry_if_fails -def test_theanets_regression(): - check_regression(TheanetsRegressor(layers=[3], - trainers=[dict(algo='rmsprop', **impatient)]), - **regressor_params) - check_regression(TheanetsRegressor(scaler=StandardScaler(), - trainers=[dict(algo='rmsprop', **impatient)]), - **regressor_params) - - -def test_theanets_partial_fit(): - clf_complete = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}, - {'algo': 'rprop', 'learning_rate': 0.1}]) - clf_partial = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}]) - X, y, sample_weight = generate_classification_data() - clf_complete.fit(X, y) - clf_partial.fit(X, y) - clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1) - - assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' - - auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) - auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) - - # Known fail of theanets - assert auc_complete == auc_partial, 'same networks return different results' - - -def test_theanets_reproducibility(): - clf = TheanetsClassifier(trainers=[{'algo': 'nag', 'min_improvement': 0.1, 'max_updates': 10}]) - X, y, _ = generate_classification_data() - check_classification_reproducibility(clf, X, y) - - -@retry_if_fails -def test_theanets_simple_stacking(): - base_tnt = TheanetsClassifier(trainers=[{'min_improvement': 0.1}]) - base_bagging = BaggingClassifier(base_estimator=base_tnt, n_estimators=3) - check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params) - - -@retry_if_fails -def test_theanets_multiclassification(): - check_classifier(TheanetsClassifier(trainers=[{'min_improvement': 0.1, 'learning_rate': 0.1}]), n_classes=4, - **classifier_params) - - -def test_theanets_multi_regression(): - check_regression(TheanetsRegressor(layers=[13], trainers=[{'algo': 'rmsprop', 'min_improvement': 0.1}]), - n_targets=3, **regressor_params) diff --git a/tests/test_tmva.py b/tests/test_tmva.py deleted file mode 100644 index 4e273c0c..00000000 --- a/tests/test_tmva.py +++ /dev/null @@ -1,24 +0,0 @@ -from __future__ import division, print_function, absolute_import -from rep.test.test_estimators import check_classifier, check_regression -from rep.estimators import TMVAClassifier, TMVARegressor - -__author__ = 'Alex Rogozhnikov' - - -def test_tmva(): - # check classifier - factory_options = "Silent=True:V=False:DrawProgressBar=False" - cl = TMVAClassifier(factory_options=factory_options, method='kBDT', NTrees=10) - check_classifier(cl, check_instance=True, has_staged_pp=False, has_importances=False) - - cl = TMVAClassifier(factory_options=factory_options, method='kSVM', Gamma=0.25, Tol=0.001, - sigmoid_function='identity') - check_classifier(cl, check_instance=True, has_staged_pp=False, has_importances=False) - - cl = TMVAClassifier(factory_options=factory_options, method='kCuts', - FitMethod='GA', EffMethod='EffSel', sigmoid_function='sig_eff=0.9') - check_classifier(cl, check_instance=True, has_staged_pp=False, has_importances=False) - # check regressor, need to run twice to check for memory leak. - for i in range(2): - check_regression(TMVARegressor(factory_options=factory_options, method='kBDT', NTrees=10), check_instance=True, - has_staged_predictions=False, has_importances=False) From 49bf5657a5aa0ae9cf24c0ad4c986bae19d5efff Mon Sep 17 00:00:00 2001 From: Jonas Eschle 'Mayou36 Date: Thu, 15 Nov 2018 15:01:37 +0100 Subject: [PATCH 4/5] Remove a lot, refactor --- rep/estimators/neurolab.py | 363 ------------------------- rep/estimators/pybrain.py | 408 ---------------------------- rep/estimators/theanets.py | 363 ------------------------- rep/estimators/tmva.py | 432 ------------------------------ rep/metaml/folding.py | 6 +- rep/metaml/gridsearch.py | 9 +- rep/utils.py | 4 +- tests/m_test_matrixnet.py | 44 --- tests/m_test_matrixnet_api.py | 222 --------------- tests/m_test_matrixnet_applier.py | 112 -------- tests/test_stacking.py | 14 +- 11 files changed, 11 insertions(+), 1966 deletions(-) delete mode 100644 rep/estimators/neurolab.py delete mode 100644 rep/estimators/pybrain.py delete mode 100644 rep/estimators/theanets.py delete mode 100644 rep/estimators/tmva.py delete mode 100644 tests/m_test_matrixnet.py delete mode 100644 tests/m_test_matrixnet_api.py delete mode 100644 tests/m_test_matrixnet_applier.py diff --git a/rep/estimators/neurolab.py b/rep/estimators/neurolab.py deleted file mode 100644 index 2ee27a08..00000000 --- a/rep/estimators/neurolab.py +++ /dev/null @@ -1,363 +0,0 @@ -""" -These classes are wrappers for the `Neurolab library `_ --- a neural network python library. - -.. warning:: To make neurolab reproducible we change global random seed - - :: - - numpy.random.seed(42) -""" -# Copyright 2014-2015 Yandex LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division, print_function, absolute_import -from abc import ABCMeta -from copy import deepcopy - -import neurolab as nl -import numpy -import scipy - -from .interface import Classifier, Regressor -from .utils import check_inputs, check_scaler, one_hot_transform, remove_first_line - - -__author__ = 'Vlad Sterzhanov, Alex Rogozhnikov, Tatiana Likhomanenko' -__all__ = ['NeurolabClassifier', 'NeurolabRegressor'] - -NET_TYPES = {'feed-forward': nl.net.newff, - 'competing-layer': nl.net.newc, - 'learning-vector': nl.net.newlvq, - 'elman-recurrent': nl.net.newelm, - 'hemming-recurrent': nl.net.newhem, - 'hopfield-recurrent': nl.net.newhop - } - -NET_PARAMS = ('minmax', 'cn', 'layers', 'transf', 'target', - 'max_init', 'max_iter', 'delta', 'cn0', 'pc') - -BASIC_PARAMS = ('layers', 'net_type', 'trainf', 'initf', 'scaler', 'random_state') - -# Instead of a single layer use feed-forward. -CANT_CLASSIFY = ('hopfield-recurrent', 'competing-layer', 'hemming-recurrent') -CANT_DO_REGRESSION = ('hopfield-recurrent', ) - - -class NeurolabBase(object): - """ A base class for estimators from the Neurolab library. - - :param features: features used in training - :type features: list[str] or None - :param list[int] layers: sequence, number of units inside each **hidden** layer. - :param string net_type: type of the network; possible values are: - - * `feed-forward` - * `competing-layer` - * `learning-vector` - * `elman-recurrent` - * `hemming-recurrent` - - :param initf: layer initializers - :type initf: anything implementing call(layer), e.g. neurolab.init.* or list[neurolab.init.*] of shape [n_layers] - :param trainf: net training function; default value depends on the type of a network - :param scaler: transformer which is applied to the input samples. If it is False, scaling will not be used - :type scaler: str or sklearn-like transformer or False - :param random_state: this parameter is ignored and is added for uniformity. - :param dict kwargs: additional arguments to net `__init__`, varies with different `net_types` - - .. seealso:: `Supported training functions and their parameters `_ - """ - - __metaclass__ = ABCMeta - - def __init__(self, - features=None, - layers=(10,), - net_type='feed-forward', - initf=nl.init.init_rand, - trainf=None, - scaler='standard', - random_state=None, - **other_params): - self.features = list(features) if features is not None else features - self.layers = list(layers) - self.trainf = trainf - self.initf = initf - self.net_type = net_type - self.scaler = scaler - self.random_state = random_state - - self.net = None - self.train_params = {} - self.net_params = {} - self.set_params(**other_params) - - def _is_fitted(self): - """ - Check if the estimator is fitted or not. - - :rtype: bool - """ - return self.net is not None - - def set_params(self, **params): - """ - Set the parameters of the estimator. - - :param dict params: parameters to be set in the model - """ - for name, value in params.items(): - if name.startswith("scaler__"): - assert hasattr(self.scaler, 'set_params'), \ - "Trying to set {} without scaler".format(name) - self.scaler.set_params(**{name[len("scaler__"):]: value}) - elif name.startswith('layers__'): - index = int(name[len('layers__'):]) - self.layers[index] = value - elif name.startswith('initf__'): - index = int(name[len('initf__'):]) - self.initf[index] = value - elif name in NET_PARAMS: - self.net_params[name] = value - elif name in BASIC_PARAMS: - setattr(self, name, value) - else: - self.train_params[name] = value - - def get_params(self, deep=True): - """ - Get parameters of the estimator. - - :rtype: dict - """ - parameters = deepcopy(self.net_params) - parameters.update(deepcopy(self.train_params)) - for name in BASIC_PARAMS: - parameters[name] = getattr(self, name) - return parameters - - def _partial_fit(self, X, y_original, y_train): - """ - Train the estimator by training the existing estimator again. - - :param pandas.DataFrame X: data shape [n_samples, n_features] - :param y_train: array-like target, which is always 2-dimensional (one-hot for classification) - :param y_original: array-like target, which originally was passed to `fit`. - :return: self - """ - # magic reproducibilizer - numpy.random.seed(42) - - if self._is_fitted(): - x_train = self._transform_data(X, y_original, fit=False) - else: - x_train = self._transform_data(X, y_original, fit=True) - - # Prepare parameters depending on the network purpose (classification / regression) - net_params = self._prepare_params(self.net_params, x_train, y_train) - - initializer = self._get_initializer(self.net_type) - net = initializer(**net_params) - - # To allow similar initf function on all layers - initf_iterable = self.initf if hasattr(self.initf, '__iter__') else [self.initf] * len(net.layers) - for layer, init_function in zip(net.layers, initf_iterable): - layer.initf = init_function - net.init() - - if self.trainf is not None: - net.trainf = self.trainf - - self.net = net - - self.net.train(x_train, y_train, **self.train_params) - return self - - def _activate_on_dataset(self, X): - """ - Predict data. - - :param pandas.DataFrame X: data to be predicted - :return: array-like predictions [n_samples, n_targets] - """ - assert self.net is not None, 'Model is not fitted, prediction is denied' - transformed_x = self._transform_data(X, fit=False) - return self.net.sim(transformed_x) - - def _transform_data(self, X, y=None, fit=True): - """ - Transform input samples by the scaler. - - :param pandas.DataFrame X: input data - :param y: array-like target - :param bool fit: true if scaler is not trained yet - :return: array-like transformed data - """ - X = self._get_features(X) - # The following line fights the bug in sklearn < 0.16, - # most of the transformers there modify X if it is pandas.DataFrame. - X = numpy.copy(X) - if fit: - self.scaler = check_scaler(self.scaler) - self.scaler.fit(X, y) - X = self.scaler.transform(X) - - # HACK: neurolab requires all features (even those of predicted objects) to be in [min, max] - # so this dark magic appeared, seems to work ok for the most reasonable use-cases, - # while allowing arbitrary inputs. - return scipy.special.expit(X / 3) - - def _prepare_params(self, net_params, x_train, y_train): - """ - Set parameters for the neurolab net. - - :param dict net_params: parameters - :param x_train: array-like training data - :param y_train: array-like training target - :return: prepared parameters in the neurolab interface - """ - net_params = deepcopy(net_params) - # Network expects features to be [0, 1]-scaled - net_params['minmax'] = [[0, 1]] * (x_train.shape[1]) - - # To unify the layer-description argument with other supported networks - if 'size' not in net_params: - net_params['size'] = self.layers - else: - if self.layers != (10, ): - raise ValueError('For neurolab please use either `layers` or `sizes`, not both') - - # Set output layer size - net_params['size'] = list(net_params['size']) + [y_train.shape[1]] - - # Default parameters for the transfer functions in the networks - if self.net_type != 'learning-vector': - if 'transf' not in net_params: - net_params['transf'] = [nl.trans.TanSig()] * len(net_params['size']) - if not hasattr(net_params['transf'], '__iter__'): - net_params['transf'] = [net_params['transf']] * len(net_params['size']) - net_params['transf'] = list(net_params['transf']) - - return net_params - - @staticmethod - def _get_initializer(net_type): - """ - Return a neurolab net type object. - - :param str net_type: net type - :return: a neurolab object corresponding to the net type - """ - if net_type not in NET_TYPES: - raise AttributeError("Got unexpected network type: '{}'".format(net_type)) - return NET_TYPES.get(net_type) - - -class NeurolabClassifier(NeurolabBase, Classifier): - __doc__ = "Implements a classification model from the Neurolab library. \n" + remove_first_line(NeurolabBase.__doc__) - - def fit(self, X, y): - """ - Train a classification model on the data. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: labels of samples --- array-like of shape [n_samples] - :return: self - """ - # erasing results of the previous training - self.net = None - return self.partial_fit(X, y) - - def partial_fit(self, X , y): - """ - Additional training of the classifier. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: labels of samples, array-like of shape [n_samples] - :return: self - """ - assert self.net_type not in CANT_CLASSIFY, 'Network type does not support classification' - X, y, _ = check_inputs(X, y, None) - if not self._is_fitted(): - self._set_classes(y) - y_train = one_hot_transform(y, n_classes=len(self.classes_)) * 0.98 + 0.01 - return self._partial_fit(X, y, y_train) - - def predict_proba(self, X): - return self._activate_on_dataset(X) - - predict_proba.__doc__ = Classifier.predict_proba.__doc__ - - def staged_predict_proba(self, X): - """ - .. warning:: This is not supported in the Neurolab (**AttributeError** will be thrown) - """ - raise AttributeError("'staged_predict_proba' is not supported by the Neurolab networks") - - def _prepare_params(self, params, x_train, y_train): - net_params = super(NeurolabClassifier, self)._prepare_params(params, x_train, y_train) - # Classification networks should have SoftMax as the transfer function on output layer - net_params['transf'][-1] = nl.trans.SoftMax() - return net_params - - _prepare_params.__doc__ = NeurolabBase._prepare_params.__doc__ - - -class NeurolabRegressor(NeurolabBase, Regressor): - __doc__ = "Implements a regression model from the Neurolab library. \n" + remove_first_line(NeurolabBase.__doc__) - - def fit(self, X, y): - """ - Train a regression model on the data. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples --- array-like of shape [n_samples] - :return: self - """ - # erasing results of previous training - self.net = None - return self.partial_fit(X, y) - - def partial_fit(self, X , y): - """ - Additional training of the regressor. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples, array-like of shape [n_samples] - :return: self - """ - if self.net_type in CANT_DO_REGRESSION: - raise RuntimeError('Network type does not support regression') - X, y, _ = check_inputs(X, y, None, allow_multiple_targets=True) - y_train = y.reshape(len(y), 1 if len(y.shape) == 1 else y.shape[1]) - return self._partial_fit(X, y, y_train) - - def predict(self, X): - modeled = self._activate_on_dataset(X) - return modeled if modeled.shape[1] != 1 else numpy.ravel(modeled) - - predict.__doc__ = Regressor.predict.__doc__ - - def staged_predict(self, X): - """ - .. warning:: This is not supported in the Neurolab (**AttributeError** will be thrown) - """ - raise AttributeError("'staged_predict' is not supported by the Neurolab networks") - - def _prepare_params(self, params, x_train, y_train): - net_params = super(NeurolabRegressor, self)._prepare_params(params, x_train, y_train) - net_params['transf'][-1] = nl.trans.PureLin() - return net_params - - _prepare_params.__doc__ = NeurolabBase._prepare_params.__doc__ diff --git a/rep/estimators/pybrain.py b/rep/estimators/pybrain.py deleted file mode 100644 index f042fdaa..00000000 --- a/rep/estimators/pybrain.py +++ /dev/null @@ -1,408 +0,0 @@ -""" -These classes are wrappers for the `PyBrain library `_ --- a neural network python library. - -.. warning:: pybrain training isn't reproducible - (training with the same parameters produces different neural network each time) - - -""" - -# Copyright 2014-2015 Yandex LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import division, print_function, absolute_import -from abc import ABCMeta - -import numpy -from pybrain.tools.shortcuts import buildNetwork -from pybrain.datasets import SupervisedDataSet -from pybrain.supervised.trainers import BackpropTrainer, RPropMinusTrainer -from pybrain import structure - -from .interface import Classifier, Regressor -from .utils import check_inputs, check_scaler, one_hot_transform, remove_first_line - -__author__ = 'Artem Zhirokhov, Alex Rogozhnikov, Tatiana Likhomanenko' -__all__ = ['PyBrainBase', 'PyBrainClassifier', 'PyBrainRegressor'] - -LAYER_CLASS = {'BiasUnit': structure.BiasUnit, - 'LinearLayer': structure.LinearLayer, - 'MDLSTMLayer': structure.MDLSTMLayer, - 'SigmoidLayer': structure.SigmoidLayer, - 'SoftmaxLayer': structure.SoftmaxLayer, - 'TanhLayer': structure.TanhLayer} - - -class PyBrainBase(object): - """A base class for the estimator from the PyBrain. - - :param features: features used in training. - :type features: list[str] or None - :param scaler: transformer which is applied to the input samples. If it is False, scaling will not be used - :type scaler: str or sklearn-like transformer or False - :param bool use_rprop: flag to indicate whether we should use Rprop or SGD trainer - :param bool verbose: print train/validation errors. - :param random_state: it is ignored parameter, pybrain training is not reproducible - - **Net parameters:** - - :param list[int] layers: indicate how many neurons in each hidden(!) layer; default is 1 hidden layer with 10 neurons - :param list[str] hiddenclass: classes of the hidden layers; default is `'SigmoidLayer'` - :param dict params: other net parameters: - - * `bias` and `outputbias` (boolean) flags to indicate whether the network should have the corresponding biases, - both default to True; - * `peepholes` (boolean); - * `recurrent` (boolean): if the `recurrent` flag is set, a :class:`RecurrentNetwork` will be created, - otherwise a :class:`FeedForwardNetwork` - - **Gradient descent trainer parameters:** - - :param float learningrate: gives the ratio of which parameters are changed into the direction of the gradient - :param float lrdecay: the learning rate decreases by lrdecay, which is used to multiply the learning rate after each training step - :param float momentum: the ratio by which the gradient of the last time step is used - :param boolean batchlearning: if set, the parameters are updated only at the end of each epoch. Default is False - :param float weightdecay: corresponds to the `weightdecay` rate, where 0 is no weight decay at all - - **Rprop trainer parameters:** - - :param float etaminus: factor by which a step width is decreased when overstepping (default=0.5) - :param float etaplus: factor by which a step width is increased when following gradient (default=1.2) - :param float delta: step width for each weight - :param float deltamin: minimum step width (default=1e-6) - :param float deltamax: maximum step width (default=5.0) - :param float delta0: initial step width (default=0.1) - - **Training termination parameters** - - :param int epochs: number of iterations in training; if < 0 then estimator trains until converge - :param int max_epochs: maximum number of epochs the trainer should train if it is given - :param int continue_epochs: each time validation error decreases, try for `continue_epochs` epochs to find a better one - :param float validation_proportion: the ratio of the dataset that is used for the validation dataset - - .. note:: - - Details about parameters `here `_. - """ - __metaclass__ = ABCMeta - # to be overriden in descendants. - _model_type = None - - def __init__(self, - features=None, - layers=(10,), - hiddenclass=None, - epochs=10, - scaler='standard', - use_rprop=False, - learningrate=0.01, - lrdecay=1.0, - momentum=0., - verbose=False, - batchlearning=False, - weightdecay=0., - etaminus=0.5, - etaplus=1.2, - deltamin=1.0e-6, - deltamax=0.5, - delta0=0.1, - max_epochs=None, - continue_epochs=3, - validation_proportion=0.25, - random_state=None, - **params): - self.features = list(features) if features is not None else features - self.epochs = epochs - self.scaler = scaler - self.use_rprop = use_rprop - - # net options - self.layers = list(layers) - self.hiddenclass = hiddenclass - self.params = params - - # SGD trainer options - self.learningrate = learningrate - self.lrdecay = lrdecay - self.momentum = momentum - self.verbose = verbose - self.batchlearning = batchlearning - self.weightdecay = weightdecay - - # Rprop trainer - self.etaminus = etaminus - self.etaplus = etaplus - self.deltamin = deltamin - self.deltamax = deltamax - self.delta0 = delta0 - - # trainUntilConvergence options - self.max_epochs = max_epochs - self.continue_epochs = continue_epochs - self.validation_proportion = validation_proportion - - self.random_state = random_state - self.net = None - - def _check_params(self): - """ - Checks the input of __init__. - """ - if self.hiddenclass is not None: - assert len(self.layers) == len( - self.hiddenclass), 'Number of hidden layers does not match number of hidden classes' - if self.hiddenclass[0] == 'BiasUnit': - raise ValueError('BiasUnit should not be the first unit class') - - for hid_class in self.hiddenclass: - if hid_class not in LAYER_CLASS: - raise ValueError('Wrong class name ' + hid_class) - - def fit(self, X, y): - """ - Train a classification/regression model on the data. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples --- array-like of shape [n_samples] - :return: self - """ - self.net = None - return self.partial_fit(X, y) - - def partial_fit(self, X, y): - """ - Additional training of the classification/regression model. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples, array-like of shape [n_samples] - :return: self - """ - dataset = self._prepare_dataset(X, y, self._model_type) - - if not self._is_fitted(): - self._prepare_net(dataset=dataset, model_type=self._model_type) - - if self.use_rprop: - trainer = RPropMinusTrainer(self.net, - etaminus=self.etaminus, - etaplus=self.etaplus, - deltamin=self.deltamin, - deltamax=self.deltamax, - delta0=self.delta0, - dataset=dataset, - learningrate=self.learningrate, - lrdecay=self.lrdecay, - momentum=self.momentum, - verbose=self.verbose, - batchlearning=self.batchlearning, - weightdecay=self.weightdecay) - else: - trainer = BackpropTrainer(self.net, - dataset, - learningrate=self.learningrate, - lrdecay=self.lrdecay, - momentum=self.momentum, - verbose=self.verbose, - batchlearning=self.batchlearning, - weightdecay=self.weightdecay) - - if self.epochs < 0: - trainer.trainUntilConvergence(maxEpochs=self.max_epochs, - continueEpochs=self.continue_epochs, - verbose=self.verbose, - validationProportion=self.validation_proportion) - else: - trainer.trainEpochs(epochs=self.epochs, ) - return self - - def _is_fitted(self): - """ - Check if the estimator is fitted or not. - - :rtype: bool - """ - return self.net is not None - - def set_params(self, **params): - """ - Set the parameters of the estimator. - - Names of the parameters are the same as in the constructor. - """ - for name, value in params.items(): - if hasattr(self, name): - setattr(self, name, value) - else: - if name.startswith('layers__'): - index = int(name[len('layers__'):]) - self.layers[index] = value - elif name.startswith('hiddenclass__'): - index = int(name[len('hiddenclass__'):]) - self.hiddenclass[index] = value - elif name.startswith('scaler__'): - scaler_params = {name[len('scaler__'):]: value} - self.scaler.set_params(**scaler_params) - else: - self.params[name] = value - - def _transform_data(self, X, y=None, fit=True): - """ - Transform input samples by the scaler. - - :param pandas.DataFrame X: input data - :param y: array-like target - :param bool fit: true if scaler is not trained yet - :return: array-like transformed data - """ - X = self._get_features(X) - # The following line fights the bug in sklearn < 0.16, - # most of transformers there modify X if it is pandas.DataFrame. - data_temp = numpy.copy(X) - if fit: - self.scaler = check_scaler(self.scaler) - self.scaler.fit(data_temp, y) - return self.scaler.transform(data_temp) - - def _prepare_dataset(self, X, y, model_type): - """ - Prepare data in pybrain format. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples --- array-like of shape [n_samples] - :param str model_type: classification or regression label - :return: self - """ - X, y, sample_weight = check_inputs(X, y, sample_weight=None, allow_none_weights=True, - allow_multiple_targets=model_type == 'regression') - X = self._transform_data(X, y, fit=not self._is_fitted()) - - if model_type == 'classification': - if not self._is_fitted(): - self._set_classes(y) - target = one_hot_transform(y, n_classes=len(self.classes_)) - elif model_type == 'regression': - if len(y.shape) == 1: - target = y.reshape((len(y), 1)) - else: - # multi regression - target = y - - if not self._is_fitted(): - self.n_targets = target.shape[1] - else: - raise ValueError('Wrong model type') - - dataset = SupervisedDataSet(X.shape[1], target.shape[1]) - dataset.setField('input', X) - dataset.setField('target', target) - - return dataset - - def _prepare_net(self, dataset, model_type): - """ - Prepare net for training. - - :param pybrain.datasets.SupervisedDataSet dataset: dataset in pybrain format - :param str model_type: classification or regression label - """ - self._check_params() - - if self.hiddenclass is None: - self.hiddenclass = ['SigmoidLayer'] * len(self.layers) - - net_options = {'bias': True, - 'outputbias': True, - 'peepholes': False, - 'recurrent': False, - } - for key in self.params: - if key not in net_options.keys(): - raise ValueError('Unexpected parameter: {}'.format(key)) - net_options[key] = self.params[key] - # This flag says to use native python implementation, not arac. - net_options['fast'] = False - - if model_type == 'classification': - net_options['outclass'] = structure.SoftmaxLayer - else: - net_options['outclass'] = structure.LinearLayer - - layers_for_net = [dataset.indim] + self.layers + [dataset.outdim] - self.net = buildNetwork(*layers_for_net, **net_options) - - for layer_id in range(1, len(self.layers)): - hid_layer = LAYER_CLASS[self.hiddenclass[layer_id]](self.layers[layer_id]) - self.net.addModule(hid_layer) - self.net.sortModules() - - def _activate_on_dataset(self, X): - """ - Predict data. - - :param pandas.DataFrame X: data to be predicted - :return: array-like predictions [n_samples, n_targets] - """ - assert self._is_fitted(), "Net isn't fitted, please call 'fit' first" - - X = self._transform_data(X, fit=False) - y_test_dummy = numpy.zeros((len(X), 1)) - - ds = SupervisedDataSet(X.shape[1], y_test_dummy.shape[1]) - ds.setField('input', X) - ds.setField('target', y_test_dummy) - - return self.net.activateOnDataset(ds) - - def __setstate__(self, dict): - # resolve pickling issue with pyBrain http://stackoverflow.com/questions/4334941/ - self.__dict__ = dict - if self.net is not None: - self.net.sorted = False - self.net.sortModules() - - -class PyBrainClassifier(PyBrainBase, Classifier): - __doc__ = "Implements a classification model from the PyBrain library. \n" + remove_first_line(PyBrainBase.__doc__) - _model_type = 'classification' - - def predict_proba(self, X): - return self._activate_on_dataset(X=X) - - predict_proba.__doc__ = Classifier.predict_proba.__doc__ - - def staged_predict_proba(self, X): - """ - .. warning:: This function is not supported for PyBrain (**AttributeError** will be thrown). - """ - raise AttributeError("'staged_predict_proba' is not supported by the PyBrain networks") - - -class PyBrainRegressor(PyBrainBase, Regressor): - __doc__ = "Implements a regression model from the PyBrain library. \n" + remove_first_line(PyBrainBase.__doc__) - _model_type = 'regression' - - def predict(self, X): - predictions = self._activate_on_dataset(X) - if self.n_targets == 1: - predictions = predictions.flatten() - return predictions - - predict.__doc__ = Classifier.predict.__doc__ - - def staged_predict(self, X): - """ - .. warning:: This function is not supported for PyBrain (**AttributeError** will be thrown). - """ - raise AttributeError("'staged_predict' is not supported by the PyBrain networks") diff --git a/rep/estimators/theanets.py b/rep/estimators/theanets.py deleted file mode 100644 index 4f3c2813..00000000 --- a/rep/estimators/theanets.py +++ /dev/null @@ -1,363 +0,0 @@ -""" -These classes are wrappers for `theanets library `_ --- a neural network python library. - -""" - -# Copyright 2014-2015 Yandex LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import division, print_function, absolute_import -from abc import abstractmethod, ABCMeta - -import numpy -import theanets as tnt - -from .interface import Classifier, Regressor -from .utils import check_inputs, check_scaler, remove_first_line -from sklearn.utils import check_random_state - - -__author__ = 'Lisa Ignatyeva, Alex Rogozhnikov, Tatiana Likhomanenko' -__all__ = ['TheanetsBase', 'TheanetsClassifier', 'TheanetsRegressor'] - -UNSUPPORTED_OPTIMIZERS = {'sample', 'hf'} -# sample has too different interfaces from what we support here -# currently, hf now does not work in theanets, see https://github.com/lmjohns3/theanets/issues/62 - -# to keep climate from printing anything, uncomment following: -# import os -# import climate -# null_file = open(os.devnull, "w") -# climate.enable_default_logging(default_level='ERROR', stream=null_file) - - -class TheanetsBase(object): - """A base class for the estimators from Theanets library. - - :param features: list of features to train model - :type features: None or list(str) - :param layers: a sequence of values specifying the **hidden** layer configuration for the network. - For more information see `Specifying layers `_ - in the theanets documentation. - Note that theanets `layers` parameter includes input and output layers in the sequence as well. - :type layers: sequence of int, tuple, dict - :param int input_layer: size of the input layer. If it equals -1, the size is taken from the training dataset - :param int output_layer: size of the output layer. If it equals -1, the size is taken from the training dataset - :param str hidden_activation: the name of an activation function to use on the hidden network layers by default - :param str output_activation: the name of an activation function to use on the output layer by default - :param float input_noise: standard deviation of desired noise to inject into input - :param float hidden_noise: standard deviation of desired noise to inject into hidden unit activation output - :param float input_dropouts: proportion of the input units to randomly set to 0; it ranges [0, 1] - :param float hidden_dropouts: proportion of hidden unit activations to randomly set to 0; it ranges [0, 1] - :param int decode_from: any of the hidden layers can be tapped at the output. Just specify a value greater than - 1 to tap the last N hidden layers. The default is 1, which decodes from just the last layer. - :param scaler: transformer which is applied to the input samples. If it is False, scaling will not be used - :type scaler: str or sklearn-like transformer or False - :param trainers: parameters to specify training algorithm(s), for example:: - - trainers=[{'algo': sgd, 'momentum': 0.2}, {'algo': 'nag'}] - - :type trainers: list[dict] or None - :param random_state: state for a pseudo random generator - :type random_state: None or int or RandomState - - - For more information on the available trainers and their parameters see this `page `_. - """ - - __metaclass__ = ABCMeta - _model_type = None - - def __init__(self, - features=None, - layers=(10,), - input_layer=-1, - output_layer=-1, - hidden_activation='logistic', - output_activation='linear', - input_noise=0, - hidden_noise=0, - input_dropout=0, - hidden_dropout=0, - decode_from=1, - weight_l1=0.01, - weight_l2=0.01, - scaler='standard', - trainers=None, - random_state=42, ): - self.features = list(features) if features is not None else features - self.layers = list(layers) - self.input_layer = input_layer - self.output_layer = output_layer - self.random_state = random_state - - self.scaler = scaler - self.trainers = trainers - self.exp = None - - self.input_noise = input_noise - self.hidden_noise = hidden_noise - self.input_dropout = input_dropout - self.hidden_dropout = hidden_dropout - self.decode_from = decode_from - self.weight_l1 = weight_l1 - self.weight_l2 = weight_l2 - - self.hidden_activation = hidden_activation - self.output_activation = output_activation - - def set_params(self, **params): - """ - Set the parameters of the estimator. Deep parameters of trainers and scaler can be accessed, - for instance:: - - trainers__0 = {'algo': 'sgd', 'learning_rate': 0.3} - trainers__0_algo = 'sgd' - layers__1 = 14 - scaler__use_std = True - - :param dict params: parameters to set in the model - """ - for key, value in params.items(): - if hasattr(self, key): - if key == 'layers': - value = list(value) - setattr(self, key, value) - else: - # accessing deep parameters - param, sep, param_of_param = key.partition('__') - if sep != '__': - raise ValueError(key + ' is an invalid parameter a Theanets estimator') - if param == 'trainers': - index, sep, param = param_of_param.partition('_') - index = int(index) - if index >= len(self.trainers): - raise ValueError('{} is an invalid parameter for a Theanets estimator: index ' - 'too big'.format(key)) - if param == '': - # e.g. trainers__0 = {'algo': 'sgd', 'learning_rate': 0.3} - self.trainers[index] = value - else: - # e.g. trainers__0_algo = 'sgd' - self.trainers[index][param] = value - elif param == 'layers': - index = int(param_of_param) - if index >= len(self.layers): - raise ValueError('{} is an invalid parameter for a Theanets estimator: index ' - 'too big'.format(key)) - self.layers[index] = value - elif param == 'scaler': - try: - self.scaler.set_params(**{param_of_param: value}) - except Exception: - raise ValueError('was unable to set parameter {}={} ' - 'to scaler {}'.format(param_of_param, value, self.scaler)) - else: - raise ValueError(key + ' is an invalid parameter for a Theanets estimator') - - def _transform_data(self, data, y=None): - """ - Transform input samples by the scaler. - - :param pandas.DataFrame X: input data - :param y: array-like target - :return: array-like transformed data - """ - data_backup = data.copy() - if not self._is_fitted(): - self.scaler = check_scaler(self.scaler) - self.scaler.fit(data_backup, y) - return self.scaler.transform(data_backup) - - def _is_fitted(self): - """ - Check if the estimator is fitted or not. - - :rtype: bool - """ - return self.exp is not None - - def fit(self, X, y, sample_weight=None): - """ - Train a classification/regression model on the data. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples --- array-like of shape [n_samples] - :param sample_weight: weights for samples --- array-like of shape [n_samples] - :return: self - """ - self.exp = None - if self.trainers is None: - # use default trainer with default parameters. - self.trainers = [{}] - - for trainer in self.trainers: - if 'algo' in trainer and trainer['algo'] in UNSUPPORTED_OPTIMIZERS: - raise NotImplementedError(trainer['algo'] + ' is not supported') - self.partial_fit(X, y, sample_weight=sample_weight, keep_trainer=False, **trainer) - return self - - @abstractmethod - def partial_fit(self, X, y, sample_weight=None, keep_trainer=True, **trainer): - """ - Train the estimator by training the existing estimator again. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples --- array-like of shape [n_samples] - :param sample_weight: weights for samples --- array-like of shape [n_samples] - :param bool keep_trainer: True if the trainer is not stored in self.trainers. - If True, will add it to the list of the estimators. - :param dict trainer: parameters of the training algorithm we want to use now - :return: self - """ - pass - - def _prepare_for_partial_fit(self, X, y, sample_weight=None, allow_multiple_targets=False, keep_trainer=True, - **trainer): - """ - Do preparation for fitting which is the same for a classifier and regressor. - - :param pandas.DataFrame X: data of shape [n_samples, n_features] - :param y: values for samples --- array-like of shape [n_samples] - :param sample_weight: weights for samples --- array-like of shape [n_samples] - :param bool allow_multiple_targets: True if target can contain multiple targets - :param bool keep_trainer: True if the trainer is not stored in self.trainers - :param dict trainer: parameters of the training algorithm we want to use now - :return: prepared data and target - """ - X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False, - allow_multiple_targets=allow_multiple_targets) - X = self._transform_data(self._get_features(X, allow_nans=True), y) - if keep_trainer: - self.trainers.append(trainer) - return X, y, sample_weight - - def _construct_layers(self, input_layer, output_layer): - """ - Build a layer list including correct input/output layers' sizes. - - :param int input_layer: input layer size taken from the data - :param int output_layer: output layer size taken from the data - :return: list of layers - """ - layers = [self.input_layer] + self.layers + [self.output_layer] - if layers[0] == -1: - layers[0] = input_layer - if layers[-1] == -1: - layers[-1] = output_layer - return layers - - def _prepare_network_params(self): - """ - Prepare simple net parameters. - - :return: prepared dict - """ - if self.random_state is None: - seed = 0 - elif isinstance(self.random_state, int): - seed = self.random_state - else: - seed = check_random_state(self.random_state).randint(0, 10000) - - return {'hidden_activation': self.hidden_activation, - 'output_activation': self.output_activation, - 'input_noise': self.input_noise, - 'hidden_noise': self.hidden_noise, - 'input_dropout': self.input_dropout, - 'hidden_dropout': self.hidden_dropout, - 'decode_from': self.decode_from, - 'rng': seed, - 'weight_l1': self.weight_l1, - 'weight_l2': self.weight_l2 - } - - -class TheanetsClassifier(TheanetsBase, Classifier): - __doc__ = 'Implements a classification model from the Theanets library. \n' + remove_first_line(TheanetsBase.__doc__) - - _model_type = 'classification' - - def partial_fit(self, X, y, sample_weight=None, keep_trainer=True, **trainer): - X, y, sample_weight = self._prepare_for_partial_fit(X, y, sample_weight=sample_weight, - keep_trainer=keep_trainer, **trainer) - if self.exp is None: - self._set_classes(y) - layers = self._construct_layers(X.shape[1], len(self.classes_)) - self.exp = tnt.Experiment(tnt.Classifier, layers=layers, weighted=True) - params = self._prepare_network_params() - params.update(**trainer) - if trainer.get('algo', None) == 'pretrain': - self.exp.train([X.astype(numpy.float32)], **params) - else: - self.exp.train([X.astype(numpy.float32), y.astype(numpy.int32), sample_weight.astype(numpy.float32)], - **params) - return self - partial_fit.__doc__ = TheanetsBase.partial_fit.__doc__ - - def predict_proba(self, X): - assert self._is_fitted(), 'Classifier wasn`t fitted, please, call `fit` first' - X = self._transform_data(self._get_features(X, allow_nans=True)) - return self.exp.network.predict_proba(X.astype(numpy.float32)) - - predict_proba.__doc__ = Classifier.predict_proba.__doc__ - - def staged_predict_proba(self, X): - """ - .. warning:: This function is not supported in the Theanets (**NotImplementedError** will be thrown) - """ - raise NotImplementedError("'staged_predict_proba' is not supported by the Theanets classifiers") - - -class TheanetsRegressor(TheanetsBase, Regressor): - __doc__ = 'Implements a regression model from the Theanets library. \n' + remove_first_line(TheanetsBase.__doc__) - - _model_type = 'regression' - - def partial_fit(self, X, y, sample_weight=None, keep_trainer=True, **trainer): - allow_multiple_targets = False if len(numpy.shape(y)) == 1 else True - X, y, sample_weight = self._prepare_for_partial_fit(X, y, sample_weight=sample_weight, - allow_multiple_targets=allow_multiple_targets, - keep_trainer=keep_trainer, **trainer) - if self.exp is None: - layers = self._construct_layers(X.shape[1], 1 if len(numpy.shape(y)) == 1 else numpy.shape(y)[1]) - self.exp = tnt.Experiment(tnt.Regressor, layers=layers, weighted=True) - params = self._prepare_network_params() - params.update(**trainer) - if len(numpy.shape(y)) == 1: - y = y.reshape(len(y), 1) - if len(numpy.shape(sample_weight)) == 1: - sample_weight = numpy.repeat(sample_weight, y.shape[1]) - sample_weight = sample_weight.reshape(y.shape) - if trainer.get('algo') == 'pretrain': - self.exp.train([X.astype(numpy.float32)], **params) - else: - self.exp.train([X.astype(numpy.float32), y, sample_weight.astype(numpy.float32)], **params) - return self - - partial_fit.__doc__ = TheanetsBase.partial_fit.__doc__ - - def predict(self, X): - assert self._is_fitted(), "Regressor wasn't fitted, please, call `fit` first" - X = self._transform_data(self._get_features(X, allow_nans=True)) - return self.exp.network.predict(X.astype(numpy.float32)) - - predict.__doc__ = Regressor.predict.__doc__ - - def staged_predict(self, X): - """ - .. warning:: This function is not supported in the Theanets (**NotImplementedError** will be thrown) - """ - raise NotImplementedError("'staged_predict' is not supported by the Theanets regressors") diff --git a/rep/estimators/tmva.py b/rep/estimators/tmva.py deleted file mode 100644 index b8b08753..00000000 --- a/rep/estimators/tmva.py +++ /dev/null @@ -1,432 +0,0 @@ -""" -These classes are wrappers for physics machine learning library TMVA used .root format files (c++ library). -Now you can simply use it in python. TMVA contains classification and regression algorithms, including neural networks. -See `TMVA guide `_ -for the list of the available algorithms and parameters. -""" -from __future__ import division, print_function, absolute_import -from abc import ABCMeta -from logging import getLogger -import os -import tempfile -import subprocess -from subprocess import PIPE -import shutil -import sys - -from .interface import Classifier, Regressor -from .utils import check_inputs, score_to_proba, proba_to_two_dimensions -from six.moves import cPickle -import signal - -__author__ = 'Tatiana Likhomanenko, Alex Rogozhnikov' - -logger = getLogger(__name__) -# those parameters that shall not be passed to the options of the TMVA estimators -_IGNORED_PARAMETERS = {'random_state'} -__all__ = ['TMVAClassifier', 'TMVARegressor'] - - -class _AdditionalInformation: - """ - Additional information for the tmva factory (used in training) - """ - - def __init__(self, directory, model_type='classification'): - self.directory = directory - self.tmva_root = 'result.root' - self.tmva_job = "TMVAEstimation" - self.model_type = model_type - - -class _AdditionalInformationPredict: - """ - Additional information for the tmva factory (used to predict new data) - """ - - def __init__(self, directory, xml_file, method_name, model_type=('classification', None)): - self.directory = directory - self.xml_file = xml_file - self.method_name = method_name - self.model_type = model_type - self.result_filename = os.path.join(directory, 'dump_predictions.pkl') - - -class TMVABase(object): - """ - TMVABase is a base class for the tmva classification and regression models. - - :param str method: algorithm method (default='kBDT') - :param features: features used in training - :type features: list[str] or None - :param str factory_options: system options, including data transformation before training - :param dict method_parameters: estimator options - - .. note:: TMVA doesn't support staged predictions and features importances :( - """ - __metaclass__ = ABCMeta - - def __init__(self, - factory_options="", - method='kBDT', - **method_parameters): - self.method = method - self._method_name = 'REP_Estimator' - self.factory_options = factory_options - self.method_parameters = method_parameters - - # contents of xml file with formula, read into memory - self.formula_xml = None - - @staticmethod - def _create_tmp_directory(): - return tempfile.mkdtemp(dir=os.getcwd()) - - @staticmethod - def _remove_tmp_directory(directory): - shutil.rmtree(directory, ignore_errors=True) - - def _fit(self, X, y, sample_weight=None, model_type='classification'): - """ - Train the estimator. - - :param pandas.DataFrame X: data shape [n_samples, n_features] - :param y: targets for samples --- array-like of shape [n_samples] - :param sample_weight: weights for samples, - array-like of shape [n_samples] or None if all weights are equal - :return: self - """ - # saving data to 2 different root files. - directory = self._create_tmp_directory() - add_info = _AdditionalInformation(directory, model_type=model_type) - try: - self._run_tmva_training(add_info, X, y, sample_weight) - finally: - self._remove_tmp_directory(directory) - - return self - - def _run_tmva_training(self, info, X, y, sample_weight): - """ - Run subprocess to train tmva factory. - - :param info: class with additional information - """ - tmva_process = None - _platform = sys.platform - try: - if _platform == 'win32' or _platform == 'cygwin': - tmva_process = subprocess.Popen( - '{executable} -c "import os; from rep.estimators import _tmvaFactory; _tmvaFactory.main()"'.format( - executable=sys.executable), - cwd=info.directory, - stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT) - - else: - # Problem with Mac OS El Capitan which is not garanteed to set DYLD_LIBRARY_PATH. - # This DYLD_LIBRARY_PATH can be used in root_numpy for dynamic loading ROOT libraries - # https://github.com/rootpy/root_numpy/issues/227#issuecomment-165981891 - tmva_process = subprocess.Popen( - 'export DYLD_LIBRARY_PATH={dyld}; cd "{directory}";' - '{executable} -c "import os; from rep.estimators import _tmvaFactory; _tmvaFactory.main()"'.format( - dyld=os.environ.get('DYLD_LIBRARY_PATH', ""), - directory=info.directory, - executable=sys.executable), - stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT, - shell=True, preexec_fn=os.setsid) - - try: - cPickle.dump(self, tmva_process.stdin) - cPickle.dump(info, tmva_process.stdin) - cPickle.dump(X, tmva_process.stdin) - cPickle.dump(y, tmva_process.stdin) - cPickle.dump(sample_weight, tmva_process.stdin) - except: - # continuing, next we check the output of process - pass - stdout, stderr = tmva_process.communicate() - assert tmva_process.returncode == 0, \ - 'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout) - if stdout is not None: - print('%s' % (stdout)) - - xml_filename = os.path.join(info.directory, 'weights', - '{job}_{name}.weights.xml'.format(job=info.tmva_job, name=self._method_name)) - with open(xml_filename, 'r') as xml_file: - self.formula_xml = xml_file.read() - finally: - if tmva_process is not None: - try: - if _platform == 'win32' or _platform == 'cygwin': - subprocess.call(['taskkill', '/F', '/T', '/PID', str(tmva_process.pid)]) - else: - os.killpg(tmva_process.pid, signal.SIGTERM) - except: - pass - - def _check_fitted(self): - assert self.formula_xml is not None, "Classifier wasn't fitted, please call `fit` first" - - def _predict(self, X, model_type=('classification', None)): - """ - Predict data - - :param pandas.DataFrame X: data shape [n_samples, n_features] - :param model_type: (classification/regression, type of output transformation) - :return: predicted values of shape [n_samples] - """ - self._check_fitted() - - directory = self._create_tmp_directory() - try: - with tempfile.NamedTemporaryFile(mode="w", suffix='.xml', dir=directory, delete=True) as file_xml: - file_xml.write(self.formula_xml) - file_xml.flush() - add_info = _AdditionalInformationPredict(directory, file_xml.name, self._method_name, - model_type=model_type) - prediction = self._run_tmva_predict(add_info, X) - finally: - self._remove_tmp_directory(directory) - - return prediction - - @staticmethod - def _run_tmva_predict(info, data): - """ - Run subprocess to predict new data by tmva factory - - :param info: class with additional information - """ - tmva_process = None - _platform = sys.platform - try: - if _platform == 'win32' or _platform == 'cygwin': - tmva_process = subprocess.Popen( - '{executable} -c "from rep.estimators import _tmvaReader; _tmvaReader.main()"'.format( - executable=sys.executable), - cwd=info.directory, - stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT) - - else: - # Problem with Mac OS El Capitan (10.11) which is not guaranteed to set DYLD_LIBRARY_PATH. - # This DYLD_LIBRARY_PATH can be used in root_numpy for dynamic loading ROOT libraries - # https://github.com/rootpy/root_numpy/issues/227#issuecomment-165981891 - tmva_process = subprocess.Popen( - 'export DYLD_LIBRARY_PATH={dyld}; cd "{directory}";' - '{executable} -c "from rep.estimators import _tmvaReader; _tmvaReader.main()"'.format( - dyld=os.environ.get('DYLD_LIBRARY_PATH', ""), - directory=info.directory, - executable=sys.executable), - stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT, - shell=True) - - try: - cPickle.dump(info, tmva_process.stdin) - cPickle.dump(data, tmva_process.stdin) - except: - # Doing nothing, there is check later. - pass - stdout, stderr = tmva_process.communicate() - assert tmva_process.returncode == 0, \ - 'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout) - with open(info.result_filename, 'rb') as predictions_file: - predictions = cPickle.load(predictions_file) - return predictions - finally: - if tmva_process is not None: - try: - if _platform == 'win32' or _platform == 'cygwin': - subprocess.call(['taskkill', '/F', '/T', '/PID', str(tmva_process.pid)]) - else: - os.killpg(tmva_process.pid, signal.SIGTERM) - except: - pass - - -class TMVAClassifier(TMVABase, Classifier): - """ - Implements classification models from TMVA library: CERN library for machine learning. - - :param str method: algorithm method (default='kBDT') - :param features: features used in training - :type features: list[str] or None - :param str factory_options: system options, including data transformations before training, for example:: - - "!V:!Silent:Color:Transformations=I;D;P;G,D" - - :param str sigmoid_function: function which is used to convert TMVA output to probabilities; - - * *identity* (use for svm, mlp) --- do not transform the output, use this value for methods returning class probabilities - * *sigmoid* --- sigmoid transformation, use it if output varies in range [-infinity, +infinity] - * *bdt* (for the BDT algorithms output varies in range [-1, 1]) - * *sig_eff=0.4* --- for the rectangular cut optimization methods, - for instance, here 0.4 will be used as a signal efficiency to evaluate MVA, - (put any float number from [0, 1]) - - :param dict method_parameters: classifier options, example: `NTrees=100`, `BoostType='Grad'`. - - .. warning:: - TMVA doesn't support *staged_predict_proba()* and *feature_importances__*. - - TMVA doesn't support multiclassification, only two-class classification. - - `TMVA guide `_. - """ - def __init__(self, - method='kBDT', - features=None, - factory_options="", - sigmoid_function='bdt', - **method_parameters): - TMVABase.__init__(self, factory_options=factory_options, method=method, **method_parameters) - Classifier.__init__(self, features=features) - self.sigmoid_function = sigmoid_function - - def _set_classes_special(self, y): - self._set_classes(y) - assert self.n_classes_ == 2, "Support only 2 classes (data contain {})".format(self.n_classes_) - - def set_params(self, **params): - """ - Set the parameters of this estimator. - - :param dict params: parameters to set in the model - """ - for k, v in params.items(): - if hasattr(self, k): - setattr(self, k, v) - else: - if k in _IGNORED_PARAMETERS: - continue - self.method_parameters[k] = v - - def get_params(self, deep=True): - """ - Get parameters for this estimator. - - :return: dict, parameter names mapped to their values. - """ - parameters = self.method_parameters.copy() - parameters['method'] = self.method - parameters['factory_options'] = self.factory_options - parameters['features'] = self.features - return parameters - - def fit(self, X, y, sample_weight=None): - X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False) - X = self._get_features(X).copy() - self._set_classes_special(y) - if self.n_classes_ == 2: - self.factory_options = '{}:AnalysisType=Classification'.format(self.factory_options) - else: - self.factory_options = '{}:AnalysisType=Multiclass'.format(self.factory_options) - - return self._fit(X, y, sample_weight=sample_weight) - - fit.__doc__ = Classifier.fit.__doc__ - - def predict_proba(self, X): - X = self._get_features(X) - prediction = self._predict(X, model_type=('classification', self.sigmoid_function)) - return self._convert_output(prediction) - - predict_proba.__doc__ = Classifier.predict_proba.__doc__ - - def _convert_output(self, prediction): - """ - Convert the output to the probabilities for each class. - - :param array prediction: predictions which will be converted - :return: probabilities - """ - variants = {'bdt', 'sigmoid', 'identity'} - if 'sig_eff' in self.sigmoid_function: - return proba_to_two_dimensions(prediction) - assert self.sigmoid_function in variants, \ - 'sigmoid_function parameter must be one of {}, instead of {}'.format(variants, self.sigmoid_function) - if self.sigmoid_function == 'sigmoid': - return score_to_proba(prediction) - elif self.sigmoid_function == 'bdt': - return proba_to_two_dimensions((prediction + 1.) / 2.) - else: - return proba_to_two_dimensions(prediction) - - def staged_predict_proba(self, X): - """ - .. warning:: This function is not supported for the TMVA library (**AttributeError** will be thrown) - """ - raise AttributeError("'staged_predict_proba' is not supported by the TMVA library") - - -class TMVARegressor(TMVABase, Regressor): - """ - Implements regression models from TMVA library: CERN library for machine learning. - - :param str method: algorithm method (default='kBDT') - :param features: features used in training - :type features: list[str] or None - :param str factory_options: system options, including data transformations before training, for example:: - - "!V:!Silent:Color:Transformations=I;D;P;G,D" - - :param dict method_parameters: regressor options, for example: `NTrees=100`, `BoostType='Grad'` - - .. warning:: - TMVA doesn't support *staged_predict()* and *feature_importances__*. - - `TMVA guide `_ - """ - def __init__(self, - method='kBDT', - features=None, - factory_options="", - **method_parameters): - TMVABase.__init__(self, factory_options=factory_options, method=method, **method_parameters) - Regressor.__init__(self, features=features) - - def set_params(self, **params): - """ - Set the parameters of this estimator. - - :param dict params: parameters to set in the model - """ - for k, v in params.items(): - if hasattr(self, k): - setattr(self, k, v) - else: - if k in _IGNORED_PARAMETERS: - continue - self.method_parameters[k] = v - - def get_params(self, deep=True): - """ - Get parameters for this estimator. - - :return: dict, parameter names mapped to their values. - """ - parameters = self.method_parameters.copy() - parameters['method'] = self.method - parameters['factory_options'] = self.factory_options - parameters['features'] = self.features - return parameters - - def fit(self, X, y, sample_weight=None): - X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False) - X = self._get_features(X).copy() - - self.factory_options = '{}:AnalysisType=Regression'.format(self.factory_options) - return self._fit(X, y, sample_weight=sample_weight, model_type='regression') - - fit.__doc__ = Regressor.fit.__doc__ - - def predict(self, X): - X = self._get_features(X) - return self._predict(X, model_type=('regression', None)) - - predict.__doc__ = Regressor.predict.__doc__ - - def staged_predict(self, X): - """ - .. warning:: This function is not supported for the TMVA library (**AttributeError** will be thrown) - """ - raise AttributeError("'staged_predict' is not supported by the TMVA library") diff --git a/rep/metaml/folding.py b/rep/metaml/folding.py index acddb9c4..27161d26 100644 --- a/rep/metaml/folding.py +++ b/rep/metaml/folding.py @@ -20,8 +20,8 @@ __author__ = 'Tatiana Likhomanenko, Alex Rogozhnikov' __all__ = ['FoldingClassifier', 'FoldingRegressor'] -from .utils import get_classifier_probabilities, get_classifier_staged_proba, get_regressor_prediction, \ - get_regressor_staged_predict +from .utils import (get_classifier_probabilities, get_classifier_staged_proba, get_regressor_prediction, + get_regressor_staged_predict, ) class FoldingBase(object): @@ -77,7 +77,7 @@ def _get_folds_column(self, length): self._random_number = check_random_state(self.random_state).randint(0, 100000) folds_column = numpy.zeros(length) for fold_number, (_, folds_indices) in enumerate( - KFold(length, self.n_folds, shuffle=True, random_state=self._random_number)): + KFold(self.n_folds, shuffle=True, random_state=self._random_number).split(folds_column)): folds_column[folds_indices] = fold_number return folds_column diff --git a/rep/metaml/gridsearch.py b/rep/metaml/gridsearch.py index 56c99aba..30fe8d8f 100644 --- a/rep/metaml/gridsearch.py +++ b/rep/metaml/gridsearch.py @@ -242,7 +242,8 @@ def __init__(self, param_grid, n_evaluations=10, maximize=True, random_state=Non self.indices_to_parameters_ = OrderedDict() self.grid_scores_ = OrderedDict() self.queued_tasks_ = set() - from sklearn.grid_search import ParameterSampler + from sklearn.model_selection import ParameterSampler + self.param_sampler = iter(ParameterSampler(param_grid, n_iter=n_evaluations, random_state=random_state)) def generate_next_point(self): @@ -520,7 +521,7 @@ def _compute_score(self, k_folder, prediction_function, base_estimator, params, :return float: quality """ score = 0 - for ind, (train_indices, test_indices) in enumerate(islice(k_folder, 0, self.fold_checks)): + for ind, (train_indices, test_indices) in enumerate(islice(k_folder.split(X=X, y=y), 0, self.fold_checks)): estimator = clone(base_estimator) estimator.set_params(**params) @@ -571,7 +572,7 @@ def __call__(self, base_estimator, params, X, y, sample_weight=None): :return float: quality """ - k_folder = StratifiedKFold(y=y, n_folds=self.folds, shuffle=self.shuffle, random_state=self.random_state) + k_folder = StratifiedKFold(n_splits=self.folds, shuffle=self.shuffle, random_state=self.random_state) return self._compute_score(k_folder, get_classifier_probabilities, base_estimator, params, X, y, sample_weight=sample_weight) @@ -605,7 +606,7 @@ def __call__(self, base_estimator, params, X, y, sample_weight=None): :return float: quality """ - k_folder = KFold(len(y), n_folds=self.folds, shuffle=self.shuffle, random_state=self.random_state) + k_folder = KFold(n_splits=self.folds, shuffle=self.shuffle, random_state=self.random_state) return self._compute_score(k_folder, get_regressor_prediction, base_estimator, params, X, y, sample_weight=sample_weight) diff --git a/rep/utils.py b/rep/utils.py index 591711c8..fe6c2084 100644 --- a/rep/utils.py +++ b/rep/utils.py @@ -348,7 +348,7 @@ def train_test_split_group(group_column, *arrays, **kw_args): :param bool allow_none: default False (useful for sample_weight - after splitting train and test of `None` are again `None`) """ - from sklearn import cross_validation + from sklearn import model_selection allow_none = kw_args.pop('allow_none', None) assert len(arrays) > 0, "at least one array should be passed" @@ -360,7 +360,7 @@ def train_test_split_group(group_column, *arrays, **kw_args): assert len(initial_data) == length, "group column must have the same length" group_ids = numpy.unique(initial_data) - train_indices, test_indices = cross_validation.train_test_split(group_ids, **kw_args) + train_indices, test_indices = model_selection.train_test_split(group_ids, **kw_args) train_indices = numpy.in1d(initial_data, train_indices) test_indices = numpy.in1d(initial_data, test_indices) diff --git a/tests/m_test_matrixnet.py b/tests/m_test_matrixnet.py deleted file mode 100644 index 2b70687a..00000000 --- a/tests/m_test_matrixnet.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import division, print_function, absolute_import - -from sklearn.ensemble import AdaBoostClassifier -from rep.estimators import SklearnClassifier -from rep.metaml import FoldingClassifier -from rep.test.test_estimators import check_regression, generate_classification_data, check_classifier, \ - check_classification_reproducibility - -from rep.estimators import MatrixNetClassifier, MatrixNetRegressor - - -__author__ = 'Tatiana Likhomanenko, Alex Rogozhnikov' - - -def test_mn_classification(): - clf = MatrixNetClassifier(iterations=20, auto_stop=1e-3) - check_classifier(clf, n_classes=2) - assert {'effect', 'information', 'efficiency'} == set(clf.get_feature_importances().columns) - - -def test_mn_regression(): - clf = MatrixNetRegressor() - check_regression(clf) - assert {'effect', 'information', 'efficiency'} == set(clf.get_feature_importances().columns) - - -def test_simple_stacking_mn(): - base_mn = MatrixNetClassifier(iterations=10) - check_classifier(SklearnClassifier(clf=AdaBoostClassifier(base_estimator=base_mn, n_estimators=2)), - has_staged_pp=True) - - -def test_mn_reproducibility(): - clf = MatrixNetClassifier(iterations=10) - X, y, _ = generate_classification_data() - check_classification_reproducibility(clf, X, y) - - -def test_complex_stacking_mn(): - # Ada over kFold over MatrixNet - base_kfold = FoldingClassifier(base_estimator=MatrixNetClassifier(iterations=30)) - check_classifier(SklearnClassifier(clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), - has_staged_pp=False, has_importances=False) - diff --git a/tests/m_test_matrixnet_api.py b/tests/m_test_matrixnet_api.py deleted file mode 100644 index 5d908ca2..00000000 --- a/tests/m_test_matrixnet_api.py +++ /dev/null @@ -1,222 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import os -from time import time, sleep -from tempfile import mkstemp -from nose.tools import raises -import unittest -import json -from rep.estimators._mnkit import MatrixNetClient -from rep.estimators import MatrixNetClassifier, MatrixNetRegressor -from rep.test.test_estimators import generate_classification_data, generate_regression_data -import hashlib - -__author__ = 'Alexander Baranov, Tatiana Likhomanenko' - -DATA_PATH = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "help_files") - -CONFIG_FILE_WRONG_URL = os.path.join(DATA_PATH, 'wrong_config_url.json') -CONFIG_FILE_WRONG_TOKEN = os.path.join(DATA_PATH, 'wrong_config_token.json') - - -def test_A_md5(): - md5 = hashlib.md5() - with open(os.path.join(DATA_PATH, 'data.csv'), 'rb') as f: - for chunk in iter(lambda: f.read(8192), b''): - md5.update(chunk) - print(md5.hexdigest()) - - -# test api errors -@raises(Exception) -def test_Exception_credential(): - X, y, sample_weight = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_TOKEN, iterations=50) - cl.fit(X, y, sample_weight=sample_weight) - - -@raises(Exception) -def test_Exception_server(): - X, y, sample_weight = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.fit(X, y, sample_weight=sample_weight) - - -@raises(AssertionError) -def test_Exception_predict_proba(): - X, _, _ = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.predict_proba(X) - - -@raises(AssertionError) -def test_Exception_staged_predict_proba(): - X, _, _ = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - for _ in cl.staged_predict_proba(X): - pass - - -@raises(AssertionError) -def test_Exception_feature_importances(): - X, _, _ = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - print(cl.feature_importances_) - - -@raises(AssertionError) -def test_Exception_trained_status(): - X, _, _ = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.training_status() - - -@raises(AssertionError) -def test_Exception_synchronized(): - X, _, _ = generate_classification_data() - cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.synchronize() - - -@raises(AssertionError) -def test_Exception_reg_predict(): - X, _, _ = generate_regression_data() - cl = MatrixNetRegressor(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.predict(X) - - -@raises(AssertionError) -def test_Exception_reg_staged_predict(): - X, _, _ = generate_regression_data() - cl = MatrixNetRegressor(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - for _ in cl.staged_predict(X): - pass - - -@raises(AssertionError) -def test_Exception_reg_feature_importances(): - X, _, _ = generate_regression_data() - cl = MatrixNetRegressor(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - print(cl.feature_importances_) - - -@raises(AssertionError) -def test_Exception_reg_trained_status(): - X, _, _ = generate_regression_data() - cl = MatrixNetRegressor(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.training_status() - - -@raises(AssertionError) -def test_Exception_reg_synchronized(): - X, _, _ = generate_regression_data() - cl = MatrixNetRegressor(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) - cl.synchronize() - - -class MatrixNetTest(unittest.TestCase): - DEFAULT_CONFIG_PATH = "$HOME/.rep-matrixnet.config.json" - - def setUp(self): - config_file_path = os.path.expandvars(self.DEFAULT_CONFIG_PATH) - with open(config_file_path, 'r') as conf_file: - config = json.load(conf_file) - self.api_url = config['url'] - self.mn = MatrixNetClient(self.api_url, config['token']) - - -# test Bucket - -class TestBuckets(MatrixNetTest): - def test_create_delete(self): - b1 = self.mn.bucket() - b1.remove() - - def test_create_with_id(self): - bucket_id = "testbucket" + str(int(time())) - b1 = self.mn.bucket(bucket_id=bucket_id) - b1.remove() - - def test_bucket_id(self): - b1 = self.mn.bucket() - b2 = self.mn.bucket(bucket_id=b1.bucket_id) - b1.remove() - - def test_upload(self): - b1 = self.mn.bucket() - - datapath = os.path.join(DATA_PATH, "data.csv") - - result = b1.upload(datapath) - self.assertTrue(result) - - self.assertEqual(b1.ls(), [u'data.csv']) - - b1.remove() - - -# test Classifier - -TEST_PARAMS = { - 'mn_parameters': {'iterations': 10, - 'regularization': 0.01, - 'max_features_per_iteration': 6, - 'features_sample_rate_per_iteration': 0.5, - 'training_fraction': 0.5, - 'seed': None, - 'intervals': 8, - 'auto_stop': None, - 'train_type': 'classification'}, - 'fields': [ - 'FlightDistance', - 'FlightDistanceError', - 'IP', - 'IPSig', - 'VertexChi2', - 'weight' - ], - 'mn_version': 1, - 'extra': { - }, -} - - -# for some reason the task is pending all time. -class TestEstimator(MatrixNetTest): - def test_classifier(self): - bucket_test = self.mn.bucket() - - datapath = os.path.join(DATA_PATH, "data.csv") - - result = bucket_test.upload(datapath) - self.assertTrue(result) - - cls = self.mn.classifier( - parameters=TEST_PARAMS, - description="REP-submitted classifier", - bucket_id=bucket_test.bucket_id, - ) - cls.upload() - status = cls.get_status() - while status != "completed": - status = cls.get_status() - assert status != 'failed', 'Failed formula ' + str(cls.classifier_id) - iterations = cls.get_iterations() - print("Training: status={} iterations={}".format(status, iterations)) - sleep(2) - print('finish training') - formula_tmp_local = mkstemp(dir='/tmp')[1] - cls.save_formula(formula_tmp_local) - os.remove(formula_tmp_local) - - self.assertTrue(cls.resubmit()) - status = cls.get_status() - while status != "completed": - status = cls.get_status() - assert status != 'failed', 'Failed formula ' + str(cls.classifier_id) - iterations = cls.get_iterations() - print("Training after resubmit: status={} iterations={}".format(status, iterations)) - sleep(2) - print('finish resubmit job') - bucket_test.remove() diff --git a/tests/m_test_matrixnet_applier.py b/tests/m_test_matrixnet_applier.py deleted file mode 100644 index e4f2c591..00000000 --- a/tests/m_test_matrixnet_applier.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Here we test the correctness and speed of the formula. -""" - -from __future__ import division, print_function, absolute_import -import os -import time -import numpy -from scipy.special import expit -import pandas -from six import BytesIO -from six.moves import zip -from rep.estimators._matrixnetapplier import MatrixNetApplier as NumpyClassifier - - -__author__ = 'Alex Rogozhnikov' - -DATA_PATH = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "help_files") - - -def read_files(mx_filename, test_filename): - test_file = pandas.read_csv(test_filename, sep='\t') - with open(mx_filename, 'rb') as mx: - mx_content = mx.read() - return mx_content, test_file - - -def numpy_predict(formula_mx, data): - data = data.astype(float) - data = pandas.DataFrame(data) - mx = NumpyClassifier(BytesIO(formula_mx)) - return mx.apply(data) - - -def stage_numpy_predict(formula_mx, data, step=1): - data = data.astype(float) - data = pandas.DataFrame(data) - mx = NumpyClassifier(BytesIO(formula_mx)) - - prediction = numpy.zeros(len(data)) - - for num, prediction_iteration in enumerate(mx.apply_separately(data)): - prediction += prediction_iteration - if num % step == 0: - yield expit(prediction) - - -def check_leaves(mx_filename, test_filename, n_trees=5000): - formula_mx, data = read_files(mx_filename, test_filename) - data = data.astype(float) - data = pandas.DataFrame(data) - mx = NumpyClassifier(BytesIO(formula_mx)) - leaves = mx.compute_leaf_indices(data) - assert leaves.shape[0] == data.shape[0] - assert leaves.shape[1] == n_trees - print(leaves) - - -def test_leaves(): - check_leaves( - os.path.join(DATA_PATH, 'test_formula_mx'), - os.path.join(DATA_PATH, 'data.csv')) - - -def check_staged_predictions(mx_filename, test_filename, n_iterations, stage_predict_function): - mx_content, test_file = read_files(mx_filename, test_filename) - - predictions = pandas.read_csv(os.path.join(DATA_PATH, 'predictions.csv')) - predictions = pandas.DataFrame(predictions) - - # Checking the predictions on first 100 events - for x, (key, row) in zip(stage_predict_function(mx_content, test_file[:100]), predictions.iterrows()): - assert numpy.allclose(row, x) - - # Checking the number of iterations on 10 events - assert sum(1 for _ in stage_predict_function(mx_content, test_file[:10])) == n_iterations + 1 - - print('Check was passed') - - -# How the file was obtained -# def write_staged_predictions(mx_filename, test_filename): -# mx_content, test_file = read_files(mx_filename, test_filename) -# # testing on first 100 events -# test_file = test_file[:100] -# -# predictions = numpy.zeros([100, 100], dtype=float) -# -# for i, x in enumerate(stage_cython_predict(mx_content, test_file)): -# if i == 100: -# break -# predictions[i, :] = x -# -# pandas.DataFrame(predictions).to_csv('data/predictions.csv', index=False) - - -def compute_speed(mx_filename, test_filename, function, print_name=''): - mx_content, test_file = read_files(mx_filename, test_filename) - # just iterating over sequence - start = time.time() - for x in function(mx_content, test_file): - pass - print(print_name, time.time() - start) - - -def test_applier(): - check_staged_predictions( - os.path.join(DATA_PATH, 'test_formula_mx'), - os.path.join(DATA_PATH, 'data.csv'), - stage_predict_function=stage_numpy_predict, - n_iterations=5000) \ No newline at end of file diff --git a/tests/test_stacking.py b/tests/test_stacking.py index fee4bc01..be478fa1 100644 --- a/tests/test_stacking.py +++ b/tests/test_stacking.py @@ -4,7 +4,7 @@ from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier from rep.test.test_estimators import check_classifier, generate_classification_data -from rep.estimators import XGBoostClassifier, TMVAClassifier +from rep.estimators import XGBoostClassifier from rep.metaml import FoldingClassifier from rep.estimators.sklearn import SklearnClassifier @@ -44,11 +44,6 @@ def test_simple_stacking_sklearn(): check_classifier(SklearnClassifier(clf=AdaBoostClassifier(base_estimator=base_sk, n_estimators=3))) -def test_simple_stacking_tmva(): - base_tmva = TMVAClassifier(factory_options="Silent=True:V=False:DrawProgressBar=False") - check_classifier(SklearnClassifier(clf=BaggingClassifier(base_estimator=base_tmva, n_estimators=3, random_state=13)), - has_staged_pp=False, has_importances=False) - def test_complex_stacking_sk(): # Ada over kFold over Ada over Trees @@ -57,13 +52,6 @@ def test_complex_stacking_sk(): has_staged_pp=False, has_importances=False) -def test_complex_stacking_tmva(): - # Ada over kFold over TMVA - base_kfold = FoldingClassifier(base_estimator=TMVAClassifier(factory_options="Silent=True:V=False:DrawProgressBar=False", - method='kBDT', NTrees=10), random_state=13) - check_classifier(SklearnClassifier(clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), - has_staged_pp=False, has_importances=False) - def test_complex_stacking_xgboost(): # Ada over kFold over xgboost From 2d6333d9175218055e30b453588c2be71c3de0f1 Mon Sep 17 00:00:00 2001 From: Jonas Eschle 'Mayou36 Date: Tue, 22 Sep 2020 13:03:33 +0200 Subject: [PATCH 5/5] Change travis --- .dockerignore | 0 .gitignore | 0 .travis.yml | 0 AUTHORS | 0 LICENSE | 0 MANIFEST.in | 0 Makefile | 0 README.md | 0 ci/Dockerfile.rep | 0 ci/README.MD | 0 ci/environment-rep-template.yaml | 0 ci/environment-rep2.yaml | 0 ci/environment-rep3.yaml | 0 ci/environments_generate.py | 0 circle.yml | 0 docs/Makefile | 0 docs/conf.py | 0 docs/data.rst | 0 docs/estimators.rst | 0 docs/images/kitematic.gif | Bin docs/index.rst | 0 docs/metaml.rst | 0 docs/metrics.rst | 0 docs/parallel.rst | 0 docs/plotting.rst | 0 docs/report.rst | 0 docs/reproducibility.rst | 0 docs/utils.rst | 0 howto/00-abc-ipython.ipynb | 0 howto/00-intro-ROOT.ipynb | 0 howto/00-intro_ipython.ipynb | 0 howto/01-howto-Classifiers.ipynb | 0 howto/02-howto-Factory.ipynb | 0 howto/03-howto-gridsearch.ipynb | 0 howto/04-howto-folding.ipynb | 0 howto/05-howto-plot.ipynb | 0 howto/06-howto-neural-nets.ipynb | 0 howto/07-howto-MatrixNet.ipynb | 0 howto/Dockerfile | 0 howto/toy_datasets/README.md | 0 rep/__init__.py | 0 rep/data/__init__.py | 0 rep/data/storage.py | 0 rep/estimators/__init__.py | 0 rep/estimators/_matrixnetapplier.py | 0 rep/estimators/_mnkit.py | 0 rep/estimators/_tmvaFactory.py | 0 rep/estimators/_tmvaReader.py | 0 rep/estimators/interface.py | 0 rep/estimators/matrixnet.py | 0 rep/estimators/sklearn.py | 0 rep/estimators/utils.py | 0 rep/estimators/xgboost.py | 0 rep/metaml/__init__.py | 0 rep/metaml/_cache.py | 0 rep/metaml/cache.py | 0 rep/metaml/factory.py | 0 rep/metaml/folding.py | 0 rep/metaml/gridsearch.py | 0 rep/metaml/stacking.py | 0 rep/metaml/utils.py | 0 rep/plotting.py | 0 rep/report/__init__.py | 0 rep/report/_base.py | 0 rep/report/classification.py | 0 rep/report/metrics.py | 0 rep/report/regression.py | 0 rep/test/__init__.py | 0 rep/test/test_estimators.py | 0 rep/test/test_notebooks.py | 0 rep/utils.py | 0 requirements.txt | 0 setup.py | 0 tests/__init__.py | 0 tests/help_files/data.csv | 0 tests/help_files/predictions.csv | 0 tests/help_files/test_formula_mx | Bin tests/help_files/wrong_config_token.json | 0 tests/help_files/wrong_config_url.json | 0 tests/test_factory_clf.py | 0 tests/test_factory_reg.py | 0 tests/test_folding.py | 0 tests/test_grid.py | 0 tests/test_grid_optimization.py | 0 tests/test_meta_caching.py | 0 tests/test_metrics.py | 0 tests/test_reports.py | 0 tests/test_sklearn.py | 0 tests/test_stacking.py | 0 tests/test_util.py | 0 tests/test_xgboost.py | 0 tests/z_test_notebook.py | 0 92 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .dockerignore mode change 100644 => 100755 .gitignore mode change 100644 => 100755 .travis.yml mode change 100644 => 100755 AUTHORS mode change 100644 => 100755 LICENSE mode change 100644 => 100755 MANIFEST.in mode change 100644 => 100755 Makefile mode change 100644 => 100755 README.md mode change 100644 => 100755 ci/Dockerfile.rep mode change 100644 => 100755 ci/README.MD mode change 100644 => 100755 ci/environment-rep-template.yaml mode change 100644 => 100755 ci/environment-rep2.yaml mode change 100644 => 100755 ci/environment-rep3.yaml mode change 100644 => 100755 ci/environments_generate.py mode change 100644 => 100755 circle.yml mode change 100644 => 100755 docs/Makefile mode change 100644 => 100755 docs/conf.py mode change 100644 => 100755 docs/data.rst mode change 100644 => 100755 docs/estimators.rst mode change 100644 => 100755 docs/images/kitematic.gif mode change 100644 => 100755 docs/index.rst mode change 100644 => 100755 docs/metaml.rst mode change 100644 => 100755 docs/metrics.rst mode change 100644 => 100755 docs/parallel.rst mode change 100644 => 100755 docs/plotting.rst mode change 100644 => 100755 docs/report.rst mode change 100644 => 100755 docs/reproducibility.rst mode change 100644 => 100755 docs/utils.rst mode change 100644 => 100755 howto/00-abc-ipython.ipynb mode change 100644 => 100755 howto/00-intro-ROOT.ipynb mode change 100644 => 100755 howto/00-intro_ipython.ipynb mode change 100644 => 100755 howto/01-howto-Classifiers.ipynb mode change 100644 => 100755 howto/02-howto-Factory.ipynb mode change 100644 => 100755 howto/03-howto-gridsearch.ipynb mode change 100644 => 100755 howto/04-howto-folding.ipynb mode change 100644 => 100755 howto/05-howto-plot.ipynb mode change 100644 => 100755 howto/06-howto-neural-nets.ipynb mode change 100644 => 100755 howto/07-howto-MatrixNet.ipynb mode change 100644 => 100755 howto/Dockerfile mode change 100644 => 100755 howto/toy_datasets/README.md mode change 100644 => 100755 rep/__init__.py mode change 100644 => 100755 rep/data/__init__.py mode change 100644 => 100755 rep/data/storage.py mode change 100644 => 100755 rep/estimators/__init__.py mode change 100644 => 100755 rep/estimators/_matrixnetapplier.py mode change 100644 => 100755 rep/estimators/_mnkit.py mode change 100644 => 100755 rep/estimators/_tmvaFactory.py mode change 100644 => 100755 rep/estimators/_tmvaReader.py mode change 100644 => 100755 rep/estimators/interface.py mode change 100644 => 100755 rep/estimators/matrixnet.py mode change 100644 => 100755 rep/estimators/sklearn.py mode change 100644 => 100755 rep/estimators/utils.py mode change 100644 => 100755 rep/estimators/xgboost.py mode change 100644 => 100755 rep/metaml/__init__.py mode change 100644 => 100755 rep/metaml/_cache.py mode change 100644 => 100755 rep/metaml/cache.py mode change 100644 => 100755 rep/metaml/factory.py mode change 100644 => 100755 rep/metaml/folding.py mode change 100644 => 100755 rep/metaml/gridsearch.py mode change 100644 => 100755 rep/metaml/stacking.py mode change 100644 => 100755 rep/metaml/utils.py mode change 100644 => 100755 rep/plotting.py mode change 100644 => 100755 rep/report/__init__.py mode change 100644 => 100755 rep/report/_base.py mode change 100644 => 100755 rep/report/classification.py mode change 100644 => 100755 rep/report/metrics.py mode change 100644 => 100755 rep/report/regression.py mode change 100644 => 100755 rep/test/__init__.py mode change 100644 => 100755 rep/test/test_estimators.py mode change 100644 => 100755 rep/test/test_notebooks.py mode change 100644 => 100755 rep/utils.py mode change 100644 => 100755 requirements.txt mode change 100644 => 100755 setup.py mode change 100644 => 100755 tests/__init__.py mode change 100644 => 100755 tests/help_files/data.csv mode change 100644 => 100755 tests/help_files/predictions.csv mode change 100644 => 100755 tests/help_files/test_formula_mx mode change 100644 => 100755 tests/help_files/wrong_config_token.json mode change 100644 => 100755 tests/help_files/wrong_config_url.json mode change 100644 => 100755 tests/test_factory_clf.py mode change 100644 => 100755 tests/test_factory_reg.py mode change 100644 => 100755 tests/test_folding.py mode change 100644 => 100755 tests/test_grid.py mode change 100644 => 100755 tests/test_grid_optimization.py mode change 100644 => 100755 tests/test_meta_caching.py mode change 100644 => 100755 tests/test_metrics.py mode change 100644 => 100755 tests/test_reports.py mode change 100644 => 100755 tests/test_sklearn.py mode change 100644 => 100755 tests/test_stacking.py mode change 100644 => 100755 tests/test_util.py mode change 100644 => 100755 tests/test_xgboost.py mode change 100644 => 100755 tests/z_test_notebook.py diff --git a/.dockerignore b/.dockerignore old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.travis.yml b/.travis.yml old mode 100644 new mode 100755 diff --git a/AUTHORS b/AUTHORS old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100644 new mode 100755 diff --git a/Makefile b/Makefile old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/ci/Dockerfile.rep b/ci/Dockerfile.rep old mode 100644 new mode 100755 diff --git a/ci/README.MD b/ci/README.MD old mode 100644 new mode 100755 diff --git a/ci/environment-rep-template.yaml b/ci/environment-rep-template.yaml old mode 100644 new mode 100755 diff --git a/ci/environment-rep2.yaml b/ci/environment-rep2.yaml old mode 100644 new mode 100755 diff --git a/ci/environment-rep3.yaml b/ci/environment-rep3.yaml old mode 100644 new mode 100755 diff --git a/ci/environments_generate.py b/ci/environments_generate.py old mode 100644 new mode 100755 diff --git a/circle.yml b/circle.yml old mode 100644 new mode 100755 diff --git a/docs/Makefile b/docs/Makefile old mode 100644 new mode 100755 diff --git a/docs/conf.py b/docs/conf.py old mode 100644 new mode 100755 diff --git a/docs/data.rst b/docs/data.rst old mode 100644 new mode 100755 diff --git a/docs/estimators.rst b/docs/estimators.rst old mode 100644 new mode 100755 diff --git a/docs/images/kitematic.gif b/docs/images/kitematic.gif old mode 100644 new mode 100755 diff --git a/docs/index.rst b/docs/index.rst old mode 100644 new mode 100755 diff --git a/docs/metaml.rst b/docs/metaml.rst old mode 100644 new mode 100755 diff --git a/docs/metrics.rst b/docs/metrics.rst old mode 100644 new mode 100755 diff --git a/docs/parallel.rst b/docs/parallel.rst old mode 100644 new mode 100755 diff --git a/docs/plotting.rst b/docs/plotting.rst old mode 100644 new mode 100755 diff --git a/docs/report.rst b/docs/report.rst old mode 100644 new mode 100755 diff --git a/docs/reproducibility.rst b/docs/reproducibility.rst old mode 100644 new mode 100755 diff --git a/docs/utils.rst b/docs/utils.rst old mode 100644 new mode 100755 diff --git a/howto/00-abc-ipython.ipynb b/howto/00-abc-ipython.ipynb old mode 100644 new mode 100755 diff --git a/howto/00-intro-ROOT.ipynb b/howto/00-intro-ROOT.ipynb old mode 100644 new mode 100755 diff --git a/howto/00-intro_ipython.ipynb b/howto/00-intro_ipython.ipynb old mode 100644 new mode 100755 diff --git a/howto/01-howto-Classifiers.ipynb b/howto/01-howto-Classifiers.ipynb old mode 100644 new mode 100755 diff --git a/howto/02-howto-Factory.ipynb b/howto/02-howto-Factory.ipynb old mode 100644 new mode 100755 diff --git a/howto/03-howto-gridsearch.ipynb b/howto/03-howto-gridsearch.ipynb old mode 100644 new mode 100755 diff --git a/howto/04-howto-folding.ipynb b/howto/04-howto-folding.ipynb old mode 100644 new mode 100755 diff --git a/howto/05-howto-plot.ipynb b/howto/05-howto-plot.ipynb old mode 100644 new mode 100755 diff --git a/howto/06-howto-neural-nets.ipynb b/howto/06-howto-neural-nets.ipynb old mode 100644 new mode 100755 diff --git a/howto/07-howto-MatrixNet.ipynb b/howto/07-howto-MatrixNet.ipynb old mode 100644 new mode 100755 diff --git a/howto/Dockerfile b/howto/Dockerfile old mode 100644 new mode 100755 diff --git a/howto/toy_datasets/README.md b/howto/toy_datasets/README.md old mode 100644 new mode 100755 diff --git a/rep/__init__.py b/rep/__init__.py old mode 100644 new mode 100755 diff --git a/rep/data/__init__.py b/rep/data/__init__.py old mode 100644 new mode 100755 diff --git a/rep/data/storage.py b/rep/data/storage.py old mode 100644 new mode 100755 diff --git a/rep/estimators/__init__.py b/rep/estimators/__init__.py old mode 100644 new mode 100755 diff --git a/rep/estimators/_matrixnetapplier.py b/rep/estimators/_matrixnetapplier.py old mode 100644 new mode 100755 diff --git a/rep/estimators/_mnkit.py b/rep/estimators/_mnkit.py old mode 100644 new mode 100755 diff --git a/rep/estimators/_tmvaFactory.py b/rep/estimators/_tmvaFactory.py old mode 100644 new mode 100755 diff --git a/rep/estimators/_tmvaReader.py b/rep/estimators/_tmvaReader.py old mode 100644 new mode 100755 diff --git a/rep/estimators/interface.py b/rep/estimators/interface.py old mode 100644 new mode 100755 diff --git a/rep/estimators/matrixnet.py b/rep/estimators/matrixnet.py old mode 100644 new mode 100755 diff --git a/rep/estimators/sklearn.py b/rep/estimators/sklearn.py old mode 100644 new mode 100755 diff --git a/rep/estimators/utils.py b/rep/estimators/utils.py old mode 100644 new mode 100755 diff --git a/rep/estimators/xgboost.py b/rep/estimators/xgboost.py old mode 100644 new mode 100755 diff --git a/rep/metaml/__init__.py b/rep/metaml/__init__.py old mode 100644 new mode 100755 diff --git a/rep/metaml/_cache.py b/rep/metaml/_cache.py old mode 100644 new mode 100755 diff --git a/rep/metaml/cache.py b/rep/metaml/cache.py old mode 100644 new mode 100755 diff --git a/rep/metaml/factory.py b/rep/metaml/factory.py old mode 100644 new mode 100755 diff --git a/rep/metaml/folding.py b/rep/metaml/folding.py old mode 100644 new mode 100755 diff --git a/rep/metaml/gridsearch.py b/rep/metaml/gridsearch.py old mode 100644 new mode 100755 diff --git a/rep/metaml/stacking.py b/rep/metaml/stacking.py old mode 100644 new mode 100755 diff --git a/rep/metaml/utils.py b/rep/metaml/utils.py old mode 100644 new mode 100755 diff --git a/rep/plotting.py b/rep/plotting.py old mode 100644 new mode 100755 diff --git a/rep/report/__init__.py b/rep/report/__init__.py old mode 100644 new mode 100755 diff --git a/rep/report/_base.py b/rep/report/_base.py old mode 100644 new mode 100755 diff --git a/rep/report/classification.py b/rep/report/classification.py old mode 100644 new mode 100755 diff --git a/rep/report/metrics.py b/rep/report/metrics.py old mode 100644 new mode 100755 diff --git a/rep/report/regression.py b/rep/report/regression.py old mode 100644 new mode 100755 diff --git a/rep/test/__init__.py b/rep/test/__init__.py old mode 100644 new mode 100755 diff --git a/rep/test/test_estimators.py b/rep/test/test_estimators.py old mode 100644 new mode 100755 diff --git a/rep/test/test_notebooks.py b/rep/test/test_notebooks.py old mode 100644 new mode 100755 diff --git a/rep/utils.py b/rep/utils.py old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/tests/__init__.py b/tests/__init__.py old mode 100644 new mode 100755 diff --git a/tests/help_files/data.csv b/tests/help_files/data.csv old mode 100644 new mode 100755 diff --git a/tests/help_files/predictions.csv b/tests/help_files/predictions.csv old mode 100644 new mode 100755 diff --git a/tests/help_files/test_formula_mx b/tests/help_files/test_formula_mx old mode 100644 new mode 100755 diff --git a/tests/help_files/wrong_config_token.json b/tests/help_files/wrong_config_token.json old mode 100644 new mode 100755 diff --git a/tests/help_files/wrong_config_url.json b/tests/help_files/wrong_config_url.json old mode 100644 new mode 100755 diff --git a/tests/test_factory_clf.py b/tests/test_factory_clf.py old mode 100644 new mode 100755 diff --git a/tests/test_factory_reg.py b/tests/test_factory_reg.py old mode 100644 new mode 100755 diff --git a/tests/test_folding.py b/tests/test_folding.py old mode 100644 new mode 100755 diff --git a/tests/test_grid.py b/tests/test_grid.py old mode 100644 new mode 100755 diff --git a/tests/test_grid_optimization.py b/tests/test_grid_optimization.py old mode 100644 new mode 100755 diff --git a/tests/test_meta_caching.py b/tests/test_meta_caching.py old mode 100644 new mode 100755 diff --git a/tests/test_metrics.py b/tests/test_metrics.py old mode 100644 new mode 100755 diff --git a/tests/test_reports.py b/tests/test_reports.py old mode 100644 new mode 100755 diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py old mode 100644 new mode 100755 diff --git a/tests/test_stacking.py b/tests/test_stacking.py old mode 100644 new mode 100755 diff --git a/tests/test_util.py b/tests/test_util.py old mode 100644 new mode 100755 diff --git a/tests/test_xgboost.py b/tests/test_xgboost.py old mode 100644 new mode 100755 diff --git a/tests/z_test_notebook.py b/tests/z_test_notebook.py old mode 100644 new mode 100755