diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a047a94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md index 33e5d22..35badeb 100644 --- a/README.md +++ b/README.md @@ -10,3 +10,16 @@ PUAdapter: A tool that adapts any estimator that can output a probability to pos It is based on: Elkan, Charles, and Keith Noto. "Learning classifiers from only positive and unlabeled data." Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2008. + +## How to run scripts + +### Running puAdapter Example +```bash +python -m src.examples.puAdapterExample +``` + +### Running Breast Cancer Example +```bash +python -m src.tests.breastCancer +``` + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6a0fc4e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +matplotlib +sklearn diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/examples/__init__.py b/src/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/examples/puAdapterExample.py b/src/examples/puAdapterExample.py index e8dd983..ae8417a 100644 --- a/src/examples/puAdapterExample.py +++ b/src/examples/puAdapterExample.py @@ -1,11 +1,9 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- """ Created on Dec 21, 2012 @author: Alexandre """ -from puLearning.puAdapter import PUAdapter +from ..puLearning.puAdapter import PUAdapter from sklearn.svm import SVC from sklearn.datasets import make_classification import numpy as np @@ -16,7 +14,7 @@ n_informative=2, n_redundant=2, n_repeated=0, - n_classes=2, + n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, @@ -37,8 +35,8 @@ pu_estimator.fit(X, y) - print pu_estimator - print - print "Comparison of estimator and PUAdapter(estimator):" - print "Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0]) - print "Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0]) + print(pu_estimator) + print() + print("Comparison of estimator and PUAdapter(estimator):") + print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0])) + print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0])) diff --git a/src/puLearning/puAdapter.py b/src/puLearning/puAdapter.py index 66802ed..e582d71 100644 --- a/src/puLearning/puAdapter.py +++ b/src/puLearning/puAdapter.py @@ -53,7 +53,7 @@ def __fit_precomputed_kernel(self, X, y): y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0) """ positives = np.where(y == 1.)[0] - hold_out_size = np.ceil(len(positives) * self.hold_out_ratio) + hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) if len(positives) <= hold_out_size: raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.') @@ -95,7 +95,7 @@ def __fit_no_precomputed_kernel(self, X, y): y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0) """ positives = np.where(y == 1.)[0] - hold_out_size = np.ceil(len(positives) * self.hold_out_ratio) + hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) if len(positives) <= hold_out_size: raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.') diff --git a/src/tests/__init__.py b/src/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/breastCancer.py b/src/tests/breastCancer.py index 0ba58b9..6419f65 100644 --- a/src/tests/breastCancer.py +++ b/src/tests/breastCancer.py @@ -3,12 +3,12 @@ @author: Alexandre -The goal of this test is to verifiy that the PUAdapter really allows a regular estimator to +The goal of this test is to verify that the PUAdapter really allows a regular estimator to achieve better accuracy in the case where the \"negative\" examples are contaminated with a number of positive examples. Here we use the breast cancer dataset from UCI. We purposely take a few malignant examples and -assign them the bening label and consider the bening examples as being \"unlabled\". We then compare +assign them the benign label and consider the benign examples as being \"unlabled\". We then compare the performance of the estimator while using the PUAdapter and without using the PUAdapter. To asses the performance, we use the F1 score, precision and recall. @@ -18,7 +18,7 @@ """ import numpy as np import matplotlib.pyplot as plt -from puLearning.puAdapter import PUAdapter +from ..puLearning.puAdapter import PUAdapter from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_recall_fscore_support @@ -45,13 +45,12 @@ def load_breast_cancer(path): if __name__ == '__main__': np.random.seed(42) - print "Loading dataset" - print - X,y = load_breast_cancer('../datasets/breast-cancer-wisconsin.data') + print("Loading dataset") + X,y = load_breast_cancer('src/datasets/breast-cancer-wisconsin.data') #Shuffle dataset - print "Shuffling dataset" - print + print("Shuffling dataset") + print() permut = np.random.permutation(len(y)) X = X[permut] y = y[permut] @@ -60,46 +59,46 @@ def load_breast_cancer(path): y[np.where(y == 2)[0]] = -1. y[np.where(y == 4)[0]] = +1. - print "Loaded ", len(y), " examples" - print len(np.where(y == -1.)[0])," are bening" - print len(np.where(y == +1.)[0])," are malignant" - print - + print("Loaded ", len(y), " examples") + print(len(np.where(y == -1.)[0])," are bening") + print(len(np.where(y == +1.)[0])," are malignant") + print() + #Split test/train - print "Splitting dataset in test/train sets" - print - split = 2*len(y)/3 + print("Splitting dataset in test/train sets") + print() + split = 2*len(y)//3 X_train = X[:split] y_train = y[:split] X_test = X[split:] y_test = y[split:] - print "Training set contains ", len(y_train), " examples" - print len(np.where(y_train == -1.)[0])," are bening" - print len(np.where(y_train == +1.)[0])," are malignant" - print + print("Training set contains ", len(y_train), " examples") + print(len(np.where(y_train == -1.)[0])," are benign") + print(len(np.where(y_train == +1.)[0])," are malignant") + print() pu_f1_scores = [] reg_f1_scores = [] n_sacrifice_iter = range(0, len(np.where(y_train == +1.)[0])-21, 5) for n_sacrifice in n_sacrifice_iter: #send some positives to the negative class! :) - print "PU transformation in progress." - print "Making ", n_sacrifice, " malignant examples bening." - print + print("PU transformation in progress.") + print("Making ", n_sacrifice, " malignant examples benign.") + print() y_train_pu = np.copy(y_train) pos = np.where(y_train == +1.)[0] np.random.shuffle(pos) sacrifice = pos[:n_sacrifice] y_train_pu[sacrifice] = -1. - print "PU transformation applied. We now have:" - print len(np.where(y_train_pu == -1.)[0])," are bening" - print len(np.where(y_train_pu == +1.)[0])," are malignant" - print - + print("PU transformation applied. We now have:") + print(len(np.where(y_train_pu == -1.)[0])," are benign") + print(len(np.where(y_train_pu == +1.)[0])," are malignant") + print() + #Get f1 score with pu_learning - print "PU learning in progress..." + print("PU learning in progress...") estimator = RandomForestClassifier(n_estimators=100, criterion='gini', bootstrap=True, @@ -109,13 +108,13 @@ def load_breast_cancer(path): y_pred = pu_estimator.predict(X_test) precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred) pu_f1_scores.append(f1_score[1]) - print "F1 score: ", f1_score[1] - print "Precision: ", precision[1] - print "Recall: ", recall[1] - print - + print("F1 score: ", f1_score[1]) + print("Precision: ", precision[1]) + print("Recall: ", recall[1]) + print() + #Get f1 score without pu_learning - print "Regular learning in progress..." + print("Regular learning in progress...") estimator = RandomForestClassifier(n_estimators=100, bootstrap=True, n_jobs=1) @@ -123,11 +122,11 @@ def load_breast_cancer(path): y_pred = estimator.predict(X_test) precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred) reg_f1_scores.append(f1_score[1]) - print "F1 score: ", f1_score[1] - print "Precision: ", precision[1] - print "Recall: ", recall[1] - print - print + print("F1 score: ", f1_score[1]) + print("Precision: ", precision[1]) + print("Recall: ", recall[1]) + print() + print() plt.title("Random forest with/without PU learning") plt.plot(n_sacrifice_iter, pu_f1_scores, label='PU Adapted Random Forest') plt.plot(n_sacrifice_iter, reg_f1_scores, label='Random Forest')