From cd2f0ffcf6b5793c75b5749a90c39318c694834c Mon Sep 17 00:00:00 2001 From: Mariko Wakabayashi Date: Mon, 12 Feb 2018 17:04:22 -0600 Subject: [PATCH 1/4] Can run with python 3 --- src/__init__.py | 0 src/{examples => }/puAdapterExample.py | 14 ++--- src/puLearning/puAdapter.py | 4 +- src/tests/__init__.py | 0 src/tests/breastCancer.py | 77 +++++++++++++------------- 5 files changed, 47 insertions(+), 48 deletions(-) create mode 100644 src/__init__.py rename src/{examples => }/puAdapterExample.py (74%) create mode 100644 src/tests/__init__.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/examples/puAdapterExample.py b/src/puAdapterExample.py similarity index 74% rename from src/examples/puAdapterExample.py rename to src/puAdapterExample.py index e8dd983..95eb947 100644 --- a/src/examples/puAdapterExample.py +++ b/src/puAdapterExample.py @@ -3,7 +3,7 @@ """ Created on Dec 21, 2012 -@author: Alexandre +@author: Alexandres """ from puLearning.puAdapter import PUAdapter from sklearn.svm import SVC @@ -16,7 +16,7 @@ n_informative=2, n_redundant=2, n_repeated=0, - n_classes=2, + n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, @@ -37,8 +37,8 @@ pu_estimator.fit(X, y) - print pu_estimator - print - print "Comparison of estimator and PUAdapter(estimator):" - print "Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0]) - print "Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0]) + print(pu_estimator) + print() + print("Comparison of estimator and PUAdapter(estimator):") + print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0])) + print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0])) diff --git a/src/puLearning/puAdapter.py b/src/puLearning/puAdapter.py index 66802ed..e582d71 100644 --- a/src/puLearning/puAdapter.py +++ b/src/puLearning/puAdapter.py @@ -53,7 +53,7 @@ def __fit_precomputed_kernel(self, X, y): y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0) """ positives = np.where(y == 1.)[0] - hold_out_size = np.ceil(len(positives) * self.hold_out_ratio) + hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) if len(positives) <= hold_out_size: raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.') @@ -95,7 +95,7 @@ def __fit_no_precomputed_kernel(self, X, y): y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0) """ positives = np.where(y == 1.)[0] - hold_out_size = np.ceil(len(positives) * self.hold_out_ratio) + hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio)) if len(positives) <= hold_out_size: raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.') diff --git a/src/tests/__init__.py b/src/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/breastCancer.py b/src/tests/breastCancer.py index 0ba58b9..86aabd7 100644 --- a/src/tests/breastCancer.py +++ b/src/tests/breastCancer.py @@ -8,7 +8,7 @@ number of positive examples. Here we use the breast cancer dataset from UCI. We purposely take a few malignant examples and -assign them the bening label and consider the bening examples as being \"unlabled\". We then compare +assign them the benign label and consider the benign examples as being \"unlabled\". We then compare the performance of the estimator while using the PUAdapter and without using the PUAdapter. To asses the performance, we use the F1 score, precision and recall. @@ -18,7 +18,7 @@ """ import numpy as np import matplotlib.pyplot as plt -from puLearning.puAdapter import PUAdapter +from ..puLearning.puAdapter import PUAdapter from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_recall_fscore_support @@ -45,13 +45,12 @@ def load_breast_cancer(path): if __name__ == '__main__': np.random.seed(42) - print "Loading dataset" - print - X,y = load_breast_cancer('../datasets/breast-cancer-wisconsin.data') + print("Loading dataset") + X,y = load_breast_cancer('src/datasets/breast-cancer-wisconsin.data') #Shuffle dataset - print "Shuffling dataset" - print + print("Shuffling dataset") + print() permut = np.random.permutation(len(y)) X = X[permut] y = y[permut] @@ -60,46 +59,46 @@ def load_breast_cancer(path): y[np.where(y == 2)[0]] = -1. y[np.where(y == 4)[0]] = +1. - print "Loaded ", len(y), " examples" - print len(np.where(y == -1.)[0])," are bening" - print len(np.where(y == +1.)[0])," are malignant" - print - + print("Loaded ", len(y), " examples") + print(len(np.where(y == -1.)[0])," are bening") + print(len(np.where(y == +1.)[0])," are malignant") + print() + #Split test/train - print "Splitting dataset in test/train sets" - print - split = 2*len(y)/3 + print("Splitting dataset in test/train sets") + print() + split = 2*len(y)//3 X_train = X[:split] y_train = y[:split] X_test = X[split:] y_test = y[split:] - print "Training set contains ", len(y_train), " examples" - print len(np.where(y_train == -1.)[0])," are bening" - print len(np.where(y_train == +1.)[0])," are malignant" - print + print("Training set contains ", len(y_train), " examples") + print(len(np.where(y_train == -1.)[0])," are benign") + print(len(np.where(y_train == +1.)[0])," are malignant") + print() pu_f1_scores = [] reg_f1_scores = [] n_sacrifice_iter = range(0, len(np.where(y_train == +1.)[0])-21, 5) for n_sacrifice in n_sacrifice_iter: #send some positives to the negative class! :) - print "PU transformation in progress." - print "Making ", n_sacrifice, " malignant examples bening." - print + print("PU transformation in progress.") + print("Making ", n_sacrifice, " malignant examples benign.") + print() y_train_pu = np.copy(y_train) pos = np.where(y_train == +1.)[0] np.random.shuffle(pos) sacrifice = pos[:n_sacrifice] y_train_pu[sacrifice] = -1. - print "PU transformation applied. We now have:" - print len(np.where(y_train_pu == -1.)[0])," are bening" - print len(np.where(y_train_pu == +1.)[0])," are malignant" - print - + print("PU transformation applied. We now have:") + print(len(np.where(y_train_pu == -1.)[0])," are benign") + print(len(np.where(y_train_pu == +1.)[0])," are malignant") + print() + #Get f1 score with pu_learning - print "PU learning in progress..." + print("PU learning in progress...") estimator = RandomForestClassifier(n_estimators=100, criterion='gini', bootstrap=True, @@ -109,13 +108,13 @@ def load_breast_cancer(path): y_pred = pu_estimator.predict(X_test) precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred) pu_f1_scores.append(f1_score[1]) - print "F1 score: ", f1_score[1] - print "Precision: ", precision[1] - print "Recall: ", recall[1] - print - + print("F1 score: ", f1_score[1]) + print("Precision: ", precision[1]) + print("Recall: ", recall[1]) + print() + #Get f1 score without pu_learning - print "Regular learning in progress..." + print("Regular learning in progress...") estimator = RandomForestClassifier(n_estimators=100, bootstrap=True, n_jobs=1) @@ -123,11 +122,11 @@ def load_breast_cancer(path): y_pred = estimator.predict(X_test) precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred) reg_f1_scores.append(f1_score[1]) - print "F1 score: ", f1_score[1] - print "Precision: ", precision[1] - print "Recall: ", recall[1] - print - print + print("F1 score: ", f1_score[1]) + print("Precision: ", precision[1]) + print("Recall: ", recall[1]) + print() + print() plt.title("Random forest with/without PU learning") plt.plot(n_sacrifice_iter, pu_f1_scores, label='PU Adapted Random Forest') plt.plot(n_sacrifice_iter, reg_f1_scores, label='Random Forest') From a2e98acec44f0a364c73f834396e8514ca438b2d Mon Sep 17 00:00:00 2001 From: Mariko Wakabayashi Date: Mon, 12 Feb 2018 17:04:31 -0600 Subject: [PATCH 2/4] Add gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a047a94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +__pycache__/ \ No newline at end of file From d91e927db26442a7ca0dde7390e1970625eff782 Mon Sep 17 00:00:00 2001 From: Mariko Wakabayashi Date: Mon, 12 Feb 2018 17:14:17 -0600 Subject: [PATCH 3/4] Include instructions to run script on readme --- README.md | 13 +++++++++++++ src/examples/__init__.py | 0 src/{ => examples}/puAdapterExample.py | 4 +--- src/tests/breastCancer.py | 2 +- 4 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 src/examples/__init__.py rename src/{ => examples}/puAdapterExample.py (94%) diff --git a/README.md b/README.md index 33e5d22..35badeb 100644 --- a/README.md +++ b/README.md @@ -10,3 +10,16 @@ PUAdapter: A tool that adapts any estimator that can output a probability to pos It is based on: Elkan, Charles, and Keith Noto. "Learning classifiers from only positive and unlabeled data." Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2008. + +## How to run scripts + +### Running puAdapter Example +```bash +python -m src.examples.puAdapterExample +``` + +### Running Breast Cancer Example +```bash +python -m src.tests.breastCancer +``` + diff --git a/src/examples/__init__.py b/src/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/puAdapterExample.py b/src/examples/puAdapterExample.py similarity index 94% rename from src/puAdapterExample.py rename to src/examples/puAdapterExample.py index 95eb947..29ce162 100644 --- a/src/puAdapterExample.py +++ b/src/examples/puAdapterExample.py @@ -1,11 +1,9 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- """ Created on Dec 21, 2012 @author: Alexandres """ -from puLearning.puAdapter import PUAdapter +from ..puLearning.puAdapter import PUAdapter from sklearn.svm import SVC from sklearn.datasets import make_classification import numpy as np diff --git a/src/tests/breastCancer.py b/src/tests/breastCancer.py index 86aabd7..6419f65 100644 --- a/src/tests/breastCancer.py +++ b/src/tests/breastCancer.py @@ -3,7 +3,7 @@ @author: Alexandre -The goal of this test is to verifiy that the PUAdapter really allows a regular estimator to +The goal of this test is to verify that the PUAdapter really allows a regular estimator to achieve better accuracy in the case where the \"negative\" examples are contaminated with a number of positive examples. From b37650ccf5148e546268b8bee2571d8ede61e225 Mon Sep 17 00:00:00 2001 From: Mariko Wakabayashi Date: Mon, 12 Feb 2018 17:17:55 -0600 Subject: [PATCH 4/4] Add requirements txt --- requirements.txt | 3 +++ src/examples/puAdapterExample.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6a0fc4e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +matplotlib +sklearn diff --git a/src/examples/puAdapterExample.py b/src/examples/puAdapterExample.py index 29ce162..ae8417a 100644 --- a/src/examples/puAdapterExample.py +++ b/src/examples/puAdapterExample.py @@ -1,7 +1,7 @@ """ Created on Dec 21, 2012 -@author: Alexandres +@author: Alexandre """ from ..puLearning.puAdapter import PUAdapter from sklearn.svm import SVC