aldro61 · mwakaba2 · Feb 12, 2018 · Feb 12, 2018 · Feb 12, 2018 · Feb 12, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea/
+__pycache__/
diff --git a/README.md b/README.md
@@ -10,3 +10,16 @@ PUAdapter: A tool that adapts any estimator that can output a probability to pos
            It is based on: Elkan, Charles, and Keith Noto. "Learning classifiers from only positive and unlabeled data."
            Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. 
            ACM, 2008.
+
+## How to run scripts
+
+### Running puAdapter Example
+```bash
+python -m src.examples.puAdapterExample
+```
+
+### Running Breast Cancer Example
+```bash
+python -m src.tests.breastCancer
+```
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+matplotlib
+sklearn
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/examples/__init__.py b/src/examples/__init__.py
diff --git a/src/examples/puAdapterExample.py b/src/examples/puAdapterExample.py
@@ -1,11 +1,9 @@
-#!/usr/bin/env python
-#-*- coding:utf-8 -*-
 """
 Created on Dec 21, 2012
 
 @author: Alexandre
 """
-from puLearning.puAdapter import PUAdapter
+from ..puLearning.puAdapter import PUAdapter
 from sklearn.svm import SVC
 from sklearn.datasets import make_classification
 import numpy as np
@@ -16,7 +14,7 @@
                                n_informative=2, 
                                n_redundant=2, 
                                n_repeated=0, 
-                               n_classes=2, 
+                               n_classes=2,
                                n_clusters_per_class=2, 
                                weights=None, 
                                flip_y=0.01, 
@@ -37,8 +35,8 @@
 
     pu_estimator.fit(X, y)
 
-    print pu_estimator
-    print
-    print "Comparison of estimator and PUAdapter(estimator):"
-    print "Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0])
-    print "Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0])
+    print(pu_estimator)
+    print()
+    print("Comparison of estimator and PUAdapter(estimator):")
+    print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0]))
+    print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0]))
diff --git a/src/puLearning/puAdapter.py b/src/puLearning/puAdapter.py
@@ -53,7 +53,7 @@ def __fit_precomputed_kernel(self, X, y):
         y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0)
         """
         positives = np.where(y == 1.)[0]
-        hold_out_size = np.ceil(len(positives) * self.hold_out_ratio)
+        hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))
 
         if len(positives) <= hold_out_size:
             raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
@@ -95,7 +95,7 @@ def __fit_no_precomputed_kernel(self, X, y):
         y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0)
         """
         positives = np.where(y == 1.)[0]
-        hold_out_size = np.ceil(len(positives) * self.hold_out_ratio)
+        hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))
 
         if len(positives) <= hold_out_size:
             raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')

diff --git a/src/tests/__init__.py b/src/tests/__init__.py
diff --git a/src/tests/breastCancer.py b/src/tests/breastCancer.py
@@ -3,12 +3,12 @@
 
 @author: Alexandre
 
-The goal of this test is to verifiy that the PUAdapter really allows a regular estimator to
+The goal of this test is to verify that the PUAdapter really allows a regular estimator to
 achieve better accuracy in the case where the \"negative\" examples are contaminated with a
 number of positive examples.
 
 Here we use the breast cancer dataset from UCI. We purposely take a few malignant examples and
-assign them the bening label and consider the bening examples as being \"unlabled\". We then compare
+assign them the benign label and consider the benign examples as being \"unlabled\". We then compare
 the performance of the estimator while using the PUAdapter and without using the PUAdapter. To
 asses the performance, we use the F1 score, precision and recall.
 
@@ -18,7 +18,7 @@
 """
 import numpy as np
 import matplotlib.pyplot as plt
-from puLearning.puAdapter import PUAdapter
+from ..puLearning.puAdapter import PUAdapter
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import precision_recall_fscore_support
 
@@ -45,13 +45,12 @@ def load_breast_cancer(path):
 if __name__ == '__main__':
     np.random.seed(42)
 
-    print "Loading dataset"
-    print
-    X,y = load_breast_cancer('../datasets/breast-cancer-wisconsin.data')
+    print("Loading dataset")
+    X,y = load_breast_cancer('src/datasets/breast-cancer-wisconsin.data')
 
     #Shuffle dataset
-    print "Shuffling dataset"
-    print
+    print("Shuffling dataset")
+    print()
     permut = np.random.permutation(len(y))
     X = X[permut]
     y = y[permut]
@@ -60,46 +59,46 @@ def load_breast_cancer(path):
     y[np.where(y == 2)[0]] = -1.
     y[np.where(y == 4)[0]] = +1.
 
-    print "Loaded ", len(y), " examples"
-    print len(np.where(y == -1.)[0])," are bening"
-    print len(np.where(y == +1.)[0])," are malignant"
-    print
-    
+    print("Loaded ", len(y), " examples")
+    print(len(np.where(y == -1.)[0])," are bening")
+    print(len(np.where(y == +1.)[0])," are malignant")
+    print()
+
     #Split test/train
-    print "Splitting dataset in test/train sets"
-    print
-    split = 2*len(y)/3
+    print("Splitting dataset in test/train sets")
+    print()
+    split = 2*len(y)//3
     X_train = X[:split]
     y_train = y[:split]
     X_test = X[split:]
     y_test = y[split:]
 
-    print "Training set contains ", len(y_train), " examples"
-    print len(np.where(y_train == -1.)[0])," are bening"
-    print len(np.where(y_train == +1.)[0])," are malignant"
-    print
+    print("Training set contains ", len(y_train), " examples")
+    print(len(np.where(y_train == -1.)[0])," are benign")
+    print(len(np.where(y_train == +1.)[0])," are malignant")
+    print()
 
     pu_f1_scores = []
     reg_f1_scores = []
     n_sacrifice_iter = range(0, len(np.where(y_train == +1.)[0])-21, 5)
     for n_sacrifice in n_sacrifice_iter:
         #send some positives to the negative class! :)
-        print "PU transformation in progress."
-        print "Making ", n_sacrifice, " malignant examples bening."
-        print
+        print("PU transformation in progress.")
+        print("Making ", n_sacrifice, " malignant examples benign.")
+        print()
         y_train_pu = np.copy(y_train)
         pos = np.where(y_train == +1.)[0]
         np.random.shuffle(pos)
         sacrifice = pos[:n_sacrifice]
         y_train_pu[sacrifice] = -1.
 
-        print "PU transformation applied. We now have:"
-        print len(np.where(y_train_pu == -1.)[0])," are bening"
-        print len(np.where(y_train_pu == +1.)[0])," are malignant"
-        print
-        
+        print("PU transformation applied. We now have:")
+        print(len(np.where(y_train_pu == -1.)[0])," are benign")
+        print(len(np.where(y_train_pu == +1.)[0])," are malignant")
+        print()
+
         #Get f1 score with pu_learning
-        print "PU learning in progress..."
+        print("PU learning in progress...")
         estimator = RandomForestClassifier(n_estimators=100,
                                            criterion='gini', 
                                            bootstrap=True,
@@ -109,25 +108,25 @@ def load_breast_cancer(path):
         y_pred = pu_estimator.predict(X_test)
         precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)
         pu_f1_scores.append(f1_score[1])
-        print "F1 score: ", f1_score[1]
-        print "Precision: ", precision[1]
-        print "Recall: ", recall[1]
-        print
-        
+        print("F1 score: ", f1_score[1])
+        print("Precision: ", precision[1])
+        print("Recall: ", recall[1])
+        print()
+
         #Get f1 score without pu_learning
-        print "Regular learning in progress..."
+        print("Regular learning in progress...")
         estimator = RandomForestClassifier(n_estimators=100,
                                            bootstrap=True,
                                            n_jobs=1)
         estimator.fit(X_train,y_train_pu)
         y_pred = estimator.predict(X_test)
         precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)
         reg_f1_scores.append(f1_score[1])
-        print "F1 score: ", f1_score[1]
-        print "Precision: ", precision[1]
-        print "Recall: ", recall[1]
-        print
-        print
+        print("F1 score: ", f1_score[1])
+        print("Precision: ", precision[1])
+        print("Recall: ", recall[1])
+        print()
+        print()
     plt.title("Random forest with/without PU learning")
     plt.plot(n_sacrifice_iter, pu_f1_scores, label='PU Adapted Random Forest')
     plt.plot(n_sacrifice_iter, reg_f1_scores, label='Random Forest')