Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/
__pycache__/
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,16 @@ PUAdapter: A tool that adapts any estimator that can output a probability to pos
It is based on: Elkan, Charles, and Keith Noto. "Learning classifiers from only positive and unlabeled data."
Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining.
ACM, 2008.

## How to run scripts

### Running puAdapter Example
```bash
python -m src.examples.puAdapterExample
```

### Running Breast Cancer Example
```bash
python -m src.tests.breastCancer
```

3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
numpy
matplotlib
sklearn
Empty file added src/__init__.py
Empty file.
Empty file added src/examples/__init__.py
Empty file.
16 changes: 7 additions & 9 deletions src/examples/puAdapterExample.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-
"""
Created on Dec 21, 2012

@author: Alexandre
"""
from puLearning.puAdapter import PUAdapter
from ..puLearning.puAdapter import PUAdapter
from sklearn.svm import SVC
from sklearn.datasets import make_classification
import numpy as np
Expand All @@ -16,7 +14,7 @@
n_informative=2,
n_redundant=2,
n_repeated=0,
n_classes=2,
n_classes=2,
n_clusters_per_class=2,
weights=None,
flip_y=0.01,
Expand All @@ -37,8 +35,8 @@

pu_estimator.fit(X, y)

print pu_estimator
print
print "Comparison of estimator and PUAdapter(estimator):"
print "Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0])
print "Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0])
print(pu_estimator)
print()
print("Comparison of estimator and PUAdapter(estimator):")
print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0]))
print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0]))
4 changes: 2 additions & 2 deletions src/puLearning/puAdapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __fit_precomputed_kernel(self, X, y):
y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0)
"""
positives = np.where(y == 1.)[0]
hold_out_size = np.ceil(len(positives) * self.hold_out_ratio)
hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))

if len(positives) <= hold_out_size:
raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
Expand Down Expand Up @@ -95,7 +95,7 @@ def __fit_no_precomputed_kernel(self, X, y):
y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0)
"""
positives = np.where(y == 1.)[0]
hold_out_size = np.ceil(len(positives) * self.hold_out_ratio)
hold_out_size = int(np.ceil(len(positives) * self.hold_out_ratio))

if len(positives) <= hold_out_size:
raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.')
Expand Down
Empty file added src/tests/__init__.py
Empty file.
79 changes: 39 additions & 40 deletions src/tests/breastCancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@

@author: Alexandre

The goal of this test is to verifiy that the PUAdapter really allows a regular estimator to
The goal of this test is to verify that the PUAdapter really allows a regular estimator to
achieve better accuracy in the case where the \"negative\" examples are contaminated with a
number of positive examples.

Here we use the breast cancer dataset from UCI. We purposely take a few malignant examples and
assign them the bening label and consider the bening examples as being \"unlabled\". We then compare
assign them the benign label and consider the benign examples as being \"unlabled\". We then compare
the performance of the estimator while using the PUAdapter and without using the PUAdapter. To
asses the performance, we use the F1 score, precision and recall.

Expand All @@ -18,7 +18,7 @@
"""
import numpy as np
import matplotlib.pyplot as plt
from puLearning.puAdapter import PUAdapter
from ..puLearning.puAdapter import PUAdapter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

Expand All @@ -45,13 +45,12 @@ def load_breast_cancer(path):
if __name__ == '__main__':
np.random.seed(42)

print "Loading dataset"
print
X,y = load_breast_cancer('../datasets/breast-cancer-wisconsin.data')
print("Loading dataset")
X,y = load_breast_cancer('src/datasets/breast-cancer-wisconsin.data')

#Shuffle dataset
print "Shuffling dataset"
print
print("Shuffling dataset")
print()
permut = np.random.permutation(len(y))
X = X[permut]
y = y[permut]
Expand All @@ -60,46 +59,46 @@ def load_breast_cancer(path):
y[np.where(y == 2)[0]] = -1.
y[np.where(y == 4)[0]] = +1.

print "Loaded ", len(y), " examples"
print len(np.where(y == -1.)[0])," are bening"
print len(np.where(y == +1.)[0])," are malignant"
print
print("Loaded ", len(y), " examples")
print(len(np.where(y == -1.)[0])," are bening")
print(len(np.where(y == +1.)[0])," are malignant")
print()

#Split test/train
print "Splitting dataset in test/train sets"
print
split = 2*len(y)/3
print("Splitting dataset in test/train sets")
print()
split = 2*len(y)//3
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

print "Training set contains ", len(y_train), " examples"
print len(np.where(y_train == -1.)[0])," are bening"
print len(np.where(y_train == +1.)[0])," are malignant"
print
print("Training set contains ", len(y_train), " examples")
print(len(np.where(y_train == -1.)[0])," are benign")
print(len(np.where(y_train == +1.)[0])," are malignant")
print()

pu_f1_scores = []
reg_f1_scores = []
n_sacrifice_iter = range(0, len(np.where(y_train == +1.)[0])-21, 5)
for n_sacrifice in n_sacrifice_iter:
#send some positives to the negative class! :)
print "PU transformation in progress."
print "Making ", n_sacrifice, " malignant examples bening."
print
print("PU transformation in progress.")
print("Making ", n_sacrifice, " malignant examples benign.")
print()
y_train_pu = np.copy(y_train)
pos = np.where(y_train == +1.)[0]
np.random.shuffle(pos)
sacrifice = pos[:n_sacrifice]
y_train_pu[sacrifice] = -1.

print "PU transformation applied. We now have:"
print len(np.where(y_train_pu == -1.)[0])," are bening"
print len(np.where(y_train_pu == +1.)[0])," are malignant"
print
print("PU transformation applied. We now have:")
print(len(np.where(y_train_pu == -1.)[0])," are benign")
print(len(np.where(y_train_pu == +1.)[0])," are malignant")
print()

#Get f1 score with pu_learning
print "PU learning in progress..."
print("PU learning in progress...")
estimator = RandomForestClassifier(n_estimators=100,
criterion='gini',
bootstrap=True,
Expand All @@ -109,25 +108,25 @@ def load_breast_cancer(path):
y_pred = pu_estimator.predict(X_test)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)
pu_f1_scores.append(f1_score[1])
print "F1 score: ", f1_score[1]
print "Precision: ", precision[1]
print "Recall: ", recall[1]
print
print("F1 score: ", f1_score[1])
print("Precision: ", precision[1])
print("Recall: ", recall[1])
print()

#Get f1 score without pu_learning
print "Regular learning in progress..."
print("Regular learning in progress...")
estimator = RandomForestClassifier(n_estimators=100,
bootstrap=True,
n_jobs=1)
estimator.fit(X_train,y_train_pu)
y_pred = estimator.predict(X_test)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)
reg_f1_scores.append(f1_score[1])
print "F1 score: ", f1_score[1]
print "Precision: ", precision[1]
print "Recall: ", recall[1]
print
print
print("F1 score: ", f1_score[1])
print("Precision: ", precision[1])
print("Recall: ", recall[1])
print()
print()
plt.title("Random forest with/without PU learning")
plt.plot(n_sacrifice_iter, pu_f1_scores, label='PU Adapted Random Forest')
plt.plot(n_sacrifice_iter, reg_f1_scores, label='Random Forest')
Expand Down