Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 145 additions & 1 deletion milwrap/countbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import numpy as np
import pandas as pd
import scipy.stats

import sklearn
import sklearn.multiclass

def get_order_based_initial_bag_labels(
n_instances_for_each_bag_and_class: np.ndarray) -> List[int]:
Expand Down Expand Up @@ -136,3 +137,146 @@ def fit(
break

return self.classifier, y

class MilCountBasedBinaryClassLearner:

def __init__(self, classifier):
self.classifier = classifier

def fit(
self,
bags,
lower_threshold,
upper_threshold,
max_iter=10,
initial_y=None,
debug_true_y=None,
seed=None,
debug=True):

if seed is not None:
random.seed(seed)

# initialize y
n_list = [len(bag) for bag in bags]
if not initial_y:
# number of instances of each bags
if np.median(lower_threshold) == np.max(lower_threshold) or np.median(lower_threshold) == np.min(lower_threshold):
threshold = np.mean(lower_threshold)
else:
threshold = np.median(lower_threshold)
y = [np.repeat((lower_threshold_for_current_bag > threshold), n_instance_in_bag) for lower_threshold_for_current_bag, n_instance_in_bag in zip(lower_threshold, n_list)]
else:
y = initial_y

for i_iter in range(max_iter):

# fit
flatten_bags = np.vstack(bags)
flatten_y = np.hstack(y)
self.classifier.fit(flatten_bags, flatten_y)

# compute outputs
fs = [self.classifier.predict_proba(bag) for bag in bags]
y = [self.classifier.predict(bag) for bag in bags]
flatten_original_y = np.hstack([s for s in y])

# for every bag
has_changed = False
for i_bag in range(len(bags)):
class_count = np.sum(y[i_bag])
if class_count < lower_threshold[i_bag]:
# fs is minus
argsorted_with_score = np.argsort(-fs[i_bag])
indice_should_be_positive = argsorted_with_score[:int(lower_threshold[i_bag])]
y[i_bag][indice_should_be_positive] = 1
has_changed = True
elif upper_threshold[i_bag] <= class_count:
argsorted_with_score = np.argsort(fs[i_bag])
indice_should_be_negative = argsorted_with_score[:(n_list[i_bag] - int(upper_threshold[i_bag]))]
y[i_bag][indice_should_be_negative] = 0
has_changed = True

if debug:
print("iter:", i_iter)
predicted_count = [np.sum(b) for b in y]
print("false negative instances")
count_false_negative = np.minimum(predicted_count - lower_threshold, 0)
print(np.sum(count_false_negative))
print("false positive instances")
count_false_positive = np.maximum(predicted_count - upper_threshold, 0)
print(np.sum(count_false_positive))
print("num changes instances")
num_changes_instance = np.sum(np.hstack(y) != flatten_original_y)
print(num_changes_instance)
if debug_true_y is not None:
print("instance unit accuracy")
print(np.mean(np.hstack([self.classifier.predict(bag) for bag in bags]) == np.hstack(debug_true_y)))
print("instance unit accuracy (label adjusted)")
print(np.mean(np.hstack(y) == np.hstack(debug_true_y)))
print("-----")

if not has_changed:
break

return self.classifier, y


class OneVsRestMilCountBasedMultiClassLearner:

def __init__(self, binary_classifiers):
assert len(binary_classifiers) > 0
self.binary_classifiers = binary_classifiers

def fit(
self,
bags,
lower_threshold,
upper_threshold,
n_classes,
max_iter=10,
initial_y=None,
seed=None,
debug=True):

if seed is not None:
random.seed(seed)

assert len(self.binary_classifiers) == n_classes, (self.binary_classifiers, n_classes)
for i_class, binary_classifier in enumerate(self.binary_classifiers):

lower_threshold[:, i_class]

learner = MilCountBasedBinaryClassLearner(binary_classifier)
learner.fit(
bags = bags,
lower_threshold = lower_threshold[:, i_class],
upper_threshold = upper_threshold[:, i_class],
max_iter = max_iter,
initial_y = (np.array(initial_y) == i_class).astype(int) if initial_y is not None else None,
)
self.binary_classifiers[i_class] = learner.classifier

return self.binary_classifiers



def convert_binary_classifiers_to_ovr_multiclassifier(
n_classes, n_features, classifiers):

clf = sklearn.multiclass.OneVsRestClassifier(classifiers[0])

dummy_n_classes = n_classes
dummy_n_sample_per_classes = 10
dummy_n = dummy_n_sample_per_classes * dummy_n_classes
dummy_d = n_features
dummy_X = np.random.normal(size=(dummy_n, dummy_d))
dummy_y = np.repeat(np.arange(dummy_n_classes), dummy_n_sample_per_classes)

# fit to dummy data
clf.fit(dummy_X, dummy_y)

for i_estimator, estimator in enumerate(clf.estimators_):
assert type(estimator) == type(classifiers[i_estimator])
clf.estimators_[i_estimator] = classifiers[i_estimator]
return clf
71 changes: 70 additions & 1 deletion tests/test_countbase.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import numpy as np

from milwrap.countbase import MilCountBasedMultiClassLearner, \
from milwrap.countbase import MilCountBasedBinaryClassLearner, MilCountBasedMultiClassLearner, OneVsRestMilCountBasedMultiClassLearner, convert_binary_classifiers_to_ovr_multiclassifier, \
get_order_based_initial_bag_labels, \
get_order_based_initial_y

Expand Down Expand Up @@ -230,3 +230,72 @@ def test_fit_initialize_externally(self):
print(np.mean(clf.predict(np.vstack(bags)) == np.hstack(class_labels_of_instance_in_bags)))
print("----")


def test_fit_ovr(self):

np.random.seed(123)

n_classes = 15
n_bags = 100
n_max_instance_in_one_bag = 1000
n_instances_of_each_bags = [np.random.randint(low=0, high=n_max_instance_in_one_bag) for _ in range(n_bags)]
class_labels_of_instance_in_bags = generate_instance(n_classes, n_instances_of_each_bags)
count_each_class_of_instance_in_bags = [
pd.Series(x).value_counts().to_dict() for x in class_labels_of_instance_in_bags
]
count_each_class_of_instance_in_bags_matrix = \
pd.DataFrame(count_each_class_of_instance_in_bags)[list(range(n_classes))].values
count_each_class_of_instance_in_bags_matrix = np.nan_to_num(count_each_class_of_instance_in_bags_matrix)
lower_threshold = np.zeros_like(count_each_class_of_instance_in_bags_matrix)
upper_threshold = np.zeros_like(count_each_class_of_instance_in_bags_matrix)
divisions = [0, 50, 100, 200, 1000, n_max_instance_in_one_bag]
for i_bag in range(n_bags):
for i_class in range(n_classes):
positive_count = count_each_class_of_instance_in_bags_matrix[i_bag, i_class]
for i_division in range(len(divisions)-1):
if divisions[i_division] <= positive_count and positive_count < divisions[i_division+1]:
lower_threshold[i_bag, i_class] = divisions[i_division]
upper_threshold[i_bag, i_class] = divisions[i_division+1]

n_features = 7
x_min = 0
x_max = 100
cov_diag = 0.1*40**2

means_of_classes = [np.random.uniform(low=x_min, high=x_max, size=n_features) for _ in range(n_classes)]
covs_of_classes = [np.eye(n_features)*cov_diag for _ in range(n_classes)]
bags = [
np.vstack([
np.random.multivariate_normal(
means_of_classes[class_label],
covs_of_classes[class_label],
size=1) for class_label in class_labels_of_instance_in_bag
]) for class_labels_of_instance_in_bag in class_labels_of_instance_in_bags
]

# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier()

from sklearn.tree import DecisionTreeClassifier
classifiers = [DecisionTreeClassifier(min_samples_leaf=10) for _ in range(n_classes)]

# from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression()

# from sklearn.neural_network import MLPClassifier
# clf = MLPClassifier(alpha=1, max_iter=10)

learner = OneVsRestMilCountBasedMultiClassLearner(classifiers)
classifiers = learner.fit(
bags,
lower_threshold,
upper_threshold,
n_classes,
max_iter=10)

clf = convert_binary_classifiers_to_ovr_multiclassifier(
n_classes, n_features, classifiers)

print("MIL instance unit accuracy")
print(np.mean(clf.predict(np.vstack(bags)) == np.hstack(class_labels_of_instance_in_bags)))
print("----")