Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Reduction/reducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import pandas as pd
import os
import copy

############################# this file processed all feature subsets, computes the UMAP 2-D representation and stores them
if __name__ == "__main__":
# df = pd.read_csv('./../Dataset/scaledData.csv')
base = './../Dataset'
base = './../Dataset/umap reduced'
files = [f for f in os.listdir(base) if f[0] == 'v']
for f in files:
df = pd.read_csv(os.path.join(base, f))
Expand Down
58 changes: 0 additions & 58 deletions kNNc/buildGraph/GPB.py

This file was deleted.

2 changes: 2 additions & 0 deletions kNNc/buildGraph/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import os
from sklearn.model_selection import train_test_split

############ THE PURPOSE OF THE SCRIPT IS TO BUILD THE FOLDERS FOR THE REDUCED SUBSETS CORRESPONDING
############ TO THE DIFFERENT CENTRALITY MEASURES

if __name__ == "__main__":
base = './../../Dataset'
Expand Down
16 changes: 16 additions & 0 deletions kNNc/kNNc/ZeroR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pandas as pd
import os

if __name__ == "__main__":
ROOT = './../../Dataset'
subst = os.path.join(ROOT, 'meaningfulSetsd')
full = os.path.join(ROOT, 'umap reduced')
dsNames= [f for f in os.listdir(full)]
# redNames = [f for f in os.listdir(subst)]
# print(f"Dataset names: {dsNames}")
# print(f"Reduced subset names: {redNames}")
target = 'discretized FADY'
for n in dsNames:
zet = pd.read_csv(os.path.join(full, n))
print(f"Value counts: {zet['discretized FADY'].value_counts(normalize=True)}")

Binary file added kNNc/kNNc/__pycache__/knnc.cpython-310.pyc
Binary file not shown.
Binary file modified kNNc/kNNc/__pycache__/knnc.cpython-311.pyc
Binary file not shown.
52 changes: 36 additions & 16 deletions kNNc/kNNc/knnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

from scipy.spatial import cKDTree

class kNNc:
def __init__(self, c, k):
Expand All @@ -11,28 +11,48 @@ def __init__(self, c, k):
self.subset = None
self.knn_classifier = KNeighborsClassifier(n_neighbors=self.k)

def fit(self, X, y, subset):
def fit(self, X_train, y_train, subset):
self.x = X_train
self.y = y_train
self.subset = subset
cols = list(self.subset.columns)
# fitting nearest neighbors
self.knn_subset = NearestNeighbors(n_neighbors=self.c).fit(self.subset)
cs = self.get_c_classes(X)
self.knn_classifier.fit(X[np.isin(y, cs)], y)
self.knn_subset = cKDTree(subset[cols[:-1]])
#NearestNeighbors(n_neighbors=self.c).fit(self.subset)
print('K-nearest neighbor classifier fits')



def get_c_classes(self, X):
distances, indices = self.knn_subset.kneighbors(X)
return np.unique(indices)
# X is a row, a single point

# getting indices of nearest neighbors
_, cindices = self.knn_subset.query(X, k=self.c)
# getting classes of neighbors on subset
classes = self.subset.loc[cindices, 'discretized FADY'].tolist()
# print(classes)
return classes

def predict(self, X_test):

#X_test = X_test[self.subset.columns]
labels = []
for index, row in X_test.iterrows():

cl = self.get_c_classes(row.values)
# fit classifier only using training examples within
# cl
self.knn_classifier.fit(self.x[np.isin(self.y, cl)], self.y[np.isin(self.y, cl)])

labels.append(self.knn_classifier.predict(np.array(row).reshape(1,-1)))

def predict(self, X):
c_classes = self.get_c_classes(X)
return self.knn_classifier.predict(X[c_classes])
# refit KNN only using samples who are instances of the c_classes
return labels

def compute_accuracy(self, X, y_true):
y_pred = self.predict(X)
return accuracy_score(y_true, y_pred)
def compute_accuracy(self, y_pred, y_true):
# Ensure that the features during prediction match those during fitting
return accuracy_score(y_pred, y_true)

def compute_auc(self, X, y_true):
y_scores = self.knn_classifier.predict_proba(X)[:, 1] # Assuming binary classification
return roc_auc_score(y_true, y_scores)


if __name__ == "__main__":
Expand Down
55 changes: 36 additions & 19 deletions kNNc/kNNc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
from knnc import kNNc
import os
from sklearn.model_selection import train_test_split
import warnings

# Set the warning filter to "ignore" to suppress all warnings
warnings.filterwarnings("ignore")


if __name__ == "__main__":
ROOT = './../../Dataset'
subst = os.path.join(ROOT, 'meaningfulSetse')
subst = os.path.join(ROOT, 'meaningfulSetsd')
full = os.path.join(ROOT, 'umap reduced')

dsNames= [f for f in os.listdir(full)]
Expand All @@ -14,29 +19,41 @@
# print(f"Reduced subset names: {redNames}")
target = 'discretized FADY'
for n in dsNames:
knnc = kNNc(c=3, k=5)
sbst = pd.read_csv(os.path.join(subst, 'r' + n))
zet = pd.read_csv(os.path.join(full, n))
features = ['Umap 1', 'Umap 2', 'discretized FADY']
print(f"File name: {n}")
#zet.drop('Unnamed: 0', axis=1, inplace=True)
# sbst.drop(['Unnamed: 0.1, Unnamed: 0'], axis=1, inplace=True)

zet = zet[features]
sbst = sbst[features]
print(zet.head())
print(sbst.head())
y = zet[target]
X = zet.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#print(X_train)
#print(sbst.head())
# Check if the number of features in X_train and sbst is the same
# assert X_train.shape[1] == sbst.shape[1], "Number of features in X_train and sbst must be the same"

knnc.fit(X_train.values, y_train.values, sbst.values)
print(f"Accuracy: {knnc.compute_accuracy(X_test, y_test)}")
print(f"AUC: {knnc.compute_auc(X_test, y_test)}")
# following the same train test split as for constructing the subsets
traindf = zet.iloc[:66,:]
testdf = zet.iloc[67:,:]
y_train = traindf[target]
X_train = traindf.drop(target, axis=1)
y_test = testdf[target]
X_test = testdf.drop(target, axis=1)
# print(zet.head())
# print(sbst.head())
# y = zet[target]
# X = zet.drop(target, axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
# print(X_train)

cvals = range(2,5)
for c in cvals:
for k in range(c,c+4):
print(f"Parameters (k,c): {k}, {c}")
knnc = kNNc(c=c, k=k)

#print(X_train)
#print(sbst.head())
# print(sbst.columns)
knnc.fit(X_train, y_train, sbst)
#print(X_train)
# print('\n')
y_pred = knnc.predict(X_test)
print(f"Accuracy: {knnc.compute_accuracy(y_pred, y_test)}")
# print(f"AUC: {knnc.compute_auc(X_test, y_test)}")


40 changes: 40 additions & 0 deletions kNNc/kNNc/probaBenchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandas as pd
import os
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
ROOT = './../../Dataset'
subst = os.path.join(ROOT, 'meaningfulSetsd')
full = os.path.join(ROOT, 'umap reduced')
dsNames= [f for f in os.listdir(full)]
# redNames = [f for f in os.listdir(subst)]
# print(f"Dataset names: {dsNames}")
# print(f"Reduced subset names: {redNames}")
target = 'discretized FADY'

###### TO OBSERVE THE ZERO-R benchmark
for n in dsNames:
zet = pd.read_csv(os.path.join(full, n))
X = zet.drop(target, axis=1)
y = zet[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier as a benchmark
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
class_probabilities = rf_classifier.predict_proba(X_train)

# Use DummyClassifier
#print(f"Value counts: {zet['discretized FADY'].value_counts(normalize=True)}")
dummy_classifier = DummyClassifier(strategy='stratified')
dummy_classifier.fit(class_probabilities, y_train)
dummy_predictions = dummy_classifier.predict(rf_classifier.predict_proba(X_test))

# Evaluate the accuracy of the dummy classifier
accuracy = accuracy_score(y_test, dummy_predictions)
print(f"Accuracy of the dummy classifier: {accuracy:.2f}")


Loading