Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion model-inference/decisionTree/experiments/config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"num_trees": 1600,
"num_trees": 500,
"depth": 8,
"pgsqlconfig": {
"host": "localhost",
Expand Down Expand Up @@ -106,6 +106,23 @@
"table": "epsilon",
"header": true
},
"epsilon_sparse": {
"num_features": 2000,
"rows": 500000,
"batch_size": 100000,
"query_size": 100000,
"type": "classification",
"desc": "train 400000 test 100000",
"info": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#epsilon",
"query": "SELECT * from epsilon",
"create": "",
"train": 0.8,
"test": 0.2,
"y_col": "label",
"filename": "epsilon.pkl",
"table": "epsilon",
"header": true
},
"bosch": {
"num_features": 968,
"rows": 1183747,
Expand Down
78 changes: 62 additions & 16 deletions model-inference/decisionTree/experiments/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import gc
import json
import pickle
from model_helper import relative2abspath, dataset_folder
from model_helper import relative2abspath, dataset_folder, fetch_epsilon_sparse, todense_fill
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
Expand All @@ -14,6 +14,8 @@
import sys
import time

from scipy import sparse as sp


def parse_arguments():
parser = argparse.ArgumentParser(
Expand All @@ -26,6 +28,7 @@ def parse_arguments():
'fraud',
'year',
'epsilon',
'epsilon_sparse',
'bosch',
'covtype',
'criteo',
Expand Down Expand Up @@ -145,8 +148,20 @@ def prepare_year(dataset_folder, nrows=None):
df = df.astype({0: np.int8})
return df


def prepare_epsilon(nrows=None):
# Passing a valid dataset_folder makes this function construct a custom dense dataset from the original sparse dataset.
def prepare_epsilon(nrows=None, dataset_folder=None):
if dataset_folder:
print('Preparing Epsilon Sparse Dataset')
prepare_epsilon_sparse(dataset_folder)
train_data = pd.DataFrame([])
print('Fetching Epsilon Sparse Test Dataset [Train is empty]')
test_features, test_labels = fetch_epsilon_sparse()
print('Fetched Epsilon Sparse Test Dataset')
test_labels = np.expand_dims(test_labels, axis=1)
test_features = todense_fill(test_features)
print(f'Test Features Occupy: {sys.getsizeof(test_features)} bytes')
test_data = pd.DataFrame(np.concatenate((test_labels,test_features), axis=1))
return test_data, train_data
from catboost.datasets import epsilon
print("DOWNLOADING EPSILON")
train_data, test_data = epsilon()
Expand All @@ -167,6 +182,30 @@ def prepare_epsilon(nrows=None):

return test_data, train_data

def prepare_epsilon_sparse(dataset_folder, nrows=None):
data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.t.bz2'
final_dataset = 'epsilon_normalized_test.svm'
file_name = data_url.split('/')[-1]
downloaded_file = download_data(data_url, dataset_folder)
if (os.path.isfile(downloaded_file)):
os.system(f'bzip2 -cdk {downloaded_file} > {dataset_folder}{final_dataset}')
# Convert all -1 Output Class to 0 Class
import re
updated_content = ''
with open(f'{dataset_folder}{final_dataset}', 'r') as f:
content = f.read()
updated_content = re.sub('(^-1 )|(\n-1 )', '\n0 ', content)
with open(f'{dataset_folder}{final_dataset}', 'w') as f:
f.write(updated_content)
# START: Control the Sparsity of Data
# x, y = datasets.load_svmlight_file(downloaded_file, dtype=np.float32)
# print('DATASET LOADED FOR CUSTOM SPARSITY')
# print(f'x Occupy [BEFORE]: {sys.getsizeof(x)} bytes')
# x = sp.csr_matrix(todense_fill(x))
# print(f'x Occupy [AFTER]: {sys.getsizeof(x)} bytes')
# datasets.dump_svmlight_file(x,y,downloaded_file)
# print('DATASET DUMPED WITH CUSTOM SPARSITY')
# END

def prepare_covtype(dataset_folder, nrows=None):
df = datasets.fetch_covtype(data_home=dataset_folder, as_frame=True)["frame"]
Expand Down Expand Up @@ -362,14 +401,19 @@ def create_tables(
is_classification = datasetconfig["type"] == "classification"
df = prepare_airline(is_classification, dataset_folder, nrows=nrows)
elif dataset == 'epsilon':
df_test, df_train = prepare_epsilon(nrows=nrows)
df_test, df_train = prepare_epsilon(nrows=nrows) # Default Missing Value
# df_test, df_train = prepare_epsilon(nrows=nrows, dataset_folder=dataset_folder) # Custom Missing Value
######
# mod = np.nan_to_num(df_test,0)
# print("SPARSITY: ",1.0-(np.count_nonzero(mod))/float(mod.size))
# exit()
######
elif dataset == "fraud":
df = prepare_fraud(dataset_folder, nrows=nrows)
elif dataset == 'bosch':
df = prepare_bosch(dataset_folder, nrows=nrows)
elif dataset == 'covtype':
df = prepare_covtype(dataset_folder, nrows=nrows)

elif dataset=="tpcxai_fraud":
if nrows:
df = prepare_tpcxai_fraud_transactions(dataset_folder, nrows=nrows)
Expand All @@ -387,7 +431,9 @@ def create_tables(
print('-'*50)
df = pd.concat([df,prepare_tpcxai_fraud_transactions(dataset_folder, nrows=partition_size, skip_rows=range(1,partition_size*i))])
print(f'Final Shape of DataFrame: {df.shape}')

elif dataset == "epsilon_sparse":
prepare_epsilon_sparse(dataset_folder)
exit()
elif dataset == 'criteo':
prepare_criteo(dataset_folder)
exit()
Expand Down Expand Up @@ -426,16 +472,16 @@ def create_tables(
print("LOADING DATA FOR EPSILON")
columns = [i for i in range(1, 2001)]
with connection.cursor() as cur:
train.head()
rows = len(train)
for i in range(rows):
cur.execute("INSERT INTO epsilon_train(label,row) VALUES(%s, %s)", (int(
train.loc[i, 0]), list(train.loc[i, columns])))
if i % 10000 == 0:
print(i)

connection.commit()
print("LOADED "+datasetconfig["table"]+"_train"+" to DB")
# train.head()
# rows = len(train)
# for i in range(rows):
# cur.execute("INSERT INTO epsilon_train(label,row) VALUES(%s, %s)", (int(
# train.loc[i, 0]), list(train.loc[i, columns])))
# if i % 10000 == 0:
# print(i)

# connection.commit()
# print("LOADED "+datasetconfig["table"]+"_train"+" to DB")

test.head()
rows = len(test)
Expand Down
59 changes: 55 additions & 4 deletions model-inference/decisionTree/experiments/model_helper.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,55 @@
import pickle
import time
import os
import sys
import numpy as np
import math
import random
from scipy import sparse as sp
from sklearn.metrics import classification_report, mean_squared_error

dataset_folder = "dataset/"

fill_missing_value = 0 # np.nan # 0
density_percentage = 0.5

def calculate_time(start_time, end_time):
diff = (end_time-start_time)*1000
return diff

def todense_fill(csr: sp.csr_matrix) -> np.ndarray:
"""Densify a sparse CSR matrix. Same as csr_matrix.todense()
except it fills missing entries with fill_missing_value instead of 0
"""
# dummy_value = np.nan if not np.isnan(fill_missing_value) else np.inf
# dummy_check = np.isnan if np.isnan(dummy_value) else np.isinf
dummy_value = np.nan
dummy_check = np.isnan
csr = csr.copy().astype(float)
csr.data[csr.data == 0] = dummy_value
out = np.array(csr.todense()).squeeze()
# ===== Control the Sparsity of the Data
if density_percentage>0:
print(f'Custom Density Percentage: {density_percentage}')
# Varying Sparsity by Row and Column, using scipy random function
sparsity_matrix_multiplier = sp.random(out.shape[0],out.shape[1],density=density_percentage, data_rvs=np.ones).todense()
print(f'Multiplier Shape: {sparsity_matrix_multiplier.shape}')
print(f'Non-zero Values [BEFORE]: {np.count_nonzero(out)}')
nonzeros = np.count_nonzero(out)
for i in range(out.shape[0]):
# print(np.expand_dims(out[i],axis=0).shape, sparsity_matrix_multiplier[i].shape)
out[i] = np.multiply(np.expand_dims(out[i],axis=0),sparsity_matrix_multiplier[i])
print(f'Non-zero Values [AFTER]: {np.count_nonzero(out)}')
print(f'DENSITY: {np.count_nonzero(out)/nonzeros}')
# out = np.multiply(out,sparsity_matrix_multiplier)
# sparse_column_indices = random.sample(range(out.shape[1]),int(out.shape[1]*density_percentage))
# for column_index in sparse_column_indices:
# out[::,column_index] = fill_missing_value
# =====
# out[out == 0] = fill_missing_value
# out[dummy_check(out)] = 0
print(f'Non-zero Values [AFTER CHECK]: {np.count_nonzero(out)}')
return out

def load_data_from_pickle(dataset, config, suffix, time_consume):
start_time = time.time()
Expand All @@ -38,14 +76,27 @@ def fetch_criteo(suffix, time_consume):
y = y.astype(np.int8, copy=False)
return (x, y)

def fetch_epsilon_sparse(time_consume=None, dataset="epsilon_normalized_test.svm"):
from sklearn import datasets

start_time = time.time()
path = relative2abspath(dataset_folder, dataset)
x, y = datasets.load_svmlight_file(path, dtype=np.float32)
data_loading_time = calculate_time(start_time,time.time())
if time_consume is not None:
time_consume["data loading time"] = data_loading_time
y = y.astype(np.int8, copy=False)
return (x, y)

def fetch_data(dataset, config, suffix, time_consume=None):
if dataset == "criteo":
if dataset == "criteo":
return fetch_criteo(suffix, time_consume)
elif dataset == "epsilon_sparse":
return fetch_epsilon_sparse(time_consume, dataset="epsilon_normalized_test.svm")
print("LOADING " + dataset + " " + suffix)

import psycopg2
try:
import connectorx as cx
import psycopg2
pgsqlconfig = config["pgsqlconfig"]
datasetconfig = config[dataset]
query = datasetconfig["query"]+"_"+suffix
Expand Down Expand Up @@ -118,7 +169,7 @@ def extend(output):
aggregate_func = aggregate_function()
for i in range(iterations):
query_data = treelite_runtime.DMatrix(
features[i*query_size:(i+1)*query_size])
features[i*query_size:(i+1)*query_size], missing=fill_missing_value)
output = predict(query_data)
if is_classification:
output = np.where(output > 0.5, 1, 0)
Expand Down
6 changes: 4 additions & 2 deletions model-inference/decisionTree/experiments/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@ def parse_arguments(config):
'airline_classification',
'fraud',
'year',
'epsilon',
'epsilon',
'bosch',
'covtype',
'criteo',
'tpcxai_fraud'],
'tpcxai_fraud',
'epsilon_sparse'],
help="Dataset to be tested.")
parser.add_argument(
"-m", "--model", type=str,
Expand Down Expand Up @@ -457,6 +458,7 @@ def test_postprocess(time_consume, conversion_time, total_framework_time, config
"model": MODEL,
"framework": FRAMEWORK}
features, label = load_data(config, time_consume)
# DATASET = "epsilon" if DATASET=="epsilon_sparse" else DATASET # To make sure we load the same model
sklearnmodel = load_sklearn_model(config, time_consume)
test(args, features, label, sklearnmodel, config, time_consume)
print("==============EXPERIMENT ENDING=========================\n")