diff --git a/model-inference/decisionTree/experiments/config.json b/model-inference/decisionTree/experiments/config.json index 76198d19..59d7823e 100644 --- a/model-inference/decisionTree/experiments/config.json +++ b/model-inference/decisionTree/experiments/config.json @@ -1,5 +1,5 @@ { - "num_trees": 1600, + "num_trees": 500, "depth": 8, "pgsqlconfig": { "host": "localhost", @@ -106,6 +106,23 @@ "table": "epsilon", "header": true }, + "epsilon_sparse": { + "num_features": 2000, + "rows": 500000, + "batch_size": 100000, + "query_size": 100000, + "type": "classification", + "desc": "train 400000 test 100000", + "info": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#epsilon", + "query": "SELECT * from epsilon", + "create": "", + "train": 0.8, + "test": 0.2, + "y_col": "label", + "filename": "epsilon.pkl", + "table": "epsilon", + "header": true + }, "bosch": { "num_features": 968, "rows": 1183747, diff --git a/model-inference/decisionTree/experiments/data_processing.py b/model-inference/decisionTree/experiments/data_processing.py index 32d39ae3..630be214 100644 --- a/model-inference/decisionTree/experiments/data_processing.py +++ b/model-inference/decisionTree/experiments/data_processing.py @@ -2,7 +2,7 @@ import gc import json import pickle -from model_helper import relative2abspath, dataset_folder +from model_helper import relative2abspath, dataset_folder, fetch_epsilon_sparse, todense_fill import numpy as np import pandas as pd from urllib.request import urlretrieve @@ -14,6 +14,8 @@ import sys import time +from scipy import sparse as sp + def parse_arguments(): parser = argparse.ArgumentParser( @@ -26,6 +28,7 @@ def parse_arguments(): 'fraud', 'year', 'epsilon', + 'epsilon_sparse', 'bosch', 'covtype', 'criteo', @@ -145,8 +148,20 @@ def prepare_year(dataset_folder, nrows=None): df = df.astype({0: np.int8}) return df - -def prepare_epsilon(nrows=None): +# Passing a valid dataset_folder makes this function construct a custom dense dataset from the original sparse dataset. +def prepare_epsilon(nrows=None, dataset_folder=None): + if dataset_folder: + print('Preparing Epsilon Sparse Dataset') + prepare_epsilon_sparse(dataset_folder) + train_data = pd.DataFrame([]) + print('Fetching Epsilon Sparse Test Dataset [Train is empty]') + test_features, test_labels = fetch_epsilon_sparse() + print('Fetched Epsilon Sparse Test Dataset') + test_labels = np.expand_dims(test_labels, axis=1) + test_features = todense_fill(test_features) + print(f'Test Features Occupy: {sys.getsizeof(test_features)} bytes') + test_data = pd.DataFrame(np.concatenate((test_labels,test_features), axis=1)) + return test_data, train_data from catboost.datasets import epsilon print("DOWNLOADING EPSILON") train_data, test_data = epsilon() @@ -167,6 +182,30 @@ def prepare_epsilon(nrows=None): return test_data, train_data +def prepare_epsilon_sparse(dataset_folder, nrows=None): + data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.t.bz2' + final_dataset = 'epsilon_normalized_test.svm' + file_name = data_url.split('/')[-1] + downloaded_file = download_data(data_url, dataset_folder) + if (os.path.isfile(downloaded_file)): + os.system(f'bzip2 -cdk {downloaded_file} > {dataset_folder}{final_dataset}') + # Convert all -1 Output Class to 0 Class + import re + updated_content = '' + with open(f'{dataset_folder}{final_dataset}', 'r') as f: + content = f.read() + updated_content = re.sub('(^-1 )|(\n-1 )', '\n0 ', content) + with open(f'{dataset_folder}{final_dataset}', 'w') as f: + f.write(updated_content) + # START: Control the Sparsity of Data + # x, y = datasets.load_svmlight_file(downloaded_file, dtype=np.float32) + # print('DATASET LOADED FOR CUSTOM SPARSITY') + # print(f'x Occupy [BEFORE]: {sys.getsizeof(x)} bytes') + # x = sp.csr_matrix(todense_fill(x)) + # print(f'x Occupy [AFTER]: {sys.getsizeof(x)} bytes') + # datasets.dump_svmlight_file(x,y,downloaded_file) + # print('DATASET DUMPED WITH CUSTOM SPARSITY') + # END def prepare_covtype(dataset_folder, nrows=None): df = datasets.fetch_covtype(data_home=dataset_folder, as_frame=True)["frame"] @@ -362,14 +401,19 @@ def create_tables( is_classification = datasetconfig["type"] == "classification" df = prepare_airline(is_classification, dataset_folder, nrows=nrows) elif dataset == 'epsilon': - df_test, df_train = prepare_epsilon(nrows=nrows) + df_test, df_train = prepare_epsilon(nrows=nrows) # Default Missing Value + # df_test, df_train = prepare_epsilon(nrows=nrows, dataset_folder=dataset_folder) # Custom Missing Value + ###### + # mod = np.nan_to_num(df_test,0) + # print("SPARSITY: ",1.0-(np.count_nonzero(mod))/float(mod.size)) + # exit() + ###### elif dataset == "fraud": df = prepare_fraud(dataset_folder, nrows=nrows) elif dataset == 'bosch': df = prepare_bosch(dataset_folder, nrows=nrows) elif dataset == 'covtype': df = prepare_covtype(dataset_folder, nrows=nrows) - elif dataset=="tpcxai_fraud": if nrows: df = prepare_tpcxai_fraud_transactions(dataset_folder, nrows=nrows) @@ -387,7 +431,9 @@ def create_tables( print('-'*50) df = pd.concat([df,prepare_tpcxai_fraud_transactions(dataset_folder, nrows=partition_size, skip_rows=range(1,partition_size*i))]) print(f'Final Shape of DataFrame: {df.shape}') - + elif dataset == "epsilon_sparse": + prepare_epsilon_sparse(dataset_folder) + exit() elif dataset == 'criteo': prepare_criteo(dataset_folder) exit() @@ -426,16 +472,16 @@ def create_tables( print("LOADING DATA FOR EPSILON") columns = [i for i in range(1, 2001)] with connection.cursor() as cur: - train.head() - rows = len(train) - for i in range(rows): - cur.execute("INSERT INTO epsilon_train(label,row) VALUES(%s, %s)", (int( - train.loc[i, 0]), list(train.loc[i, columns]))) - if i % 10000 == 0: - print(i) - - connection.commit() - print("LOADED "+datasetconfig["table"]+"_train"+" to DB") + # train.head() + # rows = len(train) + # for i in range(rows): + # cur.execute("INSERT INTO epsilon_train(label,row) VALUES(%s, %s)", (int( + # train.loc[i, 0]), list(train.loc[i, columns]))) + # if i % 10000 == 0: + # print(i) + + # connection.commit() + # print("LOADED "+datasetconfig["table"]+"_train"+" to DB") test.head() rows = len(test) diff --git a/model-inference/decisionTree/experiments/model_helper.py b/model-inference/decisionTree/experiments/model_helper.py index e32dbc06..f85d0ad9 100644 --- a/model-inference/decisionTree/experiments/model_helper.py +++ b/model-inference/decisionTree/experiments/model_helper.py @@ -1,17 +1,55 @@ import pickle import time import os +import sys import numpy as np import math +import random +from scipy import sparse as sp from sklearn.metrics import classification_report, mean_squared_error dataset_folder = "dataset/" +fill_missing_value = 0 # np.nan # 0 +density_percentage = 0.5 def calculate_time(start_time, end_time): diff = (end_time-start_time)*1000 return diff +def todense_fill(csr: sp.csr_matrix) -> np.ndarray: + """Densify a sparse CSR matrix. Same as csr_matrix.todense() + except it fills missing entries with fill_missing_value instead of 0 + """ + # dummy_value = np.nan if not np.isnan(fill_missing_value) else np.inf + # dummy_check = np.isnan if np.isnan(dummy_value) else np.isinf + dummy_value = np.nan + dummy_check = np.isnan + csr = csr.copy().astype(float) + csr.data[csr.data == 0] = dummy_value + out = np.array(csr.todense()).squeeze() + # ===== Control the Sparsity of the Data + if density_percentage>0: + print(f'Custom Density Percentage: {density_percentage}') + # Varying Sparsity by Row and Column, using scipy random function + sparsity_matrix_multiplier = sp.random(out.shape[0],out.shape[1],density=density_percentage, data_rvs=np.ones).todense() + print(f'Multiplier Shape: {sparsity_matrix_multiplier.shape}') + print(f'Non-zero Values [BEFORE]: {np.count_nonzero(out)}') + nonzeros = np.count_nonzero(out) + for i in range(out.shape[0]): + # print(np.expand_dims(out[i],axis=0).shape, sparsity_matrix_multiplier[i].shape) + out[i] = np.multiply(np.expand_dims(out[i],axis=0),sparsity_matrix_multiplier[i]) + print(f'Non-zero Values [AFTER]: {np.count_nonzero(out)}') + print(f'DENSITY: {np.count_nonzero(out)/nonzeros}') + # out = np.multiply(out,sparsity_matrix_multiplier) + # sparse_column_indices = random.sample(range(out.shape[1]),int(out.shape[1]*density_percentage)) + # for column_index in sparse_column_indices: + # out[::,column_index] = fill_missing_value + # ===== + # out[out == 0] = fill_missing_value + # out[dummy_check(out)] = 0 + print(f'Non-zero Values [AFTER CHECK]: {np.count_nonzero(out)}') + return out def load_data_from_pickle(dataset, config, suffix, time_consume): start_time = time.time() @@ -38,14 +76,27 @@ def fetch_criteo(suffix, time_consume): y = y.astype(np.int8, copy=False) return (x, y) +def fetch_epsilon_sparse(time_consume=None, dataset="epsilon_normalized_test.svm"): + from sklearn import datasets + + start_time = time.time() + path = relative2abspath(dataset_folder, dataset) + x, y = datasets.load_svmlight_file(path, dtype=np.float32) + data_loading_time = calculate_time(start_time,time.time()) + if time_consume is not None: + time_consume["data loading time"] = data_loading_time + y = y.astype(np.int8, copy=False) + return (x, y) + def fetch_data(dataset, config, suffix, time_consume=None): - if dataset == "criteo": + if dataset == "criteo": return fetch_criteo(suffix, time_consume) + elif dataset == "epsilon_sparse": + return fetch_epsilon_sparse(time_consume, dataset="epsilon_normalized_test.svm") print("LOADING " + dataset + " " + suffix) - + import psycopg2 try: import connectorx as cx - import psycopg2 pgsqlconfig = config["pgsqlconfig"] datasetconfig = config[dataset] query = datasetconfig["query"]+"_"+suffix @@ -118,7 +169,7 @@ def extend(output): aggregate_func = aggregate_function() for i in range(iterations): query_data = treelite_runtime.DMatrix( - features[i*query_size:(i+1)*query_size]) + features[i*query_size:(i+1)*query_size], missing=fill_missing_value) output = predict(query_data) if is_classification: output = np.where(output > 0.5, 1, 0) diff --git a/model-inference/decisionTree/experiments/test_model.py b/model-inference/decisionTree/experiments/test_model.py index 6d7aa777..055fa929 100644 --- a/model-inference/decisionTree/experiments/test_model.py +++ b/model-inference/decisionTree/experiments/test_model.py @@ -34,11 +34,12 @@ def parse_arguments(config): 'airline_classification', 'fraud', 'year', - 'epsilon', + 'epsilon', 'bosch', 'covtype', 'criteo', - 'tpcxai_fraud'], + 'tpcxai_fraud', + 'epsilon_sparse'], help="Dataset to be tested.") parser.add_argument( "-m", "--model", type=str, @@ -457,6 +458,7 @@ def test_postprocess(time_consume, conversion_time, total_framework_time, config "model": MODEL, "framework": FRAMEWORK} features, label = load_data(config, time_consume) + # DATASET = "epsilon" if DATASET=="epsilon_sparse" else DATASET # To make sure we load the same model sklearnmodel = load_sklearn_model(config, time_consume) test(args, features, label, sklearnmodel, config, time_consume) print("==============EXPERIMENT ENDING=========================\n")