Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
c8f4220
added tpcxai_fraud results
mahidhar96 Oct 21, 2022
4238a5b
removed redundant files
mahidhar96 Oct 21, 2022
53b22ba
normalized gpu results
mahidhar96 Oct 26, 2022
b9982b6
gpu profiling
mahidhar96 Oct 26, 2022
9b15053
gpu profiling code
mahidhar96 Oct 27, 2022
9a65719
airline profiling fix
mahidhar96 Oct 27, 2022
a3c60ad
gpustat added
mahidhar96 Oct 27, 2022
77a2069
0.5 second perios
mahidhar96 Oct 27, 2022
57f621a
higgs gpu profiles
mahidhar96 Oct 27, 2022
6528719
airline profiles
mahidhar96 Oct 27, 2022
2585aa5
year profiles
mahidhar96 Oct 27, 2022
f2ad4f3
tpcxai_fraud profiles
mahidhar96 Oct 27, 2022
37227d0
fraud profiles
mahidhar96 Oct 27, 2022
75a6a24
epsilon profiles
mahidhar96 Oct 27, 2022
2ea660c
bosch profiles
mahidhar96 Oct 28, 2022
73433e9
stat update
mahidhar96 Oct 28, 2022
2573382
recalc
mahidhar96 Oct 28, 2022
3be3cc3
higgs gpu results
mahidhar96 Oct 29, 2022
80a195b
airline gpu results
mahidhar96 Oct 29, 2022
31edbbf
Merge branch '41-decisiontree-gpu' of github.com:asu-cactus/netsdb in…
mahidhar96 Oct 29, 2022
27cf2cf
airline gpu results
mahidhar96 Oct 29, 2022
249621e
airline gpu results
mahidhar96 Oct 29, 2022
c6b21b6
airline gpu results
mahidhar96 Oct 29, 2022
1a4b72c
airline gpu results
mahidhar96 Oct 29, 2022
0e5164c
airline gpu results
mahidhar96 Oct 29, 2022
7c61eb9
Merge branch '41-decisiontree-gpu' of github.com:asu-cactus/netsdb in…
mahidhar96 Oct 29, 2022
7b990b4
added gpu profiles
mahidhar96 Oct 29, 2022
175ba54
added trees to profiles
mahidhar96 Nov 1, 2022
2621dea
new gpu profiles
mahidhar96 Nov 4, 2022
d796290
adding back batch size 10
mahidhar96 Nov 4, 2022
78a325e
added gpu profile tables
mahidhar96 Nov 4, 2022
af2bbbb
added higgs
mahidhar96 Nov 4, 2022
c14d326
added higgs 500
mahidhar96 Nov 6, 2022
2f35286
criteo support
mahidhar96 Nov 6, 2022
fa712a1
query correction
mahidhar96 Nov 6, 2022
9506282
criteo update
mahidhar96 Nov 6, 2022
f79ee02
added higgs 1600 higgs
mahidhar96 Nov 8, 2022
06cabc5
Merge branch '41-decisiontree-gpu' of github.com:asu-cactus/netsdb in…
mahidhar96 Nov 8, 2022
94d8549
removed time from fraud and some cleaning
mahidhar96 Nov 9, 2022
a30f8a4
remmoved amount
mahidhar96 Nov 10, 2022
7adbb89
bosch cpu
mahidhar96 Nov 12, 2022
ed92d23
tpcxai results
mahidhar96 Nov 14, 2022
a664928
criteo changes
mahidhar96 Nov 17, 2022
34d6bcb
Merge branch '41-decisiontree-gpu' of https://github.com/asu-cactus/n…
mahidhar96 Nov 17, 2022
1f7739a
criteo batching results
mahidhar96 Nov 17, 2022
7950014
criteo profiling
mahidhar96 Nov 19, 2022
cabb5b6
added criteo profiles
mahidhar96 Nov 20, 2022
5d511c4
criteo additional results
mahidhar96 Nov 21, 2022
488b044
added profiling results
mahidhar96 Nov 22, 2022
2ed1683
changed features
mahidhar96 Dec 13, 2022
7921ec9
changed features
mahidhar96 Dec 13, 2022
aa12157
changed features
mahidhar96 Dec 13, 2022
298b654
changed features
mahidhar96 Dec 13, 2022
c858f5f
changed features
mahidhar96 Dec 13, 2022
b99a817
changed features
mahidhar96 Dec 13, 2022
869610f
todo for features
mahidhar96 Dec 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "f3af0015",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import matplotlib\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "da3cdd47",
"metadata": {},
"outputs": [],
"source": [
"files = os.listdir('gpu_profiles')\n",
"files.sort()\n",
"for file in files:\n",
" if file.startswith(\"python\") and file.endswith(\"GPU_LOG.txt\"):\n",
" args = file.split(\"-\")\n",
"# for i in range(len(args)):\n",
"# print(i,args[i])\n",
" batch_size = args[6].split('_')[2]\n",
" dataset = args[1].split('_')[1]\n",
" algo = args[2].split('_')[1]\n",
" framework = args[3].split('_')[1]\n",
" print(batch_size,dataset,framework)\n",
" data = open(file).read().split('\\n')\n",
" gpu_usage = []\n",
" mem_usage = []\n",
" ubu_usage = []\n",
" max_usage = []\n",
" max_gpu_usage = -1\n",
" max_mem_usage = -1\n",
" max_ubu_usage = -1\n",
" \n",
" for line in data:\n",
" if line.startswith(\"[0]\"):\n",
"# print(line.split())\n",
" line_split = line.split()\n",
" gpu = int(line_split[5])\n",
" mem = int(line_split[8])\n",
" max_usg = int(line_split[10])\n",
" ubu = 0\n",
" if line_split[-1].startswith('ubuntu'):\n",
" ubu = line_split[-1].split('(')[-1]\n",
" ubu = ubu.split(\"M\")[0]\n",
" ubu = int(ubu)\n",
" max_gpu_usage = max(max_gpu_usage,gpu)\n",
" max_mem_usage = max(max_mem_usage,mem)\n",
" max_ubu_usage = max(max_ubu_usage,ubu)\n",
" gpu_usage.append(gpu)\n",
" mem_usage.append(mem)\n",
" ubu_usage.append(ubu)\n",
" max_usage.append(max_usg)\n",
" \n",
" print(mem_usage,'\\n',gpu_usage,'\\n',ubu_usage,'\\n',max_usage)\n",
" break\n",
" \n",
" file = open(dataset+\".csv\",\"a+\")\n",
" file.write(\"algo,\"+algo+\"\\n\")\n",
" file.write(\"dataset,\"+dataset+\"\\n\")\n",
" file.write(\"batch_size,\"+batch_size+\"\\n\")\n",
" file.write(\"framework,\"+framework+\"\\n\")\n",
" file.write(\"max_gpu_usage,\"+str(max_gpu_usage)+\"\\n\")\n",
" file.write(\"max_mem_usage,\"+str(max_mem_usage)+\"\\n\")\n",
" file.write(\"max_ubu_usage,\"+str(max_ubu_usage)+\"\\n\\n\")\n",
" file.write(\"gpu_usage,\"+\",\".join(map(str,gpu_usage))+\"\\n\")\n",
" file.write(\"mem_usage,\"+\",\".join(map(str,mem_usage))+\"\\n\")\n",
" file.write(\"ubu_usage,\"+\",\".join(map(str,ubu_usage))+\"\\n\")\n",
" file.write(\"max_usage,\"+\",\".join(map(str,max_usage))+\"\\n\\n\\n\\n\")\n",
" file.close()\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57dd11c6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "511a2bda",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "73cffcfd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
5 changes: 3 additions & 2 deletions model-inference/decisionTree/experiments/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"type": "classification",
"info": "https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud",
"query": "SELECT v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v22,v23,v24,v25,v26,v27,v28,class FROM fraud",
"create": "CREATE TABLE ** (time DECIMAL NOT NULL, v1 DECIMAL NOT NULL, v2 DECIMAL NOT NULL, v3 DECIMAL NOT NULL, v4 DECIMAL NOT NULL, v5 DECIMAL NOT NULL, v6 DECIMAL NOT NULL, v7 DECIMAL NOT NULL, v8 DECIMAL NOT NULL, v9 DECIMAL NOT NULL, v10 DECIMAL NOT NULL, v11 DECIMAL NOT NULL, v12 DECIMAL NOT NULL, v13 DECIMAL NOT NULL, v14 DECIMAL NOT NULL, v15 DECIMAL NOT NULL, v16 DECIMAL NOT NULL, v17 DECIMAL NOT NULL, v18 DECIMAL NOT NULL, v19 DECIMAL NOT NULL, v20 DECIMAL NOT NULL, v21 DECIMAL NOT NULL, v22 DECIMAL NOT NULL, v23 DECIMAL NOT NULL, v24 DECIMAL NOT NULL, v25 DECIMAL NOT NULL, v26 DECIMAL NOT NULL, v27 DECIMAL NOT NULL, v28 DECIMAL NOT NULL, amount DECIMAL NOT NULL, class INTEGER NOT NULL)",
"create": "CREATE TABLE ** (v1 DECIMAL NOT NULL, v2 DECIMAL NOT NULL, v3 DECIMAL NOT NULL, v4 DECIMAL NOT NULL, v5 DECIMAL NOT NULL, v6 DECIMAL NOT NULL, v7 DECIMAL NOT NULL, v8 DECIMAL NOT NULL, v9 DECIMAL NOT NULL, v10 DECIMAL NOT NULL, v11 DECIMAL NOT NULL, v12 DECIMAL NOT NULL, v13 DECIMAL NOT NULL, v14 DECIMAL NOT NULL, v15 DECIMAL NOT NULL, v16 DECIMAL NOT NULL, v17 DECIMAL NOT NULL, v18 DECIMAL NOT NULL, v19 DECIMAL NOT NULL, v20 DECIMAL NOT NULL, v21 DECIMAL NOT NULL, v22 DECIMAL NOT NULL, v23 DECIMAL NOT NULL, v24 DECIMAL NOT NULL, v25 DECIMAL NOT NULL, v26 DECIMAL NOT NULL, v27 DECIMAL NOT NULL, v28 DECIMAL NOT NULL, class INTEGER NOT NULL)",
"train": 0.8,
"test": 0.2,
"y_col": "class",
Expand Down Expand Up @@ -161,10 +161,11 @@
"criteo": {
"num_features": 1000000,
"rows": 51882752,
"dataset_type": "sparse",
"batch_size": 100000,
"query_size": 100000,
"type": "classification",
"info": "https://www.kaggle.com/c/criteo-display-ad-challenge/",
"info": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#criteo https://www.kaggle.com/c/criteo-display-ad-challenge/",
"query": "",
"create": "",
"train": -1,
Expand Down
108 changes: 69 additions & 39 deletions model-inference/decisionTree/experiments/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,22 @@ def parse_arguments():
parser = argparse.ArgumentParser(
description='Arguments for data_processing.py')
parser.add_argument("-d", "--dataset", type=str, required=True,
choices=[
'higgs',
'airline_regression',
'airline_classification',
'fraud',
'year',
'epsilon',
'bosch',
'covtype',
'criteo',
'tpcxai_fraud'],
help="Dataset to be processed. Choose from ['higgs', 'airline_regression', 'airline_classification', 'fraud', 'year', 'epsilon', 'bosch', 'covtype','tpcxai_fraud','criteo']")
parser.add_argument("-n", "--nrows", type=int, help="Load nrows of the dataset. Warning: only use in development.")
parser.add_argument("-sf","--scalefactor", type=int, help="Relevant only for TPCxAI_Fraud. Takes one of the values in 1, 3, 10 and 30")
choices=[
'higgs',
'airline_regression',
'airline_classification',
'fraud',
'year',
'epsilon',
'bosch',
'covtype',
'criteo',
'tpcxai_fraud'],
help="Dataset to be processed. Choose from ['higgs', 'airline_regression', 'airline_classification', 'fraud', 'year', 'epsilon', 'bosch', 'covtype','tpcxai_fraud','criteo']")
parser.add_argument("-n", "--nrows", type=int,
help="Load nrows of the dataset. Warning: only use in development.")
parser.add_argument("-sf", "--scalefactor", type=int,
help="Relevant only for TPCxAI_Fraud. Takes one of the values in 1, 3, 10 and 30")

args = parser.parse_args()
return args
Expand Down Expand Up @@ -121,6 +123,8 @@ def prepare_fraud(dataset_folder, nrows=None):
filename + " -p " + dataset_folder)
df = pd.read_csv(local_url + ".zip", dtype=np.float32, nrows=nrows)
df = df.astype({"Class": np.int8})
df = df.drop("Time", axis=1)
df = df.drop("Amount", axis=1)
return df


Expand All @@ -138,6 +142,7 @@ def prepare_bosch(dataset_folder, nrows=None):
df = df.astype({"Response": np.int8})
return df


def prepare_year(dataset_folder, nrows=None):
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'
local_url = download_data(url, dataset_folder)
Expand Down Expand Up @@ -168,8 +173,9 @@ def prepare_epsilon(nrows=None):
return test_data, train_data


def prepare_covtype(dataset_folder, nrows=None):
df = datasets.fetch_covtype(data_home=dataset_folder, as_frame=True)["frame"]
def prepare_covtype(dataset_folder, nrows=None):
df = datasets.fetch_covtype(
data_home=dataset_folder, as_frame=True)["frame"]
if nrows is not None:
df = df[:nrows]
df = df.astype(np.float32)
Expand All @@ -178,44 +184,57 @@ def prepare_covtype(dataset_folder, nrows=None):
df["Cover_Type"] = df["Cover_Type"] - 1
return df

def prepare_tpcxai_fraud_transactions(dataset_folder,nrows=None,skip_rows=0):

def prepare_tpcxai_fraud_transactions(dataset_folder, nrows=None, skip_rows=0):
global tpcxai_fraud_dataset_headers
import re
from datetime import datetime
import time

show_progress_bar = False # To Show/Hide Progress Bar based on whether working in Interactive Mode
# To Show/Hide Progress Bar based on whether working in Interactive Mode
show_progress_bar = False

if show_progress_bar:
from tqdm import tqdm
tqdm.pandas()

SCALE_FACTOR = args.scalefactor if ("scalefactor" in args) else 1

file_name = f'dataset/financial_transactions_train_SF{SCALE_FACTOR}.csv' # Put the file in same directory
# Put the file in same directory
file_name = f'dataset/financial_transactions_train_SF{SCALE_FACTOR}.csv'
df = pd.read_csv(file_name, nrows=nrows, skiprows=skip_rows)

start_time = time.time()
print('FEATURE ENGINEERING: Conversion of Text to Numerical Features')
# Convert Text-based Columns to Numerical Values
numericalize_text_feature_fn = lambda input: re.sub(r"[^0-9]","",input).strip()

def numericalize_text_feature_fn(
input): return re.sub(r"[^0-9]", "", input).strip()
# convert_datetime_feature_fn = lambda input: pd.Series([int(x) for x in datetime.strftime(datetime.strptime(input, "%Y-%m-%dT%H:%M"),"%d%m%Y:%H%M").split(':')])
convert_datetime_feature_intermediate_fn = lambda input: datetime.strftime(datetime.strptime(input, "%Y-%m-%dT%H:%M"),"%d%m%Y:%H%M")
convert_datetime_feature_final_fn = lambda input: pd.Series([float(x) for x in input.split(':')])

def convert_datetime_feature_intermediate_fn(input): return datetime.strftime(
datetime.strptime(input, "%Y-%m-%dT%H:%M"), "%d%m%Y:%H%M")
def convert_datetime_feature_final_fn(input): return pd.Series(
[float(x) for x in input.split(':')])

print('[1] Converting IBAN to Numerical Feature [DE4875000009209924 -> 4875000009209924]')
df['IBAN'] = df['IBAN'].progress_apply(numericalize_text_feature_fn) if show_progress_bar else df['IBAN'].apply(numericalize_text_feature_fn)
df['IBAN'] = df['IBAN'].progress_apply(
numericalize_text_feature_fn) if show_progress_bar else df['IBAN'].apply(numericalize_text_feature_fn)
print(f'Time Taken until here: {(time.time()-start_time)} seconds')
print('[2] Converting receiverID to Numerical Feature [FOR55821814 -> 55821814]')
df['receiverID'] = df['receiverID'].progress_apply(numericalize_text_feature_fn) if show_progress_bar else df['receiverID'].apply(numericalize_text_feature_fn)
df['receiverID'] = df['receiverID'].progress_apply(
numericalize_text_feature_fn) if show_progress_bar else df['receiverID'].apply(numericalize_text_feature_fn)
print(f'Time Taken until here: {(time.time()-start_time)} seconds')
print('[3] Converting time to Numerical Feature [2011-01-29T15:28 -> [29012011, 1528]]')
print('\t[3.1] STAGE 1: Conversion of Date to the Required Format [2011-01-29T15:28 -> 29012011:1528]')
df['time'] = df['time'].progress_apply(convert_datetime_feature_intermediate_fn) if show_progress_bar else df['time'].apply(convert_datetime_feature_intermediate_fn)
df['time'] = df['time'].progress_apply(convert_datetime_feature_intermediate_fn) if show_progress_bar else df['time'].apply(
convert_datetime_feature_intermediate_fn)
print(f'Time Taken until here: {(time.time()-start_time)} seconds')
print('\t[3.2] STAGE 2: Conversion of Formatted Date to New Features [29012011:1528 -> [29012011, 1528]]')
print('This Stage takes long time to complete after Bar reaches 100% as it also writes to the TWO new Columns [date, time]')
df[['date','time']] = df['time'].progress_apply(convert_datetime_feature_final_fn) if show_progress_bar else df['time'].apply(convert_datetime_feature_final_fn)
print(
'This Stage takes long time to complete after Bar reaches 100% as it also writes to the TWO new Columns [date, time]')
df[['date', 'time']] = df['time'].progress_apply(
convert_datetime_feature_final_fn) if show_progress_bar else df['time'].apply(convert_datetime_feature_final_fn)
print(f'Time Taken until here: {(time.time()-start_time)} seconds')
print('[4] Change Column Name isFraud to is_fraud due to SQL Case-insensitive Nature')
df = df.rename(columns={"isFraud": "is_fraud"}, errors="raise")
Expand All @@ -227,23 +246,29 @@ def prepare_tpcxai_fraud_transactions(dataset_folder,nrows=None,skip_rows=0):
print(f'Time Taken until here: {(time.time()-start_time)} seconds')

# DataType Conversion. All features can be made Integers, hence making them int64
dtype=np.float32
df = df.astype({"IBAN": dtype, "receiverID": dtype, "date": dtype, "time": dtype, "is_fraud": np.int8})
dtype = np.float32
df = df.astype({"IBAN": dtype, "receiverID": dtype,
"date": dtype, "time": dtype, "is_fraud": np.int8})

print('-'*50)
print('Feature Engineering and Creating the New Dataset DONE')
print(f'Total Time Taken for Preparing the New Dataset: {(time.time()-start_time)} seconds')
print(
f'Total Time Taken for Preparing the New Dataset: {(time.time()-start_time)} seconds')
return df


def prepare_criteo(dataset_folder):
url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/criteo.kaggle2014.svm.tar.xz"
local_url = download_data(url, dataset_folder)
train_path = relative2abspath(dataset_folder, "criteo.kaggle2014.svm", "train.txt.svm")
test_path = relative2abspath(dataset_folder, "criteo.kaggle2014.svm", "test.txt.svm")
train_path = relative2abspath(
dataset_folder, "criteo.kaggle2014.svm", "train.txt.svm")
test_path = relative2abspath(
dataset_folder, "criteo.kaggle2014.svm", "test.txt.svm")

if not (os.path.isfile(train_path) and os.path.isfile(test_path)):
os.system(f"tar -Jxf {local_url} -C {dataset_folder}")


def get_connection(pgsqlconfig):
return psycopg2.connect(
database=pgsqlconfig["dbname"],
Expand Down Expand Up @@ -273,7 +298,8 @@ def make_query(dataset, datasetconfig, column_names):
elif dataset == "tpcxai_fraud":
if datasetconfig['y_col'] in column_names:
column_names.remove(datasetconfig['y_col'])
feature_names = ", ".join([f"{col_name} DECIMAL NOT NULL" for col_name in column_names])
feature_names = ", ".join(
[f"{col_name} DECIMAL NOT NULL" for col_name in column_names])
label_name = f"{datasetconfig['y_col']} INTEGER NOT NULL"
create_query = f"CREATE TABLE ** ({feature_names}, {label_name})"
else:
Expand Down Expand Up @@ -370,22 +396,26 @@ def create_tables(
elif dataset == 'covtype':
df = prepare_covtype(dataset_folder, nrows=nrows)

elif dataset=="tpcxai_fraud":
elif dataset == "tpcxai_fraud":
if nrows:
df = prepare_tpcxai_fraud_transactions(dataset_folder, nrows=nrows)
else:
partition_size = 1000000
num_rows = datasetconfig[f"rows_sf{args.scalefactor}"] if ("scalefactor" in args) else datasetconfig[f"rows"]
num_rows = datasetconfig[f"rows_sf{args.scalefactor}"] if (
"scalefactor" in args) else datasetconfig[f"rows"]
num_partitions = num_rows//partition_size
print('-'*50)
print(f'Processing Partition Number 1 of {num_partitions+1}')
print('-'*50)
df = prepare_tpcxai_fraud_transactions(dataset_folder, nrows=partition_size)
for i in range(1,num_partitions+1):
df = prepare_tpcxai_fraud_transactions(
dataset_folder, nrows=partition_size)
for i in range(1, num_partitions+1):
print('-'*50)
print(f'Processing Partition Number {i+1} of {num_partitions+1}')
print(
f'Processing Partition Number {i+1} of {num_partitions+1}')
print('-'*50)
df = pd.concat([df,prepare_tpcxai_fraud_transactions(dataset_folder, nrows=partition_size, skip_rows=range(1,partition_size*i))])
df = pd.concat([df, prepare_tpcxai_fraud_transactions(
dataset_folder, nrows=partition_size, skip_rows=range(1, partition_size*i))])
print(f'Final Shape of DataFrame: {df.shape}')

elif dataset == 'criteo':
Expand Down
Loading