asu-cactus · mahidhar96 · Oct 21, 2022 · Oct 21, 2022 · Oct 26, 2022 · Oct 26, 2022
diff --git a/model-inference/decisionTree/experiments/.ipynb_checkpoints/gpu_util-checkpoint.ipynb b/model-inference/decisionTree/experiments/.ipynb_checkpoints/gpu_util-checkpoint.ipynb
@@ -0,0 +1,128 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f3af0015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import matplotlib\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "da3cdd47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = os.listdir('gpu_profiles')\n",
+    "files.sort()\n",
+    "for file in files:\n",
+    "    if file.startswith(\"python\") and file.endswith(\"GPU_LOG.txt\"):\n",
+    "        args = file.split(\"-\")\n",
+    "#         for i in range(len(args)):\n",
+    "#             print(i,args[i])\n",
+    "        batch_size = args[6].split('_')[2]\n",
+    "        dataset = args[1].split('_')[1]\n",
+    "        algo = args[2].split('_')[1]\n",
+    "        framework = args[3].split('_')[1]\n",
+    "        print(batch_size,dataset,framework)\n",
+    "        data = open(file).read().split('\\n')\n",
+    "        gpu_usage = []\n",
+    "        mem_usage = []\n",
+    "        ubu_usage = []\n",
+    "        max_usage = []\n",
+    "        max_gpu_usage = -1\n",
+    "        max_mem_usage = -1\n",
+    "        max_ubu_usage = -1\n",
+    "        \n",
+    "        for line in data:\n",
+    "            if line.startswith(\"[0]\"):\n",
+    "#                 print(line.split())\n",
+    "                line_split = line.split()\n",
+    "                gpu = int(line_split[5])\n",
+    "                mem = int(line_split[8])\n",
+    "                max_usg = int(line_split[10])\n",
+    "                ubu = 0\n",
+    "                if line_split[-1].startswith('ubuntu'):\n",
+    "                    ubu = line_split[-1].split('(')[-1]\n",
+    "                    ubu = ubu.split(\"M\")[0]\n",
+    "                    ubu = int(ubu)\n",
+    "                max_gpu_usage = max(max_gpu_usage,gpu)\n",
+    "                max_mem_usage = max(max_mem_usage,mem)\n",
+    "                max_ubu_usage = max(max_ubu_usage,ubu)\n",
+    "                gpu_usage.append(gpu)\n",
+    "                mem_usage.append(mem)\n",
+    "                ubu_usage.append(ubu)\n",
+    "                max_usage.append(max_usg)\n",
+    "        \n",
+    "        print(mem_usage,'\\n',gpu_usage,'\\n',ubu_usage,'\\n',max_usage)\n",
+    "        break\n",
+    "        \n",
+    "        file = open(dataset+\".csv\",\"a+\")\n",
+    "        file.write(\"algo,\"+algo+\"\\n\")\n",
+    "        file.write(\"dataset,\"+dataset+\"\\n\")\n",
+    "        file.write(\"batch_size,\"+batch_size+\"\\n\")\n",
+    "        file.write(\"framework,\"+framework+\"\\n\")\n",
+    "        file.write(\"max_gpu_usage,\"+str(max_gpu_usage)+\"\\n\")\n",
+    "        file.write(\"max_mem_usage,\"+str(max_mem_usage)+\"\\n\")\n",
+    "        file.write(\"max_ubu_usage,\"+str(max_ubu_usage)+\"\\n\\n\")\n",
+    "        file.write(\"gpu_usage,\"+\",\".join(map(str,gpu_usage))+\"\\n\")\n",
+    "        file.write(\"mem_usage,\"+\",\".join(map(str,mem_usage))+\"\\n\")\n",
+    "        file.write(\"ubu_usage,\"+\",\".join(map(str,ubu_usage))+\"\\n\")\n",
+    "        file.write(\"max_usage,\"+\",\".join(map(str,max_usage))+\"\\n\\n\\n\\n\")\n",
+    "        file.close()\n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57dd11c6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "511a2bda",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73cffcfd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/model-inference/decisionTree/experiments/config.json b/model-inference/decisionTree/experiments/config.json
@@ -64,7 +64,7 @@
         "type": "classification",
         "info": "https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud",
         "query": "SELECT v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v22,v23,v24,v25,v26,v27,v28,class FROM fraud",
-        "create": "CREATE TABLE ** (time DECIMAL NOT NULL, v1 DECIMAL NOT NULL, v2 DECIMAL NOT NULL, v3 DECIMAL NOT NULL, v4 DECIMAL NOT NULL, v5 DECIMAL NOT NULL, v6 DECIMAL NOT NULL, v7 DECIMAL NOT NULL, v8 DECIMAL NOT NULL, v9 DECIMAL NOT NULL, v10 DECIMAL NOT NULL, v11 DECIMAL NOT NULL, v12 DECIMAL NOT NULL, v13 DECIMAL NOT NULL, v14 DECIMAL NOT NULL, v15 DECIMAL NOT NULL, v16 DECIMAL NOT NULL, v17 DECIMAL NOT NULL, v18 DECIMAL NOT NULL, v19 DECIMAL NOT NULL, v20 DECIMAL NOT NULL, v21 DECIMAL NOT NULL, v22 DECIMAL NOT NULL, v23 DECIMAL NOT NULL, v24 DECIMAL NOT NULL, v25 DECIMAL NOT NULL, v26 DECIMAL NOT NULL, v27 DECIMAL NOT NULL, v28 DECIMAL NOT NULL, amount DECIMAL NOT NULL, class INTEGER NOT NULL)",
+        "create": "CREATE TABLE ** (v1 DECIMAL NOT NULL, v2 DECIMAL NOT NULL, v3 DECIMAL NOT NULL, v4 DECIMAL NOT NULL, v5 DECIMAL NOT NULL, v6 DECIMAL NOT NULL, v7 DECIMAL NOT NULL, v8 DECIMAL NOT NULL, v9 DECIMAL NOT NULL, v10 DECIMAL NOT NULL, v11 DECIMAL NOT NULL, v12 DECIMAL NOT NULL, v13 DECIMAL NOT NULL, v14 DECIMAL NOT NULL, v15 DECIMAL NOT NULL, v16 DECIMAL NOT NULL, v17 DECIMAL NOT NULL, v18 DECIMAL NOT NULL, v19 DECIMAL NOT NULL, v20 DECIMAL NOT NULL, v21 DECIMAL NOT NULL, v22 DECIMAL NOT NULL, v23 DECIMAL NOT NULL, v24 DECIMAL NOT NULL, v25 DECIMAL NOT NULL, v26 DECIMAL NOT NULL, v27 DECIMAL NOT NULL, v28 DECIMAL NOT NULL, class INTEGER NOT NULL)",
         "train": 0.8,
         "test": 0.2,
         "y_col": "class",
@@ -161,10 +161,11 @@
     "criteo": {
         "num_features": 1000000,
         "rows": 51882752,
+        "dataset_type": "sparse",
         "batch_size": 100000,
         "query_size": 100000,
         "type": "classification",
-        "info": "https://www.kaggle.com/c/criteo-display-ad-challenge/",
+        "info": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#criteo https://www.kaggle.com/c/criteo-display-ad-challenge/",
         "query": "",
         "create": "",
         "train": -1,

diff --git a/model-inference/decisionTree/experiments/data_processing.py b/model-inference/decisionTree/experiments/data_processing.py
@@ -19,20 +19,22 @@ def parse_arguments():
     parser = argparse.ArgumentParser(
         description='Arguments for data_processing.py')
     parser.add_argument("-d", "--dataset", type=str, required=True,
-        choices=[
-            'higgs', 
-            'airline_regression', 
-            'airline_classification', 
-            'fraud', 
-            'year', 
-            'epsilon', 
-            'bosch', 
-            'covtype',
-            'criteo',
-            'tpcxai_fraud'],
-        help="Dataset to be processed. Choose from ['higgs', 'airline_regression', 'airline_classification', 'fraud', 'year', 'epsilon', 'bosch', 'covtype','tpcxai_fraud','criteo']")
-    parser.add_argument("-n", "--nrows", type=int, help="Load nrows of the dataset. Warning: only use in development.")
-    parser.add_argument("-sf","--scalefactor", type=int, help="Relevant only for TPCxAI_Fraud. Takes one of the values in 1, 3, 10 and 30")
+                        choices=[
+                            'higgs',
+                            'airline_regression',
+                            'airline_classification',
+                            'fraud',
+                            'year',
+                            'epsilon',
+                            'bosch',
+                            'covtype',
+                            'criteo',
+                            'tpcxai_fraud'],
+                        help="Dataset to be processed. Choose from ['higgs', 'airline_regression', 'airline_classification', 'fraud', 'year', 'epsilon', 'bosch', 'covtype','tpcxai_fraud','criteo']")
+    parser.add_argument("-n", "--nrows", type=int,
+                        help="Load nrows of the dataset. Warning: only use in development.")
+    parser.add_argument("-sf", "--scalefactor", type=int,
+                        help="Relevant only for TPCxAI_Fraud. Takes one of the values in 1, 3, 10 and 30")
 
     args = parser.parse_args()
     return args
@@ -121,6 +123,8 @@ def prepare_fraud(dataset_folder, nrows=None):
               filename + " -p " + dataset_folder)
     df = pd.read_csv(local_url + ".zip", dtype=np.float32, nrows=nrows)
     df = df.astype({"Class": np.int8})
+    df = df.drop("Time", axis=1)
+    df = df.drop("Amount", axis=1)
     return df
 
 
@@ -138,6 +142,7 @@ def prepare_bosch(dataset_folder, nrows=None):
     df = df.astype({"Response": np.int8})
     return df
 
+
 def prepare_year(dataset_folder, nrows=None):
     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'
     local_url = download_data(url, dataset_folder)
@@ -168,8 +173,9 @@ def prepare_epsilon(nrows=None):
     return test_data, train_data
 
 
-def prepare_covtype(dataset_folder, nrows=None): 
-    df = datasets.fetch_covtype(data_home=dataset_folder, as_frame=True)["frame"]
+def prepare_covtype(dataset_folder, nrows=None):
+    df = datasets.fetch_covtype(
+        data_home=dataset_folder, as_frame=True)["frame"]
     if nrows is not None:
         df = df[:nrows]
     df = df.astype(np.float32)
@@ -178,44 +184,57 @@ def prepare_covtype(dataset_folder, nrows=None):
     df["Cover_Type"] = df["Cover_Type"] - 1
     return df
 
-def prepare_tpcxai_fraud_transactions(dataset_folder,nrows=None,skip_rows=0):
+
+def prepare_tpcxai_fraud_transactions(dataset_folder, nrows=None, skip_rows=0):
     global tpcxai_fraud_dataset_headers
     import re
     from datetime import datetime
     import time
 
-    show_progress_bar = False  # To Show/Hide Progress Bar based on whether working in Interactive Mode 
+    # To Show/Hide Progress Bar based on whether working in Interactive Mode
+    show_progress_bar = False
 
     if show_progress_bar:
         from tqdm import tqdm
         tqdm.pandas()
 
     SCALE_FACTOR = args.scalefactor if ("scalefactor" in args) else 1
 
-    file_name = f'dataset/financial_transactions_train_SF{SCALE_FACTOR}.csv'  # Put the file in same directory
+    # Put the file in same directory
+    file_name = f'dataset/financial_transactions_train_SF{SCALE_FACTOR}.csv'
     df = pd.read_csv(file_name, nrows=nrows, skiprows=skip_rows)
 
     start_time = time.time()
     print('FEATURE ENGINEERING: Conversion of Text to Numerical Features')
     # Convert Text-based Columns to Numerical Values
-    numericalize_text_feature_fn = lambda input: re.sub(r"[^0-9]","",input).strip()
+
+    def numericalize_text_feature_fn(
+        input): return re.sub(r"[^0-9]", "", input).strip()
     # convert_datetime_feature_fn = lambda input: pd.Series([int(x) for x in datetime.strftime(datetime.strptime(input, "%Y-%m-%dT%H:%M"),"%d%m%Y:%H%M").split(':')])
-    convert_datetime_feature_intermediate_fn = lambda input: datetime.strftime(datetime.strptime(input, "%Y-%m-%dT%H:%M"),"%d%m%Y:%H%M")
-    convert_datetime_feature_final_fn = lambda input: pd.Series([float(x) for x in input.split(':')])
+
+    def convert_datetime_feature_intermediate_fn(input): return datetime.strftime(
+        datetime.strptime(input, "%Y-%m-%dT%H:%M"), "%d%m%Y:%H%M")
+    def convert_datetime_feature_final_fn(input): return pd.Series(
+        [float(x) for x in input.split(':')])
 
     print('[1] Converting IBAN to Numerical Feature [DE4875000009209924 -> 4875000009209924]')
-    df['IBAN'] = df['IBAN'].progress_apply(numericalize_text_feature_fn) if show_progress_bar else df['IBAN'].apply(numericalize_text_feature_fn)
+    df['IBAN'] = df['IBAN'].progress_apply(
+        numericalize_text_feature_fn) if show_progress_bar else df['IBAN'].apply(numericalize_text_feature_fn)
     print(f'Time Taken until here: {(time.time()-start_time)} seconds')
     print('[2] Converting receiverID to Numerical Feature [FOR55821814 -> 55821814]')
-    df['receiverID'] = df['receiverID'].progress_apply(numericalize_text_feature_fn) if show_progress_bar else df['receiverID'].apply(numericalize_text_feature_fn)
+    df['receiverID'] = df['receiverID'].progress_apply(
+        numericalize_text_feature_fn) if show_progress_bar else df['receiverID'].apply(numericalize_text_feature_fn)
     print(f'Time Taken until here: {(time.time()-start_time)} seconds')
     print('[3] Converting time to Numerical Feature [2011-01-29T15:28 -> [29012011, 1528]]')
     print('\t[3.1] STAGE 1: Conversion of Date to the Required Format [2011-01-29T15:28 -> 29012011:1528]')
-    df['time'] = df['time'].progress_apply(convert_datetime_feature_intermediate_fn) if show_progress_bar else df['time'].apply(convert_datetime_feature_intermediate_fn)
+    df['time'] = df['time'].progress_apply(convert_datetime_feature_intermediate_fn) if show_progress_bar else df['time'].apply(
+        convert_datetime_feature_intermediate_fn)
     print(f'Time Taken until here: {(time.time()-start_time)} seconds')
     print('\t[3.2] STAGE 2: Conversion of Formatted Date to New Features [29012011:1528 -> [29012011, 1528]]')
-    print('This Stage takes long time to complete after Bar reaches 100% as it also writes to the TWO new Columns [date, time]')
-    df[['date','time']] = df['time'].progress_apply(convert_datetime_feature_final_fn) if show_progress_bar else df['time'].apply(convert_datetime_feature_final_fn)
+    print(
+        'This Stage takes long time to complete after Bar reaches 100% as it also writes to the TWO new Columns [date, time]')
+    df[['date', 'time']] = df['time'].progress_apply(
+        convert_datetime_feature_final_fn) if show_progress_bar else df['time'].apply(convert_datetime_feature_final_fn)
     print(f'Time Taken until here: {(time.time()-start_time)} seconds')
     print('[4] Change Column Name isFraud to is_fraud due to SQL Case-insensitive Nature')
     df = df.rename(columns={"isFraud": "is_fraud"}, errors="raise")
@@ -227,23 +246,29 @@ def prepare_tpcxai_fraud_transactions(dataset_folder,nrows=None,skip_rows=0):
     print(f'Time Taken until here: {(time.time()-start_time)} seconds')
 
     # DataType Conversion. All features can be made Integers, hence making them int64
-    dtype=np.float32
-    df = df.astype({"IBAN": dtype, "receiverID": dtype, "date": dtype, "time": dtype, "is_fraud": np.int8})
+    dtype = np.float32
+    df = df.astype({"IBAN": dtype, "receiverID": dtype,
+                   "date": dtype, "time": dtype, "is_fraud": np.int8})
 
     print('-'*50)
     print('Feature Engineering and Creating the New Dataset DONE')
-    print(f'Total Time Taken for Preparing the New Dataset: {(time.time()-start_time)} seconds')
+    print(
+        f'Total Time Taken for Preparing the New Dataset: {(time.time()-start_time)} seconds')
     return df
 
+
 def prepare_criteo(dataset_folder):
     url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/criteo.kaggle2014.svm.tar.xz"
     local_url = download_data(url, dataset_folder)
-    train_path = relative2abspath(dataset_folder, "criteo.kaggle2014.svm", "train.txt.svm")
-    test_path = relative2abspath(dataset_folder, "criteo.kaggle2014.svm", "test.txt.svm")
+    train_path = relative2abspath(
+        dataset_folder, "criteo.kaggle2014.svm", "train.txt.svm")
+    test_path = relative2abspath(
+        dataset_folder, "criteo.kaggle2014.svm", "test.txt.svm")
 
     if not (os.path.isfile(train_path) and os.path.isfile(test_path)):
         os.system(f"tar -Jxf {local_url} -C {dataset_folder}")
 
+
 def get_connection(pgsqlconfig):
     return psycopg2.connect(
         database=pgsqlconfig["dbname"],
@@ -273,7 +298,8 @@ def make_query(dataset, datasetconfig, column_names):
     elif dataset == "tpcxai_fraud":
         if datasetconfig['y_col'] in column_names:
             column_names.remove(datasetconfig['y_col'])
-        feature_names = ", ".join([f"{col_name} DECIMAL NOT NULL" for col_name in column_names])
+        feature_names = ", ".join(
+            [f"{col_name} DECIMAL NOT NULL" for col_name in column_names])
         label_name = f"{datasetconfig['y_col']} INTEGER NOT NULL"
         create_query = f"CREATE TABLE ** ({feature_names}, {label_name})"
     else:
@@ -370,22 +396,26 @@ def create_tables(
     elif dataset == 'covtype':
         df = prepare_covtype(dataset_folder, nrows=nrows)
 
-    elif dataset=="tpcxai_fraud":
+    elif dataset == "tpcxai_fraud":
         if nrows:
             df = prepare_tpcxai_fraud_transactions(dataset_folder, nrows=nrows)
         else:
             partition_size = 1000000
-            num_rows = datasetconfig[f"rows_sf{args.scalefactor}"] if ("scalefactor" in args) else datasetconfig[f"rows"]
+            num_rows = datasetconfig[f"rows_sf{args.scalefactor}"] if (
+                "scalefactor" in args) else datasetconfig[f"rows"]
             num_partitions = num_rows//partition_size
             print('-'*50)
             print(f'Processing Partition Number 1 of {num_partitions+1}')
             print('-'*50)
-            df = prepare_tpcxai_fraud_transactions(dataset_folder, nrows=partition_size)
-            for i in range(1,num_partitions+1):
+            df = prepare_tpcxai_fraud_transactions(
+                dataset_folder, nrows=partition_size)
+            for i in range(1, num_partitions+1):
                 print('-'*50)
-                print(f'Processing Partition Number {i+1} of {num_partitions+1}')
+                print(
+                    f'Processing Partition Number {i+1} of {num_partitions+1}')
                 print('-'*50)
-                df = pd.concat([df,prepare_tpcxai_fraud_transactions(dataset_folder, nrows=partition_size, skip_rows=range(1,partition_size*i))])
+                df = pd.concat([df, prepare_tpcxai_fraud_transactions(
+                    dataset_folder, nrows=partition_size, skip_rows=range(1, partition_size*i))])
             print(f'Final Shape of DataFrame: {df.shape}')
 
     elif dataset == 'criteo':