-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processing.py
More file actions
106 lines (93 loc) · 3.88 KB
/
data_processing.py
File metadata and controls
106 lines (93 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""data_processing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/12aPKyEUpN4fac06RXDnuyNWs_v5ArF_x
"""
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from tensorflow import data
from keras.layers import TextVectorization
def load_purchase_attributes(inputPath):
# initialize the list of column names in the CSV file and then
# load it using Pandas
cols = ['Location', 'Description', 'Cost', 'Category', 'Day', 'Month', 'Year']
df = pd.read_csv(inputPath, header=0, names=cols)
# return the data frame
return df
def process_purchase_attributes(df, train, test):
# initialize the column names of the continuous data
continuous = ['Cost']
# perform min-max scaling each continuous feature column to the range [0, 1]
cs = MinMaxScaler()
trainContinuous = cs.fit_transform(train[continuous])
testContinuous = cs.transform(test[continuous])
# initialize the column names of the categorical data
categorical = ['Location', 'Day', 'Month', 'Year']
# one-hot encode the categorical data
onehot = OneHotEncoder(sparse=False).fit(df[categorical])
trainCategorical = onehot.transform(train[categorical])
testCategorical = onehot.transform(test[categorical])
# construct our training and testing data points by concatenating
# the categorical features with the continuous features
trainX = np.hstack([trainCategorical, trainContinuous])
testX = np.hstack([testCategorical, testContinuous])
# return the concatenated training and testing data
return (trainX, testX)
def process_purchase_attributes_all(df):
# initialize the column names of the continuous data
continuous = ['Cost']
# perform min-max scaling each continuous feature column to the range [0, 1]
cs = MinMaxScaler()
continuous = cs.fit_transform(df[continuous])
# initialize the column names of the categorical data
categorical = ['Location', 'Day', 'Month', 'Year']
# one-hot encode the categorical data
onehot = OneHotEncoder(sparse=False).fit(df[categorical])
categorical = onehot.transform(df[categorical])
# construct our data points by concatenating
# the categorical features with the continuous features
X = np.hstack([categorical, continuous])
# return the concatenated data
return (X)
def process_purchase_labels(df, train, test):
# initialize the column names of the categorical data
categorical = ['Category']
# one-hot encode the categorical data
onehot = OneHotEncoder(sparse=False).fit(df)
trainY = onehot.transform(train)
testY = onehot.transform(test)
return (trainY, testY)
def create_vocab_index(train):
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = data.Dataset.from_tensor_slices(train).batch(128)
vectorizer.adapt(text_ds)
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
return voc, word_index, vectorizer
def create_embeddings_index():
path_to_glove_file = os.path.join(
os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)
embeddings_index = {}
with open(path_to_glove_file) as f:
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = np.fromstring(coefs, "f", sep=" ")
embeddings_index[word] = coefs
return embeddings_index
def create_embedding_matrix(voc, word_index, embeddings_index):
num_tokens = len(voc) + 2
embedding_dim = 100
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# Words not found in embedding index will be all-zeros.
# This includes the representation for "padding" and "OOV"
embedding_matrix[i] = embedding_vector
return num_tokens, embedding_dim, embedding_matrix