PurchaseClassifier/data_processing.py at main · melanieriley/PurchaseClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""data_processing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/12aPKyEUpN4fac06RXDnuyNWs_v5ArF_x
"""

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from tensorflow import data
from keras.layers import TextVectorization

def load_purchase_attributes(inputPath):
	# initialize the list of column names in the CSV file and then
	# load it using Pandas
	cols = ['Location', 'Description', 'Cost', 'Category', 'Day', 'Month', 'Year']
	df = pd.read_csv(inputPath, header=0, names=cols)
	# return the data frame
	return df

def process_purchase_attributes(df, train, test):
	# initialize the column names of the continuous data
	continuous = ['Cost']
	# perform min-max scaling each continuous feature column to the range [0, 1]
	cs = MinMaxScaler()
	trainContinuous = cs.fit_transform(train[continuous])
	testContinuous = cs.transform(test[continuous])
  # initialize the column names of the categorical data
	categorical = ['Location', 'Day', 'Month', 'Year']
  # one-hot encode the categorical data
	onehot = OneHotEncoder(sparse=False).fit(df[categorical])
	trainCategorical = onehot.transform(train[categorical])
	testCategorical = onehot.transform(test[categorical])
	# construct our training and testing data points by concatenating
	# the categorical features with the continuous features
	trainX = np.hstack([trainCategorical, trainContinuous])
	testX = np.hstack([testCategorical, testContinuous])
	# return the concatenated training and testing data
	return (trainX, testX)

def process_purchase_attributes_all(df):
	# initialize the column names of the continuous data
	continuous = ['Cost']
	# perform min-max scaling each continuous feature column to the range [0, 1]
	cs = MinMaxScaler()
	continuous = cs.fit_transform(df[continuous])
  # initialize the column names of the categorical data
	categorical = ['Location', 'Day', 'Month', 'Year']
  # one-hot encode the categorical data
	onehot = OneHotEncoder(sparse=False).fit(df[categorical])
	categorical = onehot.transform(df[categorical])
	# construct our data points by concatenating
	# the categorical features with the continuous features
	X = np.hstack([categorical, continuous])
	# return the concatenated data
	return (X)

def process_purchase_labels(df, train, test):
  # initialize the column names of the categorical data
	categorical = ['Category']
  # one-hot encode the categorical data
	onehot = OneHotEncoder(sparse=False).fit(df)
	trainY = onehot.transform(train)
	testY = onehot.transform(test)
	return (trainY, testY)

def create_vocab_index(train):
  vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
  text_ds = data.Dataset.from_tensor_slices(train).batch(128)
  vectorizer.adapt(text_ds)
  voc = vectorizer.get_vocabulary()
  word_index = dict(zip(voc, range(len(voc))))
  return voc, word_index, vectorizer

def create_embeddings_index():
  path_to_glove_file = os.path.join(
      os.path.expanduser("~"), "/content/glove.6B.100d.txt"
  )

  embeddings_index = {}
  with open(path_to_glove_file) as f:
      for line in f:
          word, coefs = line.split(maxsplit=1)
          coefs = np.fromstring(coefs, "f", sep=" ")
          embeddings_index[word] = coefs
  return embeddings_index

def create_embedding_matrix(voc, word_index, embeddings_index):
  num_tokens = len(voc) + 2
  embedding_dim = 100

  # Prepare embedding matrix
  embedding_matrix = np.zeros((num_tokens, embedding_dim))
  for word, i in word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # Words not found in embedding index will be all-zeros.
          # This includes the representation for "padding" and "OOV"
          embedding_matrix[i] = embedding_vector

  return num_tokens, embedding_dim, embedding_matrix