-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaning.py
More file actions
291 lines (232 loc) · 10.3 KB
/
data_cleaning.py
File metadata and controls
291 lines (232 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
from dateutil.parser import parse
import pandas as pd
#Disable Chained Assigment warning in Pandas
pd.options.mode.chained_assignment = None # default='warn'
class DataCleaning:
"""Class to clean data from various sources
Methods
clean_user_data(user_df):
Cleans the user data dataframe
clean_card_data(card_df):
Cleans the card data dataframe
clean_store_data(store_df):
Cleans the store data dataframe
convert_product_weights(product_df):
Converts all weights to kg in the product data dataframe
clean_products_data(products_df):
Cleans the product data dataframe
clean_orders_data(orders_df):
Cleans the orders data dataframe
clean_date_time_data(date_time_df):
Cleans the date/time data dataframe
"""
def clean_user_data(self, user_df):
"""Cleans the user data dataframe
Parameters
user_df : dataframe
Dataframe of raw user data
Returns
clean_user_df : dataframe
Dataframe of clean user data
"""
# FIlter countries
allowed_countries = ['Germany', 'United Kingdom', 'United States']
country_mask = user_df['country'].isin(allowed_countries)
user_df = user_df[country_mask]
# Filter country codes
allowed_country_code = ['DE', 'GB', 'US', 'GGB']
country_code_mask = user_df['country_code'].isin(allowed_country_code)
user_df = user_df[country_code_mask]
user_df['country_code'] = user_df['country_code'].str.replace('GGB', 'GB', regex=False)
# Remove NULLs
user_df.dropna(inplace=True)
# Format dates
user_df['date_of_birth'] = user_df['date_of_birth'].apply(parse)
user_df.date_of_birth = pd.to_datetime(user_df.date_of_birth, errors='coerce')
user_df['join_date'] = user_df['join_date'].apply(parse)
user_df.join_date = pd.to_datetime(user_df.join_date, errors='coerce')
# Correct index
user_df.reset_index(drop=True, inplace=True)
clean_user_df = user_df
return clean_user_df
def clean_card_data(self, card_df):
"""Cleans the card data dataframe
Parameters
card_df : dataframe
Dataframe of raw card data
Returns
clean_card_df : dataframe
Dataframe of clean card data
"""
# Cardnumbers should only have numbers
card_df['card_number'] = card_df['card_number'].astype(str)
card_df['card_number'] = card_df['card_number'].str.replace('?', '', regex=False)
# Expiry_date should be XX/XX
expiry_mask=card_df.expiry_date.str.contains('/')
card_df=card_df[expiry_mask]
# Remove NULLs
card_df.dropna(inplace=True)
# Format dates
card_df['date_payment_confirmed'] = card_df['date_payment_confirmed'].apply(parse)
card_df.date_payment_confirmed = pd.to_datetime(card_df.date_payment_confirmed, errors='coerce')
# Correct index
card_df.reset_index(drop=True, inplace=True)
clean_card_df = card_df
return clean_card_df
def clean_store_data(self, store_df):
"""Cleans the store data dataframe
Parameters
store_df : dataframe
Dataframe of raw store data
Returns
clean_store_df : dataframe
Dataframe of clean store data
"""
# Correct country codes
allowed_country_code = ['DE', 'GB', 'US']
country_code_mask = store_df['country_code'].isin(allowed_country_code)
store_df = store_df[country_code_mask]
# Format dates
store_df['opening_date'] = store_df['opening_date'].apply(parse)
store_df.opening_date = pd.to_datetime(store_df.opening_date, errors='coerce')
# Correct head count
store_df['staff_numbers'] = store_df['staff_numbers'].astype(str)
regex_hc = "[^0-9]"
store_df['staff_numbers'] = store_df['staff_numbers'].str.replace(regex_hc, '', regex=True)
store_df['staff_numbers'] = pd.to_numeric(store_df.staff_numbers, errors='coerce')
# Correct continent
store_df['continent']=store_df['continent'].str.replace('ee', '', regex=False)
# Correct index
store_df.reset_index(drop=True, inplace=True)
clean_store_df = store_df
return clean_store_df
def convert_product_weights(self, product_df):
"""Converts all weights to kg in the product data dataframe
Parameters
product_df : dataframe
Dataframe of raw product data
Returns
converted_product_df : dataframe
Dataframe of product data with weights all in kg
"""
def multipack_string_to_weight(string):
"""Converts weights of multipack items
Parameters
string : string
String of multipack item count and weight eg "5 x 15g"
Returns
value : float
Weight of multipack item eg "75"
"""
list=string.split(sep=' ')
value = float(list[0])*float(list[2])
return value
product_df['weight']=product_df['weight'].astype(str)
# Sort values with kg
kg_mask = product_df.weight.str.endswith('kg')
kg_df = product_df[kg_mask]
# Remove kg
kg_df['weight'] = kg_df['weight'].str.strip('kg')
kg_df['weight'] = kg_df['weight'].astype(float)
# Sort values with g
g_mask = (product_df.weight.str.endswith('g')) & (product_df['weight'].str[-2] != 'k') & (~product_df.weight.str.contains('x'))
g_df = product_df[g_mask]
# Remove g and convert to kg
g_df['weight'] = g_df['weight'].str.strip('g')
g_df['weight'] = g_df['weight'].astype(float)
g_df['weight'] = g_df['weight']/1000
# Multipack items
multi_mask = product_df.weight.str.contains('x')
multi_df = product_df[multi_mask]
# Remove g and multiply, convert to kg
multi_df['weight'] = multi_df['weight'].str.strip('g')
multi_df['weight'] = multi_df['weight'].apply(multipack_string_to_weight)
multi_df['weight'] = multi_df['weight'].astype(float)
multi_df['weight'] = multi_df['weight']/1000
# Sort values with ml
ml_mask = product_df.weight.str.endswith('ml')
ml_df = product_df[ml_mask]
# Remove 'ml' and convert to kg
ml_df['weight'] = ml_df['weight'].str.strip('ml')
ml_df['weight'] = ml_df['weight'].astype(float)
ml_df['weight'] = ml_df['weight']/1000
# Sort values with oz
oz_mask = (product_df.weight.str.endswith('oz'))
oz_df = product_df[oz_mask]
# Remove 'oz' and convert to kg
oz_df['weight'] = oz_df['weight'].str.strip('oz')
oz_df['weight'] = oz_df['weight'].astype(float)
oz_df['weight'] = oz_df['weight']*0.0283495
# Sort for incorrect g values
ig_mask = (product_df.weight.str.endswith('g .'))
ig_df = product_df[ig_mask]
# Remove 'g' and convert to kg
ig_df['weight'] = ig_df['weight'].str.strip('g .')
ig_df['weight'] = ig_df['weight'].astype(float)
ig_df['weight'] = ig_df['weight']/1000
# Concat frames
frames = [kg_df, g_df, multi_df, ml_df, oz_df, ig_df]
product_df = pd.concat(frames)
converted_product_df = product_df
return converted_product_df
def clean_products_data(self, products_df):
"""Cleans the product data dataframe
Parameters
products_df : dataframe
Dataframe of raw product data
Returns
clean_products_df : dataframe
Dataframe of clean product data
"""
# Correct removed column
products_df['removed']=products_df['removed'].str.replace('Still_avaliable', 'Still_available', regex=False)
allowed_removed = ['Still_available', 'Removed']
removed_mask = products_df['removed'].isin(allowed_removed)
products_df = products_df[removed_mask]
# Convert price to float
products_df['product_price']=products_df['product_price'].str.replace('£', '', regex=False)
products_df['product_price'] = pd.to_numeric(products_df.product_price, errors='coerce')
# Correct dates
products_df['date_added'] = products_df['date_added'].apply(parse)
products_df.date_added = pd.to_datetime(products_df.date_added, errors='coerce')
# Correct ean case
products_df.rename(columns={"EAN":"ean"}, inplace=True)
# Correct index
products_df.reset_index(drop=True, inplace=True)
clean_products_df = products_df
return clean_products_df
def clean_orders_data(self, orders_df):
"""Cleans the order data dataframe
Parameters
orders_df : dataframe
Dataframe of raw order data
Returns
clean_orders_df : dataframe
Dataframe of clean order data
"""
# Drop column "1"
orders_df.drop('1', axis=1, inplace=True)
orders_df.drop('first_name', axis=1, inplace=True)
orders_df.drop('last_name', axis=1, inplace=True)
orders_df['card_number'] = orders_df['card_number'].astype(str)
# Correct index
orders_df.reset_index(drop=True, inplace=True)
clean_orders_df = orders_df
return clean_orders_df
def clean_date_time_data(self, date_time_df):
"""Cleans the date/time data dataframe
Parameters
date_time_df : dataframe
Dataframe of raw date/time data
Returns
clean_date_time_df : dataframe
Dataframe of clean date/time data
"""
# Remove NULLs and Errors
allowed_time_period = ['Evening', 'Morning', 'Midday', 'Late_Hours']
time_period_mask = date_time_df['time_period'].isin(allowed_time_period)
date_time_df = date_time_df[time_period_mask]
# Correct index
date_time_df.reset_index(drop=True, inplace=True)
clean_date_time_df = date_time_df
return clean_date_time_df