multinational-retail-data-centralisation/data_cleaning.py at main · d-killen/multinational-retail-data-centralisation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
from dateutil.parser import parse
import pandas as pd


#Disable Chained Assigment warning in Pandas
pd.options.mode.chained_assignment = None  # default='warn'

class DataCleaning:
    """Class to clean data from various sources

    Methods
        clean_user_data(user_df):
            Cleans the user data dataframe
        clean_card_data(card_df):
            Cleans the card data dataframe
        clean_store_data(store_df):
            Cleans the store data dataframe
        convert_product_weights(product_df):
            Converts all weights to kg in the product data dataframe
        clean_products_data(products_df):
            Cleans the product data dataframe
        clean_orders_data(orders_df):
            Cleans the orders data dataframe
        clean_date_time_data(date_time_df):
            Cleans the date/time data dataframe
    """
    def clean_user_data(self, user_df):
        """Cleans the user data dataframe

        Parameters
            user_df : dataframe
                Dataframe of raw user data

        Returns
            clean_user_df : dataframe
                Dataframe of clean user data
        """
        # FIlter countries
        allowed_countries = ['Germany', 'United Kingdom', 'United States']
        country_mask = user_df['country'].isin(allowed_countries)
        user_df = user_df[country_mask]

        # Filter country codes
        allowed_country_code = ['DE', 'GB', 'US', 'GGB']
        country_code_mask = user_df['country_code'].isin(allowed_country_code)
        user_df = user_df[country_code_mask]
        user_df['country_code'] = user_df['country_code'].str.replace('GGB', 'GB', regex=False)

        # Remove NULLs
        user_df.dropna(inplace=True)

        # Format dates
        user_df['date_of_birth'] = user_df['date_of_birth'].apply(parse)
        user_df.date_of_birth = pd.to_datetime(user_df.date_of_birth, errors='coerce')
        user_df['join_date'] = user_df['join_date'].apply(parse)
        user_df.join_date = pd.to_datetime(user_df.join_date, errors='coerce')

        # Correct index
        user_df.reset_index(drop=True, inplace=True)

        clean_user_df = user_df
        return clean_user_df

    def clean_card_data(self, card_df):
        """Cleans the card data dataframe

        Parameters
            card_df : dataframe
                Dataframe of raw card data

        Returns
            clean_card_df : dataframe
                Dataframe of clean card data
        """
        # Cardnumbers should only have numbers
        card_df['card_number'] = card_df['card_number'].astype(str)
        card_df['card_number'] = card_df['card_number'].str.replace('?', '', regex=False)

        # Expiry_date should be XX/XX
        expiry_mask=card_df.expiry_date.str.contains('/')
        card_df=card_df[expiry_mask]

        # Remove NULLs
        card_df.dropna(inplace=True)

        # Format dates
        card_df['date_payment_confirmed'] = card_df['date_payment_confirmed'].apply(parse)
        card_df.date_payment_confirmed = pd.to_datetime(card_df.date_payment_confirmed, errors='coerce')

        # Correct index
        card_df.reset_index(drop=True, inplace=True)

        clean_card_df = card_df
        return clean_card_df

    def clean_store_data(self, store_df):
        """Cleans the store data dataframe

        Parameters
            store_df : dataframe
                Dataframe of raw store data

        Returns
            clean_store_df : dataframe
                Dataframe of clean store data
        """
        # Correct country codes
        allowed_country_code = ['DE', 'GB', 'US']
        country_code_mask = store_df['country_code'].isin(allowed_country_code)
        store_df = store_df[country_code_mask]

        # Format dates
        store_df['opening_date'] = store_df['opening_date'].apply(parse)
        store_df.opening_date = pd.to_datetime(store_df.opening_date, errors='coerce')

        # Correct head count
        store_df['staff_numbers'] = store_df['staff_numbers'].astype(str)
        regex_hc = "[^0-9]"
        store_df['staff_numbers'] = store_df['staff_numbers'].str.replace(regex_hc, '', regex=True)
        store_df['staff_numbers'] = pd.to_numeric(store_df.staff_numbers, errors='coerce')

        # Correct continent
        store_df['continent']=store_df['continent'].str.replace('ee', '', regex=False)

        # Correct index
        store_df.reset_index(drop=True, inplace=True)

        clean_store_df = store_df
        return clean_store_df

    def convert_product_weights(self, product_df):
        """Converts all weights to kg in the product data dataframe

        Parameters
            product_df : dataframe
                Dataframe of raw product data

        Returns
            converted_product_df : dataframe
                Dataframe of product data with weights all in kg
        """
        def multipack_string_to_weight(string):
            """Converts weights of multipack items

            Parameters
                string : string
                    String of multipack item count and weight eg "5 x 15g"

            Returns
                value : float
                    Weight of multipack item eg "75"
            """
            list=string.split(sep=' ')
            value = float(list[0])*float(list[2])
            return value

        product_df['weight']=product_df['weight'].astype(str)

        # Sort values with kg
        kg_mask = product_df.weight.str.endswith('kg')
        kg_df = product_df[kg_mask]
        # Remove kg
        kg_df['weight'] = kg_df['weight'].str.strip('kg')
        kg_df['weight'] = kg_df['weight'].astype(float)

        # Sort values with g
        g_mask = (product_df.weight.str.endswith('g')) & (product_df['weight'].str[-2] != 'k') & (~product_df.weight.str.contains('x'))
        g_df = product_df[g_mask]
        # Remove g and convert to kg
        g_df['weight'] = g_df['weight'].str.strip('g')
        g_df['weight'] = g_df['weight'].astype(float)
        g_df['weight'] = g_df['weight']/1000

        # Multipack items
        multi_mask = product_df.weight.str.contains('x')
        multi_df = product_df[multi_mask]
        # Remove g and multiply, convert to kg
        multi_df['weight'] = multi_df['weight'].str.strip('g')
        multi_df['weight'] = multi_df['weight'].apply(multipack_string_to_weight)
        multi_df['weight'] = multi_df['weight'].astype(float)
        multi_df['weight'] = multi_df['weight']/1000

        # Sort values with ml
        ml_mask = product_df.weight.str.endswith('ml')
        ml_df = product_df[ml_mask]
        # Remove 'ml' and convert to kg
        ml_df['weight'] = ml_df['weight'].str.strip('ml')
        ml_df['weight'] = ml_df['weight'].astype(float)
        ml_df['weight'] = ml_df['weight']/1000

        # Sort values with oz
        oz_mask = (product_df.weight.str.endswith('oz'))
        oz_df = product_df[oz_mask]
        # Remove 'oz' and convert to kg
        oz_df['weight'] = oz_df['weight'].str.strip('oz')
        oz_df['weight'] = oz_df['weight'].astype(float)
        oz_df['weight'] = oz_df['weight']*0.0283495

        # Sort for incorrect g values
        ig_mask = (product_df.weight.str.endswith('g .'))
        ig_df = product_df[ig_mask]
        # Remove 'g' and convert to kg
        ig_df['weight'] = ig_df['weight'].str.strip('g .')
        ig_df['weight'] = ig_df['weight'].astype(float)
        ig_df['weight'] = ig_df['weight']/1000

        # Concat frames
        frames = [kg_df, g_df, multi_df, ml_df, oz_df, ig_df]
        product_df = pd.concat(frames)

        converted_product_df = product_df
        return converted_product_df

    def clean_products_data(self, products_df):
        """Cleans the product data dataframe

        Parameters
            products_df : dataframe
                Dataframe of raw product data

        Returns
            clean_products_df : dataframe
                Dataframe of clean product data
        """
        # Correct removed column
        products_df['removed']=products_df['removed'].str.replace('Still_avaliable', 'Still_available', regex=False)
        allowed_removed = ['Still_available', 'Removed']
        removed_mask = products_df['removed'].isin(allowed_removed)
        products_df = products_df[removed_mask]

        # Convert price to float
        products_df['product_price']=products_df['product_price'].str.replace('£', '', regex=False)
        products_df['product_price'] = pd.to_numeric(products_df.product_price, errors='coerce')

        # Correct dates
        products_df['date_added'] = products_df['date_added'].apply(parse)
        products_df.date_added = pd.to_datetime(products_df.date_added, errors='coerce')

        # Correct ean case
        products_df.rename(columns={"EAN":"ean"}, inplace=True)

        # Correct index
        products_df.reset_index(drop=True, inplace=True)

        clean_products_df = products_df
        return clean_products_df

    def clean_orders_data(self, orders_df):
        """Cleans the order data dataframe

        Parameters
            orders_df : dataframe
                Dataframe of raw order data

        Returns
            clean_orders_df : dataframe
                Dataframe of clean order data
        """
        # Drop column "1"
        orders_df.drop('1', axis=1, inplace=True)
        orders_df.drop('first_name', axis=1, inplace=True)
        orders_df.drop('last_name', axis=1, inplace=True)
        orders_df['card_number'] = orders_df['card_number'].astype(str)

        # Correct index
        orders_df.reset_index(drop=True, inplace=True)

        clean_orders_df = orders_df
        return clean_orders_df

    def clean_date_time_data(self, date_time_df):
        """Cleans the date/time data dataframe

        Parameters
            date_time_df : dataframe
                Dataframe of raw date/time data

        Returns
            clean_date_time_df : dataframe
                Dataframe of clean date/time data
        """
        # Remove NULLs and Errors
        allowed_time_period = ['Evening', 'Morning', 'Midday', 'Late_Hours']
        time_period_mask = date_time_df['time_period'].isin(allowed_time_period)
        date_time_df = date_time_df[time_period_mask]

        # Correct index
        date_time_df.reset_index(drop=True, inplace=True)

        clean_date_time_df = date_time_df
        return clean_date_time_df