-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleanning.py
More file actions
218 lines (133 loc) · 7.87 KB
/
data_cleanning.py
File metadata and controls
218 lines (133 loc) · 7.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python
# coding: utf-8
# In[1]:
"""Part II. Code to clean the 'data_raw.csv' and get it ready for the recommender system"""
import pandas as pd
import numpy as np
#from pandas.io.json import json_normalize
from operator import itemgetter
import re
# In[10]:
# TESTING (dentro de la función 'parsing_data')
#header = soup_test.find('div', id='title-and-menu-box')
# In[2]:
# to see what's going on
#df = pd.read_csv('data_raw.csv')
#df.reset_index(drop=True)
# In[3]:
def add_features(hike_df):
"""Function that classifies 12 characteristics into each route """
hike_df['dog_friendly'] = 0
hike_df['kid_friendly'] = 0
hike_df['camping'] = 0
hike_df['trekking'] = 0
hike_df['near_water'] = 0
hike_df['mountain_biking'] = 0
hike_df['great_views'] = 0
hike_df['bird_watching'] = 0
hike_df['climbing'] = 0
hike_df['forests'] = 0
hike_df['trail_running'] = 0
hike_df['historic_place'] = 0
for idx, attribute in enumerate(hike_df['hike_attributes']):
for feature in [el[1:-1] for el in attribute[1:-1].split(', ')]:
if feature == 'apto para perros' or feature == 'perros con correa':
hike_df['dog_friendly'].iloc[idx] = 1
if feature == 'apto para niños':
hike_df['kid_friendly'].iloc[idx] = 1
if feature == 'acampada':
hike_df['camping'].iloc[idx] = 1
if feature == 'senderismo' or feature == 'excursiones por la naturaleza':
hike_df['trekking'].iloc[idx] = 1
if feature == 'río' or feature == 'cascada' or feature =='lago':
hike_df['near_water'].iloc[idx] = 1
if feature == 'ciclismo de montaña':
hike_df['mountain_biking'].iloc[idx] = 1
if feature == 'vistas' or feature == 'conducción panorámica':
hike_df['great_views'].iloc[idx] = 1
if feature == 'observación de aves' or feature == 'fauna':
hike_df['bird_watching'].iloc[idx] = 1
if feature == 'escalada' or feature == 'rocoso' or feature =='trepar':
hike_df['climbing'].iloc[idx] = 1
if feature == 'bosque' or feature == 'flores silvestres':
hike_df['forests'].iloc[idx] = 1
if feature == 'trail running':
hike_df['trail_running'].iloc[idx] = 1
if feature == 'lugar histórico':
hike_df['historic_place'].iloc[idx] = 1
hike_df.drop('hike_attributes', axis=1, inplace=True)
return hike_df
# In[4]:
def data_cleaning(df):
"""Function to clean data"""
df = df.drop(columns=['Unnamed: 0']) # Dropping first column (from last index)
df['distance'] = df['distance'].apply(lambda x: re.sub(r'[^0-9.]','', str(x))).astype(float) # Taking out non-numeric characters
df['elevation'] = df['elevation'].apply(lambda x: re.sub(r'[^0-9.]','', str(x))).astype(float) # Taking out non-numeric characters and converting to float
df['route_type'] = df['route_type'].apply(lambda x: re.sub(r'\n','',re.sub(r'Tipo de ruta:','',str(x)))) # Keeping the characteristics of the route
df['difficulty_level'] = df['difficulty_level'].map({'fácil':1,'moderada':2,'difícil':3}) # Converting categoric variable into number level
df.rename(columns={'distance': 'distance_kms', 'elevation': 'elevation_mts'}, inplace=True) # Renaming distance and elevation columns just to be aware of its units
df = df.assign(hikeID=(df.hike_name).astype('category').cat.codes) #Assigning hike codes from 0 to 251
df = df [['hikeID','hike_name','region','distance_kms','elevation_mts','difficulty_level','stars','num_reviews','user_ratings','route_type','hike_attributes']] # Settle order
df = pd.get_dummies(df, columns=['route_type'],drop_first=True) # There are 3 types: 'Circular', 'De punto a punto' & 'Ida y vuelta'.
df = add_features(df) # Selecting 12 characteristics
df['user_ratings'] = df['user_ratings'].apply(eval) # Converting the string 'user_ratings' to a list of dictionaries.
return df
# In[5]:
# To see df
#df = data_cleaning(df)
#df
# In[6]:
# Para tener en cuenta
#df[df['num_reviews']>29]
# In[7]:
#type(df['user_ratings'][0][0]) #now, its a list of dicts
# In[8]:
# To see df
#hike_user_rating_df
# In[9]:
def create_rating_df(df):
"""Function to put in separate columns the user and ratings"""
lista_dicts = [l for l in hike_user_rating_df['user_ratings']] # List of lists where each element contains the dictionary
rating_df = pd.DataFrame(columns=['hike_name', 'user', 'rating']) # Empty df
tmp_hikeID = []
tmp_hike = []
tmp_user = []
tmp_rating = []
for i in range(len(df)): # By each element of the df
for el in lista_dicts[i]:
tmp_hikeID.append(df['hikeID'][i]) # Adding 'hikeID'
tmp_hike.append(df['hike_name'][i]) # Adding 'hike_name'
tmp_user.append(*el) # Adding user name
tmp_rating.append(list(map(itemgetter(0), el.values()))[0]) # Adding the rating given by the user
rating_df['hikeID'] = tmp_hikeID
rating_df['hike_name'] = tmp_hike
rating_df['user'] = tmp_user
rating_df['rating'] = tmp_rating
return rating_df
# In[10]:
def rating_df_cleanup(rating_df):
rating_df['rating'] = rating_df.rating.astype(float) # Turning rating to a float type
rating_df = rating_df.assign(userID=(rating_df['user']).astype('category').cat.codes) # Assigning user codes from 0 to 659
rating_df["userID"] = "user_" + (rating_df["userID"]).astype(str) # and adding 'user' to characterize userID
rating_df = rating_df [['hikeID','hike_name','userID','user','rating']] # Order settling
return(rating_df)
# In[12]:
if __name__ == '__main__':
df = pd.read_csv('data_raw.csv') # importing data
df.reset_index(drop=True)
df = data_cleaning(df) # Cleaning data. This is the first df
hike_user_rating_df = df[['hikeID','hike_name','user_ratings']].copy() # Creating a new df
rating_df = create_rating_df(hike_user_rating_df) # Taking user and rating into different columns
rating_df = rating_df_cleanup(rating_df) # Cleaning rating_df. This is the second df.
data_ready = pd.merge(df, rating_df, on='hikeID') # Merging 'df' and 'rating_df' by common column 'hikeID'
data_ready = data_ready.drop(columns=['user_ratings','hike_name_y']) # Doing some cleanning to 'data_ready'
data_ready.rename(columns={'hike_name_x': 'hike_name'}, inplace=True)
data_ready = data_ready[['hikeID', 'hike_name', 'region', 'distance_kms', 'elevation_mts',
'difficulty_level', 'stars', 'num_reviews', 'userID',
'user', 'rating','route_type_De punto a punto', 'route_type_Ida y vuelta',
'dog_friendly', 'kid_friendly', 'camping', 'trekking', 'near_water',
'mountain_biking', 'great_views', 'bird_watching', 'climbing',
'forests', 'trail_running', 'historic_place']]
data_ready.to_csv('data_ready.csv',index=False) # Creating an unique csv cleaned file
# In[13]:
#data_ready