-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRecomm.Engine.py
More file actions
289 lines (189 loc) · 9.24 KB
/
Recomm.Engine.py
File metadata and controls
289 lines (189 loc) · 9.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 27 13:34:21 2020
@author: Mathew Pazhur
"""
#Importing -------------------------
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import mysql.connector
#Importing stop words
stop_words = set(stopwords.words('english'))
#Mysql connection
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="recom_eng_test"
)
mycursor = mydb.cursor()
#Total Products
mycursor = mydb.cursor()
mycursor.execute("SELECT * FROM product_test")
myresult = mycursor.fetchall()
products=pd.DataFrame(myresult, columns=['product_id','product_name','product_desc','p_type','temp'])
#Enter Customer history in cust_history dataframe
customer_id=int(input("Enter customer ID : "))
mycursor.execute("""SELECT o.order_id,o.customer_id,o.store_id,o.product_id,p.product_name,p.product_description,p.p_type,p.temp
from orders_test_2 o join customer c on o.customer_id=c.customer_id JOIN
product_test p on o.product_id=p.product_id JOIN
store_test_2 s on o.store_id = s.store_id and o.product_id=s.product_id
where o.customer_id=%s
order by 3,2,4""", (customer_id,))
myresult = mycursor.fetchall()
cust_history=pd.DataFrame(myresult, columns=['order_id','customer_id','store_id','product_id','product_name','product_desc','p_type','temp'])
#Enter Store menu in products dataframe
store_id=int(input("Enter Store ID where customer is visiting : "))
mycursor.execute("""SELECT s.store_id,s.store_name,s.product_id,p.product_name
from store_test_2 s join product_test p on s.product_id=p.product_id
where store_id=%s""",(store_id,))
myresult = mycursor.fetchall()
store_products=pd.DataFrame(myresult, columns=['store_id','store_name','product_id','product_name'])
#-------------------------------------------dividing cust_history
c_h_burger=cust_history.loc[(cust_history['p_type']=='burger')|(cust_history['p_type']=='roll')]
c_h_drink=cust_history.loc[(cust_history['p_type']=='beverage')| (cust_history['p_type']=='coffee')]
c_h_dessert=cust_history.loc[(cust_history['p_type']=='dessert')]
c_h_sides=cust_history.loc[(cust_history['p_type']=='sides')|(cust_history['p_type']=='bfast')]
#------------------------------------------dividing products
tot_burger=products.loc[(products['p_type']=='burger')|(products['p_type']=='roll')]
tot_drink=products.loc[(products['p_type']=='beverage')| (products['p_type']=='coffee')]
tot_dessert=products.loc[(products['p_type']=='dessert')]
tot_sides=products.loc[(products['p_type']=='sides')|(products['p_type']=='bfast')]
#Finding top products of customer
def top_prod(hist_df): #function finds top 1 product
cust_top_prod_df=hist_df
a=cust_top_prod_df.groupby(['product_id']).size() #group by size
cust_top_prod_df=cust_top_prod_df.drop(columns=['order_id']) #drop order id column for dropping duplicates
a=a.sort_values(ascending=False) #sort by size
a=a[:1] #top file id
l=list(a.index) #convert series to list
cust_top_prod_df=cust_top_prod_df[cust_top_prod_df['product_id'].isin(l)] #take top product
cust_top_prod_df=cust_top_prod_df.drop_duplicates() #remove suplicates
return(cust_top_prod_df)
cust_top_prod_df=top_prod(cust_history) #top product
cust_top_burger_df=top_prod(c_h_burger) #top burger
cust_top_drink_df=top_prod(c_h_drink)
cust_top_dessert_df=top_prod(c_h_dessert) #top dessert
cust_top_sides_df=top_prod(c_h_sides) #top sides
# cust_top_prod_df=cust_top_prod_df.drop(columns=['customer_id','store_id','product_desc'])
# cust_top_prod_df.drop_duplicates(keep='first',inplace=True)
#Preprocessing -----------------------
X=products['product_desc']
X = X.reset_index(drop=True)
documents = []
stemmer = WordNetLemmatizer()
doc_cleaned_string=''
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'[^a-zA-Z0-9-]', ' ', str(X[sen]))
#Remove Whitespaces
#document = document.strip()
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
#Tokenizing sentence
doc_word_tokens = word_tokenize(document)
#Removing stopwords
doc_cleaned_list = [w for w in doc_word_tokens if not w in stop_words]
#Creating cleaned string from cleaned list
for x in doc_cleaned_list:
doc_cleaned_string=doc_cleaned_string+x+' '
#Adding string to list
documents.append(doc_cleaned_string)
doc_cleaned_string=''
# #Creating Tfidf vector object and removing stop words
tfidf = TfidfVectorizer(stop_words='english')
# #Replacing null values with empty string
products['product_desc']=products['product_desc'].fillna('')
# #Creating tfidf matrix
tfidf_matrix = tfidf.fit_transform(documents)
#Use Cosine formula to get similarity score
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#give indices to products to identify them
indices = pd.Series(products.index, index=products['product_id']).drop_duplicates()
products['preprocessed_desc']=documents
fin_recomm=pd.DataFrame(columns=('product_id','product_name'))
intermed_list=[]
#recommendation function -----------------
def recomm(prod_id, fl, cosine_sim=cosine_sim):
idx = indices[prod_id]
# Get the pairwsie similarity scores of all products with that product
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the products based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#Store products of each type
store_burger=store_products[store_products['product_id'].isin(list(tot_burger['product_id']))]
store_dessert=store_products[store_products['product_id'].isin(list(tot_dessert['product_id']))]
store_drink=store_products[store_products['product_id'].isin(list(tot_drink['product_id']))]
store_sides=store_products[store_products['product_id'].isin(list(tot_sides['product_id']))]
if(fl==1):
print("-------------------------")
print(store_drink)
print("-------------------------")
# print(store_burger)
# print("-------------------------")
# print(store_dessert)
# print("-------------------------")
# print(store_sides)
# Get the products indices
product_indices = [i[0] for i in sim_scores]
pi2=[]
if(fl==1):
for alpha in range(len(product_indices)):
for beta in store_burger.product_id:
if(product_indices[alpha]+1==beta):
pi2.append(product_indices[alpha])
elif(fl==2):
for alpha in range(len(product_indices)):
for beta in store_drink.product_id:
if(product_indices[alpha]+1==beta):
pi2.append(product_indices[alpha])
elif(fl==3):
for alpha in range(len(product_indices)):
for beta in store_sides.product_id:
if(product_indices[alpha]+1==beta):
pi2.append(product_indices[alpha])
elif(fl==4):
for alpha in range(len(product_indices)):
for beta in store_dessert.product_id:
if(product_indices[alpha]+1==beta):
pi2.append(product_indices[alpha])
pi2=pi2[:10]
#Testing performance
# simtest=[]
# for x in sim_scores:
# if(int(x[0]) in pi2):
# simtest.append(x[0:2])
# Return the top 3 most similar products
out=products.iloc[pi2, [0,1]]
out=out[:3] #top 3 recommendations
return(out)
#Code for recommendations of all products
print("For Burgers : ")
burg_recomm=recomm(int(cust_top_burger_df.product_id),1)
print(burg_recomm)
print("For Drinks: ")
drink_recomm=recomm(int(cust_top_drink_df.product_id),2)
print(drink_recomm)
print("For sides and breakfast : ")
sides_recomm=recomm(int(cust_top_sides_df.product_id),3)
print(sides_recomm)
print("For desserts : ")
dessert_recomm=recomm(int(cust_top_dessert_df.product_id),4)
print(dessert_recomm)