-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbing_api.py
More file actions
287 lines (252 loc) · 11 KB
/
bing_api.py
File metadata and controls
287 lines (252 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 2 00:38:48 2015
@authors: Rohan Kulkarni , Aishwarya Rajesh
"""
import urllib2
import urllib
import base64
import json
import collections
import math
import numpy as np
from stop_words import get_stop_words
import nltk
import re
def bing_api(bingUrl,accountKey):
"""
Query the Bing API and retrieve the results in the variable 'content'
"""
accountKeyEnc = base64.b64encode(accountKey + ':' + accountKey)
headers = {'Authorization': 'Basic ' + accountKeyEnc}
req = urllib2.Request(bingUrl, headers = headers)
response = urllib2.urlopen(req)
content = response.read()
return content
def stripPunctuation(input_string):
"""
This function eliminates the common punctuations in the results received
"""
punctuation_tuple=('@','#','!','.',':',',','-','\\','/','|','(',')','&','[',']','"')
stripped_str = "".join(c for c in input_string if c not in punctuation_tuple).encode('utf-8')
return str.split(str(stripped_str))
def hasNumber(key):
"""
Checks if a word has a numerical entity
"""
number = re.search(r'\d+', key)
return number
def updateDictionary(word_dictionary,qdata):
"""
Creates a dictionary object out of the document descriptions received
from Bing.
"""
stop_words = get_stop_words('english')
for each in qdata:
stripped_desc_list=stripPunctuation(each['description'])
stripped_title_list=stripPunctuation(each['title'])
data=stripped_desc_list+stripped_title_list
for i in data:
if i in word_dictionary:
word_dictionary[i]+=1
else:
word_dictionary[i]=1
for key in word_dictionary.keys():
if key.decode('utf-8') in stop_words or key.isdigit() or hasNumber(key)!=None:
del word_dictionary[key]
ordered_dict = collections.OrderedDict(sorted(word_dictionary.items()))
return ordered_dict
def findDocFrequency(word,all_docs):
"""
Calculate Document Frequencies(#times a word appears in all documents) for all the words
"""
df=0
for doc in all_docs:
stripped_desc_list=stripPunctuation(doc['description'])
stripped_title_list=stripPunctuation(doc['title'])
data=stripped_desc_list+stripped_title_list
if word in data:
df+=1
return df
def findTermFrequency(key,doc):
"""
Calculate Term Frequencies(#times a word appears in a particular document) for all the words
and documents
"""
if type(doc) is list:
return doc.count(key)
stripped_title_list=stripPunctuation(doc['title'])
stripped_desc_list=stripPunctuation(doc['description'])
data=stripped_title_list+stripped_desc_list
return data.count(key)
def calculateDocVector(doc,ordered_dict,all_docs):
"""
Calculate all Document vectors using all words present in the Ordered Dictionary
"""
doc_vec=[]
for key,val in ordered_dict.iteritems():
doc_freq=findDocFrequency(key,all_docs)
term_freq=findTermFrequency(key,doc)
if doc_freq:
wt=term_freq*(math.log10(float(len(all_docs))/float(doc_freq)))
else:
wt=0
doc_vec.append(wt)
return doc_vec
def calculateQueryVector(queryList,ordered_dict,all_docs):
"""
Calculate the Query vectors using all words present in the Ordered Dictionary
"""
query_vec=[]
for key,val in ordered_dict.iteritems():
doc_freq=findDocFrequency(key,all_docs)
term_freq=findTermFrequency(key,queryList)
if doc_freq:
wt=term_freq*(math.log10(float(len(all_docs))/float(doc_freq)))
else:
wt=0
query_vec.append(wt)
return query_vec
def getUserFeedback(results,dvec):
"""
Get Relevance Feedback from the user and create separate collections for relevant
and non-relevant documents
"""
rel_docs={}
rel_docs['docs']=[]
rel_docs['dvec']=[]
nrel_docs={}
nrel_docs['docs']=[]
nrel_docs['dvec']=[]
nrelevant=0
print ('Please provide feedback about relevance of the documents\n')
for doc in results:
print '-------------------------------------------'
print 'DOC RANK : ', doc['rank']
print 'DOC URL : ', doc['url'].encode('ascii','ignore')
print 'DOC TITLE : ', doc['title'].encode('ascii','ignore')
print 'DOC DESCRIPTION : ', doc['description'].encode('ascii','ignore')
print '-------------------------------------------'
feedback=raw_input('Please Enter [1] : Relevant [2] : Non Relevant ')
print
if(int(feedback)==1):
rel_docs['docs'].append(doc)
rel_docs['dvec'].append(dvec[doc['url']])
nrelevant+=1
else:
nrel_docs['docs'].append(doc)
nrel_docs['dvec'].append(np.array(dvec[doc['url']]))
return (rel_docs,nrel_docs,float(nrelevant)/float(len(results)))
def roccioAlgorithm(q,r,nr):
"""
Implementing ROCCIO Algorithm for query vector reweighting
"""
alpha=1
beta=0.70
gamma=0.15
q=np.array(q)
new_query_vec=(alpha*q)+((beta/len(r['dvec']))*np.sum(r['dvec'],axis=0))-((gamma/len(nr['dvec']))*np.sum(nr['dvec'],axis=0))
return list(new_query_vec)
def selectNewWords(new_list,query_words,tagged_words):
"""
Select new words to append to the initial query
"""
ratio_range=(1,1.5)
new_nounwords=[]
for i,val in enumerate(tagged_words):
tagged_words[i][0].encode('utf-8')
noun_list=[v[0].encode('utf-8') for i, v in enumerate(tagged_words) if (re.search(r"^NN", v[1]) or re.search(r"^JJ", v[1]))]
for each in new_list:
if each[0] not in query_words and each[0] in noun_list:
new_nounwords.append(each)
new_selected_words=[]
new_selected_words.append(new_nounwords[0][0])
if(ratio_range[0]<=(float(new_nounwords[0][1])/float(new_nounwords[1][1]))<ratio_range[1] and new_nounwords[1][1] != new_nounwords[2][1]):
new_selected_words.append(new_nounwords[1][0])
return new_selected_words
def wordTaggerFunction(docs):
"""
This function uses the NLTK tagger to tokenize the description of documents and
create a list of tagged words.
"""
tagged_list=[]
for each in docs:
text=nltk.word_tokenize(each['description'])
tagged_list.append(nltk.pos_tag(text))
return sum(tagged_list, [])
def main():
print
query=raw_input('Enter the Query : ')
precisionValue=raw_input('Enter the desired Precision@10 value : ') #Precision value to be achieved eventually
accountKey=raw_input('Enter the BING Account key : ') #BING account key
run_count=0
word_dictionary={} # Collection of all words extracted from the BING Search results
achieved_precision=0
while(achieved_precision < precisionValue): #Run until the desired precision is achieved
query=urllib.quote(query)
queryList=query.split('%20')
bingUrl = 'https://api.datamarket.azure.com/Bing/Search/Web?Query=%27userquery%27&$top=10&$format=json'
bingUrl=bingUrl.replace('userquery',query)
content=bing_api(bingUrl,accountKey)
queryResults=json.loads(content) #Parse the json documents containing query results
print 'FEEDBACK SUMMARY'
print 'Query : ',query
print 'BingUrl : ',bingUrl.encode('ascii','ignore')
print 'Number of Results : ',len(queryResults['d']['results'])
if len(queryResults['d']['results']) < 10: # Terminating the program if search results are not sufficient
print ('Not Enough Query search results..Terminating the program')
break
qResultDict={}
qResultDict['data']=[]
for (rank,each) in enumerate(queryResults['d']['results']):
tempDict={}
tempDict['rank']=rank+1
tempDict['title']=each['Title'].encode('ascii','ignore').lower()
tempDict['url']=each['DisplayUrl'].encode('ascii','ignore')
tempDict['description']=each['Description'].encode('ascii','ignore').lower()
qResultDict['data'].append(tempDict)
word_dictionary=updateDictionary(word_dictionary,qResultDict['data']) #Contains the dictionary of all words along with
# their counts
#Create a list of all document vectors
doc_vectors={}
for doc in qResultDict['data']:
doc_vectors[doc['url']]=calculateDocVector(doc,word_dictionary,qResultDict['data'])
#creating query vector
query_vector=calculateQueryVector(queryList,word_dictionary,qResultDict['data'])
#Calculating the relevant , non relevant documents based on user feedback and calculating
#the precision achieved by the algorithm based on the same
(rel_docs,nrel_docs,achieved_precision)=getUserFeedback(qResultDict['data'],doc_vectors)
print "Precision Achieved based on user feedback : ",achieved_precision
# Terminate the algorithm when the desired precision is achieved
if(float(achieved_precision) >= float(precisionValue)):
print ('Required precision value achieved..Algorithm terminates')
break
# Terminate the algorithm if no relevant documents are present in the search results
if(len(rel_docs['docs'])==0 and run_count==0):
print ('No relevant results in the initial search query..Quitting')
break
run_count+=1 #Contains the number of runs required by the algorithm to achieve the desired precision value
#Calculating the new query vector using ROCCIOs Algorithm and selecting all positive entries present in it.
new_query_vec=roccioAlgorithm(query_vector,rel_docs,nrel_docs)
new_query_vec=list(np.array(new_query_vec).clip(min=0))
i=0
all_new_words=[] #Contains a list of all possible words that can be appended to the Query
for key,val in word_dictionary.iteritems():
if(new_query_vec[i] > 0):
all_new_words.append((key,new_query_vec[i]))
i+=1
#Sorting the possible words in decreasing order of their weights calculated by ROCCIOs algorithm
all_new_words.sort(key=lambda tup:tup[1],reverse=True)
#Contains the WORD-TAG values
tagged_words=wordTaggerFunction(qResultDict['data'])
#Get the two new words to be appended to the former user query
two_new_words=selectNewWords(all_new_words,queryList,tagged_words)
two_new_words=" ".join(two_new_words)
#Modify the initial user query by appending the newly selected words.
query=urllib.unquote(query)+" "+two_new_words
print "Modified Query : ",query
print
#Print the number of iterations required to achieve the desired precision value
print 'RUN COUNT : ',run_count
if __name__ == '__main__':
main()