-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrec.py
More file actions
120 lines (93 loc) · 3.66 KB
/
rec.py
File metadata and controls
120 lines (93 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import anyconfig
import json
import goodreads
import goodreads.book
import goodreads.client
import re
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore",category=FutureWarning)
import tensorflow as tf
import tensorflow_hub as tf_hub
class GoodreadsList:
def __init__(self, client, shelf_id):
self._client = client
self._api_path = "review/list/{}/?page={{}}".format(shelf_id)
def __iter__(self):
"""
Lazily query the goodreads api to extract all information about books in this shelf
"""
current_page = 0
max_pages = 1
while current_page < max_pages:
current_page += 1
shelf = self._client.request(self._api_path.format(current_page), self._client.query_dict)['books']
for book in shelf['book']:
yield goodreads.book.GoodreadsBook(book, self._client)
max_pages = int(shelf['@numpages'])
class Client(goodreads.client.GoodreadsClient):
def get_list(self, shelf_id):
return GoodreadsList(self, shelf_id)
conf = anyconfig.load("conf.yaml")
GOODREADS_API_KEY = conf.get('api_key')
GOODREADS_SECRET = conf.get('api_secret')
gc = Client(GOODREADS_API_KEY, GOODREADS_SECRET)
# TODO: How to reduce logging from tensorflow?
tf.logging.set_verbosity(tf.logging.ERROR)
embed_module = tf_hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
print("downloaded module")
placeholder = tf.placeholder(dtype=tf.string)
embed = embed_module([placeholder])
print("construct module with placeholder")
# Get all the book information
# NOTE: This has to be run before the tf Session because an SSLError occurs otherwise (why? IDK)
# TODO: Should probably cache all of this information
strip_html = re.compile(r'<.*?>')
books = {}
for book in gc.get_list(81192485):
if book.title == 'Confucius And The Chinese Way':
continue
try:
books[book.isbn13.replace('-', '')] = (book.title, strip_html.sub('', book.description))
except Exception as e:
print("error on book: {} ({})".format(book.title, e))
# Produce the samples for testing
# TODO: readlines has '\n' on it
with open('data/samples.csv') as f:
samples = [isbn[:-1].replace('-', '') for isbn in f]
# Produce data for the choices
choices = {}
with open('data/choices.csv') as f:
for isbn in f:
book = gc.book(isbn=isbn[:-1].replace('-', ''))
try:
choices[book.isbn13.replace('-', '')] = (book.title, strip_html.sub('', book.description))
except Exception as e:
print("error on book: {} ({})".format(book.title, e))
print("downloaded book corpus")
session = tf.Session()
session.run([tf.global_variables_initializer(), tf.tables_initializer()])
print("setup tf session")
# Produce the embeddings
for isbn in books:
books[isbn] = (books[isbn][0], session.run(embed, feed_dict={placeholder: books[isbn][1]}))
for isbn in choices:
choices[isbn] = (choices[isbn][0], session.run(embed, feed_dict={placeholder: choices[isbn][1]}))
import numpy
from numpy.linalg import norm
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, numpy.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
with open('data/cache.json', 'w') as f:
json.dump({'books': books, 'choices': choices}, f, cls=NumpyEncoder)
def cosine_similarity(a, b):
a = numpy.array(a)
b = numpy.array(b)
return numpy.inner(a, b)/(norm(a)*norm(b))
book1 = books[samples[0]]
book2 = books[samples[1]]
print("{} v {} = {}".format(book1[0], book2[0], cosine_similarity(book1[1], book2[1])))