OKCScrape/fetchusers.py at master · stevendevan/OKCScrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/python2.7
"""


ORIGINAL DESCRIPTION:
One by one, fetch profile pages for OKCupid users. The input to this script
is a file with a list of usernames of profiles to pull,
Original version created on Jun 25, 2012
@author: Everett Wetchler (evee746)
"""

import csv
import datetime
import random
import sys
import time
import pickle as pkl

from BeautifulSoup import BeautifulSoup, UnicodeDammit
from absl import flags as gflags  # workaround since gflags is deprecated
import selenium
from selenium import webdriver
import pandas as pd
import regex

import profileparser

# You must have the chromedriver executable available
CHROME_DRIVER_PATH = 'chromedriver_win32\chromedriver.exe'

# My cookies are stored in a .pickle as a list of dicts
cookies = pkl.load(open('cookies.pickle', 'rb'))

FLAGS = gflags.FLAGS
gflags.DEFINE_string('outfile', 'profiles.pickle', 'Filename for output')

# You will have to provide your own usernames file
gflags.DEFINE_string('usernames_file', 'usernames_20180511.csv',
                     'File with usernames to fetch')
gflags.DEFINE_string('completed_usernames_file', 'completed_usernames.csv',
                     'File with usernames we have already fetched')


SLEEP_BETWEEN_QUERIES = 5
BASE_URL = "http://www.okcupid.com"


def pull_profile_and_essays(browser, username):
    """Given a username, fetches the profile page and parses it."""
    url = BASE_URL + username
    print "Fetching profile HTML for", username + "... ",
    html = None

    for attempt in range(2):
        try:
            print('\nNavigating to profile page, attempt %s' % (attempt + 1))
            browser.get(url)
        except selenium.common.exceptions.TimeoutException as e:
            if attempt < 1:
                continue
            else:
                raise
        else:
            break

    html = browser.page_source
    if not html:
        print "No html returned."
        return pd.DataFrame()
    print "Parsing..."
    profile = parse_profile_html(html)
    profile['username'] = regex.findall(r'/profile/(.+)\?cf', username)
    return profile


def parse_profile_html(html):
    """Inputs: html - the html from an OKCupid profile page as of 05.2018
    Outputs: df - a 1-row pandas.DataFrame() object containing profile data

    Parses a user profile page into a DataFrame. The html ends up with a
    bunch of Unicode characters in it, which apparently Python27 is
    ill-equipped to handle. For now, I simply code in the \u#### character
    codes when performing regexes. It doesn't print well in some shells, but
    the parsing works.
    """

    html = html.lower()
    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)

    basics_soup = soup.find(name='div', attrs={'class': 'userinfo2015-basics'})
    sidebar_soup = soup.find(
        name='div', attrs={'class': 'profile2015-content-sidebar'})
    essays_soup = soup.findAll(name='div', attrs={'class': 'profile-essay'})
    if not (basics_soup and sidebar_soup and essays_soup):
        print 'Profile likely deleted. Missing expected html structure.'
        return pd.DataFrame()

    basicsTable = sidebar_soup.find(
        name='table', attrs={'class': 'details2015-section basics'})
    if basicsTable is not None:
        basics = profileparser.parse_basics(basicsTable.findAll('td')[1].text)
    else:
        basics = pd.DataFrame()

    backgroundTable = sidebar_soup.find(
        name='table', attrs={'class': 'details2015-section background'})
    if backgroundTable is not None:
        background = profileparser.parse_background(
            backgroundTable.findAll('td')[1].text)
    else:
        background = pd.DataFrame()

    miscTable = sidebar_soup.find(
        name='table', attrs={'class': 'details2015-section misc'})
    if miscTable is not None:
        misc = profileparser.parse_misc(miscTable.findAll('td')[1].text)
    else:
        misc = pd.DataFrame()

    lookingforObj = sidebar_soup.find(
        name='div', attrs={'class': 'lookingfor2015-sentence'})
    if lookingforObj is not None:
        lookingfor = profileparser.parse_lookingfor(lookingforObj.text)
    else:
        lookingfor = pd.DataFrame()

    name = basics_soup.find(
        name='div', attrs={'class': 'userinfo2015-basics-username'}).text
    age = basics_soup.find(name='span', attrs={
                           'class': 'userinfo2015-basics-asl-age'}).text
    location = basics_soup.find(
        name='span', attrs={'class': 'userinfo2015-basics-asl-location'}).text
    city, region = regex.findall(r'(.*?), (.*)', location)[0]

    userinfo = pd.DataFrame([{'name': name,
                              'age': age,
                              'city': city,
                              'region': region,
                              }],
                            )
    print(userinfo)

    # the 'essays' column is a list of dicts
    essays_list = []
    if essays_soup is not None:
        for item in essays_soup:
            essays_list.append({'title': item.h2.text.encode('ascii', 'ignore'),
                                'content': item.p.text.encode('ascii', 'ignore')
                                })
        essays = pd.DataFrame([[essays_list]], columns=['essays'])
    else:
        essays = pd.DataFrame()

    profile = pd.concat([basics, background, misc,
                         lookingfor, userinfo, essays], axis=1)

    return profile


TIMING_MSG = """%(elapsed)ds elapsed, %(completed)d profiles fetched, \
%(skipped)d skipped, \
%(remaining)d left, %(secs_per_prof).1fs per profile, \
%(prof_per_hour).0f profiles per hour, \
%(est_hours_left).1f hours left"""


def compute_elapsed_seconds(elapsed):
    """Given a timedelta, returns a float of total seconds elapsed."""
    return (elapsed.days * 60 * 60 * 24 +
            elapsed.seconds + elapsed.microseconds / 1.0e6)


def read_usernames(filename):
    """Extracts usernames from the given file, returning a sorted list.
    The file should either be:
        1) A list of usernames, one per line
        2) A CSV file with a 'username' column (specified in its header line)
    """
    try:
        rows = [r[0] for r in csv.reader(open(filename))]
        return rows
    except IOError, e:
        # File doesn't exist
        return []


def prepare_flags(argv):
    """Set up flags. Returns true if the flag settings are acceptable."""
    try:
        argv = FLAGS(argv)  # parse flags
    except gflags.FlagsError, e:
        return False
    return FLAGS.usernames_file and FLAGS.outfile


def add_cookies(webdriver, cookie_dict):
    for cookie in cookie_dict:
        webdriver.add_cookie(cookie)


def main(argv):

    if not prepare_flags(argv):
        print 'Usage: %s ARGS\\n%s' % (sys.argv[0], FLAGS)
        sys.exit(1)

    usernames_to_fetch = read_usernames(FLAGS.usernames_file)
    if not usernames_to_fetch:
        print 'Failed to load usernames from %s' % FLAGS.usernames_file
        sys.exit(1)
    print 'Read %d usernames to fetch' % len(usernames_to_fetch)

    completed = read_usernames(FLAGS.completed_usernames_file)
    if completed:
        usernames_to_fetch = sorted(set(usernames_to_fetch) - set(completed))
        print '%d usernames were already fetched, leaving %d to do' % (
            len(completed), len(usernames_to_fetch))

    start = datetime.datetime.now()
    last = start
    headers_written = bool(completed)  # Only write headers if file is empty
    skipped = 0
    profile_writer = csv.writer(open(FLAGS.outfile, 'ab'))
    completed_usernames_file = open(FLAGS.completed_usernames_file, 'ab')
    completed_usernames_writer = csv.writer(completed_usernames_file)
    N = len(usernames_to_fetch)

    # browser objects
    options = webdriver.chrome.options.Options()
    options.add_argument('--log-level=3')
    browser = webdriver.Chrome(
        executable_path=CHROME_DRIVER_PATH, options=options)
    browser.get(BASE_URL)  # need to navigate here before setting cookies
    # I think cookies have to be added individually, hence the function
    add_cookies(browser, cookies)

    # Fetch profiles
    for i, username in enumerate(usernames_to_fetch):
        # ** Critical ** so OKC servers don't notice and throttle us
        if i > 0:
            print "Sleeping..."
            # elapsed = datetime.datetime.now() - last
            # elapsed_sec = elapsed.seconds * 1.0 + elapsed.microseconds / 1.0e6
            # time.sleep(max(0, SLEEP_BETWEEN_QUERIES - elapsed_sec))
            time.sleep(SLEEP_BETWEEN_QUERIES)
        # Go ahead
        last = datetime.datetime.now()
        profile = pull_profile_and_essays(browser, username)
        if profile.empty:
            skipped += 1
        else:
            with open(FLAGS.outfile, 'ab') as f:
                pkl.dump(profile, f)

        completed_usernames_writer.writerow([username])
        completed_usernames_file.flush()
        if i % 10 == 0:
            elapsed = datetime.datetime.now() - start
            secs = compute_elapsed_seconds(elapsed)
            profiles_per_hour = (i + 1.0) * 3600 / secs
            print '\n' + TIMING_MSG % {
                'elapsed': secs,
                'completed': i + 1,
                'skipped': skipped,
                'remaining': N - i - 1,
                'secs_per_prof': secs / (i + 1.0),
                'prof_per_hour': profiles_per_hour,
                'est_hours_left': (N - i) / profiles_per_hour,
            }


if __name__ == '__main__':
    main(sys.argv)