Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
147 changes: 124 additions & 23 deletions fanfiction/scraper.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
# !/usr/local/bin/python3
# -*- coding: utf-8 -*-

# Python 2/3 compatibility
#personal version of https://github.com/michaelmilleryoder/fanfiction
try:
from urllib.parse import unquote_plus
except ImportError:
from urllib import unquote_plus
import time, re, requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pdb

class Scraper:

def __init__(self):
self.base_url = 'http://fanfiction.net/'
self.rate_limit = 1
def __init__(self, rate_limit=1):
self.base_url = 'https://fanfiction.net'
self.rate_limit = rate_limit
self.parser = "html.parser"

def get_genres(self, genre_text):
Expand All @@ -26,6 +31,32 @@ def get_genres(self, genre_text):
corrected_genres.append(genre)
return corrected_genres

def story_ids_by_fandom(self, fandom_type, fandom_name, out_fpath):
"""
Saves a list of story IDs for a fandom to a text file.
"""
url = 'https://www.fanfiction.net/{0}/{1}/?&srt=1&lan=1&r=10'.format(self.base_url, fandom_type, fandom_name.replace(' ', '-'))
result = requests.get(url)
html = result.content
soup = BeautifulSoup(html, self.parser)

# Get list of pages
last_page = int(soup.find('a', text="Last")['href'].split('=')[-1])

for p in tqdm(range(1, last_page)):
url = 'https://www.fanfiction.net/{0}/{1}/?&srt=1&lan=1&r=10&p={2}'.format(fandom_type, fandom_name.replace(' ', '-'), p)
result = requests.get(url)
html = result.content
soup = BeautifulSoup(html, self.parser)

# Get story IDs
story_ids = [s['href'].split('/')[2] for s in soup.find_all('a', {'class': 'stitle'})]

# Save story IDs (append)
with open(out_fpath, 'a') as f:
for s in story_ids:
f.write(s + '\n')

def scrape_story_metadata(self, story_id):
"""
Returns a dictionary with the metadata for the story.
Expand All @@ -46,30 +77,63 @@ def scrape_story_metadata(self, story_id):
-num_words: total number of words in all chapters of the story
-rated: the story's rating.
"""
url = '{0}/s/{1}'.format(self.base_url, story_id)
result = requests.get(url)
url = 'https://www.fanfiction.net/s/{}'.format(story_id)
try:
result = requests.get(url)
except (requests.exceptions.ChunkedEncodingError,
requests.exceptions.ConnectionError) as e:
pdb.set_trace()
return None
html = result.content
soup = BeautifulSoup(html, self.parser)
pre_story_links = soup.find(id='pre_story_links').find_all('a')
author_id = int(re.search(r"var userid = (.*);", str(soup)).groups()[0]);
title = re.search(r"var title = (.*);", str(soup)).groups()[0];
title = unquote_plus(title)[1:-1]
pre_story_links = soup.find(id='pre_story_links')
if pre_story_links is None:
return None
else:
pre_story_links = soup.find(id='pre_story_links').find_all('a')
#author_id = re.search(r"var userid = (.*);", str(soup))
author_id = re.search(r"var userid = (.*?);", str(html))
if author_id is None:
pdb.set_trace()
else:
author_id = int(author_id.groups()[0]);
#title = re.search(r"var title = (.*);", str(soup)).groups()[0];
title = re.search(r"var title = (.*?);", str(html)).groups()[0];
title = unquote_plus(title)[2:-2]
metadata_div = soup.find(id='profile_top')
times = metadata_div.find_all(attrs={'data-xutime':True})
metadata_text = metadata_div.find(class_='xgray xcontrast_txt').text
metadata_parts = metadata_text.split('-')
genres = self.get_genres(metadata_parts[2].strip())
try:
chapters = soup.find(id='chap_select').find_all("option")
chapter_names = []
omit = 0
while len(chapters) > 0:
ch = chapters.pop()
if omit > 0:
chapter_names.insert(0,ch.text[0:-omit])
else:
chapter_names.insert(0,ch.text)
omit = len(ch.text)
except AttributeError:
chapter_names=[title]


metadata = {
'id': story_id,
'canon_type': pre_story_links[0].text,
'canon': pre_story_links[1].text,
'canon': pre_story_links[-1].text,
'author_id': author_id,
'title': title,
'updated': int(times[0]['data-xutime']),
'published': int(times[1]['data-xutime']),
'lang': metadata_parts[1].strip(),
'published': int(times[-1]['data-xutime']),
'chapter_names': chapter_names,
'genres': genres
}
if len(pre_story_links) > 1:
metadata['canon_type'] = pre_story_links[0].text,
if len(times) > 1:
metadata['updated'] = int(times[0]['data-xutime'])
for parts in metadata_parts:
parts = parts.strip()
tag_and_val = parts.split(':')
Expand All @@ -90,29 +154,50 @@ def scrape_story_metadata(self, story_id):

def scrape_story(self, story_id, keep_html=False):
metadata = self.scrape_story_metadata(story_id)
if metadata is None:
return None # Error--story not found

if "chapters" in metadata:
num_chapters = len(metadata['chapters'])
if "num_chapters" in metadata:
num_chapters = int(metadata['num_chapters'])
else:
num_chapters = 1

metadata['chapters'] = {}
metadata['reviews'] = {}
num_chapters = metadata['num_chapters']
# rate limit to follow fanfiction.net TOS
time.sleep(self.rate_limit)

if num_chapters == 0: # no chapter structure
num_chapters = 1
metadata["num_chapters"] = num_chapters

for chapter_id in range(1, num_chapters + 1):
time.sleep(self.rate_limit)
chapter = self.scrape_chapter(story_id, chapter_id)
time.sleep(self.rate_limit)
chapter_reviews = self.scrape_reviews_for_chapter(
story_id, chapter_id)

metadata['chapters'][chapter_id] = chapter
metadata['reviews'][chapter_id] = chapter_reviews

return metadata

def scrape_chapter(self, story_id, chapter_id, keep_html=False):
url = '{0}/s/{1}/{2}'.format(self.base_url, story_id, chapter_id)
result = requests.get(url)
url = 'https://www.fanfiction.net/r/{0}/{1}'.format(story_id, chapter_id)
try:
result = requests.get(url)
except requests.exceptions.SSLError:
return b''
html = result.content
soup = BeautifulSoup(html, self.parser)
chapter = soup.find(class_='storytext')
if chapter is None:
return b''
if not keep_html:
chapter_text = chapter.get_text(' ').encode('utf8')
chapter_text = chapter.get_text('\n').encode('utf8')
return chapter_text

def scrape_reviews_for_chapter(self, story_id, chapter_id):
Expand All @@ -124,21 +209,37 @@ def scrape_reviews_for_chapter(self, story_id, chapter_id):
Each review dict contains the user id of the reviewer if it exists,
the timestamp of the review, and the text of the review.
"""
url = '{0}/r/{1}/{2}'.format(self.base_url, story_id, chapter_id)
result = requests.get(url)
url = 'https://www.fanfiction.net/r/{0}/{1}'.format(story_id, chapter_id)
try:
result = requests.get(url)
except ssl.SSLError:
return []
html = result.content
soup = BeautifulSoup(html, self.parser)
reviews_table = soup.find(class_='table-striped').tbody
reviews_tds = reviews_table.find_all('td')
reviews_table = soup.find(class_='table-striped')
reviews = []

if reviews_table is None:
return reviews
else:
reviews_table = reviews_table.tbody
reviews_tds = reviews_table.find_all('td')

if len(reviews_tds) == 1 and reviews_tds[0].string == 'No Reviews found.':
return reviews

for review_td in reviews_tds:
match = re.search(r'href="/u/(.*)/.*">.*</a>', str(review_td))
if match is not None:
user_id = int(match.groups()[0])
else:
user_id = None
time = review_td.find('span', attrs={'data-xutime':True})
time = int(time['data-xutime'])
if time is not None:
time = int(time['data-xutime'])

if review_td.div is None:
continue
review = {
'time': time,
'user_id': user_id,
Expand Down