-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebCrawler.py
More file actions
138 lines (110 loc) · 6.55 KB
/
WebCrawler.py
File metadata and controls
138 lines (110 loc) · 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pickle # Pickling Final Knowledge Base dict
from bs4 import BeautifulSoup # processing page data
import requests # to crawl
import urllib # to request page data
from random import shuffle # For making the link selection more diverse
import re # For filtering page text
from nltk.tokenize import sent_tokenize, word_tokenize # For building knowledge base
from nltk.corpus import stopwords # For preprocessing texts for knowledge base
# Crawls for similar, relevant links from a specified domain
def crawlUrls(initDom):
req = requests.get(initDom)
content = req.text
soup = BeautifulSoup(content) # Reads page data to parse for links
linkList = [initDom] # initial domain is included in list of links
linkCount = 0
for links in soup.find_all('a'):
link = str(links.get('href')) # Isolates HTML hyperlink tags
if 'Frog' in link or 'frog' in link: # Ensures relevancy
if link.startswith('/url?q='): # removes query characters
link = link[7:]
print('modified:',link)
if '&' in link:
link = link[link.find('&'):]
if link.startswith('http') and 'google' not in link and 'archive.org' not in link: # Prevents tricky links like Google and Wayback Machine
if 'wikipedia' not in link: # Makes sure other Wikipedia articles are not used
linkCount = linkCount + 1
linkList.append(link)
shuffle(linkList) # Helps make the selection of links more diverse
return linkList
# Filters a page's content to extract just the visible text components
def filterText(curr):
if curr.parent.name in ['style', 'script', '[document]', 'head', 'title', '[endif]']: # filters out non-text HTML tags
return False
elif re.match('<!.*>', str(curr.encode('UTF-8'))): # filters out HTML comments
return False
return True
# Scrapes the text data from the links found
def scrapeLinks(linkList, numLinks):
fileNum = 0
for link in linkList: # Tries every link until the specified number of links is reached
if fileNum == numLinks:
break
try:
linkData = urllib.request.urlopen(link)
soup = BeautifulSoup(linkData)
pageData = soup.find_all(text = True) # Extracts page data (including unwanted tag data)
#Logging
print(link)
pageText = list(filter(filterText, pageData)) # Applies filter function to current page data
dataWrite = ' '.join(pageText)
fileNum = fileNum + 1
except Exception as e: # If the HTTP request fails for any reason, it logs and skips the link
print(e)
continue
with open('file%d.txt' % fileNum, 'w', encoding='UTF-8') as file: # Standardized file name ('file#.txt')
file.write(dataWrite)
# Cleans up scraped text
def procText(filename):
with open(filename, 'r', encoding='UTF-8') as file:
lines = file.readlines()
lines = [line.strip() for line in lines if line.strip()] # Removes any newlines or tabs
with open('%s_sents.txt' % filename.split('.')[0], 'w', encoding='UTF-8') as file: # Standardized file name ('file#_sents.txt')
for line in lines:
currSents = sent_tokenize(line)
for sent in currSents:
file.write('%s\n' % sent)
# Returns the top 40 important terms (by term frequency) from the processed data
def impTerms(numLinks):
uniCounts = {}
# determines the frequency of the unique (non stopword) terms in the data
for i in range(numLinks):
with open('file%d_sents.txt' % (i + 1), 'r', encoding='UTF-8') as file:
data = file.read()
rawToks = word_tokenize(data)
unigrams = [tok.lower() for tok in rawToks if tok.isalpha() and tok.lower() not in stopwords.words('english')]
for tok in unigrams:
if tok not in uniCounts:
uniCounts[tok] = 1
else:
uniCounts[tok] = uniCounts[tok] + 1
impTerms = sorted(uniCounts, key = uniCounts.get, reverse=True) # List of keys, sorted by highest value first
print(impTerms[:40]) # Displays top 40 words to be chosen from
# Builds and pickles the knowledge base (dict: {term: [sentences that include the term]})
def buildKB(termList, numLinks):
sentList = []
kBase = {k: [] for k in termList}
for i in range(numLinks):
with open('file%d_sents.txt' % (i + 1), 'r', encoding='UTF-8') as file:
sentList.extend(file.readlines()) # Puts all sentences into one list
for word in termList:
for sent in sentList:
if word in sent.lower().split(): # split() ensures that every word match is an exclusive word match
kBase[word].append(sent.rstrip()) # For example: prevents a sentence with 'asking' to match the term 'skin'
pickle.dump(kBase, open('frog_knowledge_base.p', 'wb'))
return kBase
# Parts 1-4: separated from parts 5-6 since the 10 terms need to be chosen manually after the important terms are found
def scrape(url, numLinks):
links = crawlUrls(url)
scrapeLinks(links, numLinks) # Creates Files
for i in range(numLinks):
procText('file%d.txt' % (i + 1)) # Creates Files
impTerms(numLinks) # Displays choices of terms
###########################################################################################################################
# No main since parts 1-4 and 5-6 need to be done and toggled manually
url = 'https://en.wikipedia.org/wiki/Frog'
numLinks = 20
# scrape(url, numLinks) # Commented to prevent any changes to the scraped data corpus
tenTerms = ['frog', 'species', 'tree', 'water', 'eggs', 'amphibians', 'tadpoles', 'toad', 'anura', 'skin'] # Manually chosen
kbDict = buildKB(tenTerms, numLinks)
print(kbDict['skin']) # displays one of the elements of the dict for logging