WikiScraper/scraper.py at master · MarkFuller1/WikiScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from bs4 import BeautifulSoup
import urllib3
import re

#list of sites visited
listmap = set("wiki/Baylor_University")

# scrape takes a url and gets a list of embedded urls from the html
# returns the number of files visited

def scrape(url, counter):

    #get GET request page
    response = http.request('GET', url)

    #get html from request responce
    soup = BeautifulSoup(response.data, features="html5lib")

    #write the current URL to the document
    f.write(url + "\t")

    #for each link found within the html
    for link in soup.findAll('a', attrs={'href': re.compile("^/wiki/[a-zA-z-0-9_()-]+$")}):

        #parse the href to get the link
        currentUrl = link.get('href')

        #print the found link
        print(currentUrl)

        #write it to a file
        f.write(currentUrl + "\t")

        #increment the counter
        counter = counter + 1

        #we dont want to go overboard
        if counter < 1000:
            #if we have not visited the link before
            if currentUrl in listmap:
                print("duplicate")
            else:
                #add the current node to the list
                listmap.add(currentUrl)

                #write to the file
                f.write("\n")

                #scrape on that node
                counter = scrape(baseUrl + link.get('href'), counter)

    f.write("\n")
    return counter

#counter for number of files
counter = 0;

#empty the dataset
open("dataset", 'w').close()

#open the file
f = open("dataset", "a")

#init database link
http = urllib3.PoolManager()

#base link url
baseUrl = "https://en.wikipedia.org"

#starting node
url = 'https://en.wikipedia.org/wiki/Baylor_University'

#count the number of sites added
counter = scrape(url, counter)

print(counter)