-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl_novel.py
More file actions
57 lines (51 loc) · 1.51 KB
/
crawl_novel.py
File metadata and controls
57 lines (51 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#/usr/bin/python
# encoding:utf-8
# __Author__ = Slwhy
'''
功能: 爬取笔趣阁网站的小说内容
'''
import requests
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
r = requests.get(url,timeout=20)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except:
print 'requests error'
if __name__ == '__main__':
url = 'https://www.qu.la/book/3137/' #笔趣阁元尊小说的网址
#url = 'http://www.farpop.com/0_4/'
html = getHtmlText(url)
urlList = [ ]
root_url = 'https://www.qu.la'
html = getHtmlText(url)
soup = BeautifulSoup(html,'html.parser')
print "***********目录*************"
for i in soup.find_all('dd'):
#print i #打印整个标签
try:
a = i.a
if a.get('style') == "":
print '\t\t\t\t\t\t\t\t\t\t\t\t',
print a.text
url = root_url + a.get('href')
urlList.append(url)
except:
print 'fail'
num = raw_input('please input the num of chapter you want:')
n = eval(num)
if n >103:
n = n-1
html = getHtmlText(urlList[n-1])
soup = BeautifulSoup(html,'html.parser')
# print soup.text
h = soup.h1
print '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t',
print h.text
content = soup.find(id = 'content')
cont_str = content.text
for i in content.children:
if type(i) != type(content):
print i