-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl_pict.py
More file actions
65 lines (50 loc) · 1.43 KB
/
crawl_pict.py
File metadata and controls
65 lines (50 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#/usr/bin/python
# encoding:utf-8
# __Author__ = Slwhy
import requests
from bs4 import BeautifulSoup
import os
def getHtmlText(url):
try:
r = requests.get(url, timeout=20)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getPictUrl(picUrlList,html):
soup = BeautifulSoup(html, "html.parser")
try:
img = soup.find('img')
picUrlList.append(img.get('src'))
except:
print 'error'
def downPict(picUrlList):
root = os.getcwd() + '/pic/'
for i in range(len(picUrlList)):
url = picUrlList[i]
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
#f.close
print '文件保存成功'
else:
print '文件已经存在'
except:
print '爬取失败'
def main():
# http://desk.zol.com.cn/showpic/2880x1800_89000_112.html
rootUrl = 'http://desk.zol.com.cn/showpic/2880x1800_89'
picUrlList = []
for i in range(100,200):
url = rootUrl + str(i) + '_112.html'
print url
html = getHtmlText(url)
getPictUrl(picUrlList,html)
downPict(picUrlList)
main()