crawls/crawl_pict.py at master · Slwhy/crawls · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#/usr/bin/python
# encoding:utf-8
# __Author__ = Slwhy

import requests
from bs4 import BeautifulSoup
import os

def getHtmlText(url):
    try:
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def getPictUrl(picUrlList,html):
    soup = BeautifulSoup(html, "html.parser")
    try:
        img = soup.find('img')
        picUrlList.append(img.get('src'))
    except:
        print 'error'


def downPict(picUrlList):
    root = os.getcwd() + '/pic/'
    for i in range(len(picUrlList)):
        url = picUrlList[i]
        path = root + url.split('/')[-1]
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(url)
                with open(path,'wb') as f:
                    f.write(r.content)
                    #f.close
                    print '文件保存成功'
            else:
                print '文件已经存在'
        except:
            print '爬取失败'


def main():
    # http://desk.zol.com.cn/showpic/2880x1800_89000_112.html
    rootUrl = 'http://desk.zol.com.cn/showpic/2880x1800_89'
    picUrlList = []
    for i in range(100,200):
        url = rootUrl + str(i) + '_112.html'
        print url
        html = getHtmlText(url)
        getPictUrl(picUrlList,html)
    downPict(picUrlList)

main()