crawls/crawlDyTtD.py at master · Slwhy/crawls · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#/usr/bin/python
# encoding:utf-8
# __Author__ = Slwhy


import requests
from bs4 import BeautifulSoup
import os
import json
import codecs
import cPickle as pickle        #可将内存中的数据序列化为字符串，写入磁盘


def getHtmlText(url):
    try:
        r = requests.get(url,timeout=20)
        r.encoding = r.apparent_encoding
        #print r.apparent_encoding
        r.raise_for_status()
        return r.text
    except:
        print 'requests error'

def getMovieUrlList(html,dict1):
    count = 0
    soup = BeautifulSoup(html,'html.parser')
    for ul in soup.find_all('ul'):
        for a in ul.find_all('a'):
            url = 'http://www.dytt8.net/'
            url = url + str(a.get('href'))
            html = getHtmlText(url)
            soup = BeautifulSoup(html,'html.parser')
            print soup.find(style="WORD-WRAP: break-word")
            # name = a.get_text()
            # dict1[name] = url
            # count = count + 1
            #dictInfor = json.dumps(dict1, encoding="UTF-8", ensure_ascii=False)
    print count


def writeMovieHtmlUrl(dict1):
    count = 1
    f = codecs.open('C:\Users\zjp\Desktop\shaoli\scarpy\movie.txt', 'wb', 'utf-8')
    for key in dict1:
        #print  key
        f.write(unicode(count) +u'\t' + key + u'\t\t\t\t\t' + dict1[key] + u"\r\n")
        count = count + 1
    f.close()

if __name__ == '__main__':
    #'http://www.dytt8.net/html/gndy/jddy/20171113/55535.html'
    #'/html/gndy/jddy/20171113/55535.html'
    url = 'http://www.dytt8.net/'
    dict1 = {}
    html = getHtmlText(url)
    getMovieUrlList(html,dict1)
    #writeMovieHtmlUrl(dict1)