-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetYYLinks.py
More file actions
65 lines (54 loc) · 2.08 KB
/
GetYYLinks.py
File metadata and controls
65 lines (54 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import sgmllib
import urllib2
from pprint import pprint
class PageParser(sgmllib.SGMLParser):
def __init__(self, aFile):
# inherit from the SGMLParser class
sgmllib.SGMLParser.__init__(self)
# create a list this will store all the links found
self.links = []
self.isInBox = False;
self.isThunder = False;
self.thunderCount = 0;
self.file = aFile;
def handle_data(self, data):
if(self.isInBox and self.isThunder):
self.thunderCount = self.thunderCount + 1;
if(self.thunderCount == 3) :
print data;
self.isThunder = False; #eat it
self.thunderCount = 0;
def unknown_starttag(self, tag, attrs):
for key, value in attrs:
#print "key and value", key, value
#if key == "thunderrestitle" and value.endswith('rmvb'):
if key == "thunderrestitle":
self.isThunder = True;
if(self.isThunder):
# print unicode(attrs[2][1], 'utf-8'), ",", attrs[5][1]
print attrs[5][1].strip();
self.file.write(unicode(attrs[2][1], 'utf-8'));
self.file.write(",");
self.file.write(attrs[5][1].strip());
self.file.write("\n");
self.isThunder = False; #eat it
def getBaiduHDList(vedioUrl):
bdhdList = []
sock = urllib2.urlopen(vedioUrl)
f = open('thunder.csv', 'w')
# make sure the string that is going to be parsed is 8-bit ascii
if sock.info().dict['content-type'] == 'text/html; charset=utf-8':
parser = PageParser(f)
parser.feed(sock.read());
parser.close();
f.close();
#bdhdList = parser.links
return bdhdList
def main():
#read id from file
# urlAddress = "http://www.yyets.com/php/resource/26351";
urlAddress = "http://www.yyets.com/php/resource/26263";
getBaiduHDList(urlAddress);
#pprint (txtlist)
if __name__ == '__main__':
main();