-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathspider.py
More file actions
306 lines (264 loc) · 9.71 KB
/
spider.py
File metadata and controls
306 lines (264 loc) · 9.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import requests
import time
import json
import pymongo
from requests.exceptions import ConnectionError
COOKIE = '这里填写自己的Cookie'
# 连接MongoDB
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.bilibili_user
MIN = 0
MID = 0
def get_space(mid):
"""
进入bili用户主页方便下一步动作
:param mid: 用户ID
"""
try:
headers = {
'Host': 'space.bilibili.com',
'Referer': 'https://www.bilibili.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Cookie': COOKIE
}
url = 'https://space.bilibili.com/' + str(mid)
req = requests.get(url, headers=headers, timeout=60)
if req.status_code == 200:
print('bili用户主页url:{}'.format(url))
print('成功进入用户主页')
# 获取用户个人信息
get_GetINnfo(mid)
else:
print('进入bili用户主页失败,code {}'.format(req.status_code))
except ConnectionError as e:
print('ConnectionError网络异常', e.args)
def get_GetINnfo(mid):
"""
获取用户个人信息
:param mid: 用户ID
:return: 返回个人信息
"""
try:
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'space.bilibili.com',
'Origin': 'https://space.bilibili.com',
'Referer': 'https://space.bilibili.com/' + str(mid),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
data = {
'mid': str(mid)
}
url = 'https://space.bilibili.com/ajax/member/GetInfo'
req = requests.post(url, headers=headers, data=data, timeout=60)
if req.status_code == 200:
print('获取用户个人信息成功')
status = req.json()
if status.get('data'):
data = status.get('data')
regtimez = time.localtime(data.get('regtime'))
regtime = time.strftime("%Y-%m-%d %H:%M:%S", regtimez)
result = {
'mid': data.get('mid'),
'name': data.get('name'),
'sex': data.get('sex'),
'regtime': regtime,
'birthday': data.get('birthday'),
'sign': data.get('sign')
}
print('用户个人信息:{}'.format(result))
# 得到用户个人信息保存到数据库
save_GetINnfo_mongodb(result)
else:
print('获取用户个人信息失败,code {}'.format(req.status_code))
except ConnectionError as e:
print('ConnectionError网络异常', e.args)
def get_myinfo(mid):
"""
获取用户关注数量和粉丝数量
:param mid: 用户ID
:return: 返回关注数量和粉丝数量
"""
try:
headers = {
'Connection': 'keep-alive',
'Cookie': COOKIE,
'Host': 'api.bilibili.com',
'Referer': 'https://space.bilibili.com/' + str(mid),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url = 'https://api.bilibili.com/x/space/myinfo?jsonp=jsonp'
print('获取用户关注数量和粉丝数量url:{}'.format(url))
req = requests.get(url, headers=headers, timeout=60)
if req.status_code == 200:
status = req.json()
if status.get('data'):
data = status.get('data')
# 粉丝
follower = data.get('follower')
# 关注
following = data.get('following')
print('关注数量:{}, 粉丝数量:{}'.format(following, follower))
return follower, following
else:
print('get_myinfo url失败 code:{}'.format(req.status_code))
except ConnectionError as e:
print('ConnectionError网络异常', e.args)
def get_followings(mid, pn, ps):
"""
获取关注用户信息
:param mid: 用户ID
:param pn: 页数
:param ps: 每页数量
"""
try:
headers = {
'Connection': 'keep-alive',
'Cookie': COOKIE,
'Host': 'api.bilibili.com',
'Referer': 'https://space.bilibili.com/' + str(mid),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url = 'https://api.bilibili.com/x/relation/followings?vmid=' + \
str(mid) + '&pn=' + str(pn) + '&ps=' + \
str(ps) + '&order=desc&jsonp=jsonp'
print('获取关注用户信息url:{}'.format(url))
req = requests.get(url, headers=headers, timeout=60)
if req.status_code == 200:
code = req.json()
if code.get('data'):
glist = code.get('data').get('list')
for i in glist:
result = {
'uname': i.get('uname'),
'mid': i.get('mid')
}
print(result)
# 得到mid进入用户主页面
get_space(result.get('mid'))
# 保存关注用户的mid到数据库
save_followers_mongodb(result)
else:
print('限制只访问前5页')
else:
print('获取关注用户信息失败 code:{}'.format(req.status_code))
except ConnectionError as e:
print('ConnectionError网络异常', e.args)
def get_followers(mid, pn, ps):
"""
获取粉丝用户信息
:param mid: 用户ID
:param pn: 页数
:param ps: 每页数量
"""
try:
headers = {
'Connection': 'keep-alive',
'Cookie': COOKIE,
'Host': 'api.bilibili.com',
'Referer': 'https://space.bilibili.com/' + str(mid),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
url = 'https://api.bilibili.com/x/relation/followers?vmid=' + \
str(mid) + '&pn=' + str(pn) + '&ps=' + \
str(ps) + '&order=desc&jsonp=jsonp'
print('获取粉丝用户信息url:{}'.format(url))
req = requests.get(url, headers=headers, timeout=60)
if req.status_code == 200:
code = req.json()
if code.get('data'):
glist = code.get('data').get('list')
for i in glist:
result = {
'uname': i.get('uname'),
'mid': i.get('mid')
}
print(result)
# 得到mid进入用户主页面
get_space(result.get('mid'))
# 保存粉丝用户mid到数据库
save_followers_mongodb(result)
else:
print('限制只访问前5页')
else:
print('获取所有粉丝用户信息失败 code:{}'.format(req.status_code))
except ConnectionError as e:
print('ConnectionError网络异常', e.args)
def save_followers_mongodb(result):
"""
将关注和粉丝mid保存至mongodb
"""
global MID
MID += 1
collection = db.list
result['id'] = MID
if collection.find_one({'mid': result.get('mid')}):
print('{} 在数据库已存在'.format(result.get('uname')))
else:
if collection.find_one({'id': MID}):
print('数据库已存在该id {}'.format(MID))
save_followers_mongodb(result)
else:
collection.insert(result)
print('{} 保存到数据库成功'.format(result.get('uname')))
def save_GetINnfo_mongodb(result):
"""
将用户个人信息保存到mongodb
"""
collection = db.myinfo
if collection.find_one({'mid': result.get('mid')}):
print('{} 用户在数据库已存在'.format(result.get('name')))
else:
collection.insert(result)
print('{} 用户保存到数据库成功'.format(result.get('name')))
def run(mid):
"""
运行函数
"""
# 进入用户主页
get_space(mid)
# 获取关注数量和粉丝数量
f, g = get_myinfo(mid)
# 获取关注用户信息
f_g_ps = 50
f_g_pn = int(g / f_g_ps)+1
if f_g_pn <= 1:
get_followers(mid, 1, f_g_ps)
else:
for g_pn in range(1, f_g_pn):
get_followings(mid, g_pn, f_g_ps)
# 获取粉丝用户信息
f_r_ps = 50
f_r_pn = int(f / f_r_ps)+1
print(f_r_pn)
if f_r_pn <= 1:
get_followers(mid, 1, f_r_ps)
else:
for r_pn in range(1, f_r_pn):
get_followers(mid, r_pn, f_r_ps)
# 循环
rep_run()
def rep_run():
"""
当上一个mid所有事情完成后进入此函数进行循环爬取下一个mid
"""
global MIN
# 每次运行此函数使MIN加一,不能大于max(数据库count)
MIN += 1
collection = db.list
# 查询数据库所有数据保存到result
if collection.find_one({'id': MIN}):
ran = collection.find_one({'id': MIN})
# 查询数据库有多少条
count = collection.find({}).count()
if MIN > count:
print('程序即将停止运行,所有信息爬取完成')
time.sleep(10)
exit()
else:
run(ran.get('mid'))
else:
print('数据库没有该数据 id: {}'.format(MIN))
if __name__ == '__main__':
# 最好填写自己的mid
run(10047741)