fetchman is a simple spider system
The quick way:
pip install fetchman
zhu_processor.py:
from fetchman.spider.spider_core import SpiderCore
from fetchman.processor.base_processor import BaseProcessor
from fetchman.downloader.http.spider_request import Request
from fetchman.utils.decorator import check
from fetchman.pipeline.console_pipeline import ConsolePipeline
from fetchman.pipeline.pic_pipeline import PicPipeline
from fetchman.pipeline.pipe_item import pipeItem
from bs4 import BeautifulSoup as bs
import hashlib
import time
import random
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class Zhu_Processor(BaseProcessor):
spider_id = 'zhu_spider'
spider_name = 'zhu_spider'
allowed_domains = ['zhuwang.cc']
start_requests = [Request(url='http://www.zhuwang.cc/list-58-1.html', priority=0)]
@check
def process(self, response):
soup = bs(response.m_response.content, 'lxml')
page_list = soup.select('div.zxpage a')
total_page = int(page_list[page_list.__len__()-2].text)
page = 1
while page<=total_page:
yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page,callback=self.process_page, priority=0,duplicate_remove=False)
page +=1
@check
def process_page(self, response):
soup = bs(response.m_response.content, 'lxml')
zhu_div_list = soup.select('div.zxleft ul li')
for zhu_div in zhu_div_list:
detail_url = zhu_div.select('a')[0]['href']
img_url = zhu_div.select('a img')[0]['src']
title = zhu_div.select('a img')[0]['alt'].strip()
shortDes = zhu_div.select('p.zxleft32 a')[0].text
md5 = hashlib.md5()
rand_name = str(time.time()) + str(random.random())
md5.update(rand_name)
img_name = md5.hexdigest() + '.jpg'
request = Request(url=img_url, priority=1, callback=self.process_pic)
request.meta['img_name'] = img_name
yield request
request = Request(url=detail_url, priority=1, callback=self.process_detail)
request.meta['title'] = title
request.meta['shortDes'] = shortDes
request.meta['img_name'] = img_name
yield request
@check
def process_pic(self, response):
result = response.m_response.content
yield pipeItem(['save'],result)
@check
def process_detail(self, response):
soup = bs(response.m_response.content, 'lxml')
dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace('来源: ','').replace('来源:','').split(' ')
date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace('|','')
newsFrom = dd_tail[0].strip()
result = dict()
result['date_time'] = date_time
result['newsFrom'] = newsFrom
yield pipeItem(['console'],result)
if __name__ == '__main__':
SpiderCore(Zhu_Processor()).set_pipeline('console',ConsolePipeline())\
.set_pipeline('save',PicPipeline()).start()
then start your redis and run script:
python zhu_processor.py