diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index f83f08cfa..782a64c23 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -110,8 +110,7 @@ def process_spider_output(self, response, result, spider): for element in result: if isinstance(element, Request): links.append(element) - else: - yield element + yield element frontier_request = response.meta[b'frontier_request'] self.frontier.page_crawled(response) # removed frontier part from .meta # putting it back, to persist .meta from original request diff --git a/tests/test_frontera_scheduler.py b/tests/test_frontera_scheduler.py index fe1b7a50a..3b6e1ed04 100644 --- a/tests/test_frontera_scheduler.py +++ b/tests/test_frontera_scheduler.py @@ -113,12 +113,18 @@ def test_next_request_overused_keys_info(self): def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} + no_requests = 3 result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) - assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i['item'])) == \ + out = list(fs.process_spider_output(resp, result, Spider)) + assert len(out) == len(result) + out_request = out[:no_requests] + assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests]) + out_items = out[no_requests:] + assert sorted(out_items, key=lambda i: sorted(i['item'])) == \ sorted([i1, i2], key=lambda i: sorted(i['item'])) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url diff --git a/tests/test_scrapy.py b/tests/test_scrapy.py index e29608001..a08d35e05 100644 --- a/tests/test_scrapy.py +++ b/tests/test_scrapy.py @@ -1,12 +1,23 @@ # -*- coding: utf-8 -*- - from __future__ import absolute_import -from frontera.contrib.scrapy.converters import RequestConverter, ResponseConverter + +import sys + +from scrapy.core.spidermw import SpiderMiddlewareManager +from scrapy.http import Request, Response from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse -from frontera.core.models import Request as FrontierRequest +from scrapy.spiders import Spider +from scrapy.utils.test import get_crawler +from twisted.internet.defer import Deferred +from twisted.trial import unittest from w3lib.util import to_bytes +from frontera.contrib.scrapy.converters import (RequestConverter, + ResponseConverter) +from frontera.core.models import Request as FrontierRequest +from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler + class TestSpider(object): def callback(self): @@ -75,3 +86,77 @@ def test_request_response_converters(): frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) assert frontier_request.url == url + + +class TestFronteraMiddlewaresWithScrapy(unittest.TestCase): + + def setUp(self): + class TestSpider(Spider): + name = 'test' + + self.spider = TestSpider + scrapy_default_middlewares = { + 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700 + } + + # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware + sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares + + custom_settings = { + 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} + } + crawler = get_crawler(self.spider, custom_settings) + self.add_frontera_scheduler(crawler) + self.smw = SpiderMiddlewareManager.from_crawler(crawler) + + @staticmethod + def add_frontera_scheduler(crawler): + scheduler = FronteraScheduler(crawler) + + # mock these functions + scheduler.frontier.page_crawled = lambda x: x + scheduler.frontier.links_extracted = lambda x, y: x + scheduler.stats_manager.add_crawled_page = lambda x, y: x + + class Engine(object): + def __init__(self, scheduler): + self.slot = type('slot', (object,), {}) + self.slot.scheduler = scheduler + + crawler.engine = Engine(scheduler) + + def test_frontera_scheduler_spider_middleware_with_referer_middleware(self): + + def request_callback(response): + yield Request('http://frontera.org') + + req = Request( + url='http://www.scrapy.org', + callback=request_callback, + meta={b'frontier_request': FrontierRequest('http://www.scrapy.org')} + ) + + res = Response(url='http://www.scrapy.org', request=req) + + def call_request_callback(result, request, spider): + dfd = Deferred() + dfd.addCallback(request.callback) + return dfd + + def test_middleware_output(result): + out = list(result) + self.assertEquals(len(out), 1) + self.assertIsInstance(out[0], Request) + self.assertIn('Referer', out[0].headers) + self.assertEquals(out[0].headers['Referer'], to_bytes(res.url)) + + def test_failure(failure): + # work around for test to fail with detailed traceback + self._observer._errors.append(failure) + + dfd = self.smw.scrape_response(call_request_callback, res, req, self.spider) + + dfd.addCallback(test_middleware_output) + dfd.addErrback(test_failure) + + dfd.callback(res)