diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 805fc52f7..1c4fea305 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -270,7 +270,7 @@ OVERUSED_SLOT_FACTOR Default: ``5.0`` (in progress + queued requests in that slot) / max allowed concurrent downloads per slot before slot is considered -overused. This affects only Scrapy scheduler." +overused. This affects only custom Frontera Scrapy scheduler. .. setting:: REQUEST_MODEL diff --git a/docs/source/topics/scrapy-integration.rst b/docs/source/topics/scrapy-integration.rst index 56422ea70..5ec19198c 100644 --- a/docs/source/topics/scrapy-integration.rst +++ b/docs/source/topics/scrapy-integration.rst @@ -1,6 +1,6 @@ -============================== -Using the Frontier with Scrapy -============================== +========================== +Using Frontera with Scrapy +========================== Using Frontera is quite easy, it includes a set of `Scrapy middlewares`_ and Scrapy scheduler that encapsulates Frontera usage and can be easily configured using `Scrapy settings`_. @@ -9,7 +9,7 @@ Frontera usage and can be easily configured using `Scrapy settings`_. Activating the frontier ======================= -The Frontera uses 2 different middlewares: ``SchedulerSpiderMiddleware`` and ``SchedulerDownloaderMiddleware``, and it's +The Frontera uses two different middlewares: ``SchedulerSpiderMiddleware``, ``SchedulerDownloaderMiddleware``, and it's own scheduler ``FronteraScheduler``. To activate the Frontera in your Scrapy project, just add them to the `SPIDER_MIDDLEWARES`_, @@ -31,6 +31,8 @@ Create a Frontera ``settings.py`` file and add it to your Scrapy settings:: Another option is to put these settings right into Scrapy settings module. +Frontera custom scheduler filters all requests generated by middlewares and Scrapy engine except redirected ones and +explicitly marked as seeds (see `seed_loaders`_). Organizing files @@ -161,6 +163,7 @@ Check also `Scrapy broad crawling`_ recommendations. .. _`Quick start single process`: http://frontera.readthedocs.org/en/latest/topics/quick-start-single.html .. _`Scrapy broad crawling`: http://doc.scrapy.org/en/master/topics/broad-crawls.html +.. _seed_loaders: Scrapy Seed Loaders =================== @@ -170,6 +173,8 @@ Frontera has some built-in Scrapy middlewares for seed loading. Seed loaders use the ``process_start_requests`` method to generate requests from a source that are added later to the :class:`FrontierManager `. +It's possible to inject seeds from any other spider middleware by setting the ``seed`` key in requests ``meta`` +dictionary to ``True``. Activating a Seed loader ------------------------ diff --git a/frontera/contrib/scrapy/middlewares/seeds/__init__.py b/frontera/contrib/scrapy/middlewares/seeds/__init__.py index 09cd0b7cd..99fd62752 100644 --- a/frontera/contrib/scrapy/middlewares/seeds/__init__.py +++ b/frontera/contrib/scrapy/middlewares/seeds/__init__.py @@ -15,7 +15,12 @@ def from_crawler(cls, crawler): def process_start_requests(self, start_requests, spider): urls = [url for url in self.load_seeds() if not url.startswith('#')] - return [spider.make_requests_from_url(url) for url in urls] + for url in urls: + r = spider.make_requests_from_url(url) + r.meta['seed'] = True + yield r + for r in start_requests: + yield r def load_seeds(self): raise NotImplementedError diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index 782a64c23..694b6fbed 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -72,31 +72,43 @@ def _set_value(self, variable, value): class FronteraScheduler(Scheduler): + """ + Custom Scheduler for Scrapy. + Adapts Frontera manager interface to Scrapy. + + Important remarks: + - it doesn't enqueue majority of requests produced by middlewares or direct calls to engine, see self.enqueue_request + method, and override if needed, + - requires SchedulerSpiderMiddleware and SchedulerDownloaderMiddleware. + """ def __init__(self, crawler, manager=None): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() - self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings, manager) self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY') + self._redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') self._delay_next_call = 0.0 - self.logger = getLogger('frontera.contrib.scrapy.schedulers.FronteraScheduler') + self.logger = getLogger('frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler') @classmethod def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): - if not self._request_is_redirected(request): - self.frontier.add_seeds([request]) - self.stats_manager.add_seeds() - return True - elif self.redirect_enabled: + # add directly to in-memory queue if request comes as part of redirect chain from RedirectMiddleware + if self._redirect_enabled and self._request_is_redirected(request): self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True + # add as seed if request is explicitly marked as seed + if self._request_is_seed(request): + self.frontier.add_seeds([request]) + self.stats_manager.add_seeds() + return True + self.logger.warning("Request to URL %s is skipped.", request.url) return False def next_request(self): @@ -166,7 +178,10 @@ def _get_exception_code(self, exception): return '?' def _request_is_redirected(self, request): - return request.meta.get(b'redirect_times', 0) > 0 + return request.meta.get('redirect_times', 0) > 0 + + def _request_is_seed(self, request): + return bool(request.meta.get('seed', False)) def _get_downloader_info(self): downloader = self.crawler.engine.downloader diff --git a/tests/scrapy_spider/spiders/example.py b/tests/scrapy_spider/spiders/example.py index 000c7c3b8..341de48e5 100644 --- a/tests/scrapy_spider/spiders/example.py +++ b/tests/scrapy_spider/spiders/example.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule - +from scrapy.http import Request class MySpider(CrawlSpider): name = 'example' @@ -18,4 +18,7 @@ def parse_page(self, response): def parse_nothing(self, response): pass + def make_requests_from_url(self, url): + return Request(url, dont_filter=True, meta={'seed':True}) + parse_start_url = parse_nothing diff --git a/tests/test_frontera_scheduler.py b/tests/test_frontera_scheduler.py index 3b6e1ed04..ac68340bc 100644 --- a/tests/test_frontera_scheduler.py +++ b/tests/test_frontera_scheduler.py @@ -11,15 +11,15 @@ # test requests -r1 = Request('http://www.example.com') -r2 = Request('https://www.example.com/some/page') -r3 = Request('http://example1.com') +r1 = Request('http://www.example.com', meta={'seed': True}) +r2 = Request('https://www.example.com/some/page', meta={'seed': True}) +r3 = Request('http://example1.com', meta={'seed': True}) # test requests with redirects -rr1 = Request('http://www.example.com', meta={b'redirect_times': 1}) -rr2 = Request('https://www.example.com/some/page', meta={b'redirect_times': 4}) -rr3 = Request('http://example1.com', meta={b'redirect_times': 0}) +rr1 = Request('http://www.example.com', meta={'redirect_times': 1}) +rr2 = Request('https://www.example.com/some/page', meta={'redirect_times': 4}) +rr3 = Request('http://example1.com', meta={'redirect_times': 0}) # test frontier requests @@ -49,11 +49,10 @@ def test_redirect_disabled_enqueue_requests(self): fs.open(Spider) assert fs.enqueue_request(rr1) is False assert fs.enqueue_request(rr2) is False - assert fs.enqueue_request(rr3) is True - assert isinstance(fs.frontier.manager.seeds[0], FRequest) - assert len(fs.frontier.manager.seeds) == 1 - assert fs.frontier.manager.seeds[0].url == rr3.url - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 + assert fs.enqueue_request(rr3) is False + assert len(fs.frontier.manager.seeds) == 0 + assert fs.stats_manager.stats.get_value('frontera/seeds_count') == None + def test_redirect_enabled_enqueue_requests(self): settings = Settings() @@ -63,13 +62,10 @@ def test_redirect_enabled_enqueue_requests(self): fs.open(Spider) assert fs.enqueue_request(rr1) is True assert fs.enqueue_request(rr2) is True - assert fs.enqueue_request(rr3) is True - assert len(fs.frontier.manager.seeds) == 1 - assert isinstance(fs.frontier.manager.seeds[0], FRequest) - assert fs.frontier.manager.seeds[0].url == rr3.url + assert fs.enqueue_request(rr3) is False assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url]) assert all([isinstance(request, Request) for request in fs._pending_requests]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 + assert fs.stats_manager.stats.get_value('frontera/seeds_count') == None assert fs.stats_manager.stats.get_value('frontera/redirected_requests_count') == 2 def test_next_request(self): diff --git a/tests/test_seed_loader.py b/tests/test_seed_loader.py index bc512e2a9..7aea5a082 100644 --- a/tests/test_seed_loader.py +++ b/tests/test_seed_loader.py @@ -49,7 +49,7 @@ def test_load_seeds(self): def test_process_start_requests(self): seed_loader = self.seed_loader_setup() - requests = seed_loader.process_start_requests(None, Spider(name='spider')) + requests = seed_loader.process_start_requests((), Spider(name='spider')) self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) def test_process_start_requests_ignore_comments(self): @@ -60,7 +60,7 @@ def test_process_start_requests_ignore_comments(self): # https://www.test.com """ seed_loader = self.seed_loader_setup(seeds_content) - requests = seed_loader.process_start_requests(None, Spider(name='spider')) + requests = seed_loader.process_start_requests((), Spider(name='spider')) self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) @@ -123,5 +123,5 @@ def mocked_connect_s3(*args, **kwargs): with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3', side_effect=mocked_connect_s3): - requests = self.seed_loader.process_start_requests(None, Spider(name='spider')) + requests = self.seed_loader.process_start_requests((), Spider(name='spider')) self.assertEqual(set([r.url for r in requests]), set(urls))