From af592392a59ab30fee89ad51b9b49ce0c56003e7 Mon Sep 17 00:00:00 2001 From: Tianyue Ren Date: Wed, 8 Feb 2023 16:23:35 +0800 Subject: [PATCH 1/4] optimize batch fetch method to boost throughput The previous start url fetching method only working when spider is idle, which is not full concurrency.This patch optimizes it by using request_left_downloader signal. Signed-off-by: Tianyue Ren --- src/scrapy_redis/spiders.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/scrapy_redis/spiders.py b/src/scrapy_redis/spiders.py index 2453d0a6..d445c396 100644 --- a/src/scrapy_redis/spiders.py +++ b/src/scrapy_redis/spiders.py @@ -87,6 +87,7 @@ def setup_redis(self, crawler=None): # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) + crawler.signals.connect(self.fill_requests_queue, signal=signals.request_left_downloader) def pop_list_queue(self, redis_key, batch_size): with self.server.pipeline() as pipe: @@ -102,11 +103,22 @@ def pop_priority_queue(self, redis_key, batch_size): datas, _ = pipe.execute() return datas + def fill_requests_queue(self): + need_size = self.crawler.engine.downloader.total_concurrency - \ + len(self.crawler.engine.downloader.active) - len(self.crawler.engine.slot.scheduler.queue) + if need_size > 0: + self.logger.debug("Need to fill %i request(s)", need_size) + for req in self.__next_requests(need_size): + self.crawler.engine.crawl(req, spider=self) + def next_requests(self): + return self.__next_requests(self.redis_batch_size) + + def __next_requests(self, redis_batch_size): """Returns a request to be scheduled or none.""" # XXX: Do we need to use a timeout here? found = 0 - datas = self.fetch_data(self.redis_key, self.redis_batch_size) + datas = self.fetch_data(self.redis_key, redis_batch_size) for data in datas: reqs = self.make_request_from_data(data) if isinstance(reqs, Iterable): From 466288c3018779c01d5751effc9ff6b96830665c Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Wed, 21 Jun 2023 09:55:36 +0800 Subject: [PATCH 2/4] [fix] assertion error --- tests/test_spiders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_spiders.py b/tests/test_spiders.py index c8b31d64..e7ad246b 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -71,6 +71,7 @@ def test_via_from_crawler(self, connection): assert myspider.server is server connection.from_settings.assert_called_with(crawler.settings) crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle) + crawler.signals.connect.assert_called_with(myspider.fill_requests_queue, signal=signals.request_left_downloader) # Second call does nothing. server = myspider.server crawler.signals.connect.reset_mock() From d1674c664d052488598042ba004e487c11498b77 Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Wed, 21 Jun 2023 10:01:01 +0800 Subject: [PATCH 3/4] [fix] assertion error --- tests/test_spiders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_spiders.py b/tests/test_spiders.py index e7ad246b..c7d1981a 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -70,7 +70,7 @@ def test_via_from_crawler(self, connection): myspider = MySpider.from_crawler(crawler) assert myspider.server is server connection.from_settings.assert_called_with(crawler.settings) - crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle) + #crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle) crawler.signals.connect.assert_called_with(myspider.fill_requests_queue, signal=signals.request_left_downloader) # Second call does nothing. server = myspider.server From 6300a38352fb6f54252ea1d2c678324c604677a2 Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Wed, 21 Jun 2023 10:26:05 +0800 Subject: [PATCH 4/4] [fix] assertion error --- src/scrapy_redis/spiders.py | 1 - tests/test_spiders.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/scrapy_redis/spiders.py b/src/scrapy_redis/spiders.py index 7be7d837..902e9582 100644 --- a/src/scrapy_redis/spiders.py +++ b/src/scrapy_redis/spiders.py @@ -96,7 +96,6 @@ def setup_redis(self, crawler=None): # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue - crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) crawler.signals.connect(self.fill_requests_queue, signal=signals.request_left_downloader) def pop_list_queue(self, redis_key, batch_size): diff --git a/tests/test_spiders.py b/tests/test_spiders.py index c7d1981a..92791fbe 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -70,7 +70,6 @@ def test_via_from_crawler(self, connection): myspider = MySpider.from_crawler(crawler) assert myspider.server is server connection.from_settings.assert_called_with(crawler.settings) - #crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle) crawler.signals.connect.assert_called_with(myspider.fill_requests_queue, signal=signals.request_left_downloader) # Second call does nothing. server = myspider.server