Skip to content

Added: A DeltaFetchPseudoItem for storing requests with no items yielded #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@
logger = logging.getLogger(__name__)


class DeltaFetchPseudoItem(dict):
"""
A pseudo item class to be used when:
- No actual item shall be generated from a page, and
- The page shall be skipped in future runs
"""
pass


class DeltaFetch(object):
"""
This is a spider middleware to ignore requests to pages containing items
Expand Down Expand Up @@ -86,7 +95,11 @@ def process_spider_output(self, response, result, spider):
self.db[key] = str(time.time())
if self.stats:
self.stats.inc_value('deltafetch/stored', spider=spider)
yield r
if isinstance(r, DeltaFetchPseudoItem):
reason = r.get('reason', 'pseudo_item')
self.stats.inc_value('deltafetch/stored/%s' % reason, spider=spider)
if not isinstance(r, DeltaFetchPseudoItem):
yield r

def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
Expand Down
18 changes: 17 additions & 1 deletion tests/test_deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from scrapy.statscollectors import StatsCollector
from scrapy.utils.test import get_crawler

from scrapy_deltafetch.middleware import DeltaFetch
from scrapy_deltafetch.middleware import DeltaFetch, DeltaFetchPseudoItem


dbmodule = None
Expand Down Expand Up @@ -201,6 +201,22 @@ def test_process_spider_output(self):
b'test_key_2']))
assert mw.db[b'key']

def test_process_spider_output_pseudo_item(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
mw.spider_opened(self.spider)
response = mock.Mock()
response.request = Request('http://url',
meta={'deltafetch_key': 'key'})
result = [DeltaFetchPseudoItem(reason='skip')]
self.assertEqual(list(mw.process_spider_output(
response, result, self.spider)), [])
self.assertEqual(set(mw.db.keys()),
set([b'key',
b'test_key_1',
b'test_key_2']))
assert mw.db[b'key']

def test_process_spider_output_dict(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
Expand Down