From e1dc20ef0f7316e0a72b50b65d8f8a4637c7eb30 Mon Sep 17 00:00:00 2001 From: MustafaGamalMuhammed <45317758+MustafaGamalMuhammed@users.noreply.github.com> Date: Sat, 20 Jul 2019 05:25:24 +0200 Subject: [PATCH] fixing a bug in the rules the allow argument of LinkExtractor is supposed to be an absolute link not a relative on. --- Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py b/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py index efe28b3..9944b0b 100644 --- a/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py +++ b/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py @@ -6,7 +6,7 @@ class ArticleSpider(CrawlSpider): allowed_domains = ['wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] rules = [ - Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}), + Rule(LinkExtractor(allow='^(https://en.wikipedia.org/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}), Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False}) ]