Skip to content

Commit b1910e5

Browse files
committed
adds: argument allowing to skip status checks for specified URLs by using regex
1 parent 30e5910 commit b1910e5

File tree

2 files changed

+19
-7
lines changed

2 files changed

+19
-7
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from setuptools import setup, find_packages
33

44

5-
version = '0.6.1'
5+
version = '0.7.0'
66

77

88
def read(f):

validator/checks/url.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import string
66
from bs4 import BeautifulSoup
77
from urllib.parse import urlparse, urljoin
8+
from typing import List, Optional
89

910
from ..errors import UrlDiff, UrlOccurencyDiff
1011

@@ -98,7 +99,10 @@ def extract_urls(self, content, keep_placeholders=False):
9899
class UrlStatusChecker(object):
99100
retry_max_count = 3
100101

101-
def __init__(self, headers=None):
102+
def __init__(self, headers=None, exclude_urls_regexs: Optional[List[str]] = None):
103+
self._exclude_urls_regex = exclude_urls_regexs or []
104+
if self._exclude_urls_regex:
105+
logging.warning('Excluded urls regexps: {}'.format(self._exclude_urls_regex))
102106
self._headers = headers or {}
103107
if 'User-Agent' not in self._headers:
104108
self._headers['User-Agent'] = DEFAULT_USER_AGENT
@@ -133,9 +137,16 @@ def _is_valid(self, status_code, has_disallowed_chars):
133137
return (200 <= status_code < 300) and not has_disallowed_chars
134138

135139
async def _check_urls_coro(self, urls, future):
136-
tasks = [self._request_status_code(url.url) for url in urls]
140+
urls_without_excluded = []
141+
for url in urls:
142+
is_exluded = any(re.match(regex, url.url) for regex in self._exclude_urls_regex)
143+
if not is_exluded:
144+
urls_without_excluded.append(url)
145+
else:
146+
logging.warning('url {} excluded from status check'.format(url.url))
147+
tasks = [self._request_status_code(url.url) for url in urls_without_excluded]
137148
results = await asyncio.gather(*tasks)
138-
for index, url in enumerate(urls):
149+
for index, url in enumerate(urls_without_excluded):
139150
url.status_code = results[index]
140151
url.has_disallowed_chars = self._has_disallowed_chars(url.url)
141152
invalid_urls = filter(lambda u: not u.is_valid(), urls)
@@ -159,8 +170,9 @@ async def async_check(self, urls):
159170
class UrlValidator(object):
160171
_extractors = {'txt': TextUrlExtractor, 'html': HtmlUrlExtractor}
161172

162-
def __init__(self, filetype, headers=None, **kwargs):
173+
def __init__(self, filetype, headers=None, exclude_status_check_regexs: Optional[List[str]] = None, **kwargs):
163174
self.client_headers = headers or {}
175+
self._excluded_status_check_regexs = exclude_status_check_regexs or []
164176
extractor_class = self._extractors.get(filetype)
165177
if extractor_class is None:
166178
raise MissingUrlExtractorError('no extractor for filetype %s', filetype)
@@ -181,13 +193,13 @@ def _get_urls(self, data, parser, reader):
181193

182194
def check(self, data, parser, reader):
183195
urls = self._get_urls(data, parser, reader)
184-
checker = UrlStatusChecker(headers=self.client_headers)
196+
checker = UrlStatusChecker(headers=self.client_headers, exclude_urls_regexs=self._excluded_status_check_regexs)
185197
invalid_urls = checker.check(urls.values())
186198
return invalid_urls
187199

188200
async def async_check(self, data, parser, reader):
189201
urls = self._get_urls(data, parser, reader)
190-
checker = UrlStatusChecker(headers=self.client_headers)
202+
checker = UrlStatusChecker(headers=self.client_headers, exclude_urls_regexs=self._excluded_status_check_regexs)
191203
invalid_urls = await checker.async_check(urls.values())
192204
return invalid_urls
193205

0 commit comments

Comments
 (0)