5
5
import string
6
6
from bs4 import BeautifulSoup
7
7
from urllib .parse import urlparse , urljoin
8
+ from typing import List , Optional
8
9
9
10
from ..errors import UrlDiff , UrlOccurencyDiff
10
11
@@ -98,7 +99,10 @@ def extract_urls(self, content, keep_placeholders=False):
98
99
class UrlStatusChecker (object ):
99
100
retry_max_count = 3
100
101
101
- def __init__ (self , headers = None ):
102
+ def __init__ (self , headers = None , exclude_urls_regexs : Optional [List [str ]] = None ):
103
+ self ._exclude_urls_regex = exclude_urls_regexs or []
104
+ if self ._exclude_urls_regex :
105
+ logging .warning ('Excluded urls regexps: {}' .format (self ._exclude_urls_regex ))
102
106
self ._headers = headers or {}
103
107
if 'User-Agent' not in self ._headers :
104
108
self ._headers ['User-Agent' ] = DEFAULT_USER_AGENT
@@ -133,9 +137,16 @@ def _is_valid(self, status_code, has_disallowed_chars):
133
137
return (200 <= status_code < 300 ) and not has_disallowed_chars
134
138
135
139
async def _check_urls_coro (self , urls , future ):
136
- tasks = [self ._request_status_code (url .url ) for url in urls ]
140
+ urls_without_excluded = []
141
+ for url in urls :
142
+ is_exluded = any (re .match (regex , url .url ) for regex in self ._exclude_urls_regex )
143
+ if not is_exluded :
144
+ urls_without_excluded .append (url )
145
+ else :
146
+ logging .warning ('url {} excluded from status check' .format (url .url ))
147
+ tasks = [self ._request_status_code (url .url ) for url in urls_without_excluded ]
137
148
results = await asyncio .gather (* tasks )
138
- for index , url in enumerate (urls ):
149
+ for index , url in enumerate (urls_without_excluded ):
139
150
url .status_code = results [index ]
140
151
url .has_disallowed_chars = self ._has_disallowed_chars (url .url )
141
152
invalid_urls = filter (lambda u : not u .is_valid (), urls )
@@ -159,8 +170,9 @@ async def async_check(self, urls):
159
170
class UrlValidator (object ):
160
171
_extractors = {'txt' : TextUrlExtractor , 'html' : HtmlUrlExtractor }
161
172
162
- def __init__ (self , filetype , headers = None , ** kwargs ):
173
+ def __init__ (self , filetype , headers = None , exclude_status_check_regexs : Optional [ List [ str ]] = None , ** kwargs ):
163
174
self .client_headers = headers or {}
175
+ self ._excluded_status_check_regexs = exclude_status_check_regexs or []
164
176
extractor_class = self ._extractors .get (filetype )
165
177
if extractor_class is None :
166
178
raise MissingUrlExtractorError ('no extractor for filetype %s' , filetype )
@@ -181,13 +193,13 @@ def _get_urls(self, data, parser, reader):
181
193
182
194
def check (self , data , parser , reader ):
183
195
urls = self ._get_urls (data , parser , reader )
184
- checker = UrlStatusChecker (headers = self .client_headers )
196
+ checker = UrlStatusChecker (headers = self .client_headers , exclude_urls_regexs = self . _excluded_status_check_regexs )
185
197
invalid_urls = checker .check (urls .values ())
186
198
return invalid_urls
187
199
188
200
async def async_check (self , data , parser , reader ):
189
201
urls = self ._get_urls (data , parser , reader )
190
- checker = UrlStatusChecker (headers = self .client_headers )
202
+ checker = UrlStatusChecker (headers = self .client_headers , exclude_urls_regexs = self . _excluded_status_check_regexs )
191
203
invalid_urls = await checker .async_check (urls .values ())
192
204
return invalid_urls
193
205
0 commit comments