From 284e05dd03856b0351c398b8bc32c70ce20b16fb Mon Sep 17 00:00:00 2001 From: alsrua7222 <59680587+alsrua7222@users.noreply.github.com> Date: Fri, 7 Jan 2022 01:58:24 +0900 Subject: [PATCH 1/2] solution work for FreeProxy --- .../requests/parsers/FreeProxyParser.py | 2 +- .../requests/parsers/PremProxyParser.py | 16 ++++++++++++---- .../requests/parsers/js/UnPacker.py | 4 ++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/http_request_randomizer/requests/parsers/FreeProxyParser.py b/http_request_randomizer/requests/parsers/FreeProxyParser.py index 112a26d..5ae167b 100644 --- a/http_request_randomizer/requests/parsers/FreeProxyParser.py +++ b/http_request_randomizer/requests/parsers/FreeProxyParser.py @@ -24,7 +24,7 @@ def parse_proxyList(self): content = response.content soup = BeautifulSoup(content, "html.parser") - table = soup.find("table", attrs={"id": "proxylisttable"}) + table = soup.find("table", attrs={"class": "table table-striped table-bordered"}) # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] diff --git a/http_request_randomizer/requests/parsers/PremProxyParser.py b/http_request_randomizer/requests/parsers/PremProxyParser.py index 0fed99f..5a3379c 100644 --- a/http_request_randomizer/requests/parsers/PremProxyParser.py +++ b/http_request_randomizer/requests/parsers/PremProxyParser.py @@ -7,6 +7,8 @@ from http_request_randomizer.requests.parsers.UrlParser import UrlParser from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol +from http_request_randomizer.requests.useragent.userAgent import UserAgentManager + logger = logging.getLogger(__name__) __author__ = 'pgaref' @@ -18,6 +20,12 @@ def __init__(self, id, web_url, timeout=None): web_url += "/list/" # Ports decoded by the JS unpacker self.js_unpacker = None + self.useragent = UserAgentManager() + self.headers = { + "User-Agent": self.useragent.get_random_user_agent(), + "Origin": self.base_url, + "Referer": self.base_url + } UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout) def parse_proxyList(self): @@ -31,7 +39,7 @@ def parse_proxyList(self): self.js_unpacker = self.init_js_unpacker() for page in page_set: - response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout) + response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout, headers=self.headers) if not response.ok: # Could not parse ANY page - Let user know if not curr_proxy_list: @@ -65,7 +73,7 @@ def parse_proxyList(self): return curr_proxy_list def get_pagination_set(self): - response = requests.get(self.get_url(), timeout=self.timeout) + response = requests.get(self.get_url(), timeout=self.timeout, headers=self.headers) page_set = set() # Could not parse pagination page - Let user know if not response.ok: @@ -84,7 +92,7 @@ def get_pagination_set(self): return page_set def init_js_unpacker(self): - response = requests.get(self.get_url(), timeout=self.timeout) + response = requests.get(self.get_url(), timeout=self.timeout, headers=self.headers) # Could not parse provider page - Let user know if not response.ok: logger.warning("Proxy Provider url failed: {}".format(self.get_url())) @@ -96,7 +104,7 @@ def init_js_unpacker(self): for script in soup.findAll('script'): if '/js/' in script.get('src'): jsUrl = self.base_url + script.get('src') - return JsUnPacker(jsUrl) + return JsUnPacker(jsUrl, headers=self.headers) return None def create_proxy_object(self, row, port): diff --git a/http_request_randomizer/requests/parsers/js/UnPacker.py b/http_request_randomizer/requests/parsers/js/UnPacker.py index 2383362..947b920 100644 --- a/http_request_randomizer/requests/parsers/js/UnPacker.py +++ b/http_request_randomizer/requests/parsers/js/UnPacker.py @@ -14,9 +14,9 @@ class JsUnPacker(object): """ # TODO: it might not be necessary to unpack the js code - def __init__(self, js_file_url): + def __init__(self, js_file_url, headers=None): logger.info("JS UnPacker init path: {}".format(js_file_url)) - r = requests.get(js_file_url) + r = requests.get(js_file_url, headers=headers) encrypted = r.text.strip() encrypted = '(' + encrypted.split('}(')[1][:-1] unpacked = eval('self.unpack' +encrypted) # string of the js code in unpacked form From 9ab14148becf58e39292e479629ef08a265bd6a3 Mon Sep 17 00:00:00 2001 From: alsrua7222 <59680587+alsrua7222@users.noreply.github.com> Date: Wed, 12 Jan 2022 23:38:45 +0900 Subject: [PATCH 2/2] test_freeProxy html update --- tests/mocks.py | 134 +++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 71 deletions(-) diff --git a/tests/mocks.py b/tests/mocks.py index 78320bd..bff3d5f 100644 --- a/tests/mocks.py +++ b/tests/mocks.py @@ -70,77 +70,69 @@ def sslproxy_mock(url, request): @urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$') def free_proxy_mock(url, request): - return """\n -\n -\n - - \n - - \n - - \n - - \n - - \n - - \n - - \n - - \n - -\n - -\n - - - - - - - - - - - -\n - - - - - - - - - - -\n - -\n -\n -\n - - \n - - \n - - \n - - \n - - \n - - \n - - \n - - \n - -\n - -\n + return """
IP AddressPortCodeCountryAnonymityGoogleHttpsLast Checked
138.197.136.463128CACanadaanonymousnono7 seconds ago
177.207.75.2278080BRBraziltransparentnono2 hours 21 minutes ago
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IP AddressPortCodeCountryAnonymityGoogleHttpsLast Checked
58.234.116.1978193KRKoreaanonymousyesno1 min ago
20.122.24.22580USUnited Statesanonymousyesno1 min ago
154.236.177.1001981EGEgyptelite proxyyesyes1 min ago
54.37.160.921080FRFranceelite proxynoyes1 min ago
110.232.78.5555667IDIndonesiaanonymousnoyes1 min ago
"""