From 284e05dd03856b0351c398b8bc32c70ce20b16fb Mon Sep 17 00:00:00 2001
From: alsrua7222 <59680587+alsrua7222@users.noreply.github.com>
Date: Fri, 7 Jan 2022 01:58:24 +0900
Subject: [PATCH 1/2] solution work for FreeProxy
---
.../requests/parsers/FreeProxyParser.py | 2 +-
.../requests/parsers/PremProxyParser.py | 16 ++++++++++++----
.../requests/parsers/js/UnPacker.py | 4 ++--
3 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/http_request_randomizer/requests/parsers/FreeProxyParser.py b/http_request_randomizer/requests/parsers/FreeProxyParser.py
index 112a26d..5ae167b 100644
--- a/http_request_randomizer/requests/parsers/FreeProxyParser.py
+++ b/http_request_randomizer/requests/parsers/FreeProxyParser.py
@@ -24,7 +24,7 @@ def parse_proxyList(self):
content = response.content
soup = BeautifulSoup(content, "html.parser")
- table = soup.find("table", attrs={"id": "proxylisttable"})
+ table = soup.find("table", attrs={"class": "table table-striped table-bordered"})
# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]
diff --git a/http_request_randomizer/requests/parsers/PremProxyParser.py b/http_request_randomizer/requests/parsers/PremProxyParser.py
index 0fed99f..5a3379c 100644
--- a/http_request_randomizer/requests/parsers/PremProxyParser.py
+++ b/http_request_randomizer/requests/parsers/PremProxyParser.py
@@ -7,6 +7,8 @@
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol
+from http_request_randomizer.requests.useragent.userAgent import UserAgentManager
+
logger = logging.getLogger(__name__)
__author__ = 'pgaref'
@@ -18,6 +20,12 @@ def __init__(self, id, web_url, timeout=None):
web_url += "/list/"
# Ports decoded by the JS unpacker
self.js_unpacker = None
+ self.useragent = UserAgentManager()
+ self.headers = {
+ "User-Agent": self.useragent.get_random_user_agent(),
+ "Origin": self.base_url,
+ "Referer": self.base_url
+ }
UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
def parse_proxyList(self):
@@ -31,7 +39,7 @@ def parse_proxyList(self):
self.js_unpacker = self.init_js_unpacker()
for page in page_set:
- response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
+ response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout, headers=self.headers)
if not response.ok:
# Could not parse ANY page - Let user know
if not curr_proxy_list:
@@ -65,7 +73,7 @@ def parse_proxyList(self):
return curr_proxy_list
def get_pagination_set(self):
- response = requests.get(self.get_url(), timeout=self.timeout)
+ response = requests.get(self.get_url(), timeout=self.timeout, headers=self.headers)
page_set = set()
# Could not parse pagination page - Let user know
if not response.ok:
@@ -84,7 +92,7 @@ def get_pagination_set(self):
return page_set
def init_js_unpacker(self):
- response = requests.get(self.get_url(), timeout=self.timeout)
+ response = requests.get(self.get_url(), timeout=self.timeout, headers=self.headers)
# Could not parse provider page - Let user know
if not response.ok:
logger.warning("Proxy Provider url failed: {}".format(self.get_url()))
@@ -96,7 +104,7 @@ def init_js_unpacker(self):
for script in soup.findAll('script'):
if '/js/' in script.get('src'):
jsUrl = self.base_url + script.get('src')
- return JsUnPacker(jsUrl)
+ return JsUnPacker(jsUrl, headers=self.headers)
return None
def create_proxy_object(self, row, port):
diff --git a/http_request_randomizer/requests/parsers/js/UnPacker.py b/http_request_randomizer/requests/parsers/js/UnPacker.py
index 2383362..947b920 100644
--- a/http_request_randomizer/requests/parsers/js/UnPacker.py
+++ b/http_request_randomizer/requests/parsers/js/UnPacker.py
@@ -14,9 +14,9 @@ class JsUnPacker(object):
"""
# TODO: it might not be necessary to unpack the js code
- def __init__(self, js_file_url):
+ def __init__(self, js_file_url, headers=None):
logger.info("JS UnPacker init path: {}".format(js_file_url))
- r = requests.get(js_file_url)
+ r = requests.get(js_file_url, headers=headers)
encrypted = r.text.strip()
encrypted = '(' + encrypted.split('}(')[1][:-1]
unpacked = eval('self.unpack' +encrypted) # string of the js code in unpacked form
From 9ab14148becf58e39292e479629ef08a265bd6a3 Mon Sep 17 00:00:00 2001
From: alsrua7222 <59680587+alsrua7222@users.noreply.github.com>
Date: Wed, 12 Jan 2022 23:38:45 +0900
Subject: [PATCH 2/2] test_freeProxy html update
---
tests/mocks.py | 134 +++++++++++++++++++++++--------------------------
1 file changed, 63 insertions(+), 71 deletions(-)
diff --git a/tests/mocks.py b/tests/mocks.py
index 78320bd..bff3d5f 100644
--- a/tests/mocks.py
+++ b/tests/mocks.py
@@ -70,77 +70,69 @@ def sslproxy_mock(url, request):
@urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$')
def free_proxy_mock(url, request):
- return """