Skip to content

Commit 04eb964

Browse files
author
pgaref
committed
Extended rebro weebly parser to retrieve anonymity and country information. Wrapping up #30 and bumping up project version
1 parent dd130fd commit 04eb964

File tree

10 files changed

+79
-41
lines changed

10 files changed

+79
-41
lines changed

http_request_randomizer/requests/parsers/FreeProxyParser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ def __init__(self, id, web_url, timeout=None):
1616

1717
def parse_proxyList(self):
1818
curr_proxy_list = []
19-
response = requests.get(self.get_URl(), timeout=self.timeout)
19+
response = requests.get(self.get_url(), timeout=self.timeout)
2020

2121
if not response.ok:
22-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
22+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2323
return []
2424

2525
content = response.content

http_request_randomizer/requests/parsers/ProxyForEuParser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ def __init__(self, id, web_url, bandwithdh=None, timeout=None):
1616

1717
def parse_proxyList(self):
1818
curr_proxy_list = []
19-
response = requests.get(self.get_URl(), timeout=self.timeout)
19+
response = requests.get(self.get_url(), timeout=self.timeout)
2020

2121
if not response.ok:
22-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
22+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2323
return []
2424

2525
content = response.content

http_request_randomizer/requests/parsers/RebroWeeblyParser.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,53 +18,70 @@ def __init__(self, id, web_url, timeout=None):
1818

1919
def parse_proxyList(self, use_top15k=False):
2020
curr_proxy_list = []
21-
response = requests.get(self.get_URl() + "/" + self.top_proxy_path, timeout=self.timeout)
21+
response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)
2222

2323
if not response.ok:
24-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
24+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2525
return []
2626

2727
content = response.content
2828
soup = BeautifulSoup(content, "html.parser")
29-
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={
30-
'color': '#33a27f'})
29+
all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
30+
# address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
31+
# .find('font', attrs={'color': '#33a27f'})
3132
# Parse Top Proxy List page
32-
for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']:
33+
address_list = []
34+
country_list = []
35+
anonymity_list = []
36+
for div in all_divs:
37+
address_div = div.find('font', attrs={'color': '#33a27f'})
38+
if address_div is not None:
39+
for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
40+
address_list.append(str(row))
41+
curr_div = div.findAll('font', attrs={'size': '2'})
42+
if curr_div[0] is not None:
43+
row_data = []
44+
# font -> strong -> font
45+
title = curr_div[0].contents[0].contents[0].contents[0]
46+
for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
47+
row_data.append(str(row))
48+
if 'Country' in str(title):
49+
country_list.extend(row_data)
50+
if 'Status' in str(title):
51+
anonymity_list.extend(row_data)
52+
for address, country, anonymity in zip(address_list, country_list, anonymity_list):
3353
# Make sure it is a Valid Proxy Address
34-
proxy_obj = self.create_proxy_object(row)
35-
if proxy_obj is not None and UrlParser.valid_ip_port(row):
54+
proxy_obj = self.create_proxy_object(address, country, anonymity)
55+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
3656
curr_proxy_list.append(proxy_obj)
3757
else:
3858
logger.debug("Proxy Invalid: {}".format(row))
3959
# Usually these proxies are stale
4060
if use_top15k:
4161
# Parse 15k Nodes Text file (named *-all-*.txt)
42-
content = requests.get(self.get_URl() + "/" + self.txt_proxy_path).content
62+
content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
4363
soup = BeautifulSoup(content, "html.parser")
4464
table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
4565
for link in table.findAll('a'):
4666
current_link = link.get('href')
4767
if current_link is not None and "all" in current_link:
4868
self.txt_proxy_path = current_link
49-
more_content = requests.get(self.get_URl() + self.txt_proxy_path).text
69+
more_content = requests.get(self.get_url() + self.txt_proxy_path).text
5070
for proxy_address in more_content.split():
5171
if UrlParser.valid_ip_port(proxy_address):
5272
proxy_obj = self.create_proxy_object(row)
5373
curr_proxy_list.append(proxy_obj)
5474
return curr_proxy_list
5575

56-
def create_proxy_object(self, dataset):
57-
# Provider specific code
58-
dataset = dataset.strip() # String strip()
59-
ip = dataset.split(":")[0]
76+
def create_proxy_object(self, address, country, anonymity):
6077
# Make sure it is a Valid IP
78+
ip = address.strip().split(":")[0]
6179
if not UrlParser.valid_ip(ip):
6280
logger.debug("IP with Invalid format: {}".format(ip))
6381
return None
64-
port = dataset.split(":")[1]
65-
# TODO: Parse extra tables and combine data - Provider seems to be out-of-date
66-
country = "Unknown"
67-
anonymity = AnonymityLevel.get("unknown")
82+
port = address.strip().split(":")[1]
83+
country = country.strip()
84+
anonymity = AnonymityLevel.get(anonymity.strip())
6885

6986
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
7087

http_request_randomizer/requests/parsers/SamairProxyParser.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ def parse_proxyList(self):
2323
page_set = self.get_pagination_set()
2424
logger.debug("Pages: {}".format(page_set))
2525
for page in page_set:
26-
response = requests.get("{0}{1}".format(self.get_URl(), page), timeout=self.timeout)
26+
response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
2727
if not response.ok:
2828
# Could not parse ANY page - Let user know
2929
if not curr_proxy_list:
30-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
30+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
3131
# Return proxies parsed so far
3232
return curr_proxy_list
3333
content = response.content
@@ -61,11 +61,11 @@ def parse_proxyList(self):
6161
return curr_proxy_list
6262

6363
def get_pagination_set(self):
64-
response = requests.get(self.get_URl(), timeout=self.timeout)
64+
response = requests.get(self.get_url(), timeout=self.timeout)
6565
page_set = set()
6666
# Could not parse pagination page - Let user know
6767
if not response.ok:
68-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
68+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
6969
return page_set
7070
content = response.content
7171
soup = BeautifulSoup(content, "html.parser")

http_request_randomizer/requests/parsers/UrlParser.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,19 @@ class UrlParser(object):
1616
:param bandwidthKBs: minimum bandwidth in KBs (to avoid straggling proxies when having the extra info from proxy provider)
1717
"""
1818

19-
def __init__(self, id, web_url, bandwidthKBs=None, timeout=None):
19+
def __init__(self, id, web_url, bandwidth_KBs=None, timeout=None):
2020
self.id = id
2121
self.url = web_url
2222
self.timeout = timeout
23-
if bandwidthKBs is not None:
24-
self.minimum_bandwidth_in_KBs = bandwidthKBs
23+
if bandwidth_KBs is not None:
24+
self.minimum_bandwidth_in_KBs = bandwidth_KBs
2525
else:
2626
self.minimum_bandwidth_in_KBs = 150
2727

28-
def get_ID(self):
28+
def get_id(self):
2929
return self.id
3030

31-
def get_URl(self):
31+
def get_url(self):
3232
if self.url is None:
3333
raise ParserException("webURL is NONE")
3434
return self.url

http_request_randomizer/requests/proxy/ProxyObject.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class AnonymityLevel(Enum):
6767
UNKNOWN = 0 # default
6868
TRANSPARENT = 1, 'transparent', 'transparent proxy', 'LOW'
6969
ANONYMOUS = 2, 'anonymous', 'anonymous proxy', 'high-anonymous'
70-
ELITE = 3, 'elite', 'elite proxy', 'HIGH'
70+
ELITE = 3, 'elite', 'elite proxy', 'HIGH', 'Elite & Anonymous'
7171

7272
def __new__(cls, int_value, *value_aliases):
7373
obj = object.__new__(cls)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def run_tests(self):
4747

4848
setup(
4949
name='http_request_randomizer',
50-
version='1.0.7',
50+
version='1.1.0',
5151
url='http://pgaref.com/blog/python-proxy',
5252
license='MIT',
5353
author='Panagiotis Garefalakis',

tests/mocks.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,21 @@ def proxy_for_eu_mock(url, request):
117117

118118
@urlmatch(netloc=r'(.*\.)?rebro\.weebly\.com$')
119119
def rebro_weebly_mock(url, request):
120-
return """<div class="paragraph" style="text-align:left;"><strong><font color="#3ab890"
121-
size="3"><font
120+
return """<div class="paragraph" style="text-align:left;"><strong><font color="#3ab890" size="3"><font
122121
color="#d5d5d5">IP:Port</font></font></strong><br/><font
123122
size="2"><strong><font color="#33a27f">213.149.105.12:8080<br/>119.188.46.42:8080</font></strong></font><br/><span></span>
124-
</div>"""
123+
</div>
124+
125+
126+
<div class="paragraph" style="text-align:left;"><font size="2"><strong><font size="3"><font color="#3ab890">Country</font></font></strong></font><font size="2">
127+
<br />Montenegro<br />China<br /></font><br /><span></span>
128+
</div>
129+
130+
<div class="paragraph" style="text-align:left;"><font size="2"><strong><font color="#3ab890" size="3">Status</font></strong></font><br /><font size="2">
131+
Elite &amp; Anonymous<br />Elite &amp; Anonymous<br /></font><br /><span></span>
132+
</div>
133+
134+
"""
125135

126136

127137
@urlmatch(netloc=r'(.*\.)?www\.premproxy\.com')

tests/test_parsers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313

1414
class TestBaseProxyParsers(unittest.TestCase):
1515
def setUp(self):
16-
self.normal_parser = UrlParser("proxy-test", "http://proxy-test.com", bandwidthKBs=50)
16+
self.normal_parser = UrlParser("proxy-test", "http://proxy-test.com", bandwidth_KBs=50)
1717
self.no_bdwidthParser = UrlParser("slow-proxy", "http://slow-proxy.com")
1818

1919
def test_normal_parser(self):
20-
self.assertEqual(self.normal_parser.get_URl(), "http://proxy-test.com", "incorrect parser URL")
20+
self.assertEqual(self.normal_parser.get_url(), "http://proxy-test.com", "incorrect parser URL")
2121
self.assertEqual(self.normal_parser.get_min_bandwidth(), 50, "incorrect parser bandwidth")
2222

2323
def test_no_bandwidth_parser(self):
24-
self.assertEqual(self.no_bdwidthParser.get_URl(), "http://slow-proxy.com", "incorrect parser URL")
24+
self.assertEqual(self.no_bdwidthParser.get_url(), "http://slow-proxy.com", "incorrect parser URL")
2525
self.assertEqual(self.no_bdwidthParser.get_min_bandwidth(), 150, "incorrect parser bandwidth")
2626

2727

tests/test_proxyObject.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,26 @@
11
from __future__ import absolute_import
22

3-
import unittest
4-
import sys
53
import os
4+
import sys
5+
import unittest
66

77
sys.path.insert(0, os.path.abspath('.'))
88

9-
from http_request_randomizer.requests.proxy.ProxyObject import AnonymityLevel
9+
from http_request_randomizer.requests.proxy.ProxyObject import AnonymityLevel, ProxyObject
1010

1111

1212
class TestProxyObject(unittest.TestCase):
1313

14+
def test_ProxyObjectSimple(self):
15+
src = 'Test'
16+
ip = '127.0.0.1'
17+
port = '8080'
18+
po = ProxyObject(src, ip, port, AnonymityLevel.UNKNOWN)
19+
self.assertEqual(po.source, src)
20+
self.assertEqual(po.ip, ip)
21+
self.assertEqual(po.port, port)
22+
self.assertEqual(po.get_address(), "{0}:{1}".format(ip, port))
23+
1424
def test_AnonymityLevels(self):
1525
self.assertTrue(AnonymityLevel.UNKNOWN.value == 0)
1626
self.assertTrue(AnonymityLevel.TRANSPARENT.value == 1)
@@ -40,6 +50,7 @@ def test_EliteEnumLevel(self):
4050
self.assertEqual(AnonymityLevel.ELITE, AnonymityLevel.get('elite'))
4151
self.assertEqual(AnonymityLevel.ELITE, AnonymityLevel.get('elite proxy'))
4252
self.assertEqual(AnonymityLevel.ELITE, AnonymityLevel.get('HIGH'))
53+
self.assertEqual(AnonymityLevel.ELITE, AnonymityLevel.get('Elite & Anonymous'))
4354

4455

4556
if __name__ == '__main__':

0 commit comments

Comments
 (0)