Skip to content

Commit 78b305a

Browse files
authored
Feature/sslproxy (#56)
* added SSLProxy parser * added tests for SslProxies * Updating README and contributors list
1 parent dd2cf26 commit 78b305a

File tree

8 files changed

+165
-6
lines changed

8 files changed

+165
-6
lines changed

CONTRIBUTORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ Contributors (as ordered by Github)
4747

4848
* JS unpacker
4949
* PremProxy migration
50+
* SslProxyParser
5051

5152

5253
**[More details](https://github.com/pgaref/HTTP_Request_Randomizer/contributors).**

README-vi.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Mã nguồn trong repository này sẽ thực hiện lấy proxy từ **bốn**
2525
* http://free-proxy-list.net
2626
* http://rebro.weebly.com/proxy-list.html
2727
* http://www.samair.ru/proxy/time-01.htm
28+
* https://www.sslproxies.org
2829

2930
Sau khi thu thập danh sách các proxy và loại bỏ những proxy chậm nó sẽ lấy ngẫu nhiên một proxy để gửi request đến url được chỉ định.
3031
Thời gian chờ được thiết lập là 30 giây và nếu proxy không phản hồi kết quả nó sẽ được xóa bỏ trong danh sách proxy.

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ Surprisingly, the only thing that tells a server the application triggered the r
2323

2424
## The source code
2525

26-
The project code in this repository is crawling **four** different public proxy websites:
26+
The project code in this repository is crawling **five** different public proxy websites:
2727
* http://proxyfor.eu/geo.php
2828
* http://free-proxy-list.net
2929
* http://rebro.weebly.com/proxy-list.html
3030
* http://www.samair.ru/proxy/time-01.htm
31+
* https://www.sslproxies.org
3132

3233
After collecting the proxy data and filtering the slowest ones it is randomly selecting one of them to query the target url.
3334
The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.

README.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,11 @@ called a "user agent" which is included in the HTTP request.
3737
The source code
3838
---------------
3939

40-
The project code in this repository is crawling **four** different
40+
The project code in this repository is crawling **five** different
4141
public proxy websites: \* http://proxyfor.eu/geo.php \*
4242
http://free-proxy-list.net \* http://rebro.weebly.com/proxy-list.html \*
43-
http://www.samair.ru/proxy/time-01.htm
43+
http://www.samair.ru/proxy/time-01.htm \*
44+
https://www.sslproxies.org
4445

4546
After collecting the proxy data and filtering the slowest ones it is
4647
randomly selecting one of them to query the target url. The request
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import logging
2+
3+
import requests
4+
from bs4 import BeautifulSoup
5+
6+
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
7+
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol
8+
9+
logger = logging.getLogger(__name__)
10+
__author__ = 'pgaref'
11+
12+
13+
class SslProxyParser(UrlParser):
14+
def __init__(self, id, web_url, timeout=None):
15+
UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
16+
17+
def parse_proxyList(self):
18+
curr_proxy_list = []
19+
try:
20+
response = requests.get(self.get_url(), timeout=self.timeout)
21+
if not response.ok:
22+
logger.warning("Proxy Provider url failed: {}".format(self.get_url()))
23+
return []
24+
25+
content = response.content
26+
soup = BeautifulSoup(content, "html.parser")
27+
table = soup.find("table", attrs={"id": "proxylisttable"})
28+
29+
# The first tr contains the field names.
30+
headings = [th.get_text() for th in table.find("tr").find_all("th")]
31+
32+
datasets = []
33+
for row in table.find_all("tr")[1:-1]:
34+
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
35+
if dataset:
36+
datasets.append(dataset)
37+
38+
for dataset in datasets:
39+
proxy_obj = self.create_proxy_object(dataset)
40+
# Make sure it is a Valid Proxy Address
41+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
42+
curr_proxy_list.append(proxy_obj)
43+
else:
44+
logger.debug("Proxy Invalid: {}".format(dataset))
45+
except AttributeError as e:
46+
logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
47+
except KeyError as e:
48+
logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
49+
except Exception as e:
50+
logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
51+
finally:
52+
return curr_proxy_list
53+
54+
def create_proxy_object(self, dataset):
55+
# Check Field[0] for tags and field[1] for values!
56+
ip = ""
57+
port = None
58+
anonymity = AnonymityLevel.UNKNOWN
59+
country = None
60+
protocols = []
61+
for field in dataset:
62+
if field[0] == 'IP Address':
63+
# Make sure it is a Valid IP
64+
ip = field[1].strip() # String strip()
65+
# Make sure it is a Valid IP
66+
if not UrlParser.valid_ip(ip):
67+
logger.debug("IP with Invalid format: {}".format(ip))
68+
return None
69+
elif field[0] == 'Port':
70+
port = field[1].strip() # String strip()
71+
elif field[0] == 'Anonymity':
72+
anonymity = AnonymityLevel.get(field[1].strip()) # String strip()
73+
elif field[0] == 'Country':
74+
country = field[1].strip() # String strip()
75+
elif field[0] == 'Https':
76+
if field[1].strip().lower() == 'yes': protocols.extend([Protocol.HTTP, Protocol.HTTPS])
77+
elif field[1].strip().lower() == 'no': protocols.append(Protocol.HTTP)
78+
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols)
79+
80+
def __str__(self):
81+
return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \
82+
.format(self.id, self.url, self.minimum_bandwidth_in_KBs)

http_request_randomizer/requests/proxy/requestProxy.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
1717
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
1818
from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser
19+
from http_request_randomizer.requests.parsers.SslProxyParser import SslProxyParser
1920
from http_request_randomizer.requests.useragent.userAgent import UserAgentManager
2021

2122
__author__ = 'pgaref'
@@ -44,6 +45,7 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5, protocol=Protoco
4445
#parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) <--doesn't work anymore
4546
#parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) <--doesn't work anymore
4647
parsers.append(PremProxyParser('PremProxy', 'https://premproxy.com', timeout=timeout))
48+
parsers.append(SslProxyParser('SslProxy', 'https://www.sslproxies.org', timeout=timeout))
4749

4850
self.logger.debug("=== Initialized Proxy Parsers ===")
4951
for i in range(len(parsers)):
@@ -117,7 +119,7 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
117119
raise ConnectionError("HTTP Response [403] - Permission denied error")
118120
elif request.status_code == 503:
119121
raise ConnectionError("HTTP Response [503] - Service unavailable error")
120-
print('RR Status {}'.format(request.status_code))
122+
self.logger.info('RR Status {}'.format(request.status_code))
121123
return request
122124
except ConnectionError:
123125
try:

tests/mocks.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,68 @@
55
proxy_for_eu_expected = ['107.151.136.222:80', '37.187.253.39:8115']
66
rebro_weebly_expected = ['213.149.105.12:8080', '119.188.46.42:8080']
77
prem_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080']
8+
sslproxy_expected = ['24.211.89.146:8080', '187.84.222.153:80', '41.193.238.249:8080']
9+
10+
@urlmatch(netloc=r'(.*\.)?sslproxies\.org$')
11+
def sslproxy_mock(url, request):
12+
return """<table class="table table-striped table-bordered" cellspacing="0" width="100%" id="proxylisttable">
13+
<thead>
14+
<tr>
15+
<th>IP Address</th>
16+
<th>Port</th>
17+
<th>Code</th>
18+
<th class='hm'>Country</th>
19+
<th>Anonymity</th>
20+
<th class='hm'>Google</th>
21+
<th class='hx'>Https</th>
22+
<th class='hm'>Last Checked</th>
23+
</tr>
24+
</thead>
25+
<tbody>
26+
<tr>
27+
<td>24.211.89.146</td>
28+
<td>8080</td>
29+
<td>US</td>
30+
<td class='hm'>United States</td>
31+
<td>elite proxy</td>
32+
<td class='hm'>no</td>
33+
<td class='hx'>yes</td>
34+
<td class='hm'>8 seconds ago</td>
35+
</tr>
36+
<tr>
37+
<td>187.84.222.153</td>
38+
<td>80</td>
39+
<td>BR</td>
40+
<td class='hm'>Brazil</td>
41+
<td>anonymous</td>
42+
<td class='hm'>no</td>
43+
<td class='hx'>yes</td>
44+
<td class='hm'>1 minute ago</td>
45+
</tr>
46+
<tr>
47+
<td>41.193.238.249</td>
48+
<td>8080</td>
49+
<td>ZA</td>
50+
<td class='hm'>South Africa</td>
51+
<td>elite proxy</td>
52+
<td class='hm'>no</td>
53+
<td class='hx'>yes</td>
54+
<td class='hm'>1 minute ago</td>
55+
</tr>
56+
</tbody>
57+
<tfoot>
58+
<tr>
59+
<th class="input"><input type="text" /></th>
60+
<th></th><th></th>
61+
<th class='hm'></th>
62+
<th></th>
63+
<th class='hm'></th>
64+
<th class='hx'></th>
65+
<th class='hm'></th>
66+
</tr>
67+
</tfoot>
68+
</table>
69+
"""
870

971
@urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$')
1072
def free_proxy_mock(url, request):

tests/test_providers.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77

88
sys.path.insert(0, os.path.abspath('.'))
99

10-
from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, prem_mock
11-
from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, prem_expected, prem_js_mock
10+
from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, prem_mock, sslproxy_mock
11+
from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, prem_expected, prem_js_mock, sslproxy_expected
1212
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
1313
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
1414
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
1515
from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser
16+
from http_request_randomizer.requests.parsers.SslProxyParser import SslProxyParser
1617

1718
__author__ = 'pgaref'
1819

@@ -56,6 +57,14 @@ def test_PremProxyParser(self):
5657
for item in prem_expected:
5758
self.assertTrue(item in proxy_list_addr)
5859

60+
def test_SslProxyParser(self):
61+
with HTTMock(sslproxy_mock):
62+
proxy_provider = SslProxyParser('SslProxy', 'https://www.sslproxies.org/')
63+
proxy_list = proxy_provider.parse_proxyList()
64+
proxy_list_addr = []
65+
for proxy in proxy_list:
66+
proxy_list_addr.append(proxy.get_address())
67+
self.assertEqual(proxy_list_addr, sslproxy_expected)
5968

6069
if __name__ == '__main__':
6170
unittest.main()

0 commit comments

Comments
 (0)