Skip to content

Commit 8c64000

Browse files
author
pgaref
committed
Merge branch 'hotfix/1.0.8' into develop
2 parents 3a14218 + 5dbaa52 commit 8c64000

File tree

8 files changed

+111
-66
lines changed

8 files changed

+111
-66
lines changed

http_request_randomizer/requests/parsers/SamairProxyParser.py

Lines changed: 40 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,53 +2,59 @@
22

33
import requests
44
from bs4 import BeautifulSoup
5+
from requests import ConnectionError
56

67
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
78

89
logger = logging.getLogger(__name__)
910
__author__ = 'pgaref'
1011

1112

13+
# Samair Proxy now renamed to: premproxy.com
1214
class SamairProxyParser(UrlParser):
1315
def __init__(self, web_url, timeout=None):
16+
web_url += "/list/"
1417
UrlParser.__init__(self, web_url, timeout)
1518

1619
def parse_proxyList(self):
1720
curr_proxy_list = []
18-
response = requests.get(self.get_URl(), timeout=self.timeout)
19-
20-
if not response.ok:
21-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
22-
return []
23-
24-
content = response.content
25-
soup = BeautifulSoup(content, "html.parser")
26-
# css provides the port number so we reverse it
27-
# for href in soup.findAll('link'):
28-
# if '/styles/' in href.get('href'):
29-
# style = "http://www.samair.ru" + href.get('href')
30-
# break
31-
# css = requests.get(style).content.split('\n')
32-
# css.pop()
33-
# ports = {}
34-
# for l in css:
35-
# p = l.split(' ')
36-
# key = p[0].split(':')[0][1:]
37-
# value = p[1].split('\"')[1]
38-
# ports[key] = value
39-
40-
table = soup.find("div", attrs={"id": "proxylist"})
41-
# The first tr contains the field names.
42-
headings = [th.get_text() for th in table.find("tr").find_all("th")]
43-
for row in table.find_all("tr")[1:]:
44-
td_row = row.find("td")
45-
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
46-
# Make sure it is a Valid Proxy Address
47-
if UrlParser.valid_ip_port(td_row.text):
48-
curr_proxy_list.append('http://' +td_row.text)
49-
else:
50-
logger.debug("Address with Invalid format: {}".format(td_row.text))
51-
21+
# Parse all proxy pages -> format: /list/{num}.htm
22+
# TODO: get the pageRange from the 'pagination' table
23+
for page in range(1, 21):
24+
response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout)
25+
if not response.ok:
26+
# Could not parse ANY page - Let user know
27+
if not curr_proxy_list:
28+
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
29+
# Return proxies parsed so far
30+
return curr_proxy_list
31+
content = response.content
32+
soup = BeautifulSoup(content, "html.parser")
33+
# css provides the port number so we reverse it
34+
# for href in soup.findAll('link'):
35+
# if '/styles/' in href.get('href'):
36+
# style = "http://www.samair.ru" + href.get('href')
37+
# break
38+
# css = requests.get(style).content.split('\n')
39+
# css.pop()
40+
# ports = {}
41+
# for l in css:
42+
# p = l.split(' ')
43+
# key = p[0].split(':')[0][1:]
44+
# value = p[1].split('\"')[1]
45+
# ports[key] = value
46+
47+
table = soup.find("div", attrs={"id": "proxylist"})
48+
# The first tr contains the field names.
49+
headings = [th.get_text() for th in table.find("tr").find_all("th")]
50+
for row in table.find_all("tr")[1:]:
51+
td_row = row.find("td")
52+
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
53+
# Make sure it is a Valid Proxy Address
54+
if UrlParser.valid_ip_port(td_row.text):
55+
curr_proxy_list.append('http://' + td_row.text)
56+
else:
57+
logger.debug("Address with Invalid format: {}".format(td_row.text))
5258
return curr_proxy_list
5359

5460
def __str__(self):

http_request_randomizer/requests/proxy/requestProxy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
4040
parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout))
4141
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout))
4242
parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout))
43-
parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout))
43+
parsers.append(SamairProxyParser('https://premproxy.com', timeout=timeout))
4444

4545
self.logger.debug("=== Initialized Proxy Parsers ===")
4646
for i in range(len(parsers)):

http_request_randomizer/requests/useragent/userAgent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def get_len_user_agent(self):
4040

4141
if __name__ == '__main__':
4242
ua = UserAgentManager()
43-
print("Number of User Agent headers: {0}".format(str(ua.get_len_user_agent)))
43+
print("Number of User Agent headers: {0}".format(ua.get_len_user_agent()))
4444
print("First User Agent in file: {0}".format(ua.get_first_user_agent()))
4545
print("Last User Agent in file: {0}".format(ua.get_last_user_agent()))
4646
print("If you want one random header for a request, you may use the following header:\n")

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ httmock == 1.2.6
44
psutil == 5.2.2
55
pytest == 3.1.3
66
pytest-cov == 2.5.1
7-
python-dateutil == 2.6.0
7+
python-dateutil == 2.6.1
88
requests == 2.18.1
99
schedule == 0.4.3
1010
pyOpenSSL == 17.1.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def run_tests(self):
6464
install_requires=['beautifulsoup4 >= 4.6.0',
6565
'httmock >= 1.2.6',
6666
'psutil >= 5.2.2',
67-
'python-dateutil >= 2.6.0',
67+
'python-dateutil >= 2.6.1',
6868
'requests >= 2.18.1',
6969
'schedule >= 0.4.3',
7070
],

tests/mocks.py

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
free_proxy_expected = ['http://138.197.136.46:3128', 'http://177.207.75.227:8080']
55
proxy_for_eu_expected = ['http://107.151.136.222:80', 'http://37.187.253.39:8115']
66
rebro_weebly_expected = ['http://213.149.105.12:8080', 'http://119.188.46.42:8080']
7-
samair_expected = ['http://191.252.61.28:80', 'http://167.114.203.141:8080']
7+
samair_expected = ['http://191.252.61.28:80', 'http://167.114.203.141:8080', 'http://152.251.141.93:8080']
88

99
@urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$')
1010
def free_proxy_mock(url, request):
@@ -124,46 +124,50 @@ def rebro_weebly_mock(url, request):
124124
</div>"""
125125

126126

127-
@urlmatch(netloc=r'(.*\.)?www.samair.ru')
127+
@urlmatch(netloc=r'(.*\.)?www\.premproxy\.com')
128128
def samair_mock(url, request):
129129
return """<div id="proxylist">\n
130-
<tr class="list_sorted">\n
131-
<th><a href="http://samair.ru/proxy/ip-address-01.htm"
132-
title="Proxy List sorted by ip address">IP address</a></th>
130+
<tr class="anon">\n
131+
<th><a href="/list/ip-address-01.htm" title="Proxy List sorted by ip address">IP address</a></th>
133132
\n
134-
<th><a href="http://samair.ru/proxy/proxy-01.htm"
135-
title="Proxy List sorted by anonymity level">Anonymity level</a>
136-
</th>
133+
<th><a href="/list/" title="Proxy List sorted by anonymity level">Anonymity</a></th>
137134
\n
138-
<th><a href="http://samair.ru/proxy/time-01.htm"
139-
title="Proxy List sorted by updated time">Checked time</a></th>
135+
<th><a href="/list/time-01.htm" title="Proxy List sorted by updated time">Checked</a></th>
140136
\n
141-
<th><a href="http://samair.ru/proxy/type-01.htm"
142-
title="Proxy list sorted by country">Country</a></th>
137+
<th><a href="/list/type-01.htm" title="Proxy list sorted by country">Country</a></th>
143138
\n
144139
<th><dfn title="City or State\\Region ">City</dfn></th>
145140
\n
146141
<th><dfn title="Internet Service Provider">ISP</dfn></th>
147142
\n
148143
</tr>
149144
\n
150-
<tr class="elite">
151-
<td>191.252.61.28:80</td>
152-
<td>high-anonymous</td>
153-
<td>Apr-18, 17:18</td>
154-
<td>Brazil</td>
155-
<td>S\xe3o Jos\xe9 Dos Campos</td>
156-
<td><dfn title="Locaweb Servi\xe7os de Internet S/A">Locaweb
145+
<tr class="anon">
146+
<td data-label="IP:port ">191.252.61.28:80</td>
147+
<td data-label="Anonymity Type: "high-anonymous</td>
148+
<td data-label="Checked: ">Apr-18, 17:18</td>
149+
<td data-label="Country: ">Brazil</td>
150+
<td data-label="City: ">S\xe3o Jos\xe9 Dos Campos</td>
151+
<td data-label="ISP: "><dfn title="Locaweb Servi\xe7os de Internet S/A">Locaweb
157152
Servi\xe7o...</dfn></td>
158153
</tr>
159154
\n
160-
<tr class="transp">
161-
<td>167.114.203.141:8080</td>
162-
<td>transparent</td>
163-
<td>Apr-18, 13:22</td>
164-
<td>Canada</td>
165-
<td>Montr\xe9al (QC)</td>
166-
<td>OVH Hosting</td>
155+
<tr class="anon">
156+
<td data-label="IP:port ">167.114.203.141:8080</td>
157+
<td data-label="Anonymity Type: "transparent</td>
158+
<td data-label="Checked: ">Apr-18, 13:22</td>
159+
<td data-label="Country: ">Canada</td>
160+
<td data-label="City: ">Montr\xe9al (QC)</td>
161+
<td data-label="ISP: ">OVH Hosting</td>
162+
</tr>
163+
\n
164+
<tr class="anon">
165+
<td data-label="IP:port ">152.251.141.93:8080</td>
166+
<td data-label="Anonymity Type: ">elite </td>
167+
<td data-label="Checked: ">Jul-16, 04:39</td>
168+
<td data-label="Country: ">Brazil</td>
169+
<td data-label="City: ">&nbsp;</td>
170+
<td data-label="ISP: ">Vivo</td>
167171
</tr>
168172
\n
169173
</div>"""

tests/test_providers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,10 @@ def test_RebroWeeblyParser(self):
3939

4040
def test_SemairProxyParser(self):
4141
with HTTMock(samair_mock):
42-
proxy_provider = SamairProxyParser('http://www.samair.ru/proxy/time-01.htm')
42+
proxy_provider = SamairProxyParser('https://www.premproxy.com')
4343
proxy_list = proxy_provider.parse_proxyList()
44-
self.assertEqual(proxy_list, samair_expected)
44+
for item in samair_expected:
45+
self.assertTrue(item in proxy_list)
4546

4647

4748
if __name__ == '__main__':

tests/test_useragent.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from __future__ import absolute_import
2+
3+
import unittest
4+
import sys
5+
import os
6+
7+
from http_request_randomizer.requests.useragent.userAgent import UserAgentManager
8+
9+
sys.path.insert(0, os.path.abspath('.'))
10+
11+
__author__ = 'pgaref'
12+
13+
14+
class TestBaseProxyParsers(unittest.TestCase):
15+
def setUp(self):
16+
self.ua = UserAgentManager()
17+
18+
def test_agent_size(self):
19+
self.assertTrue(self.ua.get_len_user_agent() >= 899)
20+
21+
def test_fist_user_agent(self):
22+
expected = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0"
23+
self.assertEquals(self.ua.get_first_user_agent(), expected)
24+
25+
def test_last_user_agent(self):
26+
expected = "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.2.15 Version/10.0"
27+
self.assertEquals(self.ua.get_last_user_agent(), expected)
28+
29+
def test_random_user_agent(self):
30+
self.assertNotEqual(self.ua.get_random_user_agent(), self.ua.get_random_user_agent())
31+
32+
33+
if __name__ == '__main__':
34+
unittest.main()

0 commit comments

Comments
 (0)