Skip to content

Commit 3d3e37a

Browse files
author
pgaref
committed
Merge branch 'release/v1.1.0'
2 parents ca36467 + 7bb0405 commit 3d3e37a

File tree

13 files changed

+402
-107
lines changed

13 files changed

+402
-107
lines changed

.gitignore

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,22 @@
11
*.pyc
2+
### JetBrains ###
3+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
4+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
5+
6+
# User-specific stuff:
7+
.idea/workspace.xml
8+
.idea/tasks.xml
9+
.idea/dictionaries
10+
.idea/vcs.xml
11+
.idea/jsLibraryMappings.xml
12+
13+
# Sensitive or high-churn files:
14+
.idea/dataSources.ids
15+
.idea/dataSources.xml
16+
.idea/dataSources.local.xml
17+
.idea/sqlDataSources.xml
18+
.idea/dynamic.xml
19+
.idea/uiDesigner.xml
20+
21+
## File-based project format:
22+
*.iws

http_request_randomizer/requests/parsers/FreeProxyParser.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,22 @@
44
from bs4 import BeautifulSoup
55

66
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
7+
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel
78

89
logger = logging.getLogger(__name__)
910
__author__ = 'pgaref'
1011

1112

1213
class FreeProxyParser(UrlParser):
13-
def __init__(self, web_url, timeout=None):
14-
UrlParser.__init__(self, web_url, timeout)
14+
def __init__(self, id, web_url, timeout=None):
15+
UrlParser.__init__(self, id, web_url, timeout)
1516

1617
def parse_proxyList(self):
1718
curr_proxy_list = []
18-
response = requests.get(self.get_URl(), timeout=self.timeout)
19+
response = requests.get(self.get_url(), timeout=self.timeout)
1920

2021
if not response.ok:
21-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
22+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2223
return []
2324

2425
content = response.content
@@ -35,28 +36,36 @@ def parse_proxyList(self):
3536
datasets.append(dataset)
3637

3738
for dataset in datasets:
38-
# Check Field[0] for tags and field[1] for values!
39-
address = ""
40-
for field in dataset:
41-
if field[0] == 'IP Address':
42-
# Make sure it is a Valid IP
43-
if not UrlParser.valid_ip(field[1]):
44-
logger.debug("IP with Invalid format: {}".format(field[1]))
45-
break
46-
else:
47-
address += field[1] + ':'
48-
elif field[0] == 'Port':
49-
address += field[1]
39+
proxy_obj = self.create_proxy_object(dataset)
5040
# Make sure it is a Valid Proxy Address
51-
if UrlParser.valid_ip_port(address):
52-
proxy = "http://" + address
53-
curr_proxy_list.append(proxy.__str__())
41+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
42+
curr_proxy_list.append(proxy_obj)
5443
else:
55-
logger.debug("Address with Invalid format: {}".format(address))
56-
# print "{0:<10}: {1}".format(field[0], field[1])
57-
# print "ALL: ", curr_proxy_list
44+
logger.debug("Proxy Invalid: {}".format(dataset))
5845
return curr_proxy_list
5946

47+
def create_proxy_object(self, dataset):
48+
# Check Field[0] for tags and field[1] for values!
49+
ip = ""
50+
port = None
51+
anonymity = AnonymityLevel.UNKNOWN
52+
country = None
53+
for field in dataset:
54+
if field[0] == 'IP Address':
55+
# Make sure it is a Valid IP
56+
ip = field[1].strip() # String strip()
57+
# Make sure it is a Valid IP
58+
if not UrlParser.valid_ip(ip):
59+
logger.debug("IP with Invalid format: {}".format(ip))
60+
return None
61+
elif field[0] == 'Port':
62+
port = field[1].strip() # String strip()
63+
elif field[0] == 'Anonymity':
64+
anonymity = AnonymityLevel.get(field[1].strip()) # String strip()
65+
elif field[0] == 'Country':
66+
country = field[1].strip() # String strip()
67+
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
68+
6069
def __str__(self):
6170
return "FreeProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
6271
.format(self.url, self.minimum_bandwidth_in_KBs)

http_request_randomizer/requests/parsers/ProxyForEuParser.py

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,22 @@
44
from bs4 import BeautifulSoup
55

66
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
7+
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel
78

89
logger = logging.getLogger(__name__)
910
__author__ = 'pgaref'
1011

1112

1213
class ProxyForEuParser(UrlParser):
13-
def __init__(self, web_url, bandwithdh=None, timeout=None):
14-
UrlParser.__init__(self, web_url, bandwithdh, timeout)
14+
def __init__(self, id, web_url, bandwithdh=None, timeout=None):
15+
UrlParser.__init__(self, id, web_url, bandwithdh, timeout)
1516

1617
def parse_proxyList(self):
1718
curr_proxy_list = []
18-
response = requests.get(self.get_URl(), timeout=self.timeout)
19+
response = requests.get(self.get_url(), timeout=self.timeout)
1920

2021
if not response.ok:
21-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
22+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2223
return []
2324

2425
content = response.content
@@ -34,31 +35,40 @@ def parse_proxyList(self):
3435
datasets.append(dataset)
3536

3637
for dataset in datasets:
37-
# Check Field[0] for tags and field[1] for values!
38-
address = ""
39-
proxy_straggler = False
40-
for field in dataset:
41-
# Discard slow proxies! Speed is in KB/s
42-
if field[0] == 'Speed':
43-
if float(field[1]) < self.get_min_bandwidth():
44-
proxy_straggler = True
45-
if field[0] == 'IP':
46-
# Make sure it is a Valid IP
47-
if not UrlParser.valid_ip(field[1]):
48-
logger.debug("IP with Invalid format: {}".format(field[1]))
49-
break
50-
else:
51-
address += field[1] + ':'
52-
elif field[0] == 'Port':
53-
address += field[1]
5438
# Avoid Straggler proxies and make sure it is a Valid Proxy Address
55-
if not proxy_straggler and UrlParser.valid_ip_port(address):
56-
proxy = "http://" + address
57-
curr_proxy_list.append(proxy.__str__())
58-
# print "{0:<10}: {1}".format(field[0], field[1])
59-
# print "ALL: ", curr_proxy_list
39+
proxy_obj = self.create_proxy_object(dataset)
40+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
41+
curr_proxy_list.append(proxy_obj)
42+
else:
43+
logger.debug("Proxy Invalid: {}".format(dataset))
6044
return curr_proxy_list
6145

46+
def create_proxy_object(self, dataset):
47+
ip = ""
48+
port = None
49+
anonymity = AnonymityLevel.UNKNOWN
50+
country = None
51+
# Check Field[0] for tags and field[1] for values!
52+
for field in dataset:
53+
# Discard slow proxies! Speed is in KB/s
54+
if field[0] == 'Speed':
55+
if float(field[1]) < self.get_min_bandwidth():
56+
logger.debug("Proxy with low bandwidth: {}".format(float(field[1])))
57+
return None
58+
if field[0] == 'IP':
59+
ip = field[1].strip() # String strip()
60+
# Make sure it is a Valid IP
61+
if not UrlParser.valid_ip(ip):
62+
logger.debug("IP with Invalid format: {}".format(ip))
63+
return None
64+
elif field[0] == 'Port':
65+
port = field[1].strip() # String strip()
66+
elif field[0] == 'Anon':
67+
anonymity = AnonymityLevel.get(field[1].strip()) # String strip()
68+
elif field[0] == 'Country':
69+
country = field[1].strip() # String strip()
70+
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
71+
6272
def __str__(self):
6373
return "ProxyForEU Parser of '{0}' with required bandwidth: '{1}' KBs" \
6474
.format(self.url, self.minimum_bandwidth_in_KBs)

http_request_randomizer/requests/parsers/RebroWeeblyParser.py

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,87 @@
44
from bs4 import BeautifulSoup
55

66
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
7+
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel
78

89
logger = logging.getLogger(__name__)
910
__author__ = 'pgaref'
1011

1112

1213
class RebroWeeblyParser(UrlParser):
13-
def __init__(self, web_url, timeout=None):
14+
def __init__(self, id, web_url, timeout=None):
1415
self.top_proxy_path = "proxy-list.html"
1516
self.txt_proxy_path = "txt-lists.html"
16-
UrlParser.__init__(self, web_url, timeout)
17+
UrlParser.__init__(self, id, web_url, timeout)
1718

1819
def parse_proxyList(self, use_top15k=False):
1920
curr_proxy_list = []
20-
response = requests.get(self.get_URl()+"/"+self.top_proxy_path, timeout=self.timeout)
21+
response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)
2122

2223
if not response.ok:
23-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
24+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2425
return []
2526

2627
content = response.content
2728
soup = BeautifulSoup(content, "html.parser")
28-
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={
29-
'color': '#33a27f'})
29+
all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
30+
# address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
31+
# .find('font', attrs={'color': '#33a27f'})
3032
# Parse Top Proxy List page
31-
for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']:
33+
address_list = []
34+
country_list = []
35+
anonymity_list = []
36+
for div in all_divs:
37+
address_div = div.find('font', attrs={'color': '#33a27f'})
38+
if address_div is not None:
39+
for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
40+
address_list.append(str(row))
41+
curr_div = div.findAll('font', attrs={'size': '2'})
42+
if curr_div[0] is not None:
43+
row_data = []
44+
# font -> strong -> font
45+
title = curr_div[0].contents[0].contents[0].contents[0]
46+
for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
47+
row_data.append(str(row))
48+
if 'Country' in str(title):
49+
country_list.extend(row_data)
50+
if 'Status' in str(title):
51+
anonymity_list.extend(row_data)
52+
for address, country, anonymity in zip(address_list, country_list, anonymity_list):
3253
# Make sure it is a Valid Proxy Address
33-
if UrlParser.valid_ip_port(row):
34-
proxy = "http://" + row
35-
curr_proxy_list.append(proxy.__str__())
54+
proxy_obj = self.create_proxy_object(address, country, anonymity)
55+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
56+
curr_proxy_list.append(proxy_obj)
3657
else:
37-
logger.debug("Address with Invalid format: {}".format(row))
58+
logger.debug("Proxy Invalid: {}".format(row))
3859
# Usually these proxies are stale
3960
if use_top15k:
4061
# Parse 15k Nodes Text file (named *-all-*.txt)
41-
content = requests.get(self.get_URl() + "/" + self.txt_proxy_path).content
62+
content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
4263
soup = BeautifulSoup(content, "html.parser")
4364
table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
4465
for link in table.findAll('a'):
4566
current_link = link.get('href')
4667
if current_link is not None and "all" in current_link:
4768
self.txt_proxy_path = current_link
48-
more_content = requests.get(self.get_URl()+self.txt_proxy_path).text
69+
more_content = requests.get(self.get_url() + self.txt_proxy_path).text
4970
for proxy_address in more_content.split():
5071
if UrlParser.valid_ip_port(proxy_address):
51-
curr_proxy_list.append(proxy_address)
52-
72+
proxy_obj = self.create_proxy_object(row)
73+
curr_proxy_list.append(proxy_obj)
5374
return curr_proxy_list
5475

76+
def create_proxy_object(self, address, country, anonymity):
77+
# Make sure it is a Valid IP
78+
ip = address.strip().split(":")[0]
79+
if not UrlParser.valid_ip(ip):
80+
logger.debug("IP with Invalid format: {}".format(ip))
81+
return None
82+
port = address.strip().split(":")[1]
83+
country = country.strip()
84+
anonymity = AnonymityLevel.get(anonymity.strip())
85+
86+
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
87+
5588
def __str__(self):
5689
return "RebroWeebly Parser of '{0}' with required bandwidth: '{1}' KBs" \
5790
.format(self.url, self.minimum_bandwidth_in_KBs)

http_request_randomizer/requests/parsers/SamairProxyParser.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,30 @@
44
from bs4 import BeautifulSoup
55

66
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
7+
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel
78

89
logger = logging.getLogger(__name__)
910
__author__ = 'pgaref'
1011

1112

1213
# Samair Proxy now renamed to: premproxy.com
1314
class SamairProxyParser(UrlParser):
14-
def __init__(self, web_url, timeout=None):
15+
def __init__(self, id, web_url, timeout=None):
1516
web_url += "/list/"
16-
UrlParser.__init__(self, web_url, timeout)
17+
UrlParser.__init__(self, id, web_url, timeout)
1718

1819
def parse_proxyList(self):
1920
curr_proxy_list = []
2021
# Parse all proxy pages -> format: /list/{num}.htm
21-
# TODO: get the pageRange from the 'pagination' table
22-
for page in range(1, 21):
23-
response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout)
22+
# Get the pageRange from the 'pagination' table
23+
page_set = self.get_pagination_set()
24+
logger.debug("Pages: {}".format(page_set))
25+
for page in page_set:
26+
response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
2427
if not response.ok:
2528
# Could not parse ANY page - Let user know
2629
if not curr_proxy_list:
27-
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
30+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
2831
# Return proxies parsed so far
2932
return curr_proxy_list
3033
content = response.content
@@ -49,13 +52,49 @@ def parse_proxyList(self):
4952
for row in table.find_all("tr")[1:]:
5053
td_row = row.find("td")
5154
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
55+
proxy_obj = self.create_proxy_object(row)
5256
# Make sure it is a Valid Proxy Address
53-
if UrlParser.valid_ip_port(td_row.text):
54-
curr_proxy_list.append('http://' + td_row.text)
57+
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
58+
curr_proxy_list.append(proxy_obj)
5559
else:
56-
logger.debug("Address with Invalid format: {}".format(td_row.text))
60+
logger.debug("Proxy Invalid: {}".format(td_row.text))
5761
return curr_proxy_list
5862

63+
def get_pagination_set(self):
64+
response = requests.get(self.get_url(), timeout=self.timeout)
65+
page_set = set()
66+
# Could not parse pagination page - Let user know
67+
if not response.ok:
68+
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
69+
return page_set
70+
content = response.content
71+
soup = BeautifulSoup(content, "html.parser")
72+
for ultag in soup.find_all('ul', {'class': 'pagination'}):
73+
for litag in ultag.find_all('li'):
74+
page_ref = litag.a.get('href')
75+
# Skip current page '/list'
76+
if page_ref.endswith(('htm', 'html')):
77+
page_set.add(page_ref)
78+
else:
79+
page_set.add("")
80+
return page_set
81+
82+
def create_proxy_object(self, row):
83+
for td_row in row.findAll("td"):
84+
if td_row.attrs['data-label'] == 'IP:port ':
85+
text = td_row.text.strip()
86+
ip = text.split(":")[0]
87+
# Make sure it is a Valid IP
88+
if not UrlParser.valid_ip(ip):
89+
logger.debug("IP with Invalid format: {}".format(ip))
90+
return None
91+
port = text.split(":")[1]
92+
elif td_row.attrs['data-label'] == 'Anonymity Type: ':
93+
anonymity = AnonymityLevel.get(td_row.text.strip())
94+
elif td_row.attrs['data-label'] == 'Country: ':
95+
country = td_row.text.strip()
96+
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
97+
5998
def __str__(self):
6099
return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
61-
.format(self.url, self.minimum_bandwidth_in_KBs)
100+
.format(self.url, self.minimum_bandwidth_in_KBs)

0 commit comments

Comments
 (0)