Skip to content

Commit 8358574

Browse files
committed
Some work on #3 and #4 . Created parser abstraction and now each web_url is a separate class extending UrlParser. This should simplify things a lot when adding new proxy providers. Created custom Parser exception as a first step towards custom exception handling.
1 parent 9cc1278 commit 8358574

File tree

10 files changed

+221
-110
lines changed

10 files changed

+221
-110
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Surprisingly, the only thing that tells a server the application triggered the r
2121

2222
## The source code
2323

24-
The project code in this repository is crawling three different public proxy websites:
24+
The project code in this repository is crawling **four** different public proxy websites:
2525
* http://proxyfor.eu/geo.php
2626
* http://free-proxy-list.net
2727
* http://rebro.weebly.com/proxy-list.html
@@ -31,3 +31,10 @@ After collecting the proxy data and filtering the slowest ones it is randomly se
3131
The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.
3232
I have to mention that for each request a different agent header is used. The different headers are stored in the **/data/user_agents.txt** file which contains around 900 different agents.
3333

34+
## Contributing
35+
36+
Contributions are always welcome! Feel free to send a pull request.
37+
38+
## Faced an issue?
39+
40+
Open an issue[here](https://github.com/pgaref/HTTP_Request_Randomizer/issues), and be as detailed as possible :)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
class ParserException(Exception):
2+
def __init___(self,dErrorArguments):
3+
Exception.__init__(self," was raised with arguments {0}".format(dErrorArguments))
4+
self.dErrorArguments = dErrorArguments

project/http/requests/errors/__init__.py

Whitespace-only changes.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from project.http.requests.errors.ParserExceptions import ParserException
2+
3+
class UrlParser(object):
4+
"""
5+
An abstract class representing any URL containing Proxy information
6+
To add an extra Proxy URL just implement this class and provide a 'url specific' parse_proxyList method
7+
8+
Attributes:
9+
site url (hhtp)
10+
bandwidth_limit_in_KBs (to remobe straggling proxies when provided by the url)
11+
"""
12+
13+
def __init__(self, web_url, limitinKBs=None):
14+
self.url = web_url
15+
if limitinKBs is not None:
16+
self.bandwidth_limit_in_KBs=limitinKBs
17+
else:
18+
self.bandwidth_limit_in_KBs=150
19+
20+
def get_URl(self):
21+
if self.url is None:
22+
raise ParserException("webURL is NONE")
23+
return self.url
24+
25+
def get_bandwidthLimit(self):
26+
if self.bandwidth_limit_in_KBs <- 0:
27+
raise ParserException("invalid bandwidth limit {0} ".format(self.bandwidth_limit_in_KBs))
28+
return self.bandwidth_limit_in_KBs
29+
30+
def parse_proxyList(self):
31+
raise ParserException(" abstract method should be implemented by each subclass")
32+
33+
def __str__(self):
34+
return "URL Parser of '{0}' with bandwidth limit at '{1}' KBs"\
35+
.format(self.url, self.bandwidth_limit_in_KBs)

project/http/requests/parsers/__init__.py

Whitespace-only changes.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from project.http.requests.parsers.UrlParser import UrlParser
2+
import requests
3+
from bs4 import BeautifulSoup
4+
5+
6+
class freeproxyParser(UrlParser):
7+
8+
def __init__(self, web_url):
9+
UrlParser.__init__(self, web_url)
10+
11+
def parse_proxyList(self):
12+
curr_proxy_list = []
13+
content = requests.get(self.get_URl()).content
14+
soup = BeautifulSoup(content, "html.parser")
15+
table = soup.find("table", attrs={"class": "display fpltable"})
16+
17+
# The first tr contains the field names.
18+
headings = [th.get_text() for th in table.find("tr").find_all("th")]
19+
20+
datasets = []
21+
for row in table.find_all("tr")[1:]:
22+
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
23+
datasets.append(dataset)
24+
25+
for dataset in datasets:
26+
# Check Field[0] for tags and field[1] for values!
27+
proxy = "http://"
28+
for field in dataset:
29+
if field[0] == 'IP Address':
30+
proxy = proxy + field[1] + ':'
31+
elif field[0] == 'Port':
32+
proxy = proxy + field[1]
33+
curr_proxy_list.append(proxy.__str__())
34+
# print "{0:<10}: {1}".format(field[0], field[1])
35+
# print "ALL: ", curr_proxy_list
36+
return curr_proxy_list
37+
38+
def __str__(self):
39+
return "FreeProxy Parser of '{0}' with bandwidth limit at '{1}' KBs" \
40+
.format(self.url, self.bandwidth_limit_in_KBs)
41+
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from project.http.requests.parsers.UrlParser import UrlParser
2+
import requests
3+
from bs4 import BeautifulSoup
4+
5+
class proxyforeuParser(UrlParser):
6+
7+
def __init__(self, web_url, bandwithdh=None):
8+
UrlParser.__init__(self, web_url, bandwithdh)
9+
10+
def parse_proxyList(self):
11+
curr_proxy_list = []
12+
content = requests.get(self.get_URl()).content
13+
soup = BeautifulSoup(content, "html.parser")
14+
table = soup.find("table", attrs={"class": "proxy_list"})
15+
16+
# The first tr contains the field names.
17+
headings = [th.get_text() for th in table.find("tr").find_all("th")]
18+
19+
datasets = []
20+
for row in table.find_all("tr")[1:]:
21+
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
22+
datasets.append(dataset)
23+
24+
for dataset in datasets:
25+
# Check Field[0] for tags and field[1] for values!
26+
proxy = "http://"
27+
proxy_straggler = False
28+
for field in dataset:
29+
# Discard slow proxies! Speed is in KB/s
30+
if field[0] == 'Speed':
31+
if float(field[1]) < self.get_bandwidthLimit():
32+
proxy_straggler = True
33+
if field[0] == 'IP':
34+
proxy = proxy + field[1] + ':'
35+
elif field[0] == 'Port':
36+
proxy = proxy + field[1]
37+
# Avoid Straggler proxies
38+
if not proxy_straggler:
39+
curr_proxy_list.append(proxy.__str__())
40+
# print "{0:<10}: {1}".format(field[0], field[1])
41+
# print "ALL: ", curr_proxy_list
42+
return curr_proxy_list
43+
44+
def __str__(self):
45+
return "ProxyForEU Parser of '{0}' with bandwidth limit at '{1}' KBs" \
46+
.format(self.url, self.bandwidth_limit_in_KBs)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from project.http.requests.parsers.UrlParser import UrlParser
2+
import requests
3+
from bs4 import BeautifulSoup
4+
5+
class rebroweeblyParser(UrlParser):
6+
7+
def __init__(self, web_url):
8+
UrlParser.__init__(self, web_url)
9+
10+
def parse_proxyList(self):
11+
curr_proxy_list = []
12+
content = requests.get(self.get_URl()).content
13+
soup = BeautifulSoup(content, "html.parser")
14+
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={
15+
'color': '#33a27f'})
16+
17+
for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']:
18+
proxy = "http://" + row
19+
curr_proxy_list.append(proxy.__str__())
20+
return curr_proxy_list
21+
22+
def __str__(self):
23+
return "RebroWeebly Parser of '{0}' with bandwidth limit at '{1}' KBs" \
24+
.format(self.url, self.bandwidth_limit_in_KBs)
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from project.http.requests.parsers.UrlParser import UrlParser
2+
import requests
3+
from bs4 import BeautifulSoup
4+
5+
class semairproxyParser(UrlParser):
6+
7+
def __init__(self, web_url):
8+
UrlParser.__init__(self, web_url)
9+
10+
def parse_proxyList(self):
11+
curr_proxy_list = []
12+
content = requests.get(self.get_URl()).content
13+
soup = BeautifulSoup(content, "html.parser")
14+
# css provides the port number so we reverse it
15+
for href in soup.findAll('link'):
16+
if '/styles/' in href.get('href'):
17+
style = "http://www.samair.ru" + href.get('href')
18+
break
19+
css = requests.get(style).content.split('\n')
20+
css.pop()
21+
ports = {}
22+
for l in css:
23+
p = l.split(' ')
24+
key = p[0].split(':')[0][1:]
25+
value = p[1].split('\"')[1]
26+
ports[key] = value
27+
28+
table = soup.find("table", attrs={"id": "proxylist"})
29+
# The first tr contains the field names.
30+
headings = [th.get_text() for th in table.find("tr").find_all("th")]
31+
for row in table.find_all("span")[1:]:
32+
curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
33+
34+
return curr_proxy_list
35+
36+
def __str__(self):
37+
return "SemairProxy Parser of '{0}' with bandwidth limit at '{1}' KBs" \
38+
.format(self.url, self.bandwidth_limit_in_KBs)

project/http/requests/proxy/requestProxy.py

Lines changed: 25 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
from project.http.requests.parsers.freeproxyParser import freeproxyParser
2+
from project.http.requests.parsers.proxyforeuParser import proxyforeuParser
3+
from project.http.requests.parsers.rebroweeblyParser import rebroweeblyParser
4+
from project.http.requests.parsers.samairproxyParser import semairproxyParser
5+
16
__author__ = 'pgaref'
27

38
import requests
@@ -15,14 +20,24 @@ class RequestProxy:
1520
def __init__(self, web_proxy_list=[]):
1621
self.useragents = self.load_user_agents(RequestProxy.agent_file)
1722
#####
18-
# Proxy format:
23+
# Each of the classes below implements a specific URL Parser
1924
# http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT>
2025
#####
26+
parsers = []
27+
parsers.append(freeproxyParser('http://free-proxy-list.net'))
28+
parsers.append(proxyforeuParser('http://proxyfor.eu/geo.php', 100.0))
29+
parsers.append(rebroweeblyParser('http://rebro.weebly.com/proxy-list.html'))
30+
parsers.append(semairproxyParser('http://www.samair.ru/proxy/time-01.htm'))
31+
32+
print "=== Initialized Proxy Parsers ==="
33+
for i in range(len(parsers)):
34+
print "\t {0}".format(parsers[i].__str__())
35+
print "================================="
36+
37+
self.parsers = parsers
2138
self.proxy_list = web_proxy_list
22-
self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php', 100.0)
23-
self.proxy_list += self.freeProxy_url_parser('http://free-proxy-list.net')
24-
self.proxy_list += self.weebly_url_parser('http://rebro.weebly.com/proxy-list.html')
25-
self.proxy_list += self.samair_url_parser('http://www.samair.ru/proxy/time-01.htm')
39+
for i in range(len(parsers)):
40+
self.proxy_list += parsers[i].parse_proxyList()
2641

2742

2843
def get_proxy_list(self):
@@ -57,117 +72,18 @@ def generate_random_request_headers(self):
5772
} # select a random user agent
5873
return headers
5974

60-
def proxyForEU_url_parser(self, web_url, speed_in_KBs=100.0):
61-
curr_proxy_list = []
62-
content = requests.get(web_url).content
63-
soup = BeautifulSoup(content, "html.parser")
64-
table = soup.find("table", attrs={"class": "proxy_list"})
65-
66-
# The first tr contains the field names.
67-
headings = [th.get_text() for th in table.find("tr").find_all("th")]
68-
69-
datasets = []
70-
for row in table.find_all("tr")[1:]:
71-
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
72-
datasets.append(dataset)
73-
74-
for dataset in datasets:
75-
# Check Field[0] for tags and field[1] for values!
76-
proxy = "http://"
77-
proxy_straggler = False
78-
for field in dataset:
79-
# Discard slow proxies! Speed is in KB/s
80-
if field[0] == 'Speed':
81-
if float(field[1]) < speed_in_KBs:
82-
proxy_straggler = True
83-
if field[0] == 'IP':
84-
proxy = proxy+field[1]+':'
85-
elif field[0] == 'Port':
86-
proxy = proxy+field[1]
87-
# Avoid Straggler proxies
88-
if not proxy_straggler:
89-
curr_proxy_list.append(proxy.__str__())
90-
#print "{0:<10}: {1}".format(field[0], field[1])
91-
#print "ALL: ", curr_proxy_list
92-
return curr_proxy_list
93-
94-
def freeProxy_url_parser(self, web_url):
95-
curr_proxy_list = []
96-
content = requests.get(web_url).content
97-
soup = BeautifulSoup(content, "html.parser")
98-
table = soup.find("table", attrs={"class": "display fpltable"})
99-
100-
# The first tr contains the field names.
101-
headings = [th.get_text() for th in table.find("tr").find_all("th")]
102-
103-
datasets = []
104-
for row in table.find_all("tr")[1:]:
105-
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
106-
datasets.append(dataset)
107-
108-
for dataset in datasets:
109-
# Check Field[0] for tags and field[1] for values!
110-
proxy = "http://"
111-
for field in dataset:
112-
if field[0] == 'IP Address':
113-
proxy = proxy+field[1]+':'
114-
elif field[0] == 'Port':
115-
proxy = proxy+field[1]
116-
curr_proxy_list.append(proxy.__str__())
117-
#print "{0:<10}: {1}".format(field[0], field[1])
118-
#print "ALL: ", curr_proxy_list
119-
return curr_proxy_list
120-
121-
def weebly_url_parser(self, web_url):
122-
curr_proxy_list = []
123-
content = requests.get(web_url).content
124-
soup = BeautifulSoup(content, "html.parser")
125-
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={'color' :'#33a27f'})
126-
127-
for row in [ x for x in table.contents if getattr(x, 'name', None) != 'br']:
128-
proxy = "http://" + row
129-
curr_proxy_list.append(proxy.__str__())
130-
return curr_proxy_list
131-
132-
def samair_url_parser(self, web_url, speed_in_KBs=100.0):
133-
curr_proxy_list = []
134-
content = requests.get(web_url).content
135-
soup = BeautifulSoup(content, "html.parser")
136-
# css provides the port number so we reverse it
137-
for href in soup.findAll('link'):
138-
if '/styles/' in href.get('href'):
139-
style = "http://www.samair.ru" + href.get('href')
140-
break
141-
css = requests.get(style).content.split('\n')
142-
css.pop()
143-
ports = {}
144-
for l in css:
145-
p = l.split(' ')
146-
key = p[0].split(':')[0][1:]
147-
value = p[1].split('\"')[1]
148-
ports[key] = value
149-
150-
table = soup.find("table", attrs={"id": "proxylist"})
151-
152-
# The first tr contains the field names.
153-
headings = [th.get_text() for th in table.find("tr").find_all("th")]
154-
155-
for row in table.find_all("span")[1:]:
156-
curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
157-
158-
return curr_proxy_list
159-
75+
#####
76+
# Proxy format:
77+
# http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT>
78+
#####
16079
def generate_proxied_request(self, url, params={}, req_timeout=30):
161-
#if len(self.proxy_list) < 2:
162-
# self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php')
163-
16480
random.shuffle(self.proxy_list)
16581
req_headers = dict(params.items() + self.generate_random_request_headers().items())
16682

16783
request = None
16884
try:
16985
rand_proxy = random.choice(self.proxy_list)
170-
print "Next proxy: " + str(rand_proxy)
86+
print "Using proxy: " + str(rand_proxy)
17187
request = requests.get(test_url, proxies={"http": rand_proxy},
17288
headers=req_headers, timeout=req_timeout)
17389
except ConnectionError:

0 commit comments

Comments
 (0)