Skip to content

Commit 30fd436

Browse files
committed
Finalising url abstraction - fixing printout format and adding author doc
1 parent 8358574 commit 30fd436

File tree

6 files changed

+36
-31
lines changed

6 files changed

+36
-31
lines changed
Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,37 @@
11
from project.http.requests.errors.ParserExceptions import ParserException
22

3+
__author__ = 'pgaref'
4+
35
class UrlParser(object):
46
"""
57
An abstract class representing any URL containing Proxy information
68
To add an extra Proxy URL just implement this class and provide a 'url specific' parse_proxyList method
79
810
Attributes:
911
site url (hhtp)
10-
bandwidth_limit_in_KBs (to remobe straggling proxies when provided by the url)
12+
minimum_bandwidth_in_KBs (to avoid straggling proxies when having the extra info from proxy provider)
1113
"""
1214

13-
def __init__(self, web_url, limitinKBs=None):
15+
def __init__(self, web_url, bandwidthKBs=None):
1416
self.url = web_url
15-
if limitinKBs is not None:
16-
self.bandwidth_limit_in_KBs=limitinKBs
17+
if bandwidthKBs is not None:
18+
self.minimum_bandwidth_in_KBs=bandwidthKBs
1719
else:
18-
self.bandwidth_limit_in_KBs=150
20+
self.minimum_bandwidth_in_KBs=150
1921

2022
def get_URl(self):
2123
if self.url is None:
2224
raise ParserException("webURL is NONE")
2325
return self.url
2426

25-
def get_bandwidthLimit(self):
26-
if self.bandwidth_limit_in_KBs <- 0:
27-
raise ParserException("invalid bandwidth limit {0} ".format(self.bandwidth_limit_in_KBs))
28-
return self.bandwidth_limit_in_KBs
27+
def get_min_bandwidth(self):
28+
if self.minimum_bandwidth_in_KBs < 0:
29+
raise ParserException("invalid minimum bandwidth limit {0} ".format(self.minimum_bandwidth_in_KBs))
30+
return self.minimum_bandwidth_in_KBs
2931

3032
def parse_proxyList(self):
3133
raise ParserException(" abstract method should be implemented by each subclass")
3234

3335
def __str__(self):
34-
return "URL Parser of '{0}' with bandwidth limit at '{1}' KBs"\
35-
.format(self.url, self.bandwidth_limit_in_KBs)
36+
return "URL Parser of '{0}' with required bandwidth: '{1}' KBs"\
37+
.format(self.url, self.minimum_bandwidth_in_KBs)

project/http/requests/parsers/freeproxyParser.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import requests
33
from bs4 import BeautifulSoup
44

5+
__author__ = 'pgaref'
56

67
class freeproxyParser(UrlParser):
78

@@ -36,6 +37,6 @@ def parse_proxyList(self):
3637
return curr_proxy_list
3738

3839
def __str__(self):
39-
return "FreeProxy Parser of '{0}' with bandwidth limit at '{1}' KBs" \
40-
.format(self.url, self.bandwidth_limit_in_KBs)
40+
return "FreeProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
41+
.format(self.url, self.minimum_bandwidth_in_KBs)
4142

project/http/requests/parsers/proxyforeuParser.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import requests
33
from bs4 import BeautifulSoup
44

5+
__author__ = 'pgaref'
6+
57
class proxyforeuParser(UrlParser):
68

79
def __init__(self, web_url, bandwithdh=None):
@@ -28,7 +30,7 @@ def parse_proxyList(self):
2830
for field in dataset:
2931
# Discard slow proxies! Speed is in KB/s
3032
if field[0] == 'Speed':
31-
if float(field[1]) < self.get_bandwidthLimit():
33+
if float(field[1]) < self.get_min_bandwidth():
3234
proxy_straggler = True
3335
if field[0] == 'IP':
3436
proxy = proxy + field[1] + ':'
@@ -42,5 +44,5 @@ def parse_proxyList(self):
4244
return curr_proxy_list
4345

4446
def __str__(self):
45-
return "ProxyForEU Parser of '{0}' with bandwidth limit at '{1}' KBs" \
46-
.format(self.url, self.bandwidth_limit_in_KBs)
47+
return "ProxyForEU Parser of '{0}' with required bandwidth: '{1}' KBs" \
48+
.format(self.url, self.minimum_bandwidth_in_KBs)

project/http/requests/parsers/rebroweeblyParser.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import requests
33
from bs4 import BeautifulSoup
44

5+
__author__ = 'pgaref'
6+
57
class rebroweeblyParser(UrlParser):
68

79
def __init__(self, web_url):
@@ -20,5 +22,5 @@ def parse_proxyList(self):
2022
return curr_proxy_list
2123

2224
def __str__(self):
23-
return "RebroWeebly Parser of '{0}' with bandwidth limit at '{1}' KBs" \
24-
.format(self.url, self.bandwidth_limit_in_KBs)
25+
return "RebroWeebly Parser of '{0}' with required bandwidth: '{1}' KBs" \
26+
.format(self.url, self.minimum_bandwidth_in_KBs)

project/http/requests/parsers/samairproxyParser.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import requests
33
from bs4 import BeautifulSoup
44

5+
__author__ = 'pgaref'
6+
57
class semairproxyParser(UrlParser):
68

79
def __init__(self, web_url):
@@ -34,5 +36,5 @@ def parse_proxyList(self):
3436
return curr_proxy_list
3537

3638
def __str__(self):
37-
return "SemairProxy Parser of '{0}' with bandwidth limit at '{1}' KBs" \
38-
.format(self.url, self.bandwidth_limit_in_KBs)
39+
return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
40+
.format(self.url, self.minimum_bandwidth_in_KBs)

project/http/requests/proxy/requestProxy.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,22 @@
22
from project.http.requests.parsers.proxyforeuParser import proxyforeuParser
33
from project.http.requests.parsers.rebroweeblyParser import rebroweeblyParser
44
from project.http.requests.parsers.samairproxyParser import semairproxyParser
5-
6-
__author__ = 'pgaref'
7-
85
import requests
96
from requests.exceptions import ConnectionError
107
import random
11-
import os
128
import time
13-
from bs4 import BeautifulSoup
149
from requests.exceptions import ReadTimeout
1510

11+
__author__ = 'pgaref'
1612

1713
class RequestProxy:
1814
agent_file = '../data/user_agents.txt'
1915

2016
def __init__(self, web_proxy_list=[]):
2117
self.useragents = self.load_user_agents(RequestProxy.agent_file)
18+
2219
#####
2320
# Each of the classes below implements a specific URL Parser
24-
# http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT>
2521
#####
2622
parsers = []
2723
parsers.append(freeproxyParser('http://free-proxy-list.net'))
@@ -83,24 +79,24 @@ def generate_proxied_request(self, url, params={}, req_timeout=30):
8379
request = None
8480
try:
8581
rand_proxy = random.choice(self.proxy_list)
86-
print "Using proxy: " + str(rand_proxy)
82+
print "Using proxy: {0}".format(str(rand_proxy))
8783
request = requests.get(test_url, proxies={"http": rand_proxy},
8884
headers=req_headers, timeout=req_timeout)
8985
except ConnectionError:
9086
self.proxy_list.remove(rand_proxy)
91-
print "Proxy unreachable - Removed Straggling proxy :", rand_proxy, " PL Size = ",len(self.proxy_list)
87+
print "Proxy unreachable - Removed Straggling proxy: {0} PL Size = {1}".format(rand_proxy, len(self.proxy_list))
9288
pass
9389
except ReadTimeout:
9490
self.proxy_list.remove(rand_proxy)
95-
print "Read timed out - Removed Straggling proxy :", rand_proxy, " PL Size = ", len(self.proxy_list)
91+
print "Read timed out - Removed Straggling proxy: {0} PL Size = {1}".format(rand_proxy, len(self.proxy_list))
9692
pass
9793
return request
9894

9995
if __name__ == '__main__':
10096

10197
start = time.time()
10298
req_proxy = RequestProxy()
103-
print "Initialization took: ", (time.time()-start)
99+
print "Initialization took: {0} sec".format((time.time()-start))
104100
print "Size : ", len(req_proxy.get_proxy_list())
105101
print " ALL = ", req_proxy.get_proxy_list()
106102

@@ -109,7 +105,7 @@ def generate_proxied_request(self, url, params={}, req_timeout=30):
109105
while True:
110106
start = time.time()
111107
request = req_proxy.generate_proxied_request(test_url)
112-
print "Proxied Request Took: ", (time.time()-start), " => Status: ", request.__str__()
108+
print "Proxied Request Took: {0} sec => Status: {1}".format((time.time()-start), request.__str__())
113109
print "Proxy List Size: ", len(req_proxy.get_proxy_list())
114110

115111
print"-> Going to sleep.."

0 commit comments

Comments
 (0)