|
2 | 2 |
|
3 | 3 | import requests
|
4 | 4 | from bs4 import BeautifulSoup
|
| 5 | +from requests import ConnectionError |
5 | 6 |
|
6 | 7 | from http_request_randomizer.requests.parsers.UrlParser import UrlParser
|
7 | 8 |
|
8 | 9 | logger = logging.getLogger(__name__)
|
9 | 10 | __author__ = 'pgaref'
|
10 | 11 |
|
11 | 12 |
|
| 13 | +# Samair Proxy now renamed to: premproxy.com |
12 | 14 | class SamairProxyParser(UrlParser):
|
13 | 15 | def __init__(self, web_url, timeout=None):
|
| 16 | + web_url += "/list/" |
14 | 17 | UrlParser.__init__(self, web_url, timeout)
|
15 | 18 |
|
16 | 19 | def parse_proxyList(self):
|
17 | 20 | curr_proxy_list = []
|
18 |
| - response = requests.get(self.get_URl(), timeout=self.timeout) |
19 |
| - |
20 |
| - if not response.ok: |
21 |
| - logger.warn("Proxy Provider url failed: {}".format(self.get_URl())) |
22 |
| - return [] |
23 |
| - |
24 |
| - content = response.content |
25 |
| - soup = BeautifulSoup(content, "html.parser") |
26 |
| - # css provides the port number so we reverse it |
27 |
| - # for href in soup.findAll('link'): |
28 |
| - # if '/styles/' in href.get('href'): |
29 |
| - # style = "http://www.samair.ru" + href.get('href') |
30 |
| - # break |
31 |
| - # css = requests.get(style).content.split('\n') |
32 |
| - # css.pop() |
33 |
| - # ports = {} |
34 |
| - # for l in css: |
35 |
| - # p = l.split(' ') |
36 |
| - # key = p[0].split(':')[0][1:] |
37 |
| - # value = p[1].split('\"')[1] |
38 |
| - # ports[key] = value |
39 |
| - |
40 |
| - table = soup.find("div", attrs={"id": "proxylist"}) |
41 |
| - # The first tr contains the field names. |
42 |
| - headings = [th.get_text() for th in table.find("tr").find_all("th")] |
43 |
| - for row in table.find_all("tr")[1:]: |
44 |
| - td_row = row.find("td") |
45 |
| - # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) |
46 |
| - # Make sure it is a Valid Proxy Address |
47 |
| - if UrlParser.valid_ip_port(td_row.text): |
48 |
| - curr_proxy_list.append('http://' +td_row.text) |
49 |
| - else: |
50 |
| - logger.debug("Address with Invalid format: {}".format(td_row.text)) |
51 |
| - |
| 21 | + # Parse all proxy pages -> format: /list/{num}.htm |
| 22 | + # TODO: get the pageRange from the 'pagination' table |
| 23 | + for page in range(1, 21): |
| 24 | + response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout) |
| 25 | + if not response.ok: |
| 26 | + # Could not parse ANY page - Let user know |
| 27 | + if not curr_proxy_list: |
| 28 | + logger.warn("Proxy Provider url failed: {}".format(self.get_URl())) |
| 29 | + # Return proxies parsed so far |
| 30 | + return curr_proxy_list |
| 31 | + content = response.content |
| 32 | + soup = BeautifulSoup(content, "html.parser") |
| 33 | + # css provides the port number so we reverse it |
| 34 | + # for href in soup.findAll('link'): |
| 35 | + # if '/styles/' in href.get('href'): |
| 36 | + # style = "http://www.samair.ru" + href.get('href') |
| 37 | + # break |
| 38 | + # css = requests.get(style).content.split('\n') |
| 39 | + # css.pop() |
| 40 | + # ports = {} |
| 41 | + # for l in css: |
| 42 | + # p = l.split(' ') |
| 43 | + # key = p[0].split(':')[0][1:] |
| 44 | + # value = p[1].split('\"')[1] |
| 45 | + # ports[key] = value |
| 46 | + |
| 47 | + table = soup.find("div", attrs={"id": "proxylist"}) |
| 48 | + # The first tr contains the field names. |
| 49 | + headings = [th.get_text() for th in table.find("tr").find_all("th")] |
| 50 | + for row in table.find_all("tr")[1:]: |
| 51 | + td_row = row.find("td") |
| 52 | + # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) |
| 53 | + # Make sure it is a Valid Proxy Address |
| 54 | + if UrlParser.valid_ip_port(td_row.text): |
| 55 | + curr_proxy_list.append('http://' + td_row.text) |
| 56 | + else: |
| 57 | + logger.debug("Address with Invalid format: {}".format(td_row.text)) |
52 | 58 | return curr_proxy_list
|
53 | 59 |
|
54 | 60 | def __str__(self):
|
|
0 commit comments