|
2 | 2 |
|
3 | 3 | import requests |
4 | 4 | from bs4 import BeautifulSoup |
| 5 | +from requests import ConnectionError |
5 | 6 |
|
6 | 7 | from http_request_randomizer.requests.parsers.UrlParser import UrlParser |
7 | 8 |
|
8 | 9 | logger = logging.getLogger(__name__) |
9 | 10 | __author__ = 'pgaref' |
10 | 11 |
|
11 | 12 |
|
| 13 | +# Samair Proxy now renamed to: premproxy.com |
12 | 14 | class SamairProxyParser(UrlParser): |
13 | 15 | def __init__(self, web_url, timeout=None): |
| 16 | + web_url += "/list/" |
14 | 17 | UrlParser.__init__(self, web_url, timeout) |
15 | 18 |
|
16 | 19 | def parse_proxyList(self): |
17 | 20 | curr_proxy_list = [] |
18 | | - response = requests.get(self.get_URl(), timeout=self.timeout) |
19 | | - |
20 | | - if not response.ok: |
21 | | - logger.warn("Proxy Provider url failed: {}".format(self.get_URl())) |
22 | | - return [] |
23 | | - |
24 | | - content = response.content |
25 | | - soup = BeautifulSoup(content, "html.parser") |
26 | | - # css provides the port number so we reverse it |
27 | | - # for href in soup.findAll('link'): |
28 | | - # if '/styles/' in href.get('href'): |
29 | | - # style = "http://www.samair.ru" + href.get('href') |
30 | | - # break |
31 | | - # css = requests.get(style).content.split('\n') |
32 | | - # css.pop() |
33 | | - # ports = {} |
34 | | - # for l in css: |
35 | | - # p = l.split(' ') |
36 | | - # key = p[0].split(':')[0][1:] |
37 | | - # value = p[1].split('\"')[1] |
38 | | - # ports[key] = value |
39 | | - |
40 | | - table = soup.find("div", attrs={"id": "proxylist"}) |
41 | | - # The first tr contains the field names. |
42 | | - headings = [th.get_text() for th in table.find("tr").find_all("th")] |
43 | | - for row in table.find_all("tr")[1:]: |
44 | | - td_row = row.find("td") |
45 | | - # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) |
46 | | - # Make sure it is a Valid Proxy Address |
47 | | - if UrlParser.valid_ip_port(td_row.text): |
48 | | - curr_proxy_list.append('http://' +td_row.text) |
49 | | - else: |
50 | | - logger.debug("Address with Invalid format: {}".format(td_row.text)) |
51 | | - |
| 21 | + # Parse all proxy pages -> format: /list/{num}.htm |
| 22 | + # TODO: get the pageRange from the 'pagination' table |
| 23 | + for page in range(1, 21): |
| 24 | + response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout) |
| 25 | + if not response.ok: |
| 26 | + # Could not parse ANY page - Let user know |
| 27 | + if not curr_proxy_list: |
| 28 | + logger.warn("Proxy Provider url failed: {}".format(self.get_URl())) |
| 29 | + # Return proxies parsed so far |
| 30 | + return curr_proxy_list |
| 31 | + content = response.content |
| 32 | + soup = BeautifulSoup(content, "html.parser") |
| 33 | + # css provides the port number so we reverse it |
| 34 | + # for href in soup.findAll('link'): |
| 35 | + # if '/styles/' in href.get('href'): |
| 36 | + # style = "http://www.samair.ru" + href.get('href') |
| 37 | + # break |
| 38 | + # css = requests.get(style).content.split('\n') |
| 39 | + # css.pop() |
| 40 | + # ports = {} |
| 41 | + # for l in css: |
| 42 | + # p = l.split(' ') |
| 43 | + # key = p[0].split(':')[0][1:] |
| 44 | + # value = p[1].split('\"')[1] |
| 45 | + # ports[key] = value |
| 46 | + |
| 47 | + table = soup.find("div", attrs={"id": "proxylist"}) |
| 48 | + # The first tr contains the field names. |
| 49 | + headings = [th.get_text() for th in table.find("tr").find_all("th")] |
| 50 | + for row in table.find_all("tr")[1:]: |
| 51 | + td_row = row.find("td") |
| 52 | + # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) |
| 53 | + # Make sure it is a Valid Proxy Address |
| 54 | + if UrlParser.valid_ip_port(td_row.text): |
| 55 | + curr_proxy_list.append('http://' + td_row.text) |
| 56 | + else: |
| 57 | + logger.debug("Address with Invalid format: {}".format(td_row.text)) |
52 | 58 | return curr_proxy_list |
53 | 59 |
|
54 | 60 | def __str__(self): |
|
0 commit comments