@@ -18,53 +18,70 @@ def __init__(self, id, web_url, timeout=None):
1818
1919 def parse_proxyList (self , use_top15k = False ):
2020 curr_proxy_list = []
21- response = requests .get (self .get_URl () + "/" + self .top_proxy_path , timeout = self .timeout )
21+ response = requests .get (self .get_url () + "/" + self .top_proxy_path , timeout = self .timeout )
2222
2323 if not response .ok :
24- logger .warn ("Proxy Provider url failed: {}" .format (self .get_URl ()))
24+ logger .warn ("Proxy Provider url failed: {}" .format (self .get_url ()))
2525 return []
2626
2727 content = response .content
2828 soup = BeautifulSoup (content , "html.parser" )
29- table = soup .find ("div" , attrs = {"class" : "paragraph" , 'style' : "text-align:left;" }).find ('font' , attrs = {
30- 'color' : '#33a27f' })
29+ all_divs = soup .findAll ("div" , attrs = {"class" : "paragraph" , 'style' : "text-align:left;" })
30+ # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
31+ # .find('font', attrs={'color': '#33a27f'})
3132 # Parse Top Proxy List page
32- for row in [x for x in table .contents if getattr (x , 'name' , None ) != 'br' ]:
33+ address_list = []
34+ country_list = []
35+ anonymity_list = []
36+ for div in all_divs :
37+ address_div = div .find ('font' , attrs = {'color' : '#33a27f' })
38+ if address_div is not None :
39+ for row in [x for x in address_div .contents if getattr (x , 'name' , None ) != 'br' ]:
40+ address_list .append (str (row ))
41+ curr_div = div .findAll ('font' , attrs = {'size' : '2' })
42+ if curr_div [0 ] is not None :
43+ row_data = []
44+ # font -> strong -> font
45+ title = curr_div [0 ].contents [0 ].contents [0 ].contents [0 ]
46+ for row in [x for x in curr_div [- 1 ].contents if getattr (x , 'name' , None ) != 'br' ]:
47+ row_data .append (str (row ))
48+ if 'Country' in str (title ):
49+ country_list .extend (row_data )
50+ if 'Status' in str (title ):
51+ anonymity_list .extend (row_data )
52+ for address , country , anonymity in zip (address_list , country_list , anonymity_list ):
3353 # Make sure it is a Valid Proxy Address
34- proxy_obj = self .create_proxy_object (row )
35- if proxy_obj is not None and UrlParser .valid_ip_port (row ):
54+ proxy_obj = self .create_proxy_object (address , country , anonymity )
55+ if proxy_obj is not None and UrlParser .valid_ip_port (proxy_obj . get_address () ):
3656 curr_proxy_list .append (proxy_obj )
3757 else :
3858 logger .debug ("Proxy Invalid: {}" .format (row ))
3959 # Usually these proxies are stale
4060 if use_top15k :
4161 # Parse 15k Nodes Text file (named *-all-*.txt)
42- content = requests .get (self .get_URl () + "/" + self .txt_proxy_path ).content
62+ content = requests .get (self .get_url () + "/" + self .txt_proxy_path ).content
4363 soup = BeautifulSoup (content , "html.parser" )
4464 table = soup .find ("div" , attrs = {"class" : "wsite-multicol-table-wrap" })
4565 for link in table .findAll ('a' ):
4666 current_link = link .get ('href' )
4767 if current_link is not None and "all" in current_link :
4868 self .txt_proxy_path = current_link
49- more_content = requests .get (self .get_URl () + self .txt_proxy_path ).text
69+ more_content = requests .get (self .get_url () + self .txt_proxy_path ).text
5070 for proxy_address in more_content .split ():
5171 if UrlParser .valid_ip_port (proxy_address ):
5272 proxy_obj = self .create_proxy_object (row )
5373 curr_proxy_list .append (proxy_obj )
5474 return curr_proxy_list
5575
56- def create_proxy_object (self , dataset ):
57- # Provider specific code
58- dataset = dataset .strip () # String strip()
59- ip = dataset .split (":" )[0 ]
76+ def create_proxy_object (self , address , country , anonymity ):
6077 # Make sure it is a Valid IP
78+ ip = address .strip ().split (":" )[0 ]
6179 if not UrlParser .valid_ip (ip ):
6280 logger .debug ("IP with Invalid format: {}" .format (ip ))
6381 return None
64- port = dataset .split (":" )[1 ]
65- # TODO: Parse extra tables and combine data - Provider seems to be out-of-date
66- country = "Unknown"
67- anonymity = AnonymityLevel .get ("unknown" )
82+ port = address .strip ().split (":" )[1 ]
83+ country = country .strip ()
84+ anonymity = AnonymityLevel .get (anonymity .strip ())
6885
6986 return ProxyObject (source = self .id , ip = ip , port = port , anonymity_level = anonymity , country = country )
7087
0 commit comments