一、功能說明:
1. 多線程方式抓取代理服務器,并多線程驗證代理服務器
ps 代理服務器是從http://www.cnproxy.com/ (測試只選擇了8個頁面)抓取
2. 抓取一個網站的圖片地址,多線程隨機取一個代理服務器下載圖片
二、實現代碼
代碼如下:
#!/usr/bin/env python
#coding:utf-8
import urllib2
import re
import threading
import time
import random
rawProxyList = []
checkedProxyList = []
imgurl_list = []
#抓取代理網站
portdicts ={'v':"3",'m':"4",'a':"2",'l':"9",'q':"0",'b':"5",'i':"7",'w':"6",'r':"8",'c':"1"}
targets = []
for i in xrange(1,9):
target = r"http://www.cnproxy.com/proxy%d.html" % i
targets.append(target)
#print targets
#抓取代理服務器正則
p = re.compile(r'''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write/(":"/+(.+?)/)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''')
#獲取代理的類
class ProxyGet(threading.Thread):
def __init__(self,target):
threading.Thread.__init__(self)
self.target = target
def getProxy(self):
print "代理服務器目標網站: " + self.target
req = urllib2.urlopen(self.target)
result = req.read()
#print chardet.detect(result)
matchs = p.findall(result)
for row in matchs:
ip=row[0]
port =row[1]
port = map(lambda x:portdicts[x],port.split('+'))
port = ''.join(port)
agent = row[2]
addr = row[3].decode("cp936").encode("utf-8")
proxy = [ip,port,addr]
#print proxy
rawProxyList.append(proxy)
def run(self):
self.getProxy()
#檢驗代理的類
class ProxyCheck(threading.Thread):
def __init__(self,proxyList):
threading.Thread.__init__(self)
self.proxyList = proxyList
新聞熱點
疑難解答