本文介紹了selenium設置proxy、headers的方法,把phantomjs、Chrome、Firefox幾個瀏覽器的設置方法都總結一下,分享給大家,也給自己留個筆記
phantomjs
設置ip
方法1:
service_args = [ '--proxy=%s' % ip_html, # 代理 IP:prot (eg:192.168.0.28:808) '--proxy-type=http', # 代理類型:http/https ‘--load-images=no', # 關閉圖片加載(可選) '--disk-cache=yes', # 開啟緩存(可選) '--ignore-ssl-errors=true' # 忽略https錯誤(可選)]driver = webdriver.PhantomJS(service_args=service_args)
方法2:
browser=webdriver.PhantomJS(PATH_PHANTOMJS)# 利用DesiredCapabilities(代理設置)參數值,重新打開一個sessionId,我看意思就相當于瀏覽器清空緩存后,加上代理重新訪問一次urlproxy=webdriver.Proxy()proxy.proxy_type=ProxyType.MANUALproxy.http_proxy='1.9.171.51:800'# 將代理設置添加到webdriver.DesiredCapabilities.PHANTOMJS中proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)browser.get('http://1212.ip138.com/ic.asp')print('1: ',browser.session_id)print('2: ',browser.page_source)print('3: ',browser.get_cookies())
還原為系統代理
# 還原為系統代理proxy=webdriver.Proxy()proxy.proxy_type=ProxyType.DIRECTproxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)browser.get('http://1212.ip138.com/ic.asp')
設置請求頭
方法2
import random,requests,jsonfrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesfrom selenium.webdriver.common.proxy import ProxyType#隨機獲取一個ipdef proxies(): r = requests.get("http://120.26.166.214:9840/JProxy/update/proxy/scoreproxy") rr = json.loads(r.text) hh = rr['ip'] + ":" + "8907" print(hh) return hhips =proxies()#設置phantomjs請求頭和代理方法一:#-------------------------------------------------------------------------------------# 設置代理service_args = [ '--proxy=%s' % ips, # 代理 IP:prot (eg:192.168.0.28:808) '--ssl-protocol=any', #忽略ssl協議 '--load - images = no', # 關閉圖片加載(可選) '--disk-cache=yes', # 開啟緩存(可選) '--ignore-ssl-errors=true' # 忽略https錯誤(可選)]#設置請求頭user_agent = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36" )dcap = dict(DesiredCapabilities.PHANTOMJS)dcap["phantomjs.page.settings.userAgent"] = user_agentdriver = webdriver.PhantomJS(executable_path=r"C:/soft/phantomjs-2.1.1-windows/bin/phantomjs.exe", desired_capabilities=dcap,service_args=service_args)driver.get(url='http://www.baidu.com')page=driver.page_sourceprint(page)#設置phantomjs請求頭和代理方法二:#-------------------------------------------------------------------------------------desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()# 從USER_AGENTS列表中隨機選一個瀏覽器頭,偽裝瀏覽器desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice('請求頭池'))# 不載入圖片,爬頁面速度會快很多desired_capabilities["phantomjs.page.settings.loadImages"] = False# 利用DesiredCapabilities(代理設置)參數值,重新打開一個sessionId,我看意思就相當于瀏覽器清空緩存后,加上代理重新訪問一次urlproxy = webdriver.Proxy()proxy.proxy_type = ProxyType.MANUALproxy.http_proxy = random.choice('ip池')proxy.add_to_capabilities(desired_capabilities)phantomjs_driver = r'C:/phantomjs-2.1.1-windows/bin/phantomjs.exe'# 打開帶配置信息的phantomJS瀏覽器driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities)driver.start_session(desired_capabilities)driver.get(url='http://www.baidu.com')page=driver.page_sourceprint(page)# 隱式等待5秒,可以自己調節driver.implicitly_wait(5)# 設置10秒頁面超時返回,類似于requests.get()的timeout選項,driver.get()沒有timeout選項# 以前遇到過driver.get(url)一直不返回,但也不報錯的問題,這時程序會卡住,設置超時選項能解決這個問題。driver.set_page_load_timeout(20)# 設置10秒腳本超時時間driver.set_script_timeout(20) #翻頁命令driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
新聞熱點
疑難解答