一般的小白python新手可能都知道廖雪峰網站吧。由于自己也是個小白,所以就想能不能將該教程爬取下來呢。說做就做。好了不多說,直接上代碼:
#coding:utf-8#autor:myndttimport urllib2import requestsimport osimport multiprocessingimport sysfrom bs4 import BeautifulSoupfrom lxml import etreeimport pdfkitreload(sys)sys.setdefaultencoding('utf-8')url='http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000'def geturl(url): article = [] try: re=urllib2.urlopen(url).read() selector=etree.HTML(re) content=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/@href') for each in content: article.append('http://www.liaoxuefeng.com'+each.strip()) except urllib2.HTTPError as e: pass return articledef gethtml(): text=u'<h1>廖雪峰Python教程<h1>'+u'<br>' a=1 re=urllib2.urlopen(url).read() selector=etree.HTML(re) conten=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/text()') #獲取目錄 for con in conten: text=text+u'<h2>'+unicode(a)+u':'+unicode(con)+u'<h2>'+u'<br>' a=a+1 return textdef getothers(urllist): n = 0 text=gethtml() pool = multiPRocessing.Pool(multiprocessing.cpu_count()) #獲取每份網頁要的東西 for ur in urllist: n=n+1 m=pool.apply_async(getpage,(ur,n,)).get() text=text+unicode(m) #提示打印的頁數 print n pool.close() pool.join() file = open("pdf.html", "a") file.write(u'<html><head><meta charset="UTF-8"></head><body>' + unicode(text) + u'</body></html>') pdfcreate() print "ok!!!"#實際獲取每份網頁要的def getpage(ur,n): page=u'' rep=urllib2.urlopen(ur).read() soup=BeautifulSoup(rep,"lxml",from_encoding='utf8') sou=soup.find("div",{"class":"x-wiki-content"}) smu=soup.find("h4").get_text() page=page+u'<h2>'+unicode(n)+u':'+unicode(smu)+u'</h2>' #找到img標簽 將其src屬性值補全 so=sou.find_all("img") for s in so: if str(s).find("http:")== -1: sou=unicode(sou).replace(s.get('src'),'http://www.liaoxuefeng.com'+s.get('src')) page=page+unicode(sou) return unicode(page) #打印成pdf(其實可有可無)def pdfcreate(): path_wkthmltopdf = r'C:/Windows/System32/wkhtmltopdf.exe' config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf) pdfkit.from_url("pdf.html", "hello.pdf", configuration=config) if __name__=='__main__': urllis=geturl(url) getothers(urllis)其實這是大半年前寫的代碼,寫的有點糟糕,同時用了xpath和beautifulsoup,真是汗顏,也懶的改了(哈哈)。
本博客僅記錄一下自己學習生活,如不勝對大家有點借鑒作用,也是極好的。
新聞熱點
疑難解答