任務:爬取租房網站信息,300個房源信息(詳情頁中的價格、位置、戶主名字、性別等)
注意:超鏈接的獲取、性別獲取
from bs4 import BeautifulSoupimport requests, timepage_link = [] # <- 每個詳情頁的鏈接都存在這里,解析詳情的時候就遍歷這個列表然后訪問就好啦~def get_page_link(page_number):#獲取每一頁中的詳情頁的鏈接 for each_number in range(1,page_number): # 每頁24個鏈接,這里輸入的是頁碼 full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number) wb_data = requests.get(full_url) soup = BeautifulSoup(wb_data.text,'lxml') for link in soup.select('a.resule_img_a'): # 找到這個 class 樣為resule_img_a 的 a 標簽即可 page_link.append(link.get('href'))#只需要取出超鏈接添加到page_link中def PRint_gender(class_name): if class_name == 'member_ico1': return '女' if class_name == 'member_ico': return '男'url = 'http://bj.xiaozhu.com/fangzi/1508951935.html'def get_attractions(url, count):#獲取每一個詳情頁的具體信息 wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') title = soup.select('div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em') address = soup.select(' div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span') price = soup.select(' div.day_l > span') picture = soup.select('#curBigImage') host_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a') host_gender = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div') for ti, add, pri, pic, name, gender in zip(title, address , price, picture, host_name, host_gender):#創建數據結構 data = { 'ti': ti.get_text(), 'add': add.get_text(strip=True), #strip去除前后空白,也可以使用.stripped_strings 'pri': pri.get_text(), 'pic': pic.get('src'), 'name': name.get_text(), 'gender':print_gender(gender.get('class')[0]) } data['count']=count #增加計數功能 print(data) get_page_link(13) #爬取13頁for i in range(1, 301): #爬取300條 # time.sleep(2) get_attractions(page_link[i], i)
新聞熱點
疑難解答