資源簡介
用Python爬取貓途鷹旅游網站的爬蟲代碼;
爬取到的數據有:酒店和景點信息,酒店評論信息,景點評論信息;
內附爬蟲過程的思路和難點介紹

代碼片段和文件信息
#?-*-?coding:utf-8?-*-
#?從酒店列表中爬取酒店信息
import?requests
import?re
import?tool
import?os
import?time
import?urllib3
urllib3.disable_warnings()
#?requests.packages.urllib3.disable_warnings()
#?url
l_siteURL?=?‘https://www.tripadvisor.cn/Hotels-g294212-oa‘
r_siteURL?=?‘-Beijing-Hotels.html#BODYCON‘
#?抓取酒店
class?Hotel:
????#?頁面初始化
????def?__init__(self):
????????#?url的左邊和右邊,左+30+右
????????self.l_siteURL?=?‘https://www.tripadvisor.cn/Hotels-g294212-oa‘
????????self.r_siteURL?=?‘-Beijing-Hotels.html#BODYCON‘
????????self.frontUrl?=?‘https://www.tripadvisor.cn‘??#?酒店詳情url要加的前綴
????????self.tool?=?tool.Tool()
????#?獲取頁面源碼內容
????def?getPage(self?infoURL):
????????time.sleep(0.2)
????????headers?=?{‘content-type‘:?‘application/json‘
???????????‘User-Agent‘:?‘Mozilla/5.0?(X11;?Ubuntu;?Linux?x86_64;?rv:22.0)?Gecko/20100101?Firefox/22.0‘}
????????r?=?requests.get(url=infoURL?verify=False?headers=headers)
????????r.encoding?=?‘utf-8‘
????????return?r.text
????#?傳入圖片地址,文件名,保存單張圖片
????def?saveImg(self?imageURL?fileName):
????????time.sleep(0.2)
????????headers?=?{‘content-type‘:?‘application/json‘
???????????????????‘User-Agent‘:?‘Mozilla/5.0?(X11;?Ubuntu;?Linux?x86_64;?rv:22.0)?Gecko/20100101?Firefox/22.0‘}
????????r?=?requests.get(url=imageURL?verify=False?headers=headers)
????????data?=?r.content??#?二進制內容返回
????????f?=?open(fileName?‘wb‘)
????????f.write(data)
????????print?u“正在悄悄保存一張圖片為%s“?%?(fileName)
????????f.close()
????#?保存一張酒店圖片并返回圖片路徑
????def?saveIcon(self?iconURL?path):
????????splitPath?=?iconURL.split(‘.‘)
????????fTail?=?splitPath.pop()??#?移除最后一個元素并返回
????????fileName?=?path?+?“.“?+?fTail
????????self.saveImg(iconURL?fileName)
????????return?fileName
????#?獲取酒店詳細地址
????def?getHotelAddress(self?page):
????????pattern?=?re.compile(“(.{010})(.{010})(.{030})“?re.S)
????????result?=?re.search(pattern?page)
????????return?self.tool.replace(result.group(1)?+?result.group(2)?+?result.group(3))
????
????#?獲取酒店評分
????def?getHotelGrade(self?page):
????????pattern?=?re.compile(“(.{04})?“?re.S)
????????result?=?re.search(pattern?page)
????????return?self.tool.replace(result.group(1))
????
????#?獲取酒店點評數量
????def?getHotelCommentNumber(self?detailPage):
????????pattern?=?re.compile(u“(\d{04})<\/span>條點評“?re.S)
????????result?=?re.search(pattern?detailPage)
????????print?type(result)
????????return?int(self.tool.replace(result.group(1)))
????#?解析酒店評論并保存
????def?parseHotelComment(self?hotelName?url?pageNum):
????????splitURL?=?url.split(‘.‘)?
????????print?‘共有‘?pageNum?‘頁評論‘
????????if?pageNum?>?80?:??#?最多爬取80頁評論
????????????pageNum?=?80
????????firstUserName?=?u‘‘??#?記錄上一頁的第一個評論人
????????flag?=?False
????????for?index?in?range(1?pageNum?+?1):
????????????if?flag?==?True:
????????????????print?‘跳出了
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-01-30?21:10??maotuying\
?????文件?????????191??2018-01-15?11:24??maotuying\hotel.txt
?????文件????????1815??2018-01-15?11:26??maotuying\hotelComment.txt
?????目錄???????????0??2018-01-23?09:27??maotuying\img\
?????目錄???????????0??2018-01-23?09:27??maotuying\img\hotel\
?????文件???????64751??2018-01-15?11:24??maotuying\img\hotel\北京新云南皇冠假日酒店.jpg
?????目錄???????????0??2018-01-15?11:24??maotuying\img\scenic\
?????文件???????19622??2018-01-30?21:04??maotuying\main.py
?????文件?????????479??2018-01-30?21:17??maotuying\READEME.txt
?????文件????????1060??2018-01-02?20:34??maotuying\tool.py
評論
共有 條評論