91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 72KB
    文件類型: .zip
    金幣: 2
    下載: 0 次
    發布日期: 2021-05-09
  • 語言: Python
  • 標簽: Python爬蟲??

資源簡介

用Python爬取貓途鷹旅游網站的爬蟲代碼; 爬取到的數據有:酒店和景點信息,酒店評論信息,景點評論信息; 內附爬蟲過程的思路和難點介紹

資源截圖

代碼片段和文件信息

#?-*-?coding:utf-8?-*-

#?從酒店列表中爬取酒店信息

import?requests
import?re
import?tool
import?os
import?time
import?urllib3
urllib3.disable_warnings()
#?requests.packages.urllib3.disable_warnings()

#?url
l_siteURL?=?‘https://www.tripadvisor.cn/Hotels-g294212-oa‘
r_siteURL?=?‘-Beijing-Hotels.html#BODYCON‘


#?抓取酒店
class?Hotel:

????#?頁面初始化
????def?__init__(self):
????????#?url的左邊和右邊,左+30+右
????????self.l_siteURL?=?‘https://www.tripadvisor.cn/Hotels-g294212-oa‘
????????self.r_siteURL?=?‘-Beijing-Hotels.html#BODYCON‘
????????self.frontUrl?=?‘https://www.tripadvisor.cn‘??#?酒店詳情url要加的前綴
????????self.tool?=?tool.Tool()

????#?獲取頁面源碼內容
????def?getPage(self?infoURL):
????????time.sleep(0.2)
????????headers?=?{‘content-type‘:?‘application/json‘
???????????‘User-Agent‘:?‘Mozilla/5.0?(X11;?Ubuntu;?Linux?x86_64;?rv:22.0)?Gecko/20100101?Firefox/22.0‘}
????????r?=?requests.get(url=infoURL?verify=False?headers=headers)
????????r.encoding?=?‘utf-8‘
????????return?r.text

????#?傳入圖片地址,文件名,保存單張圖片
????def?saveImg(self?imageURL?fileName):
????????time.sleep(0.2)
????????headers?=?{‘content-type‘:?‘application/json‘
???????????????????‘User-Agent‘:?‘Mozilla/5.0?(X11;?Ubuntu;?Linux?x86_64;?rv:22.0)?Gecko/20100101?Firefox/22.0‘}
????????r?=?requests.get(url=imageURL?verify=False?headers=headers)
????????data?=?r.content??#?二進制內容返回
????????f?=?open(fileName?‘wb‘)
????????f.write(data)
????????print?u“正在悄悄保存一張圖片為%s“?%?(fileName)
????????f.close()

????#?保存一張酒店圖片并返回圖片路徑
????def?saveIcon(self?iconURL?path):
????????splitPath?=?iconURL.split(‘.‘)
????????fTail?=?splitPath.pop()??#?移除最后一個元素并返回
????????fileName?=?path?+?“.“?+?fTail
????????self.saveImg(iconURL?fileName)
????????return?fileName

????#?獲取酒店詳細地址
????def?getHotelAddress(self?page):
????????pattern?=?re.compile(“(.{010})(.{010})(.{030})“?re.S)
????????result?=?re.search(pattern?page)
????????return?self.tool.replace(result.group(1)?+?result.group(2)?+?result.group(3))
????
????#?獲取酒店評分
????def?getHotelGrade(self?page):
????????pattern?=?re.compile(“(.{04})?“?re.S)
????????result?=?re.search(pattern?page)
????????return?self.tool.replace(result.group(1))
????
????#?獲取酒店點評數量
????def?getHotelCommentNumber(self?detailPage):
????????pattern?=?re.compile(u“(\d{04})<\/span>條點評“?re.S)
????????result?=?re.search(pattern?detailPage)
????????print?type(result)
????????return?int(self.tool.replace(result.group(1)))

????#?解析酒店評論并保存
????def?parseHotelComment(self?hotelName?url?pageNum):
????????splitURL?=?url.split(‘.‘)?
????????print?‘共有‘?pageNum?‘頁評論‘
????????if?pageNum?>?80?:??#?最多爬取80頁評論
????????????pageNum?=?80
????????firstUserName?=?u‘‘??#?記錄上一頁的第一個評論人
????????flag?=?False
????????for?index?in?range(1?pageNum?+?1):
????????????if?flag?==?True:
????????????????print?‘跳出了

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-01-30?21:10??maotuying\
?????文件?????????191??2018-01-15?11:24??maotuying\hotel.txt
?????文件????????1815??2018-01-15?11:26??maotuying\hotelComment.txt
?????目錄???????????0??2018-01-23?09:27??maotuying\img\
?????目錄???????????0??2018-01-23?09:27??maotuying\img\hotel\
?????文件???????64751??2018-01-15?11:24??maotuying\img\hotel\北京新云南皇冠假日酒店.jpg
?????目錄???????????0??2018-01-15?11:24??maotuying\img\scenic\
?????文件???????19622??2018-01-30?21:04??maotuying\main.py
?????文件?????????479??2018-01-30?21:17??maotuying\READEME.txt
?????文件????????1060??2018-01-02?20:34??maotuying\tool.py

評論

共有 條評論