資源簡介
包括爬蟲源碼,即國內10W景點概覽信息和部分景點詳細信息。
- 爬取熱門目的地信息 `MafengwoCrawler()._get_mdd()`
- 爬取目的地內景點信息 `MafengwoCrawler().crawler_mdd()`
- 爬取景點詳細信息 `MafengwoCrawler().crawler_detail()`

代碼片段和文件信息
#!/usr/bin/env?python
#?-*-?coding:?utf-8?-*-
#?@Time?????????:?2019/04/24
#?@Author???????:?AIsland
#?@Email????????:?yuchunyu97@gmail.com
#?@File?????????:?crawler.py
#?@Description??:?爬取馬蜂窩各省市景點數據
import?requests
import?re
import?time
import?json
import?hashlib
import?logging
import?threading
import?pymysql
from?bs4?import?BeautifulSoup
class?MafengwoCrawler:
????#?查詢目的地的網址
????#?目的地內包含景點
????URL_MDD?=?‘http://www.mafengwo.cn/mdd/‘
????#?查詢景點的網址
????#?包含景點詳情的鏈接、景點圖片和景點名稱
????URL_ROUTE?=?‘http://www.mafengwo.cn/ajax/router.php‘
????#?查詢景點坐標經緯度的網址
????#?經度:longitude?lng
????#?緯度:lat?itude?lat
????URL_POI?=?‘http://pagelet.mafengwo.cn/poi/pagelet/poiLocationApi‘
????#?通用?Headers
????HEADERS?=?{
????????‘Referer‘:?‘http://www.mafengwo.cn/‘
????????‘Upgrade-Insecure-Requests‘:?‘1‘
????????‘User-Agent‘:?‘Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_13_5)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36‘
????}
????#?mysql?數據庫鏈接信息
????DB_HOST?=?‘localhost‘
????DB_USER?=?‘root‘
????DB_PASSWORD?=?‘yuchunyu97@gmail.com‘
????DB_NAME?=?‘mafengwo‘
????#?請求數據加密需要的字符串,由?_get_md5_encrypted_string()?方法獲取
????encrypted_string?=?‘‘
????#?記錄不用爬取的頁碼,即爬取成功的頁碼
????success_pages?=?[]
????def?__init__(self?log_file=None):
????????#?使用說明?https://www.cnblogs.com/nancyzhu/p/8551506.html
????????logging.basicConfig(level=logging.DEBUG
????????????????????????????filename=‘mafengwo.‘+str(int(time.time()))+‘.log‘
????????????????????????????format=‘%(asctime)s?-?%(pathname)s[line:%(lineno)d]?-?%(levelname)s:?%(message)s‘
????????????????????????????)
????????#?初始化請求對象
????????self.REQ?=?requests.session()
????????#?設置通用?Headers
????????self.REQ.headers.update(self.HEADERS)
????????#?獲取請求數據加密需要的字符串
????????self._get_md5_encrypted_string()
????????#?如果傳入日志文件,則過濾已爬取成功的頁碼
????????if?log_file?is?not?None:
????????????self.success_pages?=?self._read_log_file_get_success_page(log_file)
????????????print(‘當前已經成功爬取的頁數:‘?+?str(len(self.success_pages)))
????????????print(‘5秒后繼續運行‘)
????????????time.sleep(5)
????def?crawler_mdd(self?mdd_id=21536):
????????‘‘‘
????????爬取單個目的地的景點信息
????????默認:21536,中國
????????‘‘‘
????????#?mdd_id?=?12522??#?鼓浪嶼,16頁,測試數據
????????#?開始爬數據
????????start?=?int(time.time())
????????#?先獲取數據總頁數
????????res?=?self._get_route(mdd_id)
????????page_total?=?res[‘pagecount‘]
????????#?計算每個線程爬取多少頁
????????page_range?=?round(page_total/20)
????????if?page_range?==?0:
????????????page_range?=?1
????????logging.info(‘總共‘+str(page_total)+‘頁,每個線程爬取‘+str(page_range)+‘頁‘)
????????print(‘總共‘+str(page_total)+‘頁,每個線程爬取‘+str(page_range)+‘頁‘)
????????#?開啟多線程模式
????????thread?=?[]
????????for?i?in?range(1?page_total+1?page_range):
????????????page_start?=?i
????????????page_end?=?i?+?page_range
????????????if?page_end?>?page_total?+?1:
????????????????page_end?=?page_total?+?1
????????????t?=?threading.Thread(target=self.crawler
?????????????????????????????????args=(mdd_id?page_start?page_end))
????????????thread.append(t)
????????f
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-04-30?11:16??mafengwo\
?????文件????????6148??2019-04-30?11:15??mafengwo\.DS_Store
?????目錄???????????0??2019-04-30?11:19??__MACOSX\
?????目錄???????????0??2019-04-30?11:19??__MACOSX\mafengwo\
?????文件?????????120??2019-04-30?11:15??__MACOSX\mafengwo\._.DS_Store
?????文件????20128702??2019-04-30?11:13??mafengwo\mafengwo.20190430.sql
?????文件?????????261??2019-04-30?11:13??__MACOSX\mafengwo\._mafengwo.20190430.sql
?????文件???????????0??2019-04-24?08:37??mafengwo\README.md
?????文件?????????176??2019-04-24?08:37??__MACOSX\mafengwo\._README.md
?????文件????????1666??2019-04-30?11:16??mafengwo\mafengwo.structure.sql
?????文件?????????261??2019-04-30?11:16??__MACOSX\mafengwo\._mafengwo.structure.sql
?????目錄???????????0??2019-04-24?14:37??mafengwo\.vscode\
?????文件???????21902??2019-04-30?11:15??mafengwo\crawler.py
?????文件?????????176??2019-04-30?11:15??__MACOSX\mafengwo\._crawler.py
- 上一篇:virtools素材
- 下一篇:kubernetes 進階實戰 epub 版本
評論
共有 條評論