資源簡介
使用python爬取貓眼影評并進行可視化處理分析,具體分析可見https://blog.csdn.net/qq_32392597/article/details/96891236

代碼片段和文件信息
#?coding=utf-8
from?urllib?import?request
import?json
import?time
from?datetime?import?datetime
from?datetime?import?timedelta
#?獲取數據,根據url獲取
def?get_data(url):
????headers?=?{
????????‘User-Agent‘:?‘Mozilla/5.0?(Windows?NT?6.1;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/64.0.3282.140?Safari/537.36‘
????}
????req?=?request.Request(url?headers=headers)
????response?=?request.urlopen(req)
????if?response.getcode()?==?200:
????????return?response.read()
????return?None
#處理數據
def?parse_data(html):
????data?=?json.loads(html)[‘cmts‘]??#?將str轉換為json
????comments?=?[]
????for?item?in?data:
????????comment?=?{
????????????‘id‘:?item[‘id‘]
????????????‘nickName‘:?item[‘nickName‘]
????????????‘cityName‘:?item[‘cityName‘]?if?‘cityName‘?in?item?else?‘‘??#?處理cityName不存在的情況
????????????‘content‘:?item[‘content‘].replace(‘\n‘?‘?‘?10)??#?處理評論內容換行的情況
????????????‘score‘:?item[‘score‘]
????????????‘startTime‘:?item[‘startTime‘]
????????}
????????comments.append(comment)
????return?comments
#?存儲數據,存儲到文本文件
def?save_to_txt():
????start_time?=?datetime.now().strftime(‘%Y-%m-%d?%H:%M:%S‘)??#?獲取當前時間,從當前時間向前獲取
????end_time?=?‘2019-07-18?00:00:00‘
????while?start_time?>?end_time:
????????url?=?‘http://m.maoyan.com/mmdb/comments/movie/1229534.json?_v_=yes&offset=0&startTime=‘?+?start_time.replace(‘?‘?‘%20‘)
????????html?=?None
????????‘‘‘
????????????問題:當請求過于頻繁時,服務器會拒絕連接,實際上是服務器的反爬蟲策略
????????????解決:1.在每個請求間增加延時0.1秒,盡量減少請求被拒絕
?????????????????2.如果被拒絕,則0.5秒后重試
????????‘‘‘
????????try:
????????????html?=?get_data(url)
????????except?Exception?as?e:
????????????time.sleep(0.5)
????????????html?=?get_data(url)
????????else:
????????????time.sleep(0.1)
????????comments?=?parse_data(html)
????????print(comments)
????????start_time?=?comments[14][‘startTime‘]??#?獲得末尾評論的時間
????????start_time?=?datetime.strptime(start_time?‘%Y-%m-%d?%H:%M:%S‘)?+?timedelta(seconds=-1)??#?轉換為datetime類型,減1秒,避免獲取到重復數據
????????start_time?=?datetime.strftime(start_time?‘%Y-%m-%d?%H:%M:%S‘)??#?轉換為str
????????#a+?以附加方式打開可讀寫的文件。若文件不存在,則會建立該文件,如果文件存在,寫入的數據會被加到文件尾后,即文件原先的內容會被保留。
????????for?item?in?comments:
????????????with?open(‘C:/Users/Administrator/Desktop/使用python爬取貓眼影評并進行可視化處理分析/comments.txt‘?‘a+‘?encoding=‘utf-8‘)?as?f:
????????????????f.write(str(item[‘id‘])+‘‘+item[‘nickName‘]?+?‘‘?+?item[‘cityName‘]?+?‘‘?+?item[‘content‘]?+?‘‘?+?str(item[‘score‘])+?‘‘?+?item[‘startTime‘]?+?‘\n‘)
if?__name__?==?‘__main__‘:
????html?=?get_data(‘http://m.maoyan.com/mmdb/comments/movie/1229534.json?_v_=yes&offset=0&startTime=2019-07-18%2022%3A25%3A03‘)
????comments?=?parse_data(html)
????#?print(comments)
????save_to_txt()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????3192??2019-07-22?21:10??使用python爬取貓眼影評并進行可視化處理分析\1.py
?????文件???????4769??2019-07-22?21:34??使用python爬取貓眼影評并進行可視化處理分析\2.py
?????文件????8200650??2019-07-22?21:09??使用python爬取貓眼影評并進行可視化處理分析\comments.txt
?????文件??????30804??2019-07-22?21:31??使用python爬取貓眼影評并進行可視化處理分析\粉絲來源.html
?????目錄??????????0??2019-07-22?19:40??使用python爬取貓眼影評并進行可視化處理分析
-----------?---------??----------?-----??----
??????????????8239415????????????????????5
評論
共有 條評論