資源簡(jiǎn)介
內(nèi)含三個(gè)文件,分別是:爬取微博、數(shù)據(jù)預(yù)處理、爬取并處理。基于python3,實(shí)現(xiàn)了高效爬取微博數(shù)據(jù),并結(jié)合正則表達(dá)式對(duì)數(shù)據(jù)進(jìn)一步處理。其中亦包含對(duì)微博評(píng)論和點(diǎn)贊等其他信息的爬取,小小修改一下代碼即可。

代碼片段和文件信息
#?-*-?coding:utf-8?-*-
from?lxml?import?html
import?requests
import?json
import?os
import?time
import?re
from?time?import?sleep
###?爬取微博內(nèi)容
class?CrawlWeibo:
????#?獲取指定博主的所有微博cards的list
????def?getCards(self?id?page):
????????#?id(字符串類型):博主的用戶id;page(整型):微博翻頁(yè)參數(shù)
????????ii?=?0
????????list_cards?=?[]
????????while?ii?????????????ii?=?ii?+?1
????????????url?=?‘https://m.weibo.cn/api/container/getIndex?type=uid&value=‘?+?id?\
??????????????????+?‘&containerid=107603‘?+?id?+?‘&page=‘?+?str(ii)
????????????response?=?requests.get(url?headers=headers)
????????????ob_json?=?json.loads(response.text)??#?ob_json為dict類型
????????????list_card?=?ob_json[‘data‘][‘cards‘]
????????????if?len(list_card)==0?and?ii==1:
???????????? break
????????????else:
???????????? list_cards.append(ob_json[‘data‘][‘cards‘])?#?ob_json[‘data‘][‘cards‘]為list類型
????????return?list_cards??#?返回所有頁(yè)的cards
????#?獲取某條微博的熱門評(píng)論或評(píng)論的list
????def?getComments(self?id?page):??#?id(字符串類型):某條微博的id;page(整型):評(píng)論翻頁(yè)參數(shù)
????????url?=?‘https://m.weibo.cn/api/comments/show?id=‘?+?id?+?‘&page=‘?+?str(page)
????????response?=?requests.get(url?headers=headers)
????????ob_json?=?json.loads(response.text)
????????list_comments?=?[]
????????if?‘data‘?in?ob_json:
????????????if?‘hot_data‘?in?ob_json[‘data‘]:
????????????????list_comments?=?ob_json[‘data‘][‘hot_data‘]
????????????else:
????????????????list_comments?=?ob_json[‘data‘][‘data‘]
????????return?list_comments??#?返回某條微博下評(píng)論
????def?getAll(self?id?page?path):??#?id為博主uid,page為爬取頁(yè)數(shù),path為保存路徑
????????list_cards?=?self.getCards(id?page)
????????if?len(list_cards)!=0:
????????count_weibo?=?1
????????page_weibo?=?1
????????#?遍歷當(dāng)頁(yè)所有微博,保存內(nèi)容,并根據(jù)id查找輸出熱門評(píng)論
????????ff?=?open(path?+?‘%s.txt‘%id?‘w‘?encoding=‘utf-8‘)
????????for?cards?in?list_cards:
????????????for?card?in?cards:
????????????????if?card[‘card_type‘]?==?9:??#?過(guò)濾出微博
????????????????????#?if?card[‘card_type‘]?==?9?and?‘raw_text‘?not?in?card[‘mblog‘]:??#?過(guò)濾出原創(chuàng)微博
????????????????????#?print(‘正在爬取第‘?+?str(page_weibo)?+?‘頁(yè)?第‘?+?str(count_weibo)?+?‘條card‘)
????????????????????mid?=?card[‘mblog‘][‘id‘]
????????????????????created_at?=?card[‘mblog‘][‘created_at‘]
????????????????????#?獲取保存文本信息
????????????????????if?not?card[‘mblog‘][‘isLongText‘]:?#?card[‘mblog‘][‘isLongText‘]?==?‘false‘
????????????????????????text?=?card[‘mblog‘][‘text‘]
????????????????????else:
????????????????????????url?=?‘https://m.weibo.cn/statuses/extend?id=‘?+?mid
????????????????????????response?=?requests.get(url?headers=headers)
????????????????????????ob_json?=?json.loads(response.text)??#?ob_json為dict類型
????????????????????????text?=?ob_json[‘data‘][‘longTextContent‘]
????????????????????????tree?=?html.fromstring(text)
????????????????????????text?=?tree.xpath(‘string(.)‘)??#?用string函數(shù)過(guò)濾掉多余標(biāo)簽
????????????????????ff.write(text?+?‘\n‘)
????????????????????#?print(text)
????????????????count_weibo?=?count_weibo?+?1
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件????????5392??2019-01-29?15:31??weibo_crawl\CrawlAndDeal.py
?????文件?????????942??2018-12-22?20:08??weibo_crawl\Deal.py
?????文件????????5665??2019-01-29?15:30??weibo_crawl\WeiboCrawl.py
?????目錄???????????0??2019-01-29?15:33??weibo_crawl\weibo\
?????目錄???????????0??2019-01-29?15:33??weibo_crawl\
評(píng)論
共有 條評(píng)論