資源簡介
此資料是用來爬取新浪微博評論的,修改最后的uid值即可
代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
“““
Created?on?Wed?Apr?18?13:11:58?2018
@author:?qizhiliu
“““
import?time
import?jieba
f=open(r‘D:\test\test7.txt‘‘a+‘encoding=‘utf-8‘)
import?requests#requests是一個兼容的庫
import?json
#from?lastLine?import?get_last_line
#import?os
import?re?#解析不規則文本
from?lxml?import?html
import?math
#uid=2803301701
start=time.clock()
class?weibo(object):
????
????def?get_weibo(selfidpage_idpage):
????????url=‘https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid={}&page={}‘.format(ididpage_idpage)
????????response=requests.get(url)
????????ob_json?=json.loads(response.text)
????????#print?(response.text)
????????#print?(ob_json)
????????list_cards=ob_json.get(‘data‘).get(‘cards‘)
????????#list_text=ob_json.get(‘text‘)
????????#print?(list_text)
????????#print(list_cards)
????????return?list_cards
????def?get_comments(selfidpage):
????????url=‘https://m.weibo.cn/api/comments/show?id={}&page={}‘.format(idpage)
????????response=requests.get(url)
????????ob_json?=json.loads(response.text)
????????if?len(ob_json)<3:
????????????list_comments=‘‘
????????else:
????????????list_comments=ob_json.get(‘data‘).get(‘data‘)
???????#?print?(list_comments)
????????
????????return?list_comments
????def?main(selfidpagepage_id):
????????list_cards??=?self.get_weibo(idpage_idpage)
????????#print?(list_cards)
????????for?card?in?list_cards:
????????????if?card.get(‘card_type‘)==9:??#等于9的微博才不是廣告
????????????????id?=?card.get(‘mblog‘).get(‘id‘)
????????????????text=?card.get(‘mblog‘).get(‘text‘)
????????????????if?text!=‘‘:
????????????????????tree=html.fromstring(text)
????????????????????text=tree.xpath(‘string(.)‘)??????????????????
????????????????????text=re.sub(r‘回復.*?:‘‘‘text)
????????????????????text=re.sub(r‘?‘‘?‘text)
????????????????????text=re.sub(r“@.*?“‘‘text)
????????????????????text?=?jieba.cut(text)
????????????????????text=“?“.join(text)
????????????????????#f.write(“***“)
????????????????????#f.write(‘@@@微博‘)
????????????????????f.write(text)
????????????????????f.write(‘\n‘)
????????????????else:
????????????????????pass
????????????????b=1
????????????????#tree=html.fromstring(text)
????????????????#text=tree.xpath(‘string(.)‘)
????????????????while?True:
????????????????????list_comments=weibo.get_comments(idb)#獲取博文對應的評論界面
????????????????????b+=1
????????????????????if?b+1%10==0:
????????????????????????print(‘成功爬取100頁評論‘)
????????????????????if?len(list_comments)<1:
????????????????????????break
????????????????????else:
????????????????????????
????????????????????????count_hotcomments?=?1
????????????????????????for?comment?in?list_comments:
?????????????????#???????????user_id?=?comment.get(‘user_id‘)
??????????????????#??????????created_at?=?comment.get(‘created_at‘)
????????????????????????????#link_counts?=?comment.get(‘like_counts‘)
????????????????????????????text?=?comment.get(‘text‘)
???????????????
- 上一篇:shuake.py
- 下一篇:Python淘寶評論爬取
評論
共有 條評論