資源簡介
根據關鍵字,起始時間,和天數自動抓取時間段內的微博,包括微博id ,用戶id ,時間,vip,微博內容,轉發信息,轉發信息轉發數和評論數,并寫入excel表格。
代碼片段和文件信息
from?urllib?import?request
from?urllib?import?parse
from?urllib.request?import?urlopen
from?http?import?cookiejar
from?lxml?import?etree
from?bs4?import??BeautifulSoup
import??re
import?datetime
#from?myfunction?import?ntoc
#from?myfunction?import?dict_freq_sort
import?xlwt
wbk?=?xlwt.Workbook()
sheet?=?wbk.add_sheet(‘sheet?1‘)
keyword=‘食品安全‘
y=2019
m=3
d=1
days=10
def?getday(ymdn):
????the_date?=?datetime.datetime(ymd)
????result_date?=?the_date?+?datetime.timedelta(days=n)
????d?=?result_date.strftime(‘%Y-%m-%d‘)
????return?d
url_keyword=parse.quote(keyword)
#提交準備
user_agent?=?‘Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/64.0.3282.140?Safari/537.36?Edge/18.17763‘
headers?=?{?‘User-Agent‘?:?user_agent?‘Referer‘:‘‘?}
#cookie構建opener
cookie=cookiejar.CookieJar()#cookie?=?cookiejar.MozillaCookieJar(filename)?可保存讀取的cookie初始化方法
#cookie.load(filename?ignore_discard=True?ignore_expires=True)讀取已保存cookie
handler?=?request.HTTPCookieProcessor(cookie)
opener?=?request.build_opener(handler)
#pattern_newline=?re.compile(r‘,|?|。|!|……|:|;‘)#每一句進行換行
#pattern_del_blank=re.compile(r‘?||的秒拍視頻|“|”|、|網頁鏈接|《|》|收起全文d|@|【|】|“|“‘)#刪空格去和微博符號
#pattern_del=re.compile(r‘#.*#‘)#刪掉微博中的主題名
pattern_chinese?=re.compile(u“[\u4e00-\u9fa5]+“)
end_str=‘抱歉,未找到“‘+keyword+‘”相關結果?!?br/>pattern_endweibo=re.compile(end_str)
pattern1=re.compile(‘展開全文c‘)
pattern_del_sentence=re.compile(‘的微博視頻‘)
pattern_save=re.compile(r‘【.*?】|《.*?》‘)
patterb_firstsen=re.compile(r‘[\u4e00-\u9fa5](.*?)。|[\u4e00-\u9fa5](.*?)?|[\u4e00-\u9fa5](.*?)!‘)
pattern_time=re.compile(r‘..:..‘)
sum=0
for?i?in?range(days):
????data=getday(ymd-i)
????print(data)
????for?j?in?range(24):
????????if?j==23:
????????????data_add_hour?=?data?+?‘-‘?+?str(j)?+?‘:‘?+getday(ymd-(i-1))?+?‘-‘?+?str(0)
????????else:
????????????data_add_hour?=?data?+?‘-‘?+?str(j)?+?‘:‘?+?data?+?‘-‘?+?str(j?+?1)
????????print(data_add_hour+‘:‘)
????????for?k?in?range(50):
????????????url?=?‘https://s.weibo.com/weibo?q=‘+url_keyword+‘&typeall=1&suball=1×cope=custom:‘+data_add_hour+‘&Refer=g&page=‘+str(k+1)
????????????requ?=?request.Request(url=urlheaders=headers)??#?data#headers
????????????try:
????????????????respones?=?opener.open(requtimeout=60)??#?timeout=10?使用自己建的opener處理requests
????????????????#?cookie.save(ignore_discard=True?ignore_expires=True)??保存cookie
????????????????web_data?=?respones.read().decode(“utf-8“?“ignore“)
????????????????if?pattern_endweibo.findall(web_data)!=[]:
????????????????????#print(‘該時段沒有更多結果‘)
????????????????????break
????????????????page?=?etree.HTML(web_data)
????????????????weibo_list?=?page.xpath(“//div[@mid]“)
????????????????for?p?in?weibo_list:
????????????????????rowNum?=?sum
????????????????????#print(‘=============‘)
????????????????????mid?=
- 上一篇:PCA結合馬氏距離 py代碼
- 下一篇:month2day.py
評論
共有 條評論