資源簡(jiǎn)介
python檢索新浪微博.zip

代碼片段和文件信息
#?coding:?utf-8
import?urllib.request
import?time
import?random
from?lxml?import?etree
import?logging
import?xlrd
from?xlutils.copy?import?copy
#?導(dǎo)入所需模塊
import?urllib.error
import?urllib.request
import?urllib.parse
import?re
import?rsa
import?http.cookiejar??#?從前的cookielib
import?base64
import?json
import?urllib
import?binascii
class?CollectData():
????“““數(shù)據(jù)收集類(lèi)
???????利用微博高級(jí)搜索功能,按關(guān)鍵字搜集一定時(shí)間范圍內(nèi)的微博。
????“““
????def?__init__(self?keyword?area?startTime?interval=‘50‘fileS=“weibo.csv“flag=True?begin_url_per=“http://s.weibo.com/weibo/“):
????????self.begin_url_per?=?begin_url_per??#?設(shè)置固定地址部分
????????self.setKeyword(keyword)??#?設(shè)置關(guān)鍵字
????????self.setArea(area)??#?設(shè)置關(guān)鍵字
????????self.setStartTimescope(startTime)??#?設(shè)置搜索的開(kāi)始時(shí)間
????????#?self.setRegion(region)??#設(shè)置搜索區(qū)域
????????self.setInterval(interval)??#?設(shè)置鄰近網(wǎng)頁(yè)請(qǐng)求之間的基礎(chǔ)時(shí)間間隔(注意:過(guò)于頻繁會(huì)被認(rèn)為是機(jī)器人)
????????self.setFileS(fileS)??#?設(shè)置鄰近網(wǎng)頁(yè)請(qǐng)求之間的基礎(chǔ)時(shí)間間隔(注意:過(guò)于頻繁會(huì)被認(rèn)為是機(jī)器人)
????????self.setFlag(flag)
????#設(shè)置關(guān)鍵字
????#關(guān)鍵字需解碼后編碼為utf-8
????def?setKeyword(self?keyword):
????????self.keyword?=?keyword.encode(“utf-8“)
????def?setArea(self?area):
????????self.area?=?area
????def?setFileS(self?fileS):
????????self.fileS?=?fileS
????def?getKeyWord(self):
????????return?urllib.parse.quote(self.keyword)
????def?getArea(self):
????????return?self.area
????????##設(shè)置起始范圍,間隔為1天
????#格式為:yyyy-mm-dd
????def?setStartTimescope(self?startTime):
????????if?not?(startTime?==?‘-‘):
????????????self.timescope?=?startTime
????????else:
????????????self.timescope?=?‘-‘
????##設(shè)置鄰近網(wǎng)頁(yè)請(qǐng)求之間的基礎(chǔ)時(shí)間間隔
????def?setInterval(self?interval):
????????self.interval?=?int(interval)
????????def?setInterval(self?interval):
????????????self.interval?=?int(interval)
????#設(shè)置是否被認(rèn)為機(jī)器人的標(biāo)志。
????def?setFlag(self?flag):
????????self.flag?=?flag
????#構(gòu)建URL
????def?getURL(self):
????????return?self.begin_url_per?+?“?q=“?+?self.getKeyWord()?+?“®ion=custom:“?+?self.getArea()?+?“&scope=ori&suball=1×cope=custom:“?+?self.timescope?+?“&Refer=g&page=“
????????##爬取一次請(qǐng)求中的所有網(wǎng)頁(yè),最多返回50頁(yè)
????def?download(self?url?maxTryNum=4):
????????hasMore?=?True??#?某次請(qǐng)求可能少于50頁(yè),設(shè)置標(biāo)記,判斷是否還有下一頁(yè)
????????isCaught?=?False??#?某次請(qǐng)求被認(rèn)為是機(jī)器人,設(shè)置標(biāo)記,判斷是否被抓住。抓住后,需要,進(jìn)入頁(yè)面,輸入驗(yàn)證碼
????????i?=?1??#?記錄本次請(qǐng)求所返回的頁(yè)數(shù)
????????while?hasMore?and?i?99?and?(not?isCaught):??#?最多返回98頁(yè),對(duì)每頁(yè)進(jìn)行解析,并寫(xiě)入結(jié)果文件
????????????source_url?=?url?+?str(i)??#?構(gòu)建某頁(yè)的URL
????????????data?=?‘‘??#?存儲(chǔ)該頁(yè)的網(wǎng)頁(yè)數(shù)據(jù)
????????????goon?=?True??#?網(wǎng)絡(luò)中斷標(biāo)記
????????????##網(wǎng)絡(luò)不好的情況,試著嘗試請(qǐng)求三次
????????????for?tryNum?in?range(maxTryNum):
????????????????try:
????????????????????html?=?urllib.request.urlopen(source_url?timeout=12)
????????????????????data?=?html.read().decode()
????????????????????break
????????????????except:
????????????????????if?tryNum?(maxTryNum?-?1):
????????????????????????time.sleep(10)
????????????????????else:
????????????????????????print(‘Internet?Connect?Error!‘)
????????????????????????self.flag?=?False
???????????
?屬性????????????大小?????日期????時(shí)間???名稱(chēng)
-----------?---------??----------?-----??----
?????目錄???????????0??2019-03-16?13:06??python檢索新浪微博\
?????文件??????267829??2019-03-15?13:26??python檢索新浪微博\readme.docx
?????文件???????59392??2019-03-15?13:15??python檢索新浪微博\weiboData.xls
?????文件????????8287??2019-03-16?13:02??python檢索新浪微博\xinLang.py
- 上一篇:魔塔
- 下一篇:keras .whl文件 用于python3
評(píng)論
共有 條評(píng)論