資源簡(jiǎn)介
通過(guò)搜狗搜索中的微信搜索入口來(lái)爬取微信公眾號(hào)上的文章 時(shí)間,文章標(biāo)題,文章地址,文章簡(jiǎn)介、圖片
代碼片段和文件信息
#!/usr/bin/python
#?coding:?utf-8
‘‘‘
總的來(lái)說(shuō)就是通過(guò)搜狗搜索中的微信搜索入口來(lái)爬取
2017-04-13?by?Jimy_fengqi
‘‘‘
import?urllib
import?xlwt?as?xlwt
from?urllib?import?parse
from?pyquery?import?PyQuery?as?pq
from?selenium?import?webdriver
import?requests
import?time
import?re
import?os
def?dirsIsExists(path):
????if?os.path.exists(path):
????????message?=?‘OK?the?“%s“?file?exists.‘
????else:
????????message?=?‘Sorry?I?cannot?find?the?“%s“?file.‘
????????os.makedirs(path)
????print(message)
????#爬蟲(chóng)主函數(shù)
def?saveImgTwo(pathnameList):
????dirsIsExists(path)
????Length?=?len(nameList)
????x?=?0
????for?i?in?range(0?Length):
????????imgurl?=?nameList[i][“pic“]
????????print(“圖片%d:%s\n“%(i+1?imgurl))
????????if?‘http‘?in?imgurl:
????????????print?(“第?%s“?%x?+?“張圖片“)
????????????urllib.request.urlretrieve(imgurl?path+‘/‘+‘/%s.jpg‘?%?x)
????????????x?+=?1
????????????#判斷目錄是否存在不存在就生成
class?weixin_spider:
????def?__init__(self?keywords):
????????‘?構(gòu)造函數(shù)?‘
????????self.keywords?=?keywords
????????#?搜狐微信搜索鏈接入口
????????#self.sogou_search_url?=?‘http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&_sug_=n&_sug_type_=‘?%?quote(self.keywords)
????????self.sogou_search_url?=?‘http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&s_from=input&_sug_=n&_sug_type_=‘?%?parse.quote(self.keywords)
????????#?爬蟲(chóng)偽裝頭部設(shè)置
????????self.headers?=?{‘User-Agent‘:?‘Mozilla/5.0?(Windows?NT?6.3;?WOW64;?rv:51.0)?Gecko/20100101?Firefox/51.0‘}
????????#?設(shè)置操作超時(shí)時(shí)長(zhǎng)
????????self.timeout?=?5
????????#?爬蟲(chóng)模擬在一個(gè)request.session中完成
????????self.s?=?requests.Session()
????????#excel?第一行數(shù)據(jù)
????????self.excel_data=[u‘編號(hào)‘u‘時(shí)間‘u‘文章標(biāo)題‘u‘文章地址‘u‘文章簡(jiǎn)介‘]
????????#定義excel操作句柄
????????self.excle_w=xlwt.Workbook()
????#搜索入口地址,以公眾為關(guān)鍵字搜索該公眾號(hào)
????def?get_search_result_by_keywords(self):
????????self.log(u‘搜索地址為:%s‘?%?self.sogou_search_url)
????????return?self.s.get(self.sogou_search_url?headers=self.headers?timeout=self.timeout).content
????#獲得公眾號(hào)主頁(yè)地址
????def?get_wx_url_by_sougou_search_html(self?sougou_search_html):
????????doc?=?pq(sougou_search_html)
????????#print?doc(‘p[class=“tit“]‘)(‘a(chǎn)‘).attr(‘href‘)
????????#print?doc(‘div[class=img-box]‘)(‘a(chǎn)‘).attr(‘href‘)
????????#通過(guò)pyquery的方式處理網(wǎng)頁(yè)內(nèi)容,類(lèi)似用beautifulsoup,但是pyquery和jQuery的方法類(lèi)似,找到公眾號(hào)主頁(yè)地址
????????return?doc(‘div[class=txt-box]‘)(‘p[class=tit]‘)(‘a(chǎn)‘).attr(‘href‘)
????#使用webdriver?加載公眾號(hào)主頁(yè)內(nèi)容,主要是js渲染的部分
????def?get_selenium_js_html(self?url):
????????browser?=?webdriver.PhantomJS(executable_path=r‘D:\mytoolssoft\idea_Tool\phantomjs-2.1.1-windows\bin\phantomjs.exe‘)
????????browser.get(url)
????????time.sleep(3)
????????#?執(zhí)行js得到整個(gè)頁(yè)面內(nèi)容
????????html?=?browser.execute_script(“return?document.documentElement.outerHTML“)
????????browser.close()
????????return?html
????#獲取公眾號(hào)文章內(nèi)容
????def?parse_wx_articles_by_html(self?selenium_html):
????????doc?=?pq(selenium_html)
????????print?(u‘開(kāi)始查找內(nèi)容msg‘)
????????return?doc(‘div[class=“weui_media_box?appmsg“]‘)
????#有的公眾號(hào)僅僅有
評(píng)論
共有 條評(píng)論