資源簡介
抓取了網易新聞里["國內","國際","航空"]和["軍事","科技","體育","教育"]板塊的新聞內容和評論并存到數據庫中。
代碼片段和文件信息
import?json
import?requests
import?re
from?selenium?import?webdriver
import?time
import?os
from?bs4?import?BeautifulSoup
import?pymysql
def?getHTMLText(url):
????try:
????????headers?=?{‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:23.0)?Gecko/20100101?Firefox/23.0‘}
????????r?=?requests.get(url?headers=headerstimeout?=?30)
????????r.encoding?=?‘GBK‘
????????r.raise_for_status()
????????return?r.text
????except:
????????return?““
def?getContent(urlntype):
????#===============================獲取新聞頁面==============================
????tbnews_id?=?0
????commentids=[]
????print(url)
????html?=?getHTMLText(url)
????soup?=?BeautifulSoup(html“html.parser“)
????[s.extract()?for?s?in?soup([‘script‘‘style‘])]
????title?=?soup.find_all(“h1“)???????????#爬取標題
????if?title==[]:
????????return?
????else:tit=title[0].get_text()
????path?=?soup.select(“.post_crumb“)
????time=soup.select(“.post_time_source“)?????????????#爬取時間和來源
????if?time==[]:
????????tim?=?
- 上一篇:HFSS雙極化喇叭天線設計
- 下一篇:學校簽到小程序
評論
共有 條評論