資源簡介
代碼片段和文件信息
import?requests
from?bs4?import?BeautifulSoup
import?re
import?json
import?jieba
#獲取html頁面信息
def?getKeywordResult(keyword?pagenum):
????url?=?‘http://www.baidu.com/s?wd=‘?+?keyword?+?‘&pn=‘?+?pagenum?+?‘0‘
????try:
????????r?=?requests.get(url?timeout=30)
????????r.raise_for_status()
????????r.encoding?=?‘utf-8‘
????????return?r.text
????except:
????????return?““
#解析并抽取數據
def?parserlinks(html):
????soup?=?BeautifulSoup(html?“html.parser“)
????links?=?[]
????for?div?in?soup.find_all(‘div‘?{‘data-tools‘:re.compile(‘title‘)}):
????????data?=?div.attrs[‘data-tools‘]
????????d?=?json.loads(data)
????????links.append(d[‘title‘])
????????words_all.append(d[‘title‘])
????return?links?words_all
#詞頻統計
def?words_ratio(words_all):
????words?=?[]
????for?i?in?words_all:
????????tmp?=?jieba.lcut(i)
?
評論
共有 條評論