資源簡介
包括Python分詞,去停用詞,使用gensim包進行LDA主題分析,并輸出每條矩陣屬于每個主題的概率的代碼,以及停用詞表

代碼片段和文件信息
import?jieba
#?jieba.load_userdict(‘userdict.txt‘)
#?創建停用詞list
def?stopwordslist(filepath):
????stopwords?=?[line.strip()?for?line?in?open(filepath?‘r‘?encoding=‘utf-8‘).readlines()]
????return?stopwords
#?對句子進行分詞
def?seg_sentence(sentence):
????sentence_seged?=?jieba.cut(sentence.strip())
????stopwords?=?stopwordslist(‘D:/LDA/stopwords.txt‘)??#?這里加載停用詞的路徑
????outstr?=?‘‘
????for?word?in?sentence_seged:
????????if?word?not?in?stopwords:
????????????if?word?!=?‘\t‘:
????????????????outstr?+=?word
????????????????outstr?+=?“?“
????return?outstr
#讀入需要分詞的文件
inputs?=?open(‘D:/LDA/dp.txt‘?‘r‘?encoding=‘utf-8‘)
#輸出分詞結果
outputs?=?open(‘D:/LDA/dp_fenci.txt‘?‘w‘encoding=‘utf-8‘)
for?line?in?inputs:
????line_seg?=?seg_sentence(line)??#?這里的返回值是字符串
????outputs.write(line_seg?+?‘\n‘)
outputs.close()
inputs.close()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????15267??2018-04-10?17:44??stopwords.txt
?????文件????????1150??2018-05-16?22:04??topic_model1.py
?????文件?????????946??2018-05-16?14:45??quting.py
- 上一篇:DNN判斷句子的通順程度.py
- 下一篇:基于python的網絡爬蟲設計
評論
共有 條評論