-
大小: 888KB文件類型: .zip金幣: 2下載: 1 次發布日期: 2021-05-28
- 語言: Python
- 標簽:
資源簡介
利用Python實現中文文本關鍵詞抽取,分別采用TF-IDF、TextRank、Word2Vec詞聚類三種方法。

代碼片段和文件信息
#!/usr/bin/python
#?coding=utf-8
#?采用TextRank方法提取文本關鍵詞
import?sys
import?pandas?as?pd
import?jieba.analyse
“““
???????TextRank權重:
????????????1、將待抽取關鍵詞的文本進行分詞、去停用詞、篩選詞性
????????????2、以固定窗口大小(默認為5,通過span屬性調整),詞之間的共現關系,構建圖
????????????3、計算圖中節點的PageRank,注意是無向帶權圖
“““
#?處理標題和摘要,提取關鍵詞
def?getKeywords_textrank(datatopK):
????idListtitleListabstractList?=?data[‘id‘]data[‘title‘]data[‘abstract‘]
????ids?titles?keys?=?[]?[]?[]
????for?index?in?range(len(idList)):
????????text?=?‘%s。%s‘?%?(titleList[index]?abstractList[index])?#?拼接標題和摘要
????????jieba.analyse.set_stop_words(“data/stopWord.txt“)?#?加載自定義停用詞表
????????print?“\““titleList[index]“\““??“?10?Keywords?-?TextRank?:“
????????keywords?=?jieba.analyse.textrank(text?topK=topK?allowPOS=(‘n‘‘nz‘‘v‘‘vd‘‘vn‘‘l‘‘a‘‘d‘))??#?TextRank關鍵詞提取,詞性篩選
????????word_split?=?“?“.join(keywords)
????????print?word_split
????????keys.append(word_split.encode(“utf-8“))
????????ids.append(idList[index])
????????titles.append(titleList[index])
????result?=?pd.Dataframe({“id“:?ids?“title“:?titles?“key“:?keys}?columns=[‘id‘?‘title‘?‘key‘])
????return?result
def?main():
????dataFile?=?‘data/sample_data.csv‘
????data?=?pd.read_csv(dataFile)
????result?=?getKeywords_textrank(data10)
????result.to_csv(“result/keys_TextRank.csv“index=False)
if?__name__?==?‘__main__‘:
????main()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-01-16?01:10??keyword_extraction-master\
?????文件???????22135??2018-01-16?01:10??keyword_extraction-master\README.md
?????目錄???????????0??2018-01-16?01:10??keyword_extraction-master\data\
?????文件????????6782??2018-01-16?01:10??keyword_extraction-master\data\sample_data.csv
?????文件????????9373??2018-01-16?01:10??keyword_extraction-master\data\stopWord.txt
?????文件????????1638??2018-01-16?01:10??keyword_extraction-master\keyextract_textrank.py
?????文件????????3722??2018-01-16?01:10??keyword_extraction-master\keyextract_tfidf.py
?????文件????????2686??2018-01-16?01:10??keyword_extraction-master\keyextract_word2vec_1.py
?????文件????????4045??2018-01-16?01:10??keyword_extraction-master\keyextract_word2vec_2.py
?????目錄???????????0??2018-01-16?01:10??keyword_extraction-master\result\
?????文件????????1130??2018-01-16?01:10??keyword_extraction-master\result\keys_TFIDF.csv
?????文件????????1130??2018-01-16?01:10??keyword_extraction-master\result\keys_TextRank.csv
?????文件????????1133??2018-01-16?01:10??keyword_extraction-master\result\keys_word2vec.csv
?????目錄???????????0??2018-01-16?01:10??keyword_extraction-master\result\vecs\
?????文件??????375478??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_1.csv
?????文件??????141841??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_10.csv
?????文件??????184382??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_2.csv
?????文件??????154035??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_3.csv
?????文件??????154759??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_4.csv
?????文件??????263297??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_5.csv
?????文件??????160204??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_6.csv
?????文件??????136095??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_7.csv
?????文件??????185739??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_8.csv
?????文件??????196662??2018-01-16?01:10??keyword_extraction-master\result\vecs\wordvecs_9.csv
?????文件????????2136??2018-01-16?01:10??keyword_extraction-master\詞性標注參考.txt
評論
共有 條評論