-
大小: 51.25MB文件類型: .zip金幣: 1下載: 0 次發布日期: 2023-07-19
- 語言: Python
- 標簽:
資源簡介
對四種句子/文本相似度計算方法進行實驗與比較
代碼片段和文件信息
#encoding:utf-8
from?__future__?import?absolute_import
import?jieba
import?time
from?scipy?import?spatial
import?numpy?as?np
from?Utils.load_data?import?*
file_voc=‘./data/voc.txt‘
file_idf=‘./data/idf.txt‘
file_userdict=‘./data/medfw.txt‘
class?SSIM(object):
????def?__init__(self):
????????t1?=?time.time()
????????self.voc=load_voc(file_voc)
????????print(“Loading??word2vec?vector?cost?%.3f?seconds...\n“?%?(time.time()?-?t1))
????????t1?=?time.time()
????????self.idf=load_idf(file_idf)
????????print(“Loading??idf?data?cost?%.3f?seconds...\n“?%?(time.time()?-?t1))
????????jieba.load_userdict(file_userdict)
????def?M_cosine(selfs1s2):
????????s1_list=jieba.lcut(s1)
????????s2_list=jieba.lcut(s2)
????????v1=np.array([self.voc[s]?for?s?in?s1_list?if?s?in?self.voc])
????????v2=np.array([self.voc[s]?for?s?in?s2_list?if?s?in?self.voc])
????????v1=v1.sum(axis=0)
????????v2=v2.sum(axis=0)
????????sim=1-spatial.distance.cosine(v1v2)
????????return?sim
????def?M_idf(selfs1?s2):
????????v1?v2?=?[]?[]
????????s1_list?=?jieba.lcut(s1)
????????s2_list?=?jieba.lcut(s2)
????????for?s?in?s1_list:
????????????idf_v?=?self.idf.get(s?1)
????????????if?s?in?self.voc:
????????????????v1.append(1.0?*?idf_v?*?self.voc[s])
????????for?s?in?s2_list:
????????????idf_v?=?self.idf.get(s?1)
????????????if?s?in?self.voc:
????????????????v2.append(1.0?*?idf_v?*?self.voc[s])
????????v1?=?np.array(v1).sum(axis=0)
????????v2?=?np.array(v2).sum(axis=0)
????????sim?=?1?-?spatial.distance.cosine(v1?v2)
????????return?sim
????def?M_bm25(selfs1?s2?s_avg=10?k1=2.0?b=0.75):
????????bm25?=?0
????????s1_list?=?jieba.lcut(s1)
????????for?w?in?s1_list:
????????????idf_s?=?self.idf.get(w?1)
????????????bm25_ra?=?s2.count(w)?*?(k1?+?1)
????????????bm25_rb?=?s2.count(w)?+?k1?*?(1?-?b?+?b?*?len(s2)?/?s_avg)
????????????bm25?+=?idf_s?*?(bm25_ra?/?bm25_rb)
????????return?bm25
????def?M_jaccard(selfs1?s2):
????????s1?=?set(s1)
????????s2?=?set(s2)
????????ret1?=?s1.intersection(s2)
????????ret2?=?s1.union(s2)
????????jaccard?=?1.0?*?len(ret1)/?len(ret2)
????????return?jaccard
????def?ssim(selfs1s2model=‘cosine‘):
????????if?model==‘idf‘:
????????????f_ssim=self.M_idf
????????elif?model==‘bm25‘:
????????????f_ssim=self.M_bm25
????????elif?model==‘jaccard‘:
????????????f_ssim=self.M_jaccard
????????else:
????????????f_ssim?=?self.M_cosine
????????sim=f_ssim(s1s2)
????????return?sim
sm=SSIM()
ssim=sm.ssim
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-07-13?09:01??sentence-similarity-master\
?????文件????????1203??2018-07-13?09:01??sentence-similarity-master\.gitignore
?????文件????????1063??2018-07-13?09:01??sentence-similarity-master\LICENSE
?????文件????????2127??2018-07-13?09:01??sentence-similarity-master\README.md
?????目錄???????????0??2018-07-13?09:01??sentence-similarity-master\Utils\
?????文件????????1682??2018-07-13?09:01??sentence-similarity-master\Utils\compute_idf.py
?????文件?????????741??2018-07-13?09:01??sentence-similarity-master\Utils\get_sentence.py
?????文件?????????629??2018-07-13?09:01??sentence-similarity-master\Utils\load_data.py
?????文件????????1346??2018-07-13?09:01??sentence-similarity-master\Utils\train_word2vec.py
?????目錄???????????0??2018-07-13?09:01??sentence-similarity-master\data\
?????文件????49367332??2018-07-13?09:01??sentence-similarity-master\data\file_corpus.txt
?????文件??????453237??2018-07-13?09:01??sentence-similarity-master\data\file_sentence.txt
?????文件?????5802916??2018-07-13?09:01??sentence-similarity-master\data\idf.txt
?????文件???????56840??2018-07-13?09:01??sentence-similarity-master\data\medfw.txt
?????文件????????2978??2018-07-13?09:01??sentence-similarity-master\data\test_result.txt
?????文件????86874190??2018-07-13?09:01??sentence-similarity-master\data\voc.txt
?????目錄???????????0??2018-07-13?09:01??sentence-similarity-master\images\
?????文件???????88419??2018-07-13?09:01??sentence-similarity-master\images\bm25.png
?????文件??????143263??2018-07-13?09:01??sentence-similarity-master\images\cosine.png
?????文件??????162673??2018-07-13?09:01??sentence-similarity-master\images\idf.png
?????文件??????161561??2018-07-13?09:01??sentence-similarity-master\images\jaccard.png
?????文件??????164037??2018-07-13?09:01??sentence-similarity-master\images\result.png
?????文件????????2437??2018-07-13?09:01??sentence-similarity-master\similarity.py
?????文件????????1454??2018-07-13?09:01??sentence-similarity-master\test.py
評論
共有 條評論