資源簡介
基于深度學習的文本相似度計算模型和代碼,親自跑過可以直接使用,對nlp領域的學習非常有借鑒意義,在智能問答系統上經常會用到。
代碼片段和文件信息
#?!/usr/bin/env?python??
#?-*-?coding:utf-8?_*-??
“““?
@Author:yanqiang?
@File:?build_input.py?
@Time:?2018/11/30?17:41
@Software:?PyCharm?
@Description:?構建模型的輸入
“““
from?collections?import?Counter
from?keras.preprocessing.sequence?import?pad_sequences
import?numpy?as?np
from?gensim.models?import?Word2Vec
#?train?=?load_atec()
#?train?dev?test=load_ccks()
def?select_best_length(trainlimit_ratio=0.95):
????“““
????根據數據集的句子長度,選擇最佳的樣本max-length
????:param?limit_ratio:句子長度覆蓋度,默認覆蓋95%以上的句子
????:return:
????“““
????len_list?=?[]
????max_length?=?0
????cover_rate?=?0.0
????for?q1?q2?in?zip(train[‘q1‘]?train[‘q2‘]):
????????len_list.append(len(q1))
????????len_list.append(len(q2))
????all_sent?=?len(len_list)
????sum_length?=?0
????len_dict?=?Counter(len_list).most_common()
????for?i?in?len_dict:
????????sum_length?+=?i[1]?*?i[0]
????average_length?=?sum_length?/?all_sent
????for?i?in?len_dict:
????????rate?=?i[1]?/?all_sent
????????cover_rate?+=?rate
????????if?cover_rate?>=?limit_ratio:
????????????max_length?=?i[0]
????????????break
????print(‘average_length:‘?average_length)
????print(‘max_length:‘?max_length)
????return?max_length
#?select_best_length()
#返回train_xy
def?build_data(train):
????“““
????構建數據集
????:return:
????“““
????#遍歷每一個樣本,獲取樣本的問題q1的樣本集合list
????sample_x_left?=?train.q1.apply(lambda?x:?[char?for?char?in?x?if?char]).tolist()
????#?遍歷每一個樣本,獲取樣本的問題q2的樣本集合list
????sample_x_right?=?train.q2.apply(lambda?x:?[char?for?char?in?x?if?char]).tolist()
????vocabs?=?{‘UNK‘}
????#構建詞匯表
????for?x_left?x_right?in?zip(sample_x_left?sample_x_right):
????????for?char?in?x_left?+?x_right:
????????????vocabs.add(char)
????sample_x?=?[sample_x_left?sample_x_right]
????sample_y?=?train.label.tolist()
????print(len(sample_x_left)?len(sample_x_right))
????datas?=?[sample_x?sample_y]
????#{‘這‘:?0?‘純‘:?1?‘代‘:?2?‘萬‘:?3?‘(‘:?4?‘柳‘:?5?‘扮‘:?6?‘翻‘:?7?‘水‘:?8................}
????word_dict?=?{wd:?index?for?index?wd?in?enumerate(list(vocabs))}
????#print(word_dict)
????vocab_path?=?‘model/vocab.txt‘
????with?open(vocab_path?‘w‘?encoding=‘utf-8‘)?as?f:
????????f.write(‘\n‘.join(list(vocabs)))
????return?datas?word_dict
def?convert_data(datas?word_dict?MAX_LENGTH):
????“““
????將數據轉換成keras所能處理的格式
????:return:?
????“““
????sample_x?=?datas[0]
????sample_y?=?datas[1]
????sample_x_left?=?sample_x[0]
????sample_x_right?=?sample_x[1]
????left_x_train?=?[[word_dict[char]?for?char?in?data]?for?data?in?sample_x_left]
????right_x_train?=?[[word_dict[char]?for?char?in?data]?for?data?in?sample_x_right]
????y_train?=?[int(i)?for?i?in?sample_y]
????left_x_train?=?pad_sequences(left_x_train?MAX_LENGTH?padding=‘pre‘)
????right_x_train?=?pad_sequences(right_x_train?MAX_LENGTH?padding=‘pre‘)
????y_train?=?np.expand_dims(y_train?2)
????return?left_x_train?right_x_train?y_train
def?train_w2v(datas):
????“““
????訓練詞向量
????:return:
????“““
????sents?=?datas[0][0]?+?datas[0][1]
????#print(sents)
????model?=?Word2Vec(sentences=sents?size=300?min_
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????6148??2020-02-26?18:01??sentence-similarity-project\.DS_Store
?????文件????????141??2018-12-05?17:08??sentence-similarity-project\.gitignore
?????文件????????128??2020-03-28?21:48??sentence-similarity-project\.idea\libraries\R_User_Library.xm
?????文件????????315??2020-03-28?21:46??sentence-similarity-project\.idea\misc.xm
?????文件????????313??2020-03-28?21:46??sentence-similarity-project\.idea\modules.xm
?????文件????????611??2020-03-28?21:48??sentence-similarity-project\.idea\sentence-similarity-project.iml
?????文件??????16934??2020-03-29?15:07??sentence-similarity-project\.idea\workspace.xm
?????文件???????4702??2020-03-28?23:25??sentence-similarity-project\build_input.py
?????文件???????1753??2020-03-05?21:25??sentence-similarity-project\data_loader.py
?????文件???????1780??2020-03-05?21:40??sentence-similarity-project\evalute.py
?????文件????3485318??2020-03-05?21:23??sentence-similarity-project\input\atec\atec_nlp_sim_train.csv
?????文件????5625804??2018-12-05?17:08??sentence-similarity-project\input\atec\atec_nlp_sim_train_add.csv
?????文件????????946??2018-12-05?17:08??sentence-similarity-project\input\atec\readme.txt
?????文件????????609??2018-12-05?17:08??sentence-similarity-project\input\ccks\Readme
?????文件?????760958??2018-12-05?17:08??sentence-similarity-project\input\ccks\task3_dev.txt
?????文件????7355965??2018-12-05?17:08??sentence-similarity-project\input\ccks\task3_train.txt
?????文件????8555401??2018-12-05?17:08??sentence-similarity-project\input\ccks\test_with_id.txt
?????文件??????23854??2020-03-29?12:06??sentence-similarity-project\model\model.png
?????文件??????29593??2020-03-29?12:27??sentence-similarity-project\model\result_atec.png
?????文件??????25260??2018-12-05?17:08??sentence-similarity-project\model\result_ccks.png
?????文件????8809848??2020-03-29?12:27??sentence-similarity-project\model\tokenvec_bilstm2_siamese_model.h5
?????文件????7847540??2020-03-29?12:06??sentence-similarity-project\model\token_vec_300.bin
?????文件??????10735??2020-03-29?12:06??sentence-similarity-project\model\vocab.txt
?????文件???????4329??2020-03-29?12:06??sentence-similarity-project\train_siamese_network.py
?????文件???????5003??2020-03-28?23:25??sentence-similarity-project\__pycache__\build_input.cpython-36.pyc
?????文件???????1481??2020-03-28?21:48??sentence-similarity-project\__pycache__\data_loader.cpython-36.pyc
?????文件?????175767??2020-02-26?18:00??sentence-similarity-project\文本相似度建模.pdf
?????目錄??????????0??2020-03-28?21:48??sentence-similarity-project\.idea\inspectionProfiles
?????目錄??????????0??2020-03-28?21:48??sentence-similarity-project\.idea\libraries
?????目錄??????????0??2020-03-05?21:23??sentence-similarity-project\input\atec
............此處省略9個文件信息
評論
共有 條評論