資源簡介
基于雙向LSTM/keras/tensorflow的中文分詞,語料為人民日報,分詞準確率高達97%
代碼片段和文件信息
#?import?os
#?os.environ[“CUDA_DEVICE_ORDER“]?=?“PCI_BUS_ID“
#?os.environ[“CUDA_VISIBLE_DEVICES“]?=?““
from?keras.models?import?model_from_json
from?keras?import?backend?as?K
from?DataProcessing.embedding?import?Data
import?numpy?as?np
import?json
import?os
transpose_matrix?=?{‘be‘:?0.5
????????????????????‘bm‘:?0.5
????????????????????‘eb‘:?0.5
????????????????????‘es‘:?0.5
????????????????????‘me‘:?0.5
????????????????????‘mm‘:?0.5
????????????????????‘sb‘:?0.5
????????????????????‘ss‘:?0.3}
zy?=?{i:?np.log(transpose_matrix[i])?for?i?in?transpose_matrix.keys()}
def?viterbi(nodes):
????path?=?{‘b‘:?nodes[0][‘b‘]?‘s‘:?nodes[0][‘s‘]}
????for?layer_num?in?range(1?len(nodes)):
????????old_path?=?path.copy()
????????path?=?{}
????????for?new_tag?in?nodes[layer_num].keys():
????????????tmp?=?{}
????????????if?layer_num?==?len(nodes)?-?1:
????????????????if?new_tag?in?[“m“?“b“]:
????????????????????continue
????????????for?old_path_tag?in?old_path.keys():
????????????????if?old_path_tag[-1]+new_tag?in?transpose_matrix.keys():
????????????????????tmp[old_path_tag+new_tag]?=?old_path[old_path_tag]?+?nodes[layer_num][new_tag]?+?transpose_matrix[old_path_tag[-1]+new_tag]
????????????k?=?np.argmax(list(tmp.values()))
????????????path[list(tmp.keys())[k]]?=?list(tmp.values())[k]
????????#?print(path)
????return?list(path.keys())[np.argmax(list(path.values()))]
def?cut(asentence?tagstr):
????result?=?‘‘
????for?(character?tag)?in?zip(asentence?tagstr):
????????result?+=?character
????????if?str(tag)?in?[‘s‘?‘e‘]:
????????????result?+=?‘?‘
????return?result
if?__name__?==?‘__main__‘:
????filedir?=?“D:\\codes\\python\\keras_splitwords\\datas“
????model_name?=?“model.json“
????weights_name?=?“model_weights.h5“
????test_sentence?=?““
????data?=?Data(filedir)
????charsets?=?data.load_charsets_from_file(“charsets.pkl“)
????model?=?model_from_json(json.load(open(os.path.join(filedir?model_name)?“r“)))
????model.load_weights(os.path.join(filedir?weights_name))
????sentence_embeddings?all_sentences?sentence_len?=?data.get_sent_embeddings(charsets?test_sentence)
????#?print(“sentence?len:“?sentence_len)
????result?=?model.predict(sentence_embeddings?verbose=False)
????print(test_sentence)
????for?(aresult?asentence?length)?in?zip(result?all_sentences?sentence_len):
????????aresult?=?aresult[:length]
????????one_sentence_nodes?=?[{k:?v?for?(k?v)?in?zip([‘s‘?‘b‘?‘m‘?‘e‘]?np.log(i[:4]))}?for?i?in?aresult]
????????print(cut(asentence?viterbi(one_sentence_nodes)))
????K.clear_session()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-05-16?04:53??Chinese-Word-Split-master\
?????文件??????????12??2018-05-16?04:53??Chinese-Word-Split-master\.gitignore
?????目錄???????????0??2018-05-16?04:53??Chinese-Word-Split-master\DataProcessing\
?????文件???????????0??2018-05-16?04:53??Chinese-Word-Split-master\DataProcessing\__init__.py
?????文件????????3143??2018-05-16?04:53??Chinese-Word-Split-master\DataProcessing\em
?????目錄???????????0??2018-05-16?04:53??Chinese-Word-Split-master\datas\
?????文件??????531361??2018-05-16?04:53??Chinese-Word-Split-master\datas\charsets.pkl
?????文件????????2880??2018-05-16?04:53??Chinese-Word-Split-master\datas\model.json
?????文件????????2879??2018-05-16?04:53??Chinese-Word-Split-master\datas\model_usebias.json
?????文件?????3061352??2018-05-16?04:53??Chinese-Word-Split-master\datas\model_weights.h5
?????文件?????3061352??2018-05-16?04:53??Chinese-Word-Split-master\datas\model_weights_usebias.h5
?????文件????24389693??2018-05-16?04:53??Chinese-Word-Split-master\datas\msr_train.txt
?????文件????????2546??2018-05-16?04:53??Chinese-Word-Split-master\gen.py
?????文件????????1391??2018-05-16?04:53??Chinese-Word-Split-master\train.py
評論
共有 條評論