資源簡介
基于字的用感知機實現的中文分詞系統。完全訓練后對微軟的測試集精度可以達到96%多。我上傳的版本是完整的代碼(訓練和分詞),大家自己用附帶的微軟訓練數據訓練就可以了,只有一個文件。
代碼總的來說寫的還是很清楚的,方便自己也方便別人閱讀。歡迎大家共討論,xiatian@ict.ac.cn。

代碼片段和文件信息
#?-*-?coding:?cp936?-*-
import?os
import?time
import?random
import?cPickle
__author__?=?“summer?rain“
__email__?=?“xiatian@ict.ac.cn“
class?CPTTrain:
????def?__init__(self?segment?train):
????????self.__char_type?=?{}
????????data_path?=?“PTData“
????????for?ind?name?in?enumerate([“punc“?“alph“?“date“?“num“]):
????????????fn?=?data_path?+?“/“?+?name
????????????if?os.path.isfile(fn):
????????????????for?line?in?file(fn?“rU“):
????????????????????self.__char_type[line.strip().decode(“cp936“)]?=?ind
????????????else:
????????????????print?“can‘t?open“?fn
????????????????exit()
????????self.__train_insts?=?None???????????#?all?instances?for?training.
????????self.__feats_weight?=?None??????????#?[“b“?“m“?“e“?“s“][all?the?features]?-->?weight.
????????self.__words_num?=?None?????????????#?total?words?num?in?all?the?instances.
????????self.__insts_num?=?None?????????????#?namley?the?sentences‘?num.
????????self.__cur_ite_ID?=?None????????????#?current?iteration?index.
????????self.__cur_inst_ID?=?None???????????#?current?index_th?instance.
????????self.__real_inst_ID?=?None??????????#?the?accurate?index?in?training?instances?after?randimizing.
????????self.__last_update?=?None???????????#?[“b“..“s“][feature]?-->?[last_update_ite_ID?last_update_inst_ID]
????????self.__feats_weight_sum?=?None??????#?sum?of?[“b“..“s“][feature]?from?begin?to?end.
????????if?segment?and?train?or?not?segment?and?not?train:
????????????print?“there?is?only?a?True?and?False?in?segment?and?train“
????????????exit()
????????elif?train:
????????????self.Train?=?self.__Train
????????else:
????????????self.__LoadModel()
????????????self.Segment?=?self.__Segment
????def?__LoadModel(self):
????????model?=?“PTData/avgmodel“
????????print?“load“?model?“...“
????????self.__feats_weight?=?{}
????????if?os.path.isfile(model):
????????????start?=?time.clock()
????????????self.__feats_weight?=?cPickle.load(file(model?“rb“))
????????????end?=?time.clock()
????????????print?“It?takes?%d?seconds“?%(end?-?start)
????????else:
????????????print?“can‘t?open“?model
????def?__Train(self?corp_file_name?max_train_num?max_ite_num):
????????if?not?self.__LoadCorp(corp_file_name?max_train_num):
????????????return?False
????????starttime?=?time.clock()
????????????????
????????self.__feats_weight?=?{}
????????self.__last_update?=?{}
????????self.__feats_weight_sum?=?{}
????????
????????for?self.__cur_ite_ID?in?xrange(max_ite_num):
????????????if?self.__Iterate():
????????????????break
????????self.__SaveModel()
????????endtime?=?time.clock()????????
????????print?“total?iteration?times?is?%d?seconds“?%(endtime?-?starttime)
????????return?True
????def?__GenerateFeats(self?inst):
????????inst_feat?=?[]
????????for?ind?[c?tag?t]?in?enumerate(inst):
????????????inst_feat.append([])
????????????if?t?==?-1:
????????????????continue
????????????#?Cn
????????????for?n?in?xrange(-2?3):
??????????
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件??????11158??2008-05-26?19:11??PTTrain.py
?????文件????????260??2007-12-04?13:00??PTData\alph
?????文件?????976127??2008-05-26?10:58??PTData\avgmodel
?????文件?????????17??2007-12-04?13:00??PTData\date
?????文件????????110??2007-12-04?13:00??PTData\num
?????文件????????270??2007-12-04?13:00??PTData\punc
?????目錄??????????0??2008-05-23?14:28??PTData
?????文件???24476617??2007-12-17?10:53??msr_train.txt
-----------?---------??----------?-----??----
?????????????25464559????????????????????8
評論
共有 條評論