資源簡介
基于MP最大概率的Ngram漢語切分(北郵計算機語言學基礎(chǔ))
有簡潔的說明文檔和python源代碼

代碼片段和文件信息
#!/user/bin/env?python
#?-*-?coding:?utf-8?-*-
import?re
#?把切分點轉(zhuǎn)化為?字編號序列
def?line2order(line?truth?=?1):
????pattern?=?re.compile(r‘[^/]*‘)??#?正則表達式:讀到?/?為止
????orderlist?=?[]
????line?=?line.split()
????order?=?0
????for?i?in?range(len(line)-truth):
????????word?=?pattern.match(line[i?+?truth]).group()
????????order?+=?len(word)
????????orderlist.append(order)
????return?orderlist
#?把字編號序列轉(zhuǎn)化為?切分詞的首尾編號
def?cutlist(orderlist):
????cut?=?[]
????second?=?orderlist[0]
????for?i?in?range(len(orderlist)-1):
????????first?=?second
????????second?=?orderlist[i+1]
????????cut.append(?str(first)+str(second)?)
????return?cut
#?計算單句話正確率,返回?[正確個數(shù),總數(shù)]
def?cal_acc(truecut?mycut):
????correctnum?=?0
????for?i?in?truecut:
????????if?i?in?mycut:
????????????correctnum?+=?1
????return?correctnum
fin_true?=?open(‘final_ans.txt‘‘r‘encoding?=?‘utf-8‘)
my_ans?=?open(‘a(chǎn)ns.txt‘‘r‘encoding?=?‘utf-8‘)
f?=?open(‘a(chǎn)ccuracy.txt‘?‘w‘?encoding=‘utf-8‘)
correctnum?=?0
allnump?=?0
allnumr?=?0
while?True:
????line?=?my_ans.readline()
????if?line:
????????truecut?=?cutlist(line2order(fin_true.readline()?truth=0))
????????mycut?=?cutlist(line2order(line?truth=0))
????????correctnum?+=?cal_acc(truecut?mycut)
????????allnump?+=?len(mycut)
????????allnumr?+=?len(truecut)
????else:
????????break
p?=?correctnum/allnump
r?=?correctnum/allnumr
print(p?r)
f.write(str(?2*p*r/(p+r)?))
f.close()
my_ans.close()
fin_true.close()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件??????698616??2018-01-11?15:04??切分算法說明文檔.docx
?????文件???????28462??2017-11-30?12:41??result.txt
?????文件????????6541??2017-11-30?12:23??MP.py
?????文件????????1587??2017-11-29?19:01??accuracy.py
- 上一篇:動態(tài)規(guī)劃代碼
- 下一篇:HTMLTestRunnerNew.py
評論
共有 條評論