資源簡介
基于crfsuited的醫療命名實體抽取的Python實現
醫療電子病例命名實體識別評測任務的一個可執行demo,采用的方法是條件隨機場(CRF),實現CRF的第三方庫為[python-crfsuite]。目前該demo準確率為68%,召回率為62%,F1值為64.8%。

代碼片段和文件信息
#!/usr/bin/python
#?-*-?coding:utf-8?-*-
#?**************************
#?*?Author??????:??baiyyang
#?*?Email???????:??baiyyang@163.com
#?*?Description?:??
#?*?create?time?:??2018/1/10上午10:29
#?*?file?name???:??crf_unit.py
import?sys
import?codecs
import?pycrfsuite
import?string
import?zhon.hanzi?as?zh
import?reader
from?sklearn.metrics?import?classification_report
from?sklearn.preprocessing?import?LabelBinarizer
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
#?獲取數據
def?readData(filename):
????fr?=?codecs.open(filename?‘r‘?‘utf-8‘)
????data?=?[]
????for?line?in?fr:
????????fields?=?line.strip().split(‘\t‘)
????????if?len(fields)?==?3:
????????????data.append(fields)
????return?data
train?=?readData(‘train.txt‘)
test?=?readData(‘test.txt‘)
#?判斷是否為標點符號
#?punctuation
def?ispunctuation(word):
????punctuation?=?string.punctuation?+?zh.punctuation
????if?punctuation.find(word)?!=?-1:
????????return?True
????else:
????????return?False
#?特征定義
def?word2features(sent?i):
????“““返回特征列表“““
????word?=?sent[i][0]
????postag?=?sent[i][1]
????features?=?[
????????‘bias‘
????????‘word=‘?+?word
????????‘word_tag=‘?+?postag
????]
????if?i?>?0:
????????features.append(‘word[-1]=‘?+?sent[i-1][0])
????????features.append(‘word[-1]_tag=‘?+?sent[i-1][1])
????????if?i?>?1:
????????????features.append(‘word[-2]=‘?+?sent[i-2][0])
????????????features.append(‘word[-2?-1]=‘?+?sent[i-2][0]?+?sent[i-1][0])
????????????features.append(‘word[-2]_tag=‘?+?sent[i-2][1])
????if?i?????????features.append(‘word[1]=‘?+?sent[i+1][0])
????????features.append(‘word[1]_tag=‘?+?sent[i+1][1])
????????if?i?????????????features.append(‘word[2]=‘?+?sent[i+2][0])
????????????features.append(‘word[1?2]=‘?+?sent[i+1][0]?+?sent[i+2][0])
????????????features.append(‘word[2]_tag=‘?+?sent[i+2][1])
????return?features
def?sent2feature(sent):
????return?[word2features(sent?i)?for?i?in?range(len(sent))]
def?sent2label(sent):
????return?[label?for?word?tag?label?in?sent]
def?sent2word(sent):
????return?[word?for?word?tag?label?in?sent]
X_train?=?sent2feature(train)
y_train?=?sent2label(train)
X_test?=?sent2feature(test)
y_test?=?sent2label(test)
#?訓練模型
model?=?pycrfsuite.Trainer(verbose=True)
model.append(X_train?y_train)
model.set_params({
????‘c1‘:?1.0??#?coefficient?for?L1?penalty
????‘c2‘:?1e-3??#?coefficient?for?L2?penalty
????‘max_iterations‘:?100??#?stop?earlier
????#?include?transitions?that?are?possible?but?not?observed
????‘feature.possible_transitions‘:?True
????‘feature.minfreq‘:?3
})
model.train(‘./medical.crfsuite‘)
#?預測數據
tagger?=?pycrfsuite.Tagger()
tagger.open(‘./medical.crfsuite‘)
#?一份測試數據集
print?‘?‘.join(sent2word(readData(‘test1.txt‘)))
predicted?=?tagger.tag(sent2feature(readData(‘test1.txt‘)))
correct?=?sent2label(readData(‘test1.txt‘))
#?預測結果對比
print?‘Predicted:?‘?‘?‘.join(predicted)
print?‘Correct:?‘?‘?‘.join(correct)
#?預測準確率
num?=?0
for?i?tag?in?enumerate(predicted):
????if?tag?==?correct[i]:
????????num?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-05-03?01:54??medical_ner_crfsuite-master\
?????文件????????1239??2018-05-03?01:54??medical_ner_crfsuite-master\README.md
?????文件????????4103??2018-05-03?01:54??medical_ner_crfsuite-master\crf_unit.py
?????目錄???????????0??2018-05-03?01:54??medical_ner_crfsuite-master\data\
?????文件????????2350??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-1.txt
?????文件????????2505??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-10.txt
?????文件????????4555??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-100.txt
?????文件????????2898??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-11.txt
?????文件????????5652??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-12.txt
?????文件????????2127??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-13.txt
?????文件????????2832??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-14.txt
?????文件????????2314??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-15.txt
?????文件????????2533??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-16.txt
?????文件????????3044??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-17.txt
?????文件????????2310??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-18.txt
?????文件????????5179??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-19.txt
?????文件????????2368??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-2.txt
?????文件????????2010??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-20.txt
?????文件?????????233??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-21.txt
?????文件????????5934??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-22.txt
?????文件????????2808??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-23.txt
?????文件????????5115??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-24.txt
?????文件????????5503??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-25.txt
?????文件????????3506??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-26.txt
?????文件????????2210??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-27.txt
?????文件????????2457??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-28.txt
?????文件????????3772??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-29.txt
?????文件????????3023??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-3.txt
?????文件????????3063??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-30.txt
?????文件????????2256??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-31.txt
?????文件????????1810??2018-05-03?01:54??medical_ner_crfsuite-master\data\病史特點-32.txt
............此處省略79個文件信息
- 上一篇:爬取知網的社科基金項目文章信息
- 下一篇:Python Requests包
評論
共有 條評論