資源簡介
CCK2017病例標注,CCKS2017 Task2
數據格式說明:
每個病例分為4個域,分別存儲在4個文件夾
一般項目
病史特征
診療過程
出院情況
每一個目錄下存儲兩類文件
代碼片段和文件信息
#?coding:utf-8
import?fio
import?codecs
import?sys
import?os
import?jieba.posseg?as?pseg
datadir?=?“../data2/training?dataset?v4“
area?=?[“病史特點“?“出院情況“?“一般項目“?“診療經過“]
class?CRF_unit:
????def?__init__(self):
????????self.features?=?[]
????def?test_into_aline(self?filename):
????????self.features?=?[]
????????sentences?=?fio.ReadFileUTF8(filename);
????????for?sentence?in?sentences:
????????????for?token?in?sentence:
????????????????self.features.append(token)
????def?get_posTag(self?sentence):
????????words?=?pseg.cut(sentence)
????????return?words
????def?get_token(self?filename):
????????self.features?=?[]
????????sentences?=?fio.ReadFileUTF8(filename);
????????for?sentence?in?sentences:
????????????words?=?self.get_posTag(sentence)
????????????for?w?in?words:
????????????????for?token?in?w.word:
????????????????????feature?=?[token?w.flag?“N“]
????????????????????self.features.append(feature)
????????????????
????def?read_type(self?itype):
????????itype?=?itype.encode(‘utf-8‘)
????????if?itype?==?“癥狀和體征“:
????????????return?“SIGNS“
????????if?itype?==?“檢查和檢驗“:
????????????return?“CHECK“
????????if?itype?==?“疾病和診斷“:
????????????return?“DISEASE“
????????if?itype?==?“治療“:
????????????return?“TREATMENT“
????????if?itype?==?“身體部位“:
????????????return?“BODY“
????def?get_type(self?filename):
????????sentences?=?fio.ReadFileUTF8(filename);
????????for?sentence?in?sentences:
????????????words?=?sentence.split()
????????????print?words[-3]?+?words[-2]
????????????x?=?int(words[-3])
????????????y?=?int(words[-2])
????????????#if?words[3].encode(‘utf-8‘)?==?“身體部位“:
????????????itype?=?self.read_type(words[-1])
????????????self.features[x][2]?=?“B-“?+?itype
????????????for?j?in?range(x+1y+1):
????????????????self.features[j][2]?=?“I-“?+?itype
if?__name__?==?‘__main__‘:
????extractor?=?CRF_unit()
????x?=?0;
????“““
????for?i?in?range(1241):
????????filename?=?datadir?+?‘/‘?+?area[x]?+?‘/‘?+?area[x]?+?‘-‘+?str(i)?+‘.txtoriginal.txt‘
????????extractor.get_token(filename)
????????filename?=?datadir?+?‘/‘?+?area[x]?+?‘/‘?+?area[x]?+?‘-‘+?str(i)?+‘.txt‘
????????extractor.get_type(filename)
????????filename?=?datadir?+?‘/result/‘?+?area[x]?+?“/“?+?‘1-240_train.txt‘
????????fio.AddTrain(extractor.features?filename)
????“““
????
????for?i?in?range(241?301):
????????filename?=?datadir?+?‘/‘?+?area[x]?+?‘/‘?+?area[x]?+?‘-‘+?str(i)?+‘.txtoriginal.txt‘
????????extractor.test_into_aline(filename);
????????filename?=?datadir?+?‘/result/‘?+?area[x]?+?‘.testt-‘?+?str(i)?+?‘.txt‘
????????fio.AddTest(extractor.features?filename)
????
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2017-11-22?09:15??CCKS2017\
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\
?????文件??????????23??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\HEAD
?????目錄???????????0??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\branches\
?????文件?????????268??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\config
?????文件??????????73??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\desc
?????目錄???????????0??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\
?????文件?????????478??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\applypatch-msg.sample
?????文件?????????896??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\commit-msg.sample
?????文件?????????189??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\post-update.sample
?????文件?????????424??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\pre-applypatch.sample
?????文件????????1642??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\pre-commit.sample
?????文件????????1348??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\pre-push.sample
?????文件????????4898??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\pre-reba
?????文件????????1239??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\prepare-commit-msg.sample
?????文件????????3610??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\hooks\update.sample
?????文件?????1960281??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\index
?????目錄???????????0??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\info\
?????文件?????????240??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\info\exclude
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\
?????文件?????????187??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\HEAD
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\refs\
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\refs\heads\
?????文件?????????187??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\refs\heads\master
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\origin\
?????文件?????????187??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\logs\refs\remotes\origin\HEAD
?????目錄???????????0??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\ob
?????目錄???????????0??2017-08-09?10:14??CCKS2017\CCKS2017_dataset\.git\ob
?????目錄???????????0??2017-08-09?10:18??CCKS2017\CCKS2017_dataset\.git\ob
............此處省略13886個文件信息
評論
共有 條評論