資源簡介
中文分詞工具的效果評測
代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
“““
Created?on?Tue?Oct?29?17:57:11?2019
@author:?jyt
Email:904771477@qq.com
“““
import?pandas?as?pd?
import?os
import?datetime?
import?models.public_modles?as?models
import?models.tool_api?as?tool_api
api?=?tool_api.tool_apis()
os.environ[“NLS_LANG“]?=?‘.AL32UTF8‘
endresult=pd.Dataframe([]columns=[‘算法‘‘字的位置‘‘R‘‘P‘‘F1‘])
#line=[]
#file=open(r‘E:/python-work/中文分詞數據集/好數據/2015/trainSeg.txt‘‘r‘encoding=‘utf-8‘)
#for?i?in?file.readlines():
#????line.append(i.split(‘\n‘))
#datajj=pd.Dataframe(linecolumns=[‘character‘‘train‘]).fillna(‘‘)
#data?=?datajj[‘character‘].str.split(‘\t‘1True).fillna(‘‘)
datajj?=?pd.read_excel(r‘E:/python-work/中文分詞數據集/人工標注數據/可以用的數據/xxl.xlsx‘)
data?=?datajj.copy().fillna(‘‘)
data.columns?=?[‘character‘‘train‘]
each_text=pd.Dataframe([]columns=[‘text‘‘train‘])
str1=‘‘
train=[]
for?x?in?range(0len(data)):
????if?data[‘character‘][x]!=‘‘:
????????str1=str1+str(data[‘character‘][x])
????????train.append([data[‘character‘][x]data[‘train‘][x]])
????else:
????????train?=?pd.Dataframe(traincolumns=[‘character‘‘train‘])
????????dat1=pd.Dataframe([[str1train]]columns=[‘text‘‘train‘])
????????each_text=each_text.append(dat1)
????????str1=‘‘
????????train=[]
each_text=each_text.reset_index(drop=True)
#%%
res_jieba?=?pd.Dataframe([]?columns=[‘character‘‘test‘])
for?x?in??range(0len(each_text)):
??????sentence?=?each_text[‘text‘][x]
??????result=?api.jieba_api(sentence?‘cutword‘)
??????res_jieba?=?models.turn_resulttype(result?each_text[‘train‘][x]?sentence?res_jieba)
??????res_jieba?=?res_jieba.append(pd.Dataframe([[‘‘‘‘]]?columns=[‘character‘?‘test‘]))
res_jieba?=?res_jieba.reset_index(drop=True)
data[‘test_jieba‘]=res_jieba[‘test‘].copy()
endresult?=?endresult.append(models.all_score(data?‘test_jieba‘?‘jieba‘))
time1?=?datetime.datetime.now()
for?x?in??range(0len(each_text)):
??????sentence?=?each_text[‘text‘][x]
??????result=?api.jieba_api(sentence?‘cutword‘)
time2?=?datetime.datetime.now()
jieba_cost?=?time2?-?time1
print(jieba_cost)
#%%nlpir
res_nlpir?=?pd.Dataframe([]?columns=[‘character‘‘test‘])
for?x?in??range(0len(each_text)):
??????sentence?=?each_text[‘text‘][x]
??????result=?api.nlpir_api(sentence?‘cutword‘)
??????res_nlpir?=?models.turn_resulttype(result?each_text[‘train‘][x]?sentence?res_nlpir)
??????res_nlpir?=?res_nlpir.append(pd.Dataframe([[‘‘‘‘]]?columns=[‘character‘?‘test‘]))
res_nlpir?=?res_nlpir.reset_index(drop=True)
data[‘test_nlpir‘]=res_nlpir[‘test‘].copy()
endresult?=?endresult.append(models.all_score(data?‘test_nlpir‘?‘nlpir‘))
time1?=?datetime.datetime.now()
for?x?in??range(0len(each_text)):
??????sentence?=?each_text[‘text‘][x]
??????result=?api.nlpir_api(sentence?‘cutword‘)
time2?=?datetime.datetime.now()
nlpir_cost?=?time2?-?time1
print(nlpir_cost)
#%%ltp
res_ltp?=?pd.Dataframe([]?columns=[‘character‘‘test‘])
for?x?in??range(0len(each_text)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????6995??2019-10-31?20:39??中文分詞工具評測\index.py
?????文件??????10240??2019-10-13?09:59??中文分詞工具評測\jieba_nlpir_ltp_在trainseg上的表現.spydata
?????文件???????4514??2019-10-31?20:33??中文分詞工具評測\models\public_modles.py
?????文件???????5902??2019-10-29?21:27??中文分詞工具評測\models\tool_api.py
?????文件???????3394??2019-10-31?20:33??中文分詞工具評測\models\__pycache__\public_modles.cpython-36.pyc
?????文件???????3773??2019-10-29?21:29??中文分詞工具評測\models\__pycache__\tool_api.cpython-36.pyc
?????文件???????5217??2019-10-29?18:34??中文分詞工具評測\Tools_of_cutword_Score.py
?????文件????3552350??2019-10-29?21:31??中文分詞工具評測\人工標注結果.xlsx
?????目錄??????????0??2019-10-31?20:33??中文分詞工具評測\models\__pycache__
?????目錄??????????0??2019-10-29?18:08??中文分詞工具評測\models
?????目錄??????????0??2020-07-18?10:15??中文分詞工具評測
-----------?---------??----------?-----??----
??????????????3592385????????????????????11
評論
共有 條評論