資源簡介
整合 使用ICTCLAS2013(NlPIR) x64 的中文分詞,分詞的速度和效果都不錯(cuò)。然后就是對文本進(jìn)行的去停用詞以及特殊符號(hào)(哈工大停用詞庫)的去除,這些特殊符號(hào)還有標(biāo)點(diǎn)符號(hào),以及文本中夾雜的數(shù)字和字母會(huì)對分詞的效果產(chǎn)生影響。eclipse GBK 編碼

代碼片段和文件信息
import?kevin.zhang.NLPIR;
public?class?TestNLPIR?{
????
????public?static?void?main(String[]?args)?throws?Exception?{
????????try?{
????????????String?sInput?=?“張華平推出的NLPIR分詞系統(tǒng),又名ICTCLAS2013,新增新詞識(shí)別、關(guān)鍵詞提取、微博分詞功能。“;
????????????
????????????//?自適應(yīng)分詞
????????????test(sInput);
????????????
????????}?catch?(Exception?ex)?{
????????}
????????
????}
????
????public?static?void?test(String?sInput)?{
????????try?{
????????????NLPIR?testNLPIR?=?new?NLPIR();
????????????
????????????String?argu?=?““;
????????????System.out.println(“NLPIR_Init“);
????????????if?(testNLPIR.NLPIR_Init(argu.getBytes(“GBK“)?0)?==?false)?{
????????????????System.out.println(“Init?Fail!“);
????????????????return;
????????????}
????????????
????????????//?導(dǎo)入用戶詞典前
????????????byte?nativeBytes[]?=?testNLPIR.NLPIR_ParagraphProcess(sInput.getBytes(“GBK“)?1);
????????????String?nativeStr?=?new?String(nativeBytes?0?nativeBytes.length?“GBK“);
????????????
????????????System.out.println(“分詞結(jié)果為:?“?+?nativeStr);
????????????
????????????//?初始化分詞組件
????????????String?argu1?=?“test/test.TXT“;
????????????String?argu2?=?“test/test_result1.TXT“;
????????????
????????????nativeBytes?=?testNLPIR.NLPIR_GetFileNewWords(argu1.getBytes(“GBK“)?50?true);
????????????//?如果是處理內(nèi)存,可以調(diào)用testNLPIR.NLPIR_GetNewWords
????????????nativeStr?=?new?String(nativeBytes?0?nativeBytes.length?“GBK“);
????????????System.out.println(“新詞識(shí)別結(jié)果為:?“?+?nativeStr);
????????????
????????????nativeBytes?=?testNLPIR.NLPIR_GetFileKeyWords(argu1.getBytes(“GBK“)?50?true);
????????????//?如果是處理內(nèi)存,可以調(diào)用testNLPIR.NLPIR_GetKeyWords
????????????nativeStr?=?new?String(nativeBytes?0?nativeBytes.length?“GBK“);
????????????System.out.println(“關(guān)鍵詞識(shí)別結(jié)果為:?“?+?nativeStr);
????????????
????????????testNLPIR.NLPIR_FileProcess(argu1.getBytes(“GBK“)?argu2.getBytes(“GBK“)?0);
????????????
????????????testNLPIR.NLPIR_NWI_Start();
????????????testNLPIR.NLPIR_NWI_AddFile(argu1.getBytes(“GBK“));
????????????
????????????testNLPIR.NLPIR_NWI_Complete();
????????????
????????????nativeBytes?=?testNLPIR.NLPIR_NWI_GetResult(true);
????????????nativeStr?=?new?String(nativeBytes?0?nativeBytes.length?“GBK“);
????????????
????????????System.out.println(“新詞識(shí)別結(jié)果?“?+?nativeStr);
????????????
????????????testNLPIR.NLPIR_NWI_Result2UserDict();//?新詞識(shí)別結(jié)果
????????????argu2?=?“test/test_result2.TXT“;
????????????testNLPIR.NLPIR_FileProcess(argu1.getBytes(“GBK“)?argu2.getBytes(“GBK“)?1);
????????????
????????????testNLPIR.NLPIR_Exit();
????????}?catch?(Exception?ex)?{
????????}
????}
}
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件????????301??2013-12-23?22:46??NlPIR\.classpath
?????文件????????381??2013-12-23?22:46??NlPIR\.project
?????文件????????107??2013-12-23?23:05??NlPIR\.settings\org.eclipse.core.resources.prefs
?????文件????????598??2013-12-23?22:46??NlPIR\.settings\org.eclipse.jdt.core.prefs
?????文件????????388??2013-12-23?23:11??NlPIR\20131223.log
?????文件???????1164??2013-12-23?23:05??NlPIR\bin\kevin\zhang\NLPIR.class
?????文件????2292736??2013-04-15?14:22??NlPIR\bin\NLPIR_JNI.dll
?????文件???????3415??2013-12-25?15:39??NlPIR\bin\SimilarityCompution\NewFileExcludeStopWord.class
?????文件???????2399??2013-12-25?10:42??NlPIR\bin\TestNLPIR.class
?????文件?????286196??2012-05-18?21:36??NlPIR\Data\BIG2GBK.map
?????文件?????468456??2012-05-18?21:33??NlPIR\Data\BIG5.pdat
?????文件?????158695??2012-05-18?21:33??NlPIR\Data\BIG5.wordlist
?????文件????3520144??2009-01-16?13:48??NlPIR\Data\BiWord.big
?????文件??????65540??2012-11-08?20:45??NlPIR\Data\charset.type
?????文件????????856??2012-11-14?20:17??NlPIR\Data\Configure.xm
?????文件????1696620??2009-01-16?13:48??NlPIR\Data\CoreDict.pdat
?????文件????1786424??2009-01-16?13:48??NlPIR\Data\CoreDict.pos
?????文件?????478168??2009-01-16?13:48??NlPIR\Data\CoreDict.unig
?????文件?????262236??2009-01-16?13:48??NlPIR\Data\FieldDict.pdat
?????文件?????????72??2009-01-16?13:48??NlPIR\Data\FieldDict.pos
?????文件?????549204??2012-05-18?21:34??NlPIR\Data\GBK.pdat
?????文件?????166985??2012-05-18?21:34??NlPIR\Data\GBK.wordlist
?????文件?????286196??2012-05-18?21:36??NlPIR\Data\GBK2BIG.map
?????文件?????286196??2012-05-18?21:37??NlPIR\Data\GBK2GBKC.map
?????文件?????286196??2012-05-18?21:37??NlPIR\Data\GBK2UTF.map
?????文件?????550848??2012-12-22?11:50??NlPIR\Data\GBKA.pdat
?????文件?????166985??2012-12-22?11:50??NlPIR\Data\GBKA.wordlist
?????文件?????286196??2012-12-22?11:50??NlPIR\Data\GBKA2UTF.map
?????文件?????550848??2012-05-18?21:34??NlPIR\Data\GBKC.pdat
?????文件?????166985??2012-05-18?21:34??NlPIR\Data\GBKC.wordlist
............此處省略54個(gè)文件信息
評論
共有 條評論