資源簡介
java版結巴分詞工具,一個很好的中文分詞工具。直接用eclipse打開,輸入是一篇文章,然后輸出是每個詞的詞頻,并且詞頻是按照大小從次數最高到次數最低排的,只需要在test包里改部分代碼就可以使用了。
不下你會后悔的。

代碼片段和文件信息
package?com.huaban.analysis.jieba;
import?java.util.regex.Pattern;
public?class?CharacterUtil?{
????public?static?Pattern?reSkip?=?Pattern.compile(“(\\d+\\.\\d+|[a-zA-Z0-9]+)“);
????private?static?final?char[]?connectors?=?new?char[]?{?‘+‘?‘#‘?‘&‘?‘.‘?‘_‘?‘-‘?};
????public?static?boolean?isChineseLetter(char?ch)?{
????????if?(ch?>=?0x4E00?&&?ch?<=?0x9FA5)
????????????return?true;
????????return?false;
????}
????public?static?boolean?isEnglishLetter(char?ch)?{
????????if?((ch?>=?0x0041?&&?ch?<=?0x005A)?||?(ch?>=?0x0061?&&?ch?<=?0x007A))
????????????return?true;
????????return?false;
????}
????public?static?boolean?isDigit(char?ch)?{
????????if?(ch?>=?0x0030?&&?ch?<=?0x0039)
????????????return?true;
????????return?false;
????}
????public?static?boolean?isConnector(char?ch)?{
????????for?(char?connector?:?connectors)
????????????if?(ch?==?connector)
????????????????return?true;
????????return?false;
????}
????public?static?boolean?ccFind(char?ch)?{
????????if?(isChineseLetter(ch))
????????????return?true;
????????if?(isEnglishLetter(ch))
????????????return?true;
????????if?(isDigit(ch))
????????????return?true;
????????if?(isConnector(ch))
????????????return?true;
????????return?false;
????}
????/**
?????*?全角?to?半角大寫?to?小寫
?????*?
?????*?@param?input
?????*????????????輸入字符
?????*?@return?轉換后的字符
?????*/
????public?static?char?regularize(char?input)?{
????????if?(input?==?12288)?{
????????????return?32;
????????}
????????else?if?(input?>?65280?&&?input?65375)?{
????????????return?(char)?(input?-?65248);
????????}
????????else?if?(input?>=?‘A‘?&&?input?<=?‘Z‘)?{
????????????return?(input?+=?32);
????????}
????????return?input;
????}
}
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2017-11-29?20:25??JBFC\
?????文件?????????301??2017-11-29?20:25??JBFC\.classpath
?????文件?????????380??2017-11-29?20:25??JBFC\.project
?????目錄???????????0??2017-11-29?20:25??JBFC\.settings\
?????文件?????????598??2017-11-29?20:25??JBFC\.settings\org.eclipse.jdt.core.prefs
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\huaban\
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\
?????文件????????1546??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\CharacterUtil.class
?????文件????????5503??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\DictSegment.class
?????文件????????1703??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\Hit.class
?????文件????????1226??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\Jieba
?????文件????????8447??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\Jieba
?????文件?????????467??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\Node.class
?????文件????????1104??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\Pair.class
?????文件?????????842??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\SegToken.class
?????文件????????7621??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\WordDictionary.class
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\viterbi\
?????文件????????9018??2017-11-29?20:25??JBFC\bin\com\huaban\analysis\jieba\viterbi\FinalSeg.class
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\huaban\ananlysis\
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\com\huaban\ananlysis\utils\
?????文件????????1602??2017-11-29?20:25??JBFC\bin\com\huaban\ananlysis\utils\ReadTxt.class
?????文件?????5071839??2017-11-29?20:25??JBFC\bin\dict.txt
?????文件???????35530??2017-11-29?20:25??JBFC\bin\jieba.java.code.st
?????文件??????665440??2017-11-29?20:25??JBFC\bin\prob_emit.txt
?????目錄???????????0??2017-11-29?20:25??JBFC\bin\test\
?????文件????????1244??2017-11-29?20:25??JBFC\bin\test\Test$1.class
?????文件????????3680??2017-11-29?20:25??JBFC\bin\test\Test.class
?????目錄???????????0??2017-11-29?20:25??JBFC\src\
............此處省略22個文件信息
評論
共有 條評論