-
大小: 63KB文件類型: .zip金幣: 2下載: 0 次發布日期: 2024-02-05
- 語言: Java
- 標簽: mmseg4j??luncene5.X??
資源簡介
中文分詞器 mmseg4j luncene5.X 源碼 jar包
代碼片段和文件信息
package?com.chenlb.mmseg4j;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.Map;
/**
?*?所有詞都記錄在第一個字的結點下.
?*?
?*?@author?chenlb?2009-2-20?下午11:30:14
?*/
public?class?CharNode?{
private?int?freq?=?-1; //Degree?of?Morphemic?Freedom?of?One-Character?單字才需要
private?int?maxLen?=?0; //wordTail的最長
private?KeyTree?ktWordTails?=?new?KeyTree();
private?int?wordNum?=?0;
public?CharNode()?{
}
public?void?addWordTail(char[]?wordTail)?{
ktWordTails.add(wordTail);
wordNum++;
if(wordTail.length?>?maxLen)?{
maxLen?=?wordTail.length;
}
}
public?int?getFreq()?{
return?freq;
}
public?void?setFreq(int?freq)?{
this.freq?=?freq;
}
public?int?wordNum()?{
return?wordNum;
}
/**
?*?@param?sen?句子?一串文本.
?*?@param?offset?詞在句子中的位置
?*?@param?tailLen?詞尾的長度?實際是去掉詞的長度.
?*?@author?chenlb?2009-4-8?下午11:10:30
?*/
public?int?indexOf(char[]?sen?int?offset?int?tailLen)?{
//return?binarySearch(wordTails?sen?offset+1?tailLen?casc);
return?ktWordTails.match(sen?offset+1?tailLen)???1?:?-1;
}
/**
?*?@param?sen?句子?一串文本.
?*?@param?wordTailOffset?詞在句子中的位置?實際是?offset?后面的開始找.
?*?@return?返回詞尾長?沒有就是?0
?*?@author?chenlb?2009-4-10?下午10:45:51
?*/
public?int?maxMatch(char[]?sen?int?wordTailOffset)?{
return?ktWordTails.maxMatch(sen?wordTailOffset);
}
/**
?*?
?*?@return?至少返回一個包括?0的int
?*?@author?chenlb?2009-4-12?上午10:01:35
?*/
public?ArrayList?maxMatch(ArrayList?tailLens?char[]?sen?int?wordTailOffset)?{
return?ktWordTails.maxMatch(tailLens?sen?wordTailOffset);
}
public?int?getMaxLen()?{
return?maxLen;
}
public?void?setMaxLen(int?maxLen)?{
this.maxLen?=?maxLen;
}
public?static?class?KeyTree?{
TreeNode?head?=?new?TreeNode(‘?‘);
public?void?add(char[]?w)?{
if(w.length?1)?{
return;
}
TreeNode?p?=?head;
for(int?i=0;?i TreeNode?n?=?p.subNode(w[i]);
if(n?==?null)?{
n?=?new?TreeNode(w[i]);
p.born(w[i]?n);
}
p?=?n;
}
p.alsoLeaf?=?true;
}
/**
?*?@return?返回匹配最長詞的長度?沒有找到返回?0.
?*/
public?int?maxMatch(char[]?sen?int?offset)?{
int?idx?=?offset?-?1;
TreeNode?node?=?head;
for(int?i=offset;?i node?=?node.subNode(sen[i]);
if(node?!=?null)?{
if(node.isAlsoLeaf())?{
idx?=?i;?
}
}?else?{
break;
}
}
return?idx?-?offset?+?1;
}
public?ArrayList?maxMatch(ArrayList?tailLens?char[]?sen?int?offset)?{
TreeNode?node?=?head;
for(int?i=offset;?i node?=?node.subNode(sen[i]);
if(node?!=?null)?{
if(node.isAlsoLeaf())?{
tailLens.add(i-offset+1);?
}
}?else?{
break;
}
}
return?tailLens;
}
public?boolean?match(char[]?sen?int?offset?int?len)?{
TreeNode?node?=?head;
for(int?i=0;?i
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2015-11-10?17:44??com\
?????目錄???????????0??2015-11-10?17:44??com\chenlb\
?????目錄???????????0??2015-11-10?17:44??com\chenlb\mmseg4j\
?????目錄???????????0??2015-11-10?17:44??com\chenlb\mmseg4j\analysis\
?????文件?????????566??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\ComplexAnalyzer.java
?????文件????????4526??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\CutLetterDigitFilter.java
?????文件?????????571??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\MaxWordAnalyzer.java
?????文件????????1381??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\MMSegAnalyzer.java
?????文件????????2617??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\MMSegTokenizer.java
?????文件?????????572??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\SimpleAnalyzer.java
?????文件????????1731??2015-11-10?17:26??com\chenlb\mmseg4j\analysis\TokenUtils.java
?????文件????????3802??2015-11-10?17:28??com\chenlb\mmseg4j\CharNode.java
?????文件????????2681??2015-11-10?17:28??com\chenlb\mmseg4j\Chunk.java
?????文件????????4097??2015-11-10?17:28??com\chenlb\mmseg4j\ComplexSeg.java
?????文件???????12847??2015-11-10?17:28??com\chenlb\mmseg4j\Dictionary.java
?????文件????????1439??2015-11-10?17:28??com\chenlb\mmseg4j\MaxWordSeg.java
?????文件???????10355??2015-11-10?17:28??com\chenlb\mmseg4j\MMSeg.java
?????目錄???????????0??2015-11-10?17:44??com\chenlb\mmseg4j\rule\
?????文件?????????696??2015-11-10?17:28??com\chenlb\mmseg4j\rule\LargestAvgLenRule.java
?????文件?????????813??2015-11-10?17:28??com\chenlb\mmseg4j\rule\LargestSumDegreeFreedomRule.java
?????文件?????????622??2015-11-10?17:28??com\chenlb\mmseg4j\rule\MaxMatchRule.java
?????文件????????1147??2015-11-10?17:28??com\chenlb\mmseg4j\rule\Rule.java
?????文件?????????761??2015-11-10?17:28??com\chenlb\mmseg4j\rule\SmallestVarianceRule.java
?????文件????????2345??2015-11-10?17:28??com\chenlb\mmseg4j\Seg.java
?????文件????????1173??2015-11-10?17:28??com\chenlb\mmseg4j\Sentence.java
?????文件?????????695??2015-11-10?17:28??com\chenlb\mmseg4j\SimpleSeg.java
?????文件????????2465??2015-11-10?17:28??com\chenlb\mmseg4j\Word.java
?????文件???????43234??2015-11-10?17:43??mmseg4j-1.9.2.jar
評論
共有 條評論