資源簡介
使用java調(diào)用HanLP分詞器實現(xiàn)兩個文本相似度對比,可以很快對比出百分率(1=100%)

代碼片段和文件信息
package?com.etoak.simHash;
import?com.hankcs.hanlp.seg.common.Term;
import?com.hankcs.hanlp.tokenizer.StandardTokenizer;
import?org.apache.commons.lang3.StringUtils;
import?org.jsoup.Jsoup;
import?org.jsoup.safety.Whitelist;
import?java.math.BigInteger;
import?java.util.HashMap;
import?java.util.List;
import?java.util.Map;
public?class?MySimHash?{
????private?String?tokens;?//字符串
????private?BigInteger?strSimHash;//字符產(chǎn)的hash值
????private?int?hashbits?=?64;?//?分詞后的hash數(shù);
????public?MySimHash(String?tokens)?{
????????this.tokens?=?tokens;
????????this.strSimHash?=?this.simHash();
????}
????private?MySimHash(String?tokens?int?hashbits)?{
????????this.tokens?=?tokens;
????????this.hashbits?=?hashbits;
????????this.strSimHash?=?this.simHash();
????}
????/**
?????*?清除html標簽
?????*?@param?content
?????*?@return
?????*/
????private?String?cleanResume(String?content)?{
????????//?若輸入為HTML下面會過濾掉所有的HTML的tag
????????content?=?Jsoup.clean(content?Whitelist.none());
????????content?=?StringUtils.lowerCase(content);
????????String[]?strings?=?{“?“?“\n“?“\r“?“\t“?“\\r“?“\\n“?“\\t“?“ “};
????????for?(String?s?:?strings)?{
????????????content?=?content.replaceAll(s?““);
????????}
????????return?content;
????}
????/**
?????*?這個是對整個字符串進行hash計算
?????*?@return
?????*/
????private?BigInteger?simHash()?{
????????tokens?=?cleanResume(tokens);?//?cleanResume?刪除一些特殊字符
????????int[]?v?=?new?int[this.hashbits];
????????List?termList?=?StandardTokenizer.segment(this.tokens);?//?對字符串進行分詞
????????//對分詞的一些特殊處理?:?比如:?根據(jù)詞性添加權(quán)重??過濾掉標點符號??過濾超頻詞匯等;
????????Map?weightOfNature?=?new?HashMap();?//?詞性的權(quán)重
????????weightOfNature.put(“n“?2);?//給名詞的權(quán)重是2;
????????Map?stopNatures?=?new?HashMap();//停用的詞性?如一些標點符號之類的;
????????stopNatures.put(“w“?““);?//
????????int?overCount?=?5;?//設(shè)定超頻詞匯的界限?;
????????Map?wordCount?=?new?HashMap();
????????for?(Term?term?:?termList)?{
????????????String?word?=?term.word;?//分詞字符串
????????????String?nature?=?term.nature.toString();?//?分詞屬性;
????????????//??過濾超頻詞
????????????if?(wordCount.containsKey(word))?{
????????????????int?count?=?wordCount.get(word);
????????????????if?(count?>?overCount)?{
????????????????????continue;
????????????????}
????????????????wordCount.put(word?count?+?1);
????????????}?else?{
????????????????wordCount.put(word?1);
????????????}
????????????//?過濾停用詞性
????????????if?(stopNatures.containsKey(nature))?{
????????????????continue;
????????????}
????????????//?2、將每一個分詞hash為一組固定長度的數(shù)列.比如?64bit?的一個整數(shù).
????????????BigInteger?t?=?this.hash(word);
????????????for?(int?i?=?0;?i?????????????????BigInteger?bitmask?=?new?BigInteger(“1“).shiftLeft(i);
????????????????//?3、建立一個長度為64的整數(shù)數(shù)組(假設(shè)要生成64位的數(shù)字指紋也可以是其它數(shù)字)
????????????????//?對每一個分詞hash后的數(shù)列進行判斷如果是1000...1那么數(shù)組的第一位和末尾一位加1
?????????????
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2017-12-03?14:59??SimHash\
?????目錄???????????0??2017-12-03?14:58??SimHash\.idea\
?????文件?????????626??2017-12-03?14:30??SimHash\.idea\compiler.xm
?????文件?????????172??2017-12-03?14:30??SimHash\.idea\encodings.xm
?????目錄???????????0??2017-12-03?14:30??SimHash\.idea\inspectionProfiles\
?????目錄???????????0??2017-12-03?14:56??SimHash\.idea\libraries\
?????文件?????????545??2017-12-03?14:30??SimHash\.idea\libraries\Maven__com_hankcs_hanlp_portable_1_3_4.xm
?????文件?????????462??2017-12-03?14:30??SimHash\.idea\libraries\Maven__junit_junit_3_8_1.xm
?????文件?????????556??2017-12-03?14:30??SimHash\.idea\libraries\Maven__org_apache_commons_commons_lang3_3_4.xm
?????文件?????????485??2017-12-03?14:30??SimHash\.idea\libraries\Maven__org_jsoup_jsoup_1_10_3.xm
?????文件?????????439??2017-12-03?14:30??SimHash\.idea\misc.xm
?????文件?????????254??2017-12-03?14:29??SimHash\.idea\modules.xm
?????文件???????22159??2017-12-03?14:58??SimHash\.idea\workspace.xm
?????文件????????1549??2017-12-03?14:56??SimHash\pom.xm
?????文件????????1209??2017-12-03?14:56??SimHash\SimHash.iml
?????目錄???????????0??2017-12-03?14:29??SimHash\src\
?????目錄???????????0??2017-12-03?14:29??SimHash\src\main\
?????目錄???????????0??2017-12-03?14:31??SimHash\src\main\java\
?????目錄???????????0??2017-12-03?14:31??SimHash\src\main\java\com\
?????目錄???????????0??2017-12-03?14:57??SimHash\src\main\java\com\etoak\
?????目錄???????????0??2017-12-03?14:53??SimHash\src\main\java\com\etoak\simHash\
?????文件???????15305??2017-12-03?14:53??SimHash\src\main\java\com\etoak\simHash\MySimHash.java
?????目錄???????????0??2017-12-03?14:29??SimHash\src\main\resources\
?????目錄???????????0??2017-12-03?14:29??SimHash\src\test\
?????目錄???????????0??2017-12-03?14:29??SimHash\src\test\java\
評論
共有 條評論