資源簡(jiǎn)介
中文文本相似度匹配算法 simHash 海明距離 IK分詞
完整的可運(yùn)行的示例代碼 包含simHash 算法,使用IK 對(duì)中文文本進(jìn)行分詞處理

代碼片段和文件信息
package?SimHash;?/**
?*?Created?by?danjunwei_sx?on?2017/11/15.
?*/
import?org.wltea.analyzer.core.IKSegmenter;
import?org.wltea.analyzer.core.Lexeme;
import?java.io.IOException;
import?java.io.StringReader;
import?java.math.BigInteger;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.List;
/**
?*?Function:?simHash?判斷文本相似度,該示例程支持中文
?*?date:?2013-8-6?上午1:11:48?
?*?@author?june
?*?@version?0.1
?*/
public?class?SimHash
{
???/*?private?String?tokens;
????private?BigInteger?intSimHash;
????private?String?strSimHash;
????private?int?hashbits?=?64;*/
????public?String?tokens;
????public?BigInteger?intSimHash;
????public?String?strSimHash;
????public?int?hashbits?=?64;
????public?SimHash(String?tokens)?throws?IOException?{
????????this.tokens?=?tokens;
????????this.intSimHash?=?this.simHash();
????}
????public?SimHash(String?tokens?int?hashbits)?throws?IOException?{
????????this.tokens?=?tokens;
????????this.hashbits?=?hashbits;
????????this.intSimHash?=?this.simHash();
????}
????HashMap?wordMap?=?new?HashMap();
????public?BigInteger?simHash()?throws?IOException?{
????????//?定義特征向量/數(shù)組
????????int[]?v?=?new?int[this.hashbits];
????????//?英文分詞
????????//?StringTokenizer?stringTokens?=?new?StringTokenizer(this.tokens);
????????//?while?(stringTokens.hasMoreTokens())?{
????????//?String?temp?=?stringTokens.nextToken();
????????//?}
????????//?1、中文分詞,分詞器采用?IKAnalyzer3.2.8?,僅供演示使用,新版?API?已變化。
????????StringReader?reader?=?new?StringReader(this.tokens);
????????//?當(dāng)為true時(shí),分詞器進(jìn)行最大詞長(zhǎng)切分
???????//?IKSegmentation?ik?=?new?IKSegmentation(reader?true);
????????IKSegmenter?ik??=?new?IKSegmenter(readertrue);
????????Lexeme?lexeme?=?null;
????????String?word?=?null;
????????StringBuffer?temp?=?new?StringBuffer();????//String?temp?=?null
????????while?((lexeme?=?ik.next())?!=?null)?{
????????????word?=?lexeme.getLexemeText();
????????????temp.append(word+“?“);//temp?=?temp+“?“+word;
????????????//?注意停用詞會(huì)被干掉
????????????//?System.out.println(word);
????????????//?2、將每一個(gè)分詞hash為一組固定長(zhǎng)度的數(shù)列.比如?64bit?的一個(gè)整數(shù).
????????????BigInteger?t?=?this.hash(word);
????????????for?(int?i?=?0;?i?????????????????BigInteger?bitmask?=?new?BigInteger(“1“).shiftLeft(i);
????????????????//?3、建立一個(gè)長(zhǎng)度為64的整數(shù)數(shù)組(假設(shè)要生成64位的數(shù)字指紋也可以是其它數(shù)字)
????????????????//?對(duì)每一個(gè)分詞hash后的數(shù)列進(jìn)行判斷如果是1000...1那么數(shù)組的第一位和末尾一位加1
????????????????//?中間的62位減一也就是說(shuō)逢1加1逢0減1.一直到把所有的分詞hash數(shù)列全部判斷完畢.
????????????????if?(t.and(bitmask).signum()?!=?0)?{
????????????????????//?這里是計(jì)算整個(gè)文檔的所有特征的向量和
????????????????????//?這里實(shí)際使用中需要?+-?權(quán)重,比如詞頻,而不是簡(jiǎn)單的?+1/-1,
????????????????????v[i]?+=?1;
????????????????}?else?{
????????????????????v[i]?-=?1;
????????????????}
????????????}
????????}
????????System.out.println(temp);?//
????????BigInteger?fingerprint?=?new?BigInteger(“0“);
????????StringBuffer?simHashBuffer?=?new?StringBuffer();
????????for?(int?i?=?0;?i
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件?????????11??2017-11-15?12:00??simHash\mvn_project\.idea\.name
?????文件???????1151??2017-11-15?12:01??simHash\mvn_project\.idea\compiler.xm
?????文件????????941??2017-11-16?17:48??simHash\mvn_project\.idea\encodings.xm
?????文件????????224??2017-11-15?15:21??simHash\mvn_project\.idea\libraries\ikanalyzer_2012_u6.xm
?????文件????????337??2017-11-15?15:27??simHash\mvn_project\.idea\libraries\IKAnalyzer_all_jar.xm
?????文件????????462??2017-11-15?12:01??simHash\mvn_project\.idea\libraries\Maven__junit_junit_3_8_1.xm
?????文件????????584??2017-11-15?15:09??simHash\mvn_project\.idea\misc.xm
?????文件????????273??2017-11-15?12:00??simHash\mvn_project\.idea\modules.xm
?????文件????????143??2017-11-15?12:00??simHash\mvn_project\.idea\scopes\scope_settings.xm
?????文件????????173??2017-11-15?12:00??simHash\mvn_project\.idea\vcs.xm
?????文件??????34268??2017-11-17?09:52??simHash\mvn_project\.idea\workspace.xm
?????文件????4939784??2017-11-15?15:26??simHash\mvn_project\jar\IKAnalyzer_all_jar.zip
?????文件?????????23??2017-11-17?10:06??simHash\mvn_project\jar\jar包引入.txt
?????文件???????1021??2017-11-17?09:52??simHash\mvn_project\mvn_project.iml
?????文件???????1383??2017-11-17?09:52??simHash\mvn_project\pom.xm
?????文件???????7044??2017-11-15?17:45??simHash\mvn_project\src\main\java\SimHash\SimHash.java
?????文件???????5800??2017-11-17?09:50??simHash\mvn_project\src\main\java\Test\Test.java
?????文件?????????57??2017-11-15?12:01??simHash\mvn_project\src\main\webapp\index.jsp
?????文件????????222??2017-11-15?12:01??simHash\mvn_project\src\main\webapp\WEB-INF\web.xm
?????文件???????5080??2017-11-17?09:52??simHash\mvn_project\target\classes\SimHash\SimHash.class
?????文件???????4612??2017-11-17?09:52??simHash\mvn_project\target\classes\Test\Test.class
?????目錄??????????0??2017-11-15?17:45??simHash\mvn_project\src\main\java\SimHash
?????目錄??????????0??2017-11-17?09:50??simHash\mvn_project\src\main\java\Test
?????目錄??????????0??2017-11-15?12:01??simHash\mvn_project\src\main\webapp\WEB-INF
?????目錄??????????0??2017-11-17?09:50??simHash\mvn_project\src\main\java
?????目錄??????????0??2017-11-15?12:01??simHash\mvn_project\src\main\resources
?????目錄??????????0??2017-11-15?12:01??simHash\mvn_project\src\main\webapp
?????目錄??????????0??2017-11-17?09:52??simHash\mvn_project\target\classes\SimHash
?????目錄??????????0??2017-11-17?09:52??simHash\mvn_project\target\classes\Test
?????目錄??????????0??2017-11-15?15:29??simHash\mvn_project\target\generated-sources\annotations
............此處省略14個(gè)文件信息
- 上一篇:肌電信號(hào)采集
- 下一篇:《數(shù)據(jù)通信原理》歷年試題
評(píng)論
共有 條評(píng)論