91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡(jiǎn)介

中文文本相似度匹配算法 simHash 海明距離 IK分詞 完整的可運(yùn)行的示例代碼 包含simHash 算法,使用IK 對(duì)中文文本進(jìn)行分詞處理

資源截圖

代碼片段和文件信息

package?SimHash;?/**
?*?Created?by?danjunwei_sx?on?2017/11/15.
?*/

import?org.wltea.analyzer.core.IKSegmenter;
import?org.wltea.analyzer.core.Lexeme;

import?java.io.IOException;
import?java.io.StringReader;
import?java.math.BigInteger;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.List;

/**
?*?Function:?simHash?判斷文本相似度,該示例程支持中文

?*?date:?2013-8-6?上午1:11:48?

?*?@author?june
?*?@version?0.1
?*/

public?class?SimHash
{
???/*?private?String?tokens;

????private?BigInteger?intSimHash;

????private?String?strSimHash;

????private?int?hashbits?=?64;*/
????public?String?tokens;

????public?BigInteger?intSimHash;

????public?String?strSimHash;

????public?int?hashbits?=?64;

????public?SimHash(String?tokens)?throws?IOException?{
????????this.tokens?=?tokens;
????????this.intSimHash?=?this.simHash();
????}

????public?SimHash(String?tokens?int?hashbits)?throws?IOException?{
????????this.tokens?=?tokens;
????????this.hashbits?=?hashbits;
????????this.intSimHash?=?this.simHash();
????}

????HashMap?wordMap?=?new?HashMap();

????public?BigInteger?simHash()?throws?IOException?{
????????//?定義特征向量/數(shù)組
????????int[]?v?=?new?int[this.hashbits];
????????//?英文分詞
????????//?StringTokenizer?stringTokens?=?new?StringTokenizer(this.tokens);
????????//?while?(stringTokens.hasMoreTokens())?{
????????//?String?temp?=?stringTokens.nextToken();
????????//?}
????????//?1、中文分詞,分詞器采用?IKAnalyzer3.2.8?,僅供演示使用,新版?API?已變化。
????????StringReader?reader?=?new?StringReader(this.tokens);
????????//?當(dāng)為true時(shí),分詞器進(jìn)行最大詞長(zhǎng)切分
???????//?IKSegmentation?ik?=?new?IKSegmentation(reader?true);
????????IKSegmenter?ik??=?new?IKSegmenter(readertrue);
????????Lexeme?lexeme?=?null;
????????String?word?=?null;
????????StringBuffer?temp?=?new?StringBuffer();????//String?temp?=?null
????????while?((lexeme?=?ik.next())?!=?null)?{
????????????word?=?lexeme.getLexemeText();
????????????temp.append(word+“?“);//temp?=?temp+“?“+word;
????????????//?注意停用詞會(huì)被干掉
????????????//?System.out.println(word);
????????????//?2、將每一個(gè)分詞hash為一組固定長(zhǎng)度的數(shù)列.比如?64bit?的一個(gè)整數(shù).
????????????BigInteger?t?=?this.hash(word);
????????????for?(int?i?=?0;?i?????????????????BigInteger?bitmask?=?new?BigInteger(“1“).shiftLeft(i);
????????????????//?3、建立一個(gè)長(zhǎng)度為64的整數(shù)數(shù)組(假設(shè)要生成64位的數(shù)字指紋也可以是其它數(shù)字)
????????????????//?對(duì)每一個(gè)分詞hash后的數(shù)列進(jìn)行判斷如果是1000...1那么數(shù)組的第一位和末尾一位加1
????????????????//?中間的62位減一也就是說(shuō)逢1加1逢0減1.一直到把所有的分詞hash數(shù)列全部判斷完畢.
????????????????if?(t.and(bitmask).signum()?!=?0)?{
????????????????????//?這里是計(jì)算整個(gè)文檔的所有特征的向量和
????????????????????//?這里實(shí)際使用中需要?+-?權(quán)重,比如詞頻,而不是簡(jiǎn)單的?+1/-1,
????????????????????v[i]?+=?1;
????????????????}?else?{
????????????????????v[i]?-=?1;
????????????????}
????????????}
????????}
????????System.out.println(temp);?//
????????BigInteger?fingerprint?=?new?BigInteger(“0“);
????????StringBuffer?simHashBuffer?=?new?StringBuffer();
????????for?(int?i?=?0;?i

?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----

?????文件?????????11??2017-11-15?12:00??simHash\mvn_project\.idea\.name

?????文件???????1151??2017-11-15?12:01??simHash\mvn_project\.idea\compiler.xml

?????文件????????941??2017-11-16?17:48??simHash\mvn_project\.idea\encodings.xml

?????文件????????224??2017-11-15?15:21??simHash\mvn_project\.idea\libraries\ikanalyzer_2012_u6.xml

?????文件????????337??2017-11-15?15:27??simHash\mvn_project\.idea\libraries\IKAnalyzer_all_jar.xml

?????文件????????462??2017-11-15?12:01??simHash\mvn_project\.idea\libraries\Maven__junit_junit_3_8_1.xml

?????文件????????584??2017-11-15?15:09??simHash\mvn_project\.idea\misc.xml

?????文件????????273??2017-11-15?12:00??simHash\mvn_project\.idea\modules.xml

?????文件????????143??2017-11-15?12:00??simHash\mvn_project\.idea\scopes\scope_settings.xml

?????文件????????173??2017-11-15?12:00??simHash\mvn_project\.idea\vcs.xml

?????文件??????34268??2017-11-17?09:52??simHash\mvn_project\.idea\workspace.xml

?????文件????4939784??2017-11-15?15:26??simHash\mvn_project\jar\IKAnalyzer_all_jar.zip

?????文件?????????23??2017-11-17?10:06??simHash\mvn_project\jar\jar包引入.txt

?????文件???????1021??2017-11-17?09:52??simHash\mvn_project\mvn_project.iml

?????文件???????1383??2017-11-17?09:52??simHash\mvn_project\pom.xml

?????文件???????7044??2017-11-15?17:45??simHash\mvn_project\src\main\java\SimHash\SimHash.java

?????文件???????5800??2017-11-17?09:50??simHash\mvn_project\src\main\java\Test\Test.java

?????文件?????????57??2017-11-15?12:01??simHash\mvn_project\src\main\webapp\index.jsp

?????文件????????222??2017-11-15?12:01??simHash\mvn_project\src\main\webapp\WEB-INF\web.xml

?????文件???????5080??2017-11-17?09:52??simHash\mvn_project\target\classes\SimHash\SimHash.class

?????文件???????4612??2017-11-17?09:52??simHash\mvn_project\target\classes\Test\Test.class

?????目錄??????????0??2017-11-15?17:45??simHash\mvn_project\src\main\java\SimHash

?????目錄??????????0??2017-11-17?09:50??simHash\mvn_project\src\main\java\Test

?????目錄??????????0??2017-11-15?12:01??simHash\mvn_project\src\main\webapp\WEB-INF

?????目錄??????????0??2017-11-17?09:50??simHash\mvn_project\src\main\java

?????目錄??????????0??2017-11-15?12:01??simHash\mvn_project\src\main\resources

?????目錄??????????0??2017-11-15?12:01??simHash\mvn_project\src\main\webapp

?????目錄??????????0??2017-11-17?09:52??simHash\mvn_project\target\classes\SimHash

?????目錄??????????0??2017-11-17?09:52??simHash\mvn_project\target\classes\Test

?????目錄??????????0??2017-11-15?15:29??simHash\mvn_project\target\generated-sources\annotations

............此處省略14個(gè)文件信息

評(píng)論

共有 條評(píng)論

相關(guān)資源