-
大小: 46.37MB文件類型: .rar金幣: 1下載: 0 次發(fā)布日期: 2023-07-17
- 語言: Java
- 標簽:
資源簡介
java實現(xiàn)的文本相似度系統(tǒng),使用向量空間模型以及余弦相似度距離公式,實測可以實現(xiàn)2篇文本的相似度計算且有一定的效果。
代碼片段和文件信息
package?algorithm;
import?java.io.IOException;
import?java.util.HashMap;
import?java.util.Iterator;
import?java.util.List;
import?java.util.Map;
import?script.FileHandler;
import?script.StringHandler;
import?util.Conf;
import?util.Tool;
import?word2vec.main.java.com.ansj.vec.Word2VEC;
public?class?Model?{
/**
?*?input:docList1docList2?主方法入口及控制器
?*/
public?void?run(List?docList1?List?docList2?Word2VEC?w)?{
int?i?j?n1?n2;
double[]?similarArr;
int[]?locArr;
double?max?temp;
int?loc;
n1?=?docList1.size();
n2?=?docList2.size();
similarArr?=?new?double[n1];
locArr?=?new?int[n1];
for?(i?=?0;?i? max?=?0.0;
temp?=?0.0;
loc?=?0;
for?(j?=?0;?j? try?{
temp?=?getSimilar(docList1.get(i)?docList2.get(j)?w);
}?catch?(IOException?e)?{
temp?=?0.0;
e.printStackTrace();
}
if?(temp?>?max)?{
max?=?temp;
loc?=?j;
}
}
similarArr[i]?=?max;
locArr[i]?=?loc;
}
Tool.output(docList1?docList2?locArr?similarArr);
}
/**
?*?input:str1str2?計算2個字符串之間的相似度
?*/
public?double?getSimilar(String?str1?String?str2?Word2VEC?w)?throws?IOException?{
double?ret?=?0.0;
//?創(chuàng)建向量空間模型,使用map實現(xiàn),主鍵為詞項,值為長度為2的數(shù)組,存放著對應(yīng)詞項在字符串中的出現(xiàn)次數(shù)
Map?vectorSpace?=?new?HashMap();
int[]?itemCountArray?=?null;//?為了避免頻繁產(chǎn)生局部變量,所以將itemCountArray聲明在此
Iterator?iter;
double?vector1Modulo?=?0.00;//?向量1的模
double?vector2Modulo?=?0.00;//?向量2的模
double?vectorProduct?=?0.00;?//?向量積
List?list1list1_templist2list2_temptemp1temp2;
StringHandler?sh?=?new?StringHandler();
list1_temp?=?sh.stringToArray(str1);
list2_temp?=?sh.stringToArray(str2);
/*
//使用word2vec擴充語義
temp1?=?sh.stringToArray(str1);
temp2?=?sh.stringToArray(str2);
list1?=?sh.extendWord(w?temp1);
list2?=?sh.extendWord(w?temp2);
*/
list1?=?sh.deleteStopWords(list1_temp?Conf.stopWordsPath);
list2?=?sh.deleteStopWords(list2_temp?Conf.stopWordsPath);
int?in;
n?=?list1.size();
for?(i?=?0;?i? if?(vectorSpace.containsKey(list1.get(i)))
++(vectorSpace.get(list1.get(i))[0]);
else?{
itemCountArray?=?new?int[2];
itemCountArray[0]?=?1;
itemCountArray[1]?=?0;
vectorSpace.put(list1.get(i)?itemCountArray);
}
}
//?對str2處理
n?=?list2.size();
for?(i?=?0;?i? if?(vectorSpace.containsKey(list2.get(i)))
++(vectorSpace.get(list2.get(i))[1]);
else?{
itemCountArray?=?new?int[2];
itemCountArray[0]?=?0;
itemCountArray[1]?=?1;
vectorSpace.put(list2.get(i)?itemCountArray);
}
}
//?計算相似度
iter?=?vectorSpace.entrySet().iterator();
while?(iter.hasNext())?{
Map.Entry?entry?=?(Map.Entry)?iter.next();
itemCountArray?=?(int[])?entry.getValue();
vector1Modulo?+=?itemCountArray[0]?*?itemCountArray[0];
vector2Modulo?+=?itemCountAr
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????395??2017-01-20?11:37??DocDistance\.classpath
?????文件????????387??2017-01-20?11:37??DocDistance\.project
?????文件????????138??2017-01-20?11:37??DocDistance\.settings\org.eclipse.core.resources.prefs
?????文件????????598??2017-01-20?11:37??DocDistance\.settings\org.eclipse.jdt.core.prefs
?????文件???????3807??2017-01-20?11:37??DocDistance\bin\algorithm\Model.class
?????文件???????2387??2017-01-20?11:37??DocDistance\bin\sc
?????文件???????2497??2017-01-20?11:37??DocDistance\bin\sc
?????文件???????3661??2017-01-20?11:37??DocDistance\bin\sc
?????文件???????1173??2017-01-20?11:37??DocDistance\bin\sc
?????文件????????431??2017-01-20?11:37??DocDistance\bin\util\Conf.class
?????文件???????3267??2017-01-20?11:37??DocDistance\bin\util\Tool.class
?????文件????????446??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\domain\HiddenNeuron.class
?????文件????????890??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\domain\Neuron.class
?????文件???????1283??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\domain\WordEntry.class
?????文件???????2063??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\domain\WordNeuron.class
?????文件??????13008??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\Learn.class
?????文件???????1688??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\util\Haffman.class
?????文件???????3036??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\util\MapCount.class
?????文件???????1630??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\util\WordKmeans$Classes$1.class
?????文件???????3392??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\util\WordKmeans$Classes.class
?????文件???????3680??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\util\WordKmeans.class
?????文件???????9617??2017-01-20?11:37??DocDistance\bin\word2vec\main\java\com\ansj\vec\Word2VEC.class
?????文件??????10721??2017-01-20?11:37??DocDistance\brand_ext.dic
?????文件???????1172??2017-01-20?11:37??DocDistance\color_ext.dic
?????文件??????19239??2017-01-20?11:04??DocDistance\data\result\out1_1.txt
?????文件??????17407??2017-01-20?11:05??DocDistance\data\result\out1_2.txt
?????文件??????15407??2017-01-20?11:09??DocDistance\data\result\out1_3.txt
?????文件??????10116??2017-01-20?11:13??DocDistance\data\result\out2_3.txt
?????文件??????80648??2017-01-20?11:29??DocDistance\data\result\out3_1.txt
?????文件??????73147??2017-01-20?11:26??DocDistance\data\result\out3_2.txt
............此處省略62個文件信息
評論
共有 條評論