資源簡介
DOC2vec,是為一群用來產生詞向量的相關模型。這些模型為淺而雙層的神經網絡,用來訓練以重新建構語言學之詞文本。
代碼片段和文件信息
package?com.ansj.vec;
import?java.io.BufferedOutputStream;
import?java.io.BufferedReader;
import?java.io.DataOutputStream;
import?java.io.File;
import?java.io.FileInputStream;
import?java.io.FileOutputStream;
import?java.io.IOException;
import?java.io.InputStreamReader;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.List;
import?java.util.Map;
import?java.util.Map.Entry;
import?love.cq.util.MapCount;
import?com.ansj.vec.domain.HiddenNeuron;
import?com.ansj.vec.domain.Neuron;
import?com.ansj.vec.domain.WordNeuron;
import?com.ansj.vec.util.Haffman;
public?class?Learn?{
private?Map?wordMap?=?new?HashMap<>();
/**
?*?訓練多少個特征
?*/
private?int?layerSize?=?200;
/**
?*?上下文窗口大小
?*/
private?int?window?=?5;
private?double?sample?=?1e-3;
private?double?alpha?=?0.025;
private?double?startingAlpha?=?alpha;
public?int?EXP_TABLE_SIZE?=?1000;
private?Boolean?isCbow?=?false;
private?double[]?expTable?=?new?double[EXP_TABLE_SIZE];
private?int?trainWordsCount?=?0;
private?int?MAX_EXP?=?6;
private?int?freqThresold?=?5;
public?Learn(Boolean?isCbow?Integer?layerSize?Integer?window
Double?alpha?Double?sample)?{
createExpTable();
if?(isCbow?!=?null)?{
this.isCbow?=?isCbow;
}
if?(layerSize?!=?null)
this.layerSize?=?layerSize;
if?(window?!=?null)
this.window?=?window;
if?(alpha?!=?null)
this.alpha?=?alpha;
if?(sample?!=?null)
this.sample?=?sample;
}
public?Learn()?throws?IOException?{
createExpTable();
}
/**
?*?trainModel
?*?
?*?@throws?IOException
?*/
private?void?trainModel(File?file)?throws?IOException?{
try?(BufferedReader?br?=?new?BufferedReader(new?InputStreamReader(
new?FileInputStream(file))))?{
String?temp?=?null;
long?nextRandom?=?5;
int?wordCount?=?0;
int?lastWordCount?=?0;
int?wordCountActual?=?0;
while?((temp?=?br.readLine())?!=?null)?{
if?(wordCount?-?lastWordCount?>?10000)?{
System.out.println(“alpha:“
+?alpha
+?“\tProgress:?“
+?(int)?(wordCountActual
/?(double)?(trainWordsCount?+?1)?*?100)
+?“%“);
wordCountActual?+=?wordCount?-?lastWordCount;
lastWordCount?=?wordCount;
alpha?=?startingAlpha
*?(1?-?wordCountActual
/?(double)?(trainWordsCount?+?1));
if?(alpha? alpha?=?startingAlpha?*?0.0001;
}
}
String[]?strs?=?temp.split(“?“);
wordCount?+=?strs.length;
List?sentence?=?new?ArrayList();
for?(int?i?=?0;?i? Neuron?entry?=?wordMap.get(strs[i]);
if?(entry?==?null)?{
continue;
}
//?The?subsampling?randomly?discards?frequent?words?while
//?keeping?the?ranking?same
if?(sample?>?0)?{
double?ran?=?(Math.sqrt(entry.freq
/?(sample?*?trainWordsCount))?+?1)
*?(sample?*?trainWordsCount)?/?entry.freq;
nextRandom?=?nextRandom?*?25214903917L?+?11;
if?(ran?(nextRandom?&?0xFFFF)?/?(d
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\
?????文件?????????471??2015-07-23?07:26??doc2vec_java-master\.classpath
?????文件?????????371??2015-07-23?07:26??doc2vec_java-master\.project
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\.settings\
?????文件?????????658??2015-07-23?07:26??doc2vec_java-master\.settings\org.eclipse.jdt.core.prefs
?????文件?????????647??2015-07-23?07:26??doc2vec_java-master\README.md
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\com\
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\
?????文件???????11613??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\Learn.class
?????文件???????11686??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\LearnDocVec.class
?????文件????????9784??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\Word2VEC.class
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\domain\
?????文件?????????389??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\domain\HiddenNeuron.class
?????文件?????????718??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\domain\Neuron.class
?????文件????????1207??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\domain\WordEntry.class
?????文件????????1610??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\domain\WordNeuron.class
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\
?????文件????????1475??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\Haffman.class
?????文件????????2818??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\ModelFile.class
?????文件????????1050??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\ReadWriteFile.class
?????文件????????1516??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans$Classes$1.class
?????文件????????3258??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans$Classes.class
?????文件????????4510??2015-07-23?07:26??doc2vec_java-master\bin\com\ansj\vec\util\WordKmeans.class
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\bin\test\
?????文件????????2937??2015-07-23?07:26??doc2vec_java-master\bin\test\Doc2VecTest.class
?????文件????????1484??2015-07-23?07:26??doc2vec_java-master\bin\test\Word2VecTest.class
?????目錄???????????0??2015-07-23?07:26??doc2vec_java-master\file\
?????文件?????7680759??2015-07-23?07:26??doc2vec_java-master\file\amazon_docs.txt
?????文件????16492176??2015-07-23?07:26??doc2vec_java-master\file\clinical_doc_200_java.vec
............此處省略27個文件信息
- 上一篇:簡單的即時便簽
- 下一篇:javaweb課程大作業——教務管理系統
評論
共有 條評論