資源簡介
基于tf idf的文檔集關鍵詞提取
已經含有測試文檔集
可以替換成任意需要的文檔集
可以自己提供字典

代碼片段和文件信息
///////////////////////////////////////////////////////////////////
//?File??????????:Dir?txt?Input
//?Author????????:ShuanHolmes
//?Date??????????:2015.4.10
//?Modifier??????:...
//?Modify?Date???:...
//?Description???:statics_Dir.cpp
///////////////////////////////////////////////////////////////////
#include?“Statics.h“?
extern?map?Dic;
extern?list?SinStatics;
extern?multiset?SumStatics;
extern?multiset?Fileidf;
extern?set?Database;
list?DataOut;
void?getJustCurrentFile(?string?path?vector&?files)??
{????//?return?file?iter????
long??hFile??=??0;????//?file?info???
struct?_finddata_t?fileinfo;????
string?p;????
if((hFile?=?_findfirst(p.assign(path).append(“\\*“).c_str()&fileinfo))?!=?-1)????
{??????
do?????
{?????????
if((fileinfo.attrib?&?_A_SUBDIR));???????????????
else???????????????
files.push_back(fileinfo.name);?????????????
}while(_findnext(hFile?&fileinfo)?==?0);??????
_findclose(hFile);????
}??
}
void?WordFrequency(?void?)
{
SinStatics.unique();
while(!SinStatics.empty())
{
SumStatics.insert(SinStatics.back(?));
SinStatics.pop_back(?);
}
SinStatics.clear();?//?register?clear
}
void?DatabaseConstruction(?float?N?)?//?the?file?group?
{
multiset::iterator?it;
Word?temp;
for(it?=?SumStatics.begin();?it?!=?SumStatics.end();?it++?)
{
temp.wordfrequency?=?fabs(log(N/(float)SumStatics.count(*it))/log(2));
temp.word?=?*it;
if(Database.find(temp)==Database.end())
{
Database.insert(temp);
}
}
SinStatics.clear();
SumStatics.clear();
Dic.clear();
}
void?TfidfFileInput(char?*filename)?//?segment?the?sentence??store?the?real?words
{
ifstream?testfile(filename);
string?testsentence;
string?testword;
if?(!testfile)
cerr?<“Fail?to?open?“?< else
cout?<“Succeed?to?open?“?< cout?<“Please?wait?“< while(!testfile.eof())
{
getline(testfiletestsentence‘\n‘);
string?result_temp=““;
int?result_len?=?0;
string?sentence_temp=testsentence;
int?cur_sen_length=testsentence.length();
int?len1len2;
while(sentence_temp!=““)
{
len1?=?sentence_temp.length();
len2?=?sentence_temp.length();
if(len2?>?MaxWordLength)?//?MaxLength?
len2?=?MaxWordLength;
testword?=?sentence_temp.substr(len1-len2);
bool?isw?=?TFidfWordCheck(?testword?);
while(len2?>?2?&&?isw?==?false)
{
len2?=?len2-2;?//?2?Byte?1?word
testword?=?sentence_temp.substr(len1-len2);
isw?=?TFidfWordCheck(?testword?);
}
if(result_temp?==?““)
result_temp=testword+result_temp;?//?continue
else
result_temp=testword+“??“+result_temp;?//?cut
sentence_temp=sentence_temp.substr(0len1-len2);?//?next?sentence
}
}
testfile.close();
}
bool?TFidfWordCheck(string?test_word)?//?whether?t
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2015-04-10?20:25??Tfidf_Calculate\
?????文件????????4456??2015-05-28?00:17??Tfidf_Calculate\DirInput.cpp
?????文件?????????940??2015-05-28?00:18??Tfidf_Calculate\main.cpp
?????目錄???????????0??2015-04-11?11:46??Tfidf_Calculate\mingw5\
?????文件????????3268??2003-07-21?19:40??Tfidf_Calculate\mingw5\(1).txt
?????文件????????5626??2015-04-15?22:41??Tfidf_Calculate\mingw5\(1)Out.txt
?????文件?????????998??2003-07-21?19:40??Tfidf_Calculate\mingw5\(10).txt
?????文件????????1698??2015-04-15?22:41??Tfidf_Calculate\mingw5\(10)Out.txt
?????文件????????1341??2003-07-21?19:40??Tfidf_Calculate\mingw5\(100).txt
?????文件????????2283??2015-04-15?22:41??Tfidf_Calculate\mingw5\(100)Out.txt
?????文件?????????699??2003-07-21?19:40??Tfidf_Calculate\mingw5\(101).txt
?????文件????????1241??2015-04-15?22:41??Tfidf_Calculate\mingw5\(101)Out.txt
?????文件?????????963??2003-07-21?19:40??Tfidf_Calculate\mingw5\(102).txt
?????文件????????1651??2015-04-15?22:41??Tfidf_Calculate\mingw5\(102)Out.txt
?????文件????????3045??2003-07-21?19:40??Tfidf_Calculate\mingw5\(103).txt
?????文件????????5183??2015-04-15?22:41??Tfidf_Calculate\mingw5\(103)Out.txt
?????文件?????????785??2003-07-21?19:40??Tfidf_Calculate\mingw5\(104).txt
?????文件????????1339??2015-04-15?22:41??Tfidf_Calculate\mingw5\(104)Out.txt
?????文件?????????814??2003-07-21?19:40??Tfidf_Calculate\mingw5\(105).txt
?????文件????????1442??2015-04-15?22:41??Tfidf_Calculate\mingw5\(105)Out.txt
?????文件????????1190??2003-07-21?19:40??Tfidf_Calculate\mingw5\(106).txt
?????文件????????2168??2015-04-15?22:41??Tfidf_Calculate\mingw5\(106)Out.txt
?????文件????????1265??2003-07-21?19:40??Tfidf_Calculate\mingw5\(107).txt
?????文件????????2209??2015-04-15?22:41??Tfidf_Calculate\mingw5\(107)Out.txt
?????文件????????1157??2003-07-21?19:40??Tfidf_Calculate\mingw5\(108).txt
?????文件????????2001??2015-04-15?22:41??Tfidf_Calculate\mingw5\(108)Out.txt
?????文件????????1195??2003-07-21?19:40??Tfidf_Calculate\mingw5\(109).txt
?????文件????????2011??2015-04-15?22:41??Tfidf_Calculate\mingw5\(109)Out.txt
?????文件?????????788??2003-07-21?19:40??Tfidf_Calculate\mingw5\(11).txt
?????文件????????1400??2015-04-15?22:41??Tfidf_Calculate\mingw5\(11)Out.txt
?????文件????????1000??2003-07-21?19:40??Tfidf_Calculate\mingw5\(110).txt
............此處省略308個文件信息
評論
共有 條評論