資源簡介
TFIDF是經典的算法,可以進行文本相似度計算和文檔聚類,值得研究

代碼片段和文件信息
/************************************************
*?《WEB數據挖掘與知識發現》試驗報告實現程序?????*
*?功能:采用TFIDF自動對文本進行形式化(題目6)??*
*?????????????*
*?時間:2008.2.28???????????????????????????????*
************************************************/
#include
#include
#include
#include
#define?FNUM??20???//使用的文件總個數
struct?Ttree
{
char?data[20];
double?weight;
double?num;???????????//一篇文獻中的某一索引詞出現的次數
double?max;???????????//一篇文獻的總字數
double?n;?????????????//索引詞出現在幾個文檔中
struct?Ttree?*lchild;?//左兒子
struct?Ttree?*rchild;?//右兒子
};
struct?Ttree?*rootW=NULL;
struct?Ttree?*mtree=NULL*ntree=NULL*rtree=NULL;??//定義weight權值排序函數中間變量****3月2日增加
FILE?*fp=fopen(“mm.txt““w“);
//創建二叉樹用來存放單詞,以及該詞在文檔中出現的次數
Ttree?*createTtree(Ttree?*rootFILE?*fp){
int?i=0t=0;
struct?Ttree?*p*q;??????????????//定義中間指針變量
char?ch;
p=(Ttree*)malloc(sizeof(Ttree));?//申請新的存儲空間
p->data[0]=‘\0‘;
p->max=0;????????????????????????//**************3月1日增加
if(fp==NULL)
{
printf(“\nCannot?open?file?strike?any?key?exit!“);
return?NULL;
}
ch=fgetc(fp);
while((ch!=EOF)&&(t==0))
{?
if((ch>=‘a‘&&ch<=‘z‘)||(ch>=‘A‘&&ch<=‘Z‘)){
if(ch<=‘Z‘)?ch=ch+32;
p->data[i]=ch;
i++;
}
else
{
if(p->data[0]==‘\0‘){
ch=fgetc(fp);
continue;
}
p->data[i]=‘\0‘;
p->max++;
p->n=1;
p->num=1;
i=0;
t=1;
p->lchild=NULL;
p->rchild=NULL;???????????//初始化頭節點的左右兒子為空指針
root=p;
}
ch=fgetc(fp);
}
????q=(Ttree*)malloc(sizeof(Ttree));
????q->data[0]=‘\0‘?;
while(ch!=EOF){
if(?(ch>=‘a‘&&ch<=‘z‘)?||?(ch>=‘A‘&&ch<=‘Z‘)?)?{
if(ch<=‘Z‘)?ch=ch+32;
q->data[i]=ch;
i++;
ch=fgetc(fp);
}
????????else{
if(q->data[0]==‘\0‘)
{
ch=fgetc(fp);
continue;
}
q->data[i]=‘\0‘;
root->max++;
q->n=1;
q->num=1;
i=0;
q->lchild=NULL;
q->rchild=NULL;????????????????????//初始化頭節點的左右兒子為空指針
if(p==NULL)p=root;
ch=fgetc(fp);
while(p!=NULL)?????????????????????//尋找待插入節點的位置
{
if(strcmp(q->datap->data)<0){?//如果待插入的節點的值小于當前節點的值,
if(p->lchild==NULL)????????//且其左子樹為空
{
p->lchild=q;???????????//??則插入
p=NULL;
}??????????????????????????//并置當前節點為空,退出當前的while循環
else
p=p->lchild;
}?//?否則繼續訪問其左子樹
else?if(strcmp(q->datap->data)>0){?//如果待插入的節點的值大于當前節點的值
if(p->rchild==NULL)?????????????//?且其右子樹為空
{
p->rchild=q;????????????????//??則插入
p=NULL;
}?//并置當前節點為空,退出當前的while循環
else
p=p->rchild;
}?//?否則繼續訪問其右子樹
else{
p->num++;
p=NULL;
}
}//while
????????????q=(Ttree*)malloc(sizeof(Ttree));
????????????q->data[0]=‘\0‘;
}//else
}//while
return?root;
}
/*
二叉樹查找
計算某個詞在幾篇文檔中出現
*/
Ttree?*SearchBinTtree(Ttree?*rootxTtree?*rooty){
if(rootx==NULL)?return?NULL;
if(strcmp(rootx->datarooty->data)==0){
rooty->n++;
return?rootx;
}
if(strcmp(rootx->datarooty->data)>0)?return?S
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????712??2015-06-01?19:48??tfidfsrc\Debug\cl.command.1.tlog
?????文件???????1876??2015-06-01?19:48??tfidfsrc\Debug\CL.read.1.tlog
?????文件????????340??2015-06-01?19:48??tfidfsrc\Debug\CL.write.1.tlog
?????文件?????500224??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe
?????文件????????406??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe.em
?????文件????????472??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe.em
?????文件????????381??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.exe.intermediate.manifest
?????文件????1074516??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.ilk
?????文件?????????64??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.lastbuildstate
?????文件???????5090??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.log
?????文件??????20703??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.obj
?????文件????1993728??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.pdb
?????文件????????707??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.vcxprojResolveAssemblyReference.cache
?????文件??????????0??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval.write.1.tlog
?????文件????????236??2015-06-01?19:48??tfidfsrc\Debug\InformationRetrieval_manifest.rc
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件??????????2??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件???????1586??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件???????3214??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件????????840??2015-06-01?19:48??tfidfsrc\Debug\li
?????文件????????450??2015-06-01?19:48??tfidfsrc\Debug\mt.command.1.tlog
?????文件????????330??2015-06-01?19:48??tfidfsrc\Debug\mt.read.1.tlog
?????文件????????330??2015-06-01?19:48??tfidfsrc\Debug\mt.write.1.tlog
?????文件????????630??2015-06-01?19:48??tfidfsrc\Debug\rc.command.1.tlog
?????文件????????302??2015-06-01?19:48??tfidfsrc\Debug\rc.read.1.tlog
?????文件????????310??2015-06-01?19:48??tfidfsrc\Debug\rc.write.1.tlog
............此處省略43個文件信息
評論
共有 條評論