-
大小: 0.06M文件類型: .rar金幣: 1下載: 0 次發布日期: 2021-02-01
- 標簽:
資源簡介
《WEB數據挖掘與知識發現》試驗報告實現程序
【核心代碼】
/************************************************ * 《WEB數據挖掘與知識發現》試驗報告實現程序 * * 功能:采用TFIDF自動對文本進行形式化(題目6) * * * * 時間:2008.2.28 * ************************************************/ #include<stdio.h> #include<string.h> #include<malloc.h> #include<math.h> #define FNUM 20 //使用的文件總個數 struct Ttree { char data[20]; double weight; double num; //一篇文獻中的某一索引詞出現的次數 double max; //一篇文獻的總字數 double n; //索引詞出現在幾個文檔中 struct Ttree *lchild; //左兒子 struct Ttree *rchild; //右兒子 }; struct Ttree *rootW=NULL; struct Ttree *mtree=NULL,*ntree=NULL,*rtree=NULL; //定義weight權值排序函數中間變量****3月2日增加 FILE *fp=fopen("mm.txt","w"); //創建二叉樹用來存放單詞,以及該詞在文檔中出現的次數 Ttree *createTtree(Ttree *root,FILE *fp){ int i=0,t=0; struct Ttree *p,*q; //定義中間指針變量 char ch; p=(Ttree*)malloc(sizeof(Ttree)); //申請新的存儲空間 p->data[0]='\0'; p->max=0; //**************3月1日增加 if(fp==NULL) { printf("\nCannot open file strike any key exit!"); return NULL; } ch=fgetc(fp); while((ch!=EOF)&&(t==0)) { if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')){ if(ch<='Z') ch=ch 32; p->data[i]=ch; i ; } else { if(p->data[0]=='\0'){ ch=fgetc(fp); continue; } p->data[i]='\0'; p->max ; p->n=1; p->num=1; i=0; t=1; p->lchild=NULL; p->rchild=NULL; //初始化頭節點的左右兒子為空指針 root=p; } ch=fgetc(fp); } q=(Ttree*)malloc(sizeof(Ttree)); q->data[0]='\0' ; while(ch!=EOF){ if( (ch>='a'&&ch<='z') || (ch>='A'&&ch<='Z') ) { if(ch<='Z') ch=ch 32; q->data[i]=ch; i ; ch=fgetc(fp); } else{ if(q->data[0]=='\0') { ch=fgetc(fp); continue; } q->data[i]='\0'; root->max ; q->n=1; q->num=1; i=0; q->lchild=NULL; q->rchild=NULL; //初始化頭節點的左右兒子為空指針 if(p==NULL)p=root; ch=fgetc(fp); while(p!=NULL) //尋找待插入節點的位置 { if(strcmp(q->data,p->data)<0){ //如果待插入的節點的值小于當前節點的值, if(p->lchild==NULL) //且其左子樹為空 { p->lchild=q; // 則插入 p=NULL; } //并置當前節點為空,退出當前的while循環 else p=p->lchild; } // 否則繼續訪問其左子樹 else if(strcmp(q->data,p->data)>0){ //如果待插入的節點的值大于當前節點的值 if(p->rchild==NULL) // 且其右子樹為空 { p->rchild=q; // 則插入 p=NULL; } //并置當前節點為空,退出當前的while循環 else p=p->rchild; } // 否則繼續訪問其右子樹 else{ p->num ; p=NULL; } }//while q=(Ttree*)malloc(sizeof(Ttree)); q->data[0]='\0'; }//else }//while return root; } /* 二叉樹查找 計算某個詞在幾篇文檔中出現 */ Ttree *SearchBinTtree(Ttree *rootx,Ttree *rooty){ if(rootx==NULL) return NULL; if(strcmp(rootx->data,rooty->data)==0){ rooty->n ; return rootx; } if(strcmp(rootx->data,rooty->data)>0) return SearchBinTtree(rootx->lchild,rooty); return SearchBinTtree(rootx->rchild,rooty); } /*計算詞出現在幾個文檔中*/ void InMidThread(Ttree *rooty,Ttree *rootx){ if(rooty==NULL) return; InMidThread(rooty->lchild,rootx); //中序遍歷二叉樹左子樹 ; SearchBinTtree(rootx,rooty); InMidThread(rooty->rchild,rootx); //中序遍歷二叉樹右子樹 ; } /*計算權值*/ void InThread(Ttree *root,Ttree *Mroot){ if(root==NULL) return; InThread(root->lchild,Mroot); //中序遍歷二叉樹左子樹 ; root->weight=(root->num/Mroot->max)*log(FNUM/root->n); InThread(root->rchild,Mroot); //中序遍歷二叉樹右子樹 ; } //對權值進行排序 /*******3月2日對該函數進行修正,糾正建樹過程中遺漏分支的錯誤*/ void weight(Ttree *root){ if(root==NULL) return; weight(root->lchild); //中序遍歷二叉樹左子樹 ; if (rtree==NULL) { mtree=(Ttree*)malloc(sizeof(Ttree)); //申請新的存儲空間 for(int i=0;i<20;i ) mtree->data[i] = root->data[i]; mtree->weight=root->weight; mtree->num=root->num; mtree->n=root->n; mtree->lchild=NULL; mtree->rchild=NULL; //初始化頭節點的左右兒子為空指針 rootW=mtree; //指針rootW指向頭節點 rtree=mtree; } else { ntree=(Ttree*)malloc(sizeof(Ttree)); for (int i=0;i<20;i ) ntree->data[i] = root->data[i]; ntree->weight=root->weight; ntree->num=root->num; ntree->n=root->n; ntree->lchild=NULL; ntree->rchild=NULL; //初始化頭節點的左右兒子為空指針 if(mtree==NULL) mtree=rootW; //如果要有新節點插入則,m重新指向根節點,因為 每次比較都要從根節點開始 while(mtree!=NULL) //尋找待插入節點的位置 { if (ntree->weight>mtree->weight) { if(mtree->lchild==NULL) // 且其左子樹為空 { mtree->lchild=ntree; // 則插入 mtree=NULL; //并置當前節點為空,退出當前的while循環 } else mtree=mtree->lchild; // 否則繼續訪問其左子樹 } else { //如果待插入的節點的值大于當前節點的值 if(mtree->rchild==NULL) // 且其右子樹為空 { mtree->rchild=ntree; // 則插入 mtree=NULL; } //并置當前節點為空,退出當前的while循環 else mtree=mtree->rchild; // 否則繼續訪問其右子樹 } }//while }//else weight(root->rchild); } //判斷某詞是否在詞匯集中*********3月2日增加 bool in(char data[20]) { char ch,temp[20]; int k=0; FILE *fp=fopen("text\\vo.txt","r"); if(fp==NULL) { //printf("\nCannot open file strike any key exit!"); return true; //若術語集不存在,則輸出所有詞匯的權值,故這里設置為true } ch=fgetc(fp); while((ch!=EOF)) { while((ch!='\n')&&(ch!=EOF)) { if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')) { if(ch<='Z') ch=ch 32; temp[k]=ch; k ; ch=fgetc(fp); } } temp[k]='\0'; k=0; ch=fgetc(fp); if (!strcmp(temp,data)) { fclose(fp); return true; } } fclose(fp); return false; } //輸出權值 void ThreadWeight(Ttree *root){ if(root==NULL) return; ThreadWeight(root->lchild); //中序遍歷二叉樹左子樹 ; if (in(root->data)) fprintf(fp,"%30s\t%10.6f\t%6d\t%6d\n",root->data,root->weight,(int)(root->num),(int)(root->n)); ThreadWeight(root->rchild); //中序遍歷二叉樹右子樹 ; } int main(int argc, char* argv[]) { Ttree *root[ FNUM ]; int i,j; __try { char *Help=argv[1]; if (!Help==NULL) { printf("Help!\n\nNote:Please put txts into the folder \"text\",\nand edit terms in text\\vo.txt.\nRun InformationRetrieval.exe without parameter,you will get the result in mm.txt\nIf you deltet vo.txt,the result will be all terms.\n"); return 0; } //讀文件并統計詞頻 for(i=0;i<FNUM;i ) { FILE *fr; char rFileName[64]; sprintf(rFileName,"text\\P%02d.txt",i); printf("%s\n",rFileName); fr=fopen(rFileName,"r"); root[i]=createTtree(root[i],fr); fclose(fr); } //文件間相互輪對,計算詞出現在幾個文檔中 for(i=0;i<FNUM;i ) { for(j=0;j<FNUM;j ) { int next = (j 1)%FNUM; if(next==i) continue; InMidThread(root[i],root[next]); } } fprintf(fp,"%s\n\n","注:詞出現次數--詞在該文檔中出現次數, 文檔頻率--詞在幾篇文檔中出現。"); fprintf(fp,"%17s\t%6s\t%10s\t%6s%9s\n","總詞數","詞匯","權值","詞出現次數","文檔頻率"); //計算權值排序輸出 for(i=0;i<FNUM;i ) { rtree=rootW=NULL; InThread(root[i],root[i]); fprintf(fp,"第%d篇文檔%6d\n",i,int(root[i]->max)); weight(root[i]); ThreadWeight(rootW); } fcloseall(); } __except(1) { printf("Error occurring in the course of geting mail!\nProgram exit exceptionally!\n"); return -1; } return 1; }
代碼片段和文件信息
/************************************************
*?《WEB數據挖掘與知識發現》試驗報告實現程序?????*
*?功能:采用TFIDF自動對文本進行形式化(題目6)??*
*?????????????*
*?時間:2008.2.28???????????????????????????????*
************************************************/
#include
#include
#include
#include
#define?FNUM??20???//使用的文件總個數
struct?Ttree
{
char?data[20];
double?weight;
double?num;???????????//一篇文獻中的某一索引詞出現的次數
double?max;???????????//一篇文獻的總字數
double?n;?????????????//索引詞出現在幾個文檔中
struct?Ttree?*lchild;?//左兒子
struct?Ttree?*rchild;?//右兒子
};
struct?Ttree?*rootW=NULL;
struct?Ttree?*mtree=NULL*ntree=NULL*rtree=NULL;??//定義weight權值排序函數中間變量****3月2日增加
FILE?*fp=fopen(“mm.txt““w“);
//創建二叉樹用來存放單詞,以及該詞在文檔中出現的次數
Ttree?*createTtree(Ttree?*rootFILE?*fp){
in
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????8225??2008-06-25?11:29??tfidfsrc\InformationRetrieval.cpp
?????文件???????3569??2008-03-02?15:46??tfidfsrc\InformationRetrieval.dsp
?????文件????????563??2008-03-02?15:48??tfidfsrc\InformationRetrieval.dsw
?????文件??????53248??2008-03-04?15:42??tfidfsrc\InformationRetrieval.exe
?????文件???????2531??2008-06-25?11:30??tfidfsrc\mm.txt
?????文件???????4221??2008-02-27?23:39??tfidfsrc\text\P00.txt
?????文件???????4546??2008-02-27?23:40??tfidfsrc\text\P01.txt
?????文件???????3685??2008-02-28?14:49??tfidfsrc\text\P02.txt
?????文件???????2465??2008-02-28?14:50??tfidfsrc\text\P03.txt
?????文件???????3479??2008-02-28?14:51??tfidfsrc\text\P04.txt
?????文件???????3973??2008-02-28?14:55??tfidfsrc\text\P05.txt
?????文件???????1898??2008-02-28?14:56??tfidfsrc\text\P06.txt
?????文件???????4461??2008-02-28?14:56??tfidfsrc\text\P07.txt
?????文件???????3412??2008-02-28?14:57??tfidfsrc\text\P08.txt
?????文件???????3942??2008-02-28?14:58??tfidfsrc\text\P09.txt
?????文件???????3487??2008-02-28?14:59??tfidfsrc\text\P10.txt
?????文件???????4055??2008-02-28?15:00??tfidfsrc\text\P11.txt
?????文件???????4666??2008-02-28?15:01??tfidfsrc\text\P12.txt
?????文件???????4986??2008-02-28?15:01??tfidfsrc\text\P13.txt
?????文件???????4903??2008-02-28?15:02??tfidfsrc\text\P14.txt
?????文件???????4020??2008-02-28?15:03??tfidfsrc\text\P15.txt
?????文件???????3599??2008-02-28?15:04??tfidfsrc\text\P16.txt
?????文件???????4565??2008-02-28?15:04??tfidfsrc\text\P17.txt
?????文件???????4915??2008-02-28?15:05??tfidfsrc\text\P18.txt
?????文件???????3785??2008-02-28?15:07??tfidfsrc\text\P19.txt
?????文件???????3300??2008-02-28?15:09??tfidfsrc\text\P20.txt
?????文件?????????24??2008-03-03?20:04??tfidfsrc\text\vo.txt
?????目錄??????????0??2008-03-04?14:49??tfidfsrc\text
?????目錄??????????0??2008-06-25?11:30??tfidfsrc
-----------?---------??----------?-----??----
............此處省略2個文件信息
- 上一篇:管道使用demo RunDosCommand
- 下一篇:智商測試(C++坑人版)
評論
共有 條評論