資源簡介
爬網頁內容,記錄爬過的網址,記錄爬的時間,提取URL

代碼片段和文件信息
#include?
#include?
#include?
#include?
#include?
#include?“windows.h“
unsigned?char?mark[8]={0x010x020x040x080x100x200x400x80};//分別表示00000001,00000010...10000000
struct?bloom_filter{
unsigned?n;
unsigned?size;
????unsigned?char*?bits;//用于存儲mark的數組
};
//URL去重所需函數
int?Bloom_init(int?numstruct?bloom_filter*?b);
int?insert(char*?sstruct?bloom_filter*?b);
int?check(char?*sstruct?bloom_filter*?b);
void?Del(struct?bloom_filter*?b);
unsigned?int?Hash1(char?*s);
unsigned?int?Hash2(char*?s);
unsigned?int?Hash3(char*?s);
//抓取網頁分析程序所需變量
char?Host[1024];
char?Dir[1024];
char?host[1024];
char?cop[1024]=““;
int?sock;
struct?hostent?*hp;
struct?sockaddr_in?server;
unsigned?short?port=80;
FILE*?file=NULL;//暫時的網頁內容存儲
FILE*?file1=NULL;//Url隊列文件
FILE*?page=NULL;//所有crawled的網頁內容
FILE*?result=NULL;//crawled結果
char??match[100];
char??url[1024];
int???flag;
fpos_t?filepos;
//socket和網頁抓取函數
int??Connecthost();
void?analyze();
void?sendrequest();
void?receive();
void?getlink();
struct?bloom_filter*?b;//bloom?filter?存儲變量
int?main(int?argcchar**?argv)
{
int?num=666666;
int?a;
int?number;
int?p;
WORD???wVersionRequested=MAKEWORD(20);???????
????WSADATA???wsaData;?????
????int??err=WSAStartup(wVersionRequested&wsaData);?????
????if(err!=0)?????
??WSACleanup();?
if((page=fopen(“pages.txt““w“))==NULL)?return;
if((result=fopen(“result.txt““w“))==NULL)?return;
if((file1=fopen(“Url.txt““a+“))==NULL)?return;
Bloom_init(num&b);
printf(“輸入要爬取的網頁數目:“);
scanf(“%d“&number);
p=1;
while(p<=number)
{
?fgets(host1023file1);
?
?fgetpos(file1?&filepos);?
?analyze();strcpy(copHost);
?memset(url01024);
?strcpy(urlHost);
?strcat(urlDir);
?
?insert(url&b);//如果添加成功
??
?if((flag=Connecthost())==1)//如果連接成功
?{
??printf(“正在爬取第%d個網頁%s\n“purl);
??????sendrequest();
? ??????receive();
??????getlink();
??p++;
?}
?
}
????fclose(file1);
fclose(page);
fclose(result);
Del(&b);
????WSACleanup();
}
int??Connecthost()
{
int?m=0;
if((sock=socket(AF_INETSOCK_STREAM0))<0)?
??return?m;
if((hp=gethostbyname(Host))==NULL)
??return?m;
????server.sin_family?=?AF_INET;??
????server.sin_port?=?htons(port);??
server.sin_addr=*((struct?in_addr?*)(hp->h_addr));
????memset(server.sin_zero08);
????
????if(connect(sock?(struct?sockaddr*)&serversizeof(server))<0)?
???return?m;
m=1;
return?m;
}
void?analyze()//URL分析函數
{???
int?j=0;
????int?i=0;
int?k=0;
????if(host[0]==‘/‘)
{???memset(Dir01024);
while(host[j]!=‘\0‘)
{Dir[j]=host[j];
?j++;}
Dir[j]=‘\0‘;
strcpy(Hostcop);
}
else
{???
memset(Host01024);
memset(Dir01024);
????????for(j=0;j<7;j++)
??match[j]=host[j];
match[j]=‘\0‘;
if((strcmp(match“http://“))!=0)
{
?for(j=0;j<8;j++)
??match[j]=host[j];
?match[j]=‘\0‘;
?if((strcmp(match“https://“))!=0)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????4331??2010-10-13?15:08??Crawler\Crawler.dsp
?????文件????????522??2010-10-13?10:06??Crawler\Crawler.dsw
?????文件??????58368??2010-10-29?22:18??Crawler\Crawler.ncb
?????文件??????48640??2010-10-29?22:18??Crawler\Crawler.opt
?????文件???????4313??2010-10-29?22:00??Crawler\Crawler.plg
?????文件?????213052??2010-10-29?22:00??Crawler\Debug\Crawler.exe
?????文件?????286472??2010-10-29?22:00??Crawler\Debug\Crawler.ilk
?????文件????2850332??2010-10-29?18:36??Crawler\Debug\Crawler.pch
?????文件?????566272??2010-10-29?22:00??Crawler\Debug\Crawler.pdb
?????文件??????30467??2010-10-29?22:00??Crawler\Debug\main.obj
?????文件??????33792??2010-10-29?22:10??Crawler\Debug\vc60.idb
?????文件??????69632??2010-10-29?22:00??Crawler\Debug\vc60.pdb
?????文件????????996??2010-10-29?22:12??Crawler\file.txt
?????文件???????7913??2010-10-29?21:02??Crawler\main.c
?????文件????2878026??2010-10-29?22:12??Crawler\pages.txt
?????文件?????499719??2010-10-29?22:12??Crawler\result.txt
?????文件?????????46??2010-10-29?22:16??Crawler\Url.txt
?????目錄??????????0??2010-10-29?22:18??Crawler\Debug
?????目錄??????????0??2010-10-29?22:18??Crawler
-----------?---------??----------?-----??----
??????????????7552893????????????????????19
評論
共有 條評論