資源簡介
在linux環境下運行的開源網絡爬蟲系統,有具體使用步驟

代碼片段和文件信息
/*?http.cpp
?*?this?file?supports?TCP?socket?operations.?such?as?gethostbyname()
?*?create?socket?etc.
?*/
#include?“http.h“
extern?setSet;
extern?URL?url;?
extern?queueque;
extern?struct?epoll_event?events[31];
extern?struct?timeval?t_st?t_ed;
extern?int?epfd;
extern?int?cnt;
extern?int?sum_byte;
extern?int?pending;
extern?int?MAX_URL;
extern?bool?is_first_url;
extern?double?time_used;
extern?pthread_mutex_t?quelock;
extern?pthread_mutex_t?connlock;
struct?hostent?*Host;
/*?GetHostByName()
?*?by?call?gethostbyname()?we?can?get?host?infomation
?*?through?domain
?*/
int?GetHostByName(const?string&?hname)
{
if((Host?=?gethostbyname(hname.c_str()))?==?NULL)
{
return?-1;
}
return?1;
}
/*?SetNoblocking()
?*?set?nonblocking?IO?model
?*/
int?SetNoblocking(const?int&?sockfd)
{
int?opts?=?fcntl(sockfd?F_GETFL);
if?(opts?0)?
{
return?-1;
}
//?set?nonblocking
opts?|=?O_NONBLOCK;
if?(fcntl(sockfd?F_SETFL?opts)?0)?
{
return?-1;
}
}
/*?ConnectWeb()
?*?this?function?used?to?create?a?new?socket?fd?and
?*?then?connect?to?host
?*/
int?ConnectWeb(int&?sockfd)
{
struct?sockaddr_in?server_addr;
//?create?socket
if((sockfd?=?socket(PF_INET?SOCK_STREAM?0))?==?-1)?
{
return?-1;
}
#ifdef?DEBUG
puts(“create?socket?ok“);
#endif
//?initialize?server_addr
bzero(&server_addr?sizeof(server_addr));
server_addr.sin_family?=?AF_INET;
server_addr.sin_port?=?htons(80);
server_addr.sin_addr?=?*((struct?in_addr?*)Host->h_addr);
//?connect?to?host
if(connect(sockfd?(struct?sockaddr?*)(&server_addr)?sizeof(struct?sockaddr))?==?-1)
{
perror(“connect?error“);
return?-1;
}
#ifdef?DEBUG
puts(“connect?ok“);
#endif
pthread_mutex_lock(&connlock);
pending++;
pthread_mutex_unlock(&connlock);
}
/*?SendRequest()
?*?this?function?used?to?send?request?to?host.?tell?host
?*?what?you?want?to?do.
?*/
int?SendRequest(int?sockfd?URL&?url_t)
{
//?initialize?request
string?request;
string?Uagent?=?UAGENT?Conn?=?CONN?Accept?=?ACCEPT;
request?=?“GET?/“?+?url_t.GetFile()?+?“?HTTP/1.1\r\nHost:?“?+?url_t.GetHost()?+?“\r\nUser-Agent:?“?+?
??Uagent?+?“\r\nAccept:?“?+?Accept?+?“\r\nConnection:?“?+?Conn?+?“\r\n\r\n“;
??
//?write(send?request)
int?d?total?=?request.length()?send?=?0;
while(send? {
if((d?=?write(sockfd?request.c_str()+send?total-send))?0)?
{
return?-1;
}
send?+=?d;
}
#ifdef?DEBUG
puts(“write?in?socket?ok“);
#endif
}
/*?Calc_Time_Sec()
?*?this?function?used?to?calculate?the?diffrent?time?between
?*?two?time.?the?time?is?based?on?struct?timeval:
?* struct?timeval
?* {
?* __time_t?tv_sec;????????//?Seconds.
?* __suseconds_t?tv_usec;????//?Microseconds.?
?* };
?*/
double?Calc_Time_Sec(struct?timeval?st?struct?timeval?ed)
{
double?sec?=?ed.tv_sec?-?st.tv_sec;
double?usec?=?ed.tv_usec?-?st.tv_usec;
return?sec?+?usec/1000000;
}
/*?GetResponse()
?*?receive?the?data?from?host.?which?will?return?page?information.
?*?if?ok(get?the?Web?page?successfull
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2012-08-27?22:06??linux環境下的網絡爬蟲\
?????目錄???????????0??2012-08-27?22:08??linux環境下的網絡爬蟲\Net-Spider\
?????文件?????????610??2012-05-02?22:22??linux環境下的網絡爬蟲\Net-Spider\config.h
?????文件????????7170??2012-04-27?23:13??linux環境下的網絡爬蟲\Net-Spider\http.cpp
?????文件?????????960??2012-04-22?23:06??linux環境下的網絡爬蟲\Net-Spider\http.h
?????文件???????22288??2012-05-03?09:04??linux環境下的網絡爬蟲\Net-Spider\http.o
?????文件?????????229??2012-04-25?21:35??linux環境下的網絡爬蟲\Net-Spider\Makefile
?????目錄???????????0??2012-05-03?09:04??linux環境下的網絡爬蟲\Net-Spider\Pages\
?????文件????????1493??2012-05-02?22:25??linux環境下的網絡爬蟲\Net-Spider\README
?????文件???????63443??2012-05-03?09:04??linux環境下的網絡爬蟲\Net-Spider\spider
?????文件????????7649??2012-05-02?22:48??linux環境下的網絡爬蟲\Net-Spider\spider.cpp
?????文件???????79164??2012-05-03?09:04??linux環境下的網絡爬蟲\Net-Spider\spider.o
?????文件????????8845??2012-04-20?22:30??linux環境下的網絡爬蟲\Net-Spider\tags
?????文件????????4096??2012-04-23?23:32??linux環境下的網絡爬蟲\Net-Spider\web.cpp
?????文件????????1096??2012-04-18?22:09??linux環境下的網絡爬蟲\Net-Spider\web.h
?????文件???????45684??2012-05-03?09:04??linux環境下的網絡爬蟲\Net-Spider\web.o
?????文件?????7703627??2012-08-27?22:08??linux環境下的網絡爬蟲\Net-Spider\基于Linux?C、C++平臺的網絡爬蟲的研究與實現.doc
評論
共有 條評論