資源簡介
用Java語言實現的網絡爬蟲,并使用正則表達式提取html網頁的正文!

代碼片段和文件信息
import?java.io.File;
import?java.io.BufferedReader;
import?java.io.FileOutputStream;
import?java.io.InputStream;
import?java.io.InputStreamReader;
import?java.io.OutputStreamWriter;
import?java.io.PrintWriter;
import?java.net.URL;
import?java.net.URLConnection;
import?java.util.ArrayList;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;
import?java.util.Hashtable;
public?class?GetWeb?{
private?int?webDepth?=?2;//爬蟲深度
private?int?intThreadNum?=?10;//線程數
private?String?strHomePage?=?““;//主頁地址
private?String?myDomain;//域名
private?String?fPath?=?“web“;//儲存網頁文件的目錄名
private?String?dPath=“txt“;?
private?ArrayList?arrUrls?=?new?ArrayList();//存儲未處理URL
private?ArrayList?arrUrl?=?new?ArrayList();//存儲所有URL供建立索引
private?Hashtable?allUrls?=?new?Hashtable();//存儲所有URL的網頁號
private?Hashtable?deepUrls?=?new?Hashtable();//存儲所有URL深度
private?int?intWebIndex?=?0;//網頁對應文件下標,從0開始
private?String?charset?=?“GB2312“;
private?String?report?=?““;
private?long?startTime;
private?int?webSuccessed?=?0;
private?int?webFailed?=?0;
private?static?ParserHtmlList?phl;
public?GetWeb(String?s)
{
???this.strHomePage?=?s;
}
public?GetWeb(String?sint?i)
{
???this.strHomePage?=?s;
???this.webDepth?=?i;
}
public?synchronized?void?addWebSuccessed()
{
???webSuccessed++;
}
public?synchronized?void?addWebFailed()
{
???webFailed++;
}
public?synchronized?void?addReport(String?s)
{
???try
???{
????report?+=?s;
????PrintWriter?pwReport?=?new?PrintWriter(new?FileOutputStream(“report.txt“));
????pwReport.println(report);
????pwReport.close();
???}
???catch(Exception?e)
???{
????System.out.println(“生成報告文件失敗!“);
???}
}
public?synchronized?String?getAUrl()
{
???String?tmpAUrl?=?arrUrls.get(0);//得到首頁面
???arrUrls.remove(0);//并將它刪除
???return?tmpAUrl;
}
public?synchronized?String?getUrl()
{
???String?tmpUrl?=?arrUrl.get(0);
???arrUrl.remove(0);
???return?tmpUrl;
}
public?synchronized?Integer?getIntWebIndex()
{
???intWebIndex++;
???return?intWebIndex;
}
/**
*?@param?args
*/
public?static?void?main(String[]?args)
{
???if?(args.length?==?0?||?args[0].equals(““))
???{
????System.out.println(“No?input!“);
????System.exit(1);
???}
???else?if(args.length?==?1)
???{
????GetWeb?gw?=?new?GetWeb(args[0]);
????phl=new?ParserHtmlList();
????gw.getWebByHomePage();
????
???
????
???}
???else
????{
????GetWeb?gw?=?new?GetWeb(args[0]Integer.parseInt(args[1]));
????phl=new?ParserHtmlList();
????gw.getWebByHomePage();
???}
}
public?void?getWebByHomePage()
{
???startTime?=?System.currentTimeMillis();
???this.myDomain?=?getDomain();
???if?(myDomain?==?null)
???{
????System.out.println(“Wrong?input!“);
????//System.exit(1);
????return;
???}
???System.out.println(“Homepage?=?“?+?strHomePage);
???addReport(“Homepage?=?“?+?strHomePage?+?“!\n“);
???System.out.println(“Domain?=?“?+?myDomain);
?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????386??2010-12-04?20:33??mainSpider\.project
?????文件????????226??2010-11-17?08:59??mainSpider\.classpath
?????文件???????9351??2010-12-13?22:20??mainSpider\GetWeb.java
?????文件???????5395??2010-12-07?17:18??mainSpider\ParserHtmlList.java
?????文件???????4973??2010-12-07?17:18??mainSpider\ParserHtmlList.class
?????文件???????9692??2010-12-13?22:20??mainSpider\GetWeb.class
?????文件???????1208??2010-12-13?22:20??mainSpider\GetWeb$Processer.class
?????文件??????18329??2010-12-13?22:16??mainSpider\report.txt
?????文件??????77590??2010-12-13?22:15??mainSpider\web\web0.htm
?????文件??????10610??2010-12-13?22:15??mainSpider\web\web3.htm
?????文件???????9242??2010-12-13?22:15??mainSpider\web\web4.htm
?????文件??????11796??2010-12-13?22:15??mainSpider\web\web9.htm
?????文件??????10465??2010-12-13?22:15??mainSpider\web\web7.htm
?????文件??????10470??2010-12-13?22:15??mainSpider\web\web6.htm
?????文件??????22696??2010-12-13?22:15??mainSpider\web\web12.htm
?????文件??????68916??2010-12-13?22:15??mainSpider\web\web8.htm
?????文件??????10247??2010-12-13?22:15??mainSpider\web\web14.htm
?????文件?????129728??2010-12-13?22:15??mainSpider\web\web17.htm
?????文件???????6247??2010-12-13?22:15??mainSpider\web\web13.htm
?????文件??????42967??2010-12-13?22:15??mainSpider\web\web11.htm
?????文件???????9908??2010-12-13?22:15??mainSpider\web\web15.htm
?????文件?????140187??2010-12-13?22:15??mainSpider\web\web18.htm
?????文件??????10605??2010-12-13?22:15??mainSpider\web\web19.htm
?????文件??????10360??2010-12-13?22:15??mainSpider\web\web5.htm
?????文件?????144060??2010-12-13?22:15??mainSpider\web\web16.htm
?????文件????????187??2010-12-13?22:15??mainSpider\web\web20.htm
?????文件??????10083??2010-12-13?22:15??mainSpider\web\web2.htm
?????文件??????50773??2010-12-13?22:15??mainSpider\web\web1.htm
?????文件???????2462??2010-12-13?22:15??mainSpider\web\web21.htm
?????文件??????69090??2010-12-13?22:15??mainSpider\web\web23.htm
............此處省略81個文件信息
- 上一篇:韓順平jsp+servlet源碼
- 下一篇:網絡爬蟲 Java實現原理
評論
共有 條評論