資源簡介
自己動手寫網絡爬蟲完整版 源碼
代碼片段和文件信息
import?java.io.DataOutputStream;
import?java.io.File;
import?java.io.FileOutputStream;
import?java.io.IOException;
import?org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import?org.apache.commons.httpclient.HttpClient;
import?org.apache.commons.httpclient.HttpException;
import?org.apache.commons.httpclient.HttpStatus;
import?org.apache.commons.httpclient.methods.GetMethod;
import?org.apache.commons.httpclient.params.HttpMethodParams;
public?class?DownLoadFile?{
/**
?*?根據?url?和網頁類型生成需要保存的網頁的文件名?去除掉?url?中非文件名字符
?*/
public??String?getFileNameByUrl(String?urlString?contentType)
{
//remove?http://
url=url.substring(7);
//text/html類型
if(contentType.indexOf(“html“)!=-1)
{
url=?url.replaceAll(“[\\?/:*|<>\“]“?“_“)+“.html“;
return?url;
}
//如application/pdf類型
else
{
??????????return?url.replaceAll(“[\\?/:*|<>\“]“?“_“)+“.“+
??????????contentType.substring(contentType.lastIndexOf(“/“)+1);
}
}
/**
?*?保存網頁字節數組到本地文件?filePath?為要保存的文件的相對地址
?*/
private?void?saveToLocal(byte[]?data?String?filePath)?{
try?{
DataOutputStream?out?=?new?DataOutputStream(new?FileOutputStream(
new?File(filePath)));
for?(int?i?=?0;?i? out.write(data[i]);
out.flush();
out.close();
}?catch?(IOException?e)?{
e.printStackTrace();
}
}
/*?下載?url?指向的網頁?*/
public?String?downloadFile(String?url)?{
String?filePath?=?null;
/*?1.生成?HttpClinet?對象并設置參數?*/
HttpClient?httpClient?=?new?HttpClient();
//?設置?Http?連接超時?5s
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(
5000);
/*?2.生成?GetMethod?對象并設置參數?*/
GetMethod?getMethod?=?new?GetMethod(url);
//?設置?get?請求超時?5s
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT?5000);
//?設置請求重試處理
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER
new?DefaultHttpMethodRetryHandler());
/*?3.執行?HTTP?GET?請求?*/
try?{
int?statusCode?=?httpClient.executeMethod(getMethod);
//?判斷訪問的狀態碼
if?(statusCode?!=?HttpStatus.SC_OK)?{
System.err.println(“Method?failed:?“
+?getMethod.getStatusLine());
filePath?=?null;
}
/*?4.處理?HTTP?響應內容?*/
byte[]?responseBody?=?getMethod.getResponseBody();//?讀取為字節數組
//?根據網頁?url?生成保存時的文件名
filePath?=?“temp\\“
+?getFileNameByUrl(url?getMethod.getResponseHeader(
“Content-Type“).getValue());
saveToLocal(responseBody?filePath);
}?catch?(HttpException?e)?{
//?發生致命的異常,可能是協議不對或者返回的內容有問題
System.out.println(“Please?check?your?provided?http?address!“);
e.printStackTrace();
}?catch?(IOException?e)?{
//?發生網絡異常
e.printStackTrace();
}?finally?{
//?釋放連接
getMethod.releaseConnection();
}
return?filePath;
}
}
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????4478??2010-06-28?13:46??Chap04\Channelli
?????文件??????17812??2010-06-28?13:46??Chap04\ExtractContext.java
?????文件????????825??2010-04-24?11:08??Chap04\HtmlParser\.classpath
?????文件????????386??2010-05-04?08:52??Chap04\HtmlParser\.project
?????文件????????629??2010-04-19?16:37??Chap04\HtmlParser\.settings\org.eclipse.jdt.core.prefs
?????文件??????11334??2010-04-26?15:54??Chap04\HtmlParser\src\com\lietu\htmlParser\HtmlParser.java
?????文件???????8312??2006-09-23?14:25??Chap04\HtmlParser\src\doc-files\building.html
?????文件???????5074??2006-09-17?12:47??Chap04\HtmlParser\src\doc-files\overview.html
?????文件???????4896??2006-09-17?13:00??Chap04\HtmlParser\src\doc-files\using.html
?????文件??????26096??2006-09-17?07:24??Chap04\HtmlParser\src\org\htmlparser\Attribute.java
?????文件??????10617??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\BeanyBaby.form
?????文件??????13209??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\BeanyBaby.java
?????文件??????13762??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\FilterBean.java
?????文件???????6547??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\HTMLli
?????文件???????9020??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\HTMLTextBean.java
?????文件????????213??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Chain16.gif
?????文件????????278??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Chain32.gif
?????文件????????140??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Knot16.gif
?????文件????????167??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\images\Knot32.gif
?????文件???????8602??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\li
?????文件???????2188??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\package.html
?????文件??????23110??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\beans\StringBean.java
?????文件???????3485??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\AndFilter.java
?????文件??????13731??2006-09-22?20:26??Chap04\HtmlParser\src\org\htmlparser\filters\CssSelectorNodeFilter.java
?????文件???????4224??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasAttributeFilter.java
?????文件???????5213??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasChildFilter.java
?????文件???????4821??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasParentFilter.java
?????文件???????3556??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\HasSiblingFilter.java
?????文件???????1825??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\IsEqualFilter.java
?????文件???????3184??2006-09-17?07:25??Chap04\HtmlParser\src\org\htmlparser\filters\li
............此處省略2237個文件信息
評論
共有 條評論