資源簡(jiǎn)介
網(wǎng)絡(luò)爬蟲,輕松獲取網(wǎng)絡(luò)資源!網(wǎng)絡(luò)爬蟲為搜索引擎從萬維網(wǎng)下載網(wǎng)頁。一般分為傳統(tǒng)爬蟲和聚焦爬蟲。
代碼片段和文件信息
import?java.io.IOException;
import?java.io.InputStream;
import?java.io.InputStreamReader;
import?java.io.Reader;
import?java.net.MalformedURLException;
import?java.net.URL;
import?java.net.URLConnection;
import?java.util.ArrayList;
import?java.util.Date;
import?java.util.List;
import?javax.swing.text.MutableAttributeSet;
import?javax.swing.text.html.HTML;
import?javax.swing.text.html.HTMLEditorKit;
public?class?Crawler?{
private?List?urlWaiting?=?new?ArrayList(); //A?list?of?URLs?that?are?waiting?to?be?processed
private?List?urlProcessed?=?new?ArrayList(); //A?list?of?URLs?that?were?processed
private?List?urlError?=?new?ArrayList(); //A?list?of?URLs?that?resulted?in?an?error
private?int?numFindUrl?=?0; //find?the?number?of?url
public?Crawler()?{}
/**
?*?start?crawling
?*/
public?void?begin()?{
while?(!urlWaiting.isEmpty())?{
processURL(urlWaiting.remove(0));
}
log(“finish?crawling“);
log(“the?number?of?urls?that?were?found:“?+?numFindUrl);
log(“the?number?of?urls?that?were?processed:“?+?urlProcessed.size());
log(“the?number?of?urls?that?resulted?in?an?error:“?+?urlError.size());
}
/**
?*?Called?internally?to?process?a?URL
?*?
?*?@param?strUrl
?*????????????The?URL?to?be?processed.
?*/
public?void?processURL(String?strUrl)?{
URL?url?=?null;
try?{
url?=?new?URL(strUrl);
log(“Processing:?“?+?url);
//?get?the?URL‘s?contents
URLConnection?connection?=?url.openConnection();
connection.setRequestProperty(“User-Agent“?“Test?Crawler?for?Course?NIR“);
if?((connection.getContentType()?!=?null)
&&?!connection.getContentType().toLowerCase()
.startsWith(“text/“))?{
log(“Not?processing?because?content?type?is:?“
+?connection.getContentType());
return;
}
//?read?the?URL
InputStream?is?=?connection.getInputStream();
Reader?r?=?new?InputStreamReader(is);
//?parse?the?URL
HTMLEditorKit.Parser?parse?=?new?HTMLParse().getParser();
parse.parse(r?new?Parser(url)?true);
}?catch?(IOException?e)?{
urlError
評(píng)論
共有 條評(píng)論