-
大小: 35KB文件類型: .zip金幣: 2下載: 0 次發(fā)布日期: 2021-06-02
- 語(yǔ)言: Java
- 標(biāo)簽: 主題爬蟲??畢業(yè)設(shè)計(jì)??
資源簡(jiǎn)介
網(wǎng)絡(luò)爬蟲是一種能夠自動(dòng)采集互聯(lián)網(wǎng)信息的程序。網(wǎng)絡(luò)爬蟲不但能夠作為搜索引擎的采集器,而且可以用于特定信息的采集,根據(jù)某些特定的要求采集網(wǎng)站中的信息,如就業(yè),租房信息等。本文設(shè)計(jì)并實(shí)現(xiàn)了一種基于主題的網(wǎng)絡(luò)爬蟲程序。網(wǎng)絡(luò)爬蟲采用何種搜索策略和如何評(píng)價(jià)當(dāng)前頁(yè)面的主題相關(guān)度是基于主題的網(wǎng)絡(luò)爬蟲需要解決的關(guān)鍵問(wèn)題。本文設(shè)計(jì)的網(wǎng)絡(luò)爬蟲采用廣度優(yōu)先搜索,對(duì)url進(jìn)行解析、去重等。并應(yīng)用Java多線程,使爬蟲在抓取網(wǎng)頁(yè)的過(guò)程中更有效率。通常評(píng)價(jià)頁(yè)面相關(guān)度是采用基于內(nèi)容評(píng)價(jià)的搜索策略,本文實(shí)現(xiàn)了三個(gè)常用的相關(guān)度評(píng)價(jià)算法分別是基于網(wǎng)頁(yè)內(nèi)容的相關(guān)度算法、基于網(wǎng)頁(yè)內(nèi)容和標(biāo)題的相關(guān)度算法、基于網(wǎng)頁(yè)內(nèi)容和鏈接結(jié)構(gòu)的相關(guān)度算法。

代碼片段和文件信息
package?theme;
import?java.io.*;
import?java.net.*;
import?java.util.*;
import?java.util.concurrent.ExecutorService;
import?java.util.concurrent.Executors;
import?java.util.concurrent.PriorityBlockingQueue;
import?java.util.regex.*;
import?javax.swing.JButton;
import?javax.swing.JLabel;
import?javax.swing.JOptionPane;
import?javax.swing.JTextPane;
import?javax.swing.text.BadLocationException;
import?javax.swing.text.SimpleAttributeSet;
import?javax.swing.text.styleConstants;
import?org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import?org.apache.commons.httpclient.HttpClient;
import?org.apache.commons.httpclient.HttpException;
import?org.apache.commons.httpclient.HttpStatus;
import?org.apache.commons.httpclient.methods.GetMethod;
import?org.apache.commons.httpclient.params.HttpMethodParams;
public?class?Crawler
{
private?String?title;
private?volatile?static?int?threadNum?=?0;
private?int?urlCount?=?1000;
private?volatile?int?visitedURL?=?0;
private?int?threadCount?=?5;
private?double?threshold?=?0.7;
private?String?startURL;
private?HashMap?keywords?=?new?HashMap();
private?PriorityBlockingQueue?waitforHandling?=?new?PriorityBlockingQueue();
private?HashSet?visited?=?new?HashSet();
private?HashMap?wanted?=?new?HashMap();
private?HashSet?noneRelevant?=?new?HashSet();
private?boolean?stop?=?false;
?
private?JTextPane?textpane;
private?JLabel?label;
private?JButton?button;
ExecutorService?threadPool?=?Executors.newCachedThreadPool();??
public?String?gettitle()
{
return?title;
}
public?void?settitle(String?title)
{
this.title?=?title;
}
public?int?getUrlCount()
{
return?urlCount;
}
public?void?setUrlCount(int?urlCount)
{
this.urlCount?=?urlCount;
}
public?int?getThreadCount()
{
return?threadCount;
}
public?Iterator?getKeyWords()?
{
return?keywords.keySet().iterator();
}
public?void?setThreshold(double?threshold)
{
this.threshold?=?threshold;
}
public?String?getStartURL()
{
return?startURL;
}
public?void?setStartURL(String?startURL)
{
this.startURL?=?startURL;
}
public?double?getThreshold()
{
return?threshold;
}
public?void?setThreadCount(int?threadCount)
{
this.threadCount?=?threadCount;
}
public?void?addKeyWord(String?word?int?count)?
{
keywords.put(word?count);
}
public?void?removeKeyWord(String?word)
{
if?(word?!=?null)
{
if?(keywords.containsKey(word))
{
keywords.remove(word);
}
}
}
public?void?removeAllKeyWords()
{
keywords.clear();
}
public?Crawler(String?title?String?start?JTextPane?textpane?JLabel?labelJButton?button)
{
this.title?=?title;
this.startURL?=?start;
this.textpane?=?textpane;
this.label?=?label;
this.butt
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2015-05-29?12:51??theme\
?????文件?????????858??2015-04-30?10:18??theme\.classpath
?????文件?????????381??2015-04-20?13:24??theme\.project
?????目錄???????????0??2015-04-20?13:24??theme\.settings\
?????文件?????????598??2015-04-20?13:24??theme\.settings\org.eclipse.jdt.core.prefs
?????目錄???????????0??2015-05-26?22:43??theme\bin\
?????目錄???????????0??2015-05-26?22:43??theme\bin\theme\
?????文件?????????738??2015-05-26?22:43??theme\bin\theme\Crawler$1.class
?????文件?????????676??2015-05-26?22:43??theme\bin\theme\Crawler$2.class
?????文件?????????906??2015-05-26?22:43??theme\bin\theme\Crawler$3.class
?????文件?????????619??2015-05-26?22:43??theme\bin\theme\Crawler$Task.class
?????文件???????13408??2015-05-26?22:43??theme\bin\theme\Crawler.class
?????文件???????10776??2015-05-26?22:43??theme\bin\theme\Crawlerfr
?????文件????????8304??2015-05-26?22:43??theme\bin\theme\Download.class
?????文件?????????817??2015-05-26?22:43??theme\bin\theme\HtmlParserTool$1.class
?????文件????????2703??2015-05-26?22:43??theme\bin\theme\HtmlParserTool.class
?????文件?????????645??2015-05-26?22:43??theme\bin\theme\HttpConstants.class
?????文件?????????148??2015-05-26?22:43??theme\bin\theme\li
?????文件?????????873??2015-05-26?22:43??theme\bin\theme\PriorityURL.class
?????文件???????????0??2015-04-29?15:12??theme\result
?????目錄???????????0??2015-04-20?13:24??theme\src\
?????目錄???????????0??2015-04-25?18:24??theme\src\theme\
?????文件???????11856??2015-05-06?12:21??theme\src\theme\Crawler.java
?????文件???????10473??2015-04-30?10:20??theme\src\theme\Crawlerfr
?????文件????????7113??2015-05-06?12:21??theme\src\theme\Download.java
?????文件????????1780??2015-05-06?12:21??theme\src\theme\HtmlParserTool.java
?????文件?????????606??2015-04-20?15:27??theme\src\theme\HttpConstants.java
?????文件?????????160??2015-05-03?15:33??theme\src\theme\li
?????文件?????????505??2015-05-26?22:43??theme\src\theme\PriorityURL.java
評(píng)論
共有 條評(píng)論