-
大小: 29.21MB文件類型: .rar金幣: 1下載: 0 次發(fā)布日期: 2023-07-08
- 語(yǔ)言: Java
- 標(biāo)簽: Jsoup+Java??
資源簡(jiǎn)介
經(jīng)過(guò)測(cè)試,可以抓取一個(gè)完整的網(wǎng)站,包括網(wǎng)站的圖片、css、js等。同時(shí)根據(jù)網(wǎng)站目錄,在本地生成相同目錄。使用Jsoup+Java。下載之后,可以直接運(yùn)行。
代碼片段和文件信息
package?cn.yuping.haha;
import?cn.yuping.haha.util.FileUtils;
import?org.jsoup.Jsoup;
import?org.jsoup.nodes.Document;
import?org.jsoup.nodes.Element;
import?org.jsoup.select.Elements;
import?javax.servlet.ServletException;
import?javax.servlet.annotation.WebServlet;
import?javax.servlet.http.HttpServlet;
import?javax.servlet.http.HttpServletRequest;
import?javax.servlet.http.HttpServletResponse;
import?javax.servlet.http.HttpSession;
import?java.io.File;
import?java.io.IOException;
import?java.util.*;
@WebServlet(name?=?“crawlWebServlet“?urlPatterns?=?“/crawlWeb“)
public?class?CrawlWebSite?extends?HttpServlet?{
?
private?final?static?String?charSet?=?“utf-8“;
private??static?String?rootDir;
private??static?String?rootUrl; //“http://www.17sucai.com/preview/216556/2016-02-25/%E6%A9%99%E8%89%B2%E5%95%86%E5%9C%BAwap/“
private?final?static?int?timeOut?=?30000;
/**網(wǎng)站上相對(duì)地址與絕對(duì)地址的映射*/
private?static?Map?absRelativeUrlMap?=?new?HashMap();
/**網(wǎng)站上的url與最終本地使用的url映射*/
private?static?Map?urlmapMap?=?new?HashMap();
/**網(wǎng)站上的cssjs*/
private?static?Map?cssjsmapMap?=?new?HashMap();
private?static?List?allFiles?=?new?ArrayList();
private?static?Set?imgList?=?new?HashSet();
/**過(guò)濾掉不爬取的內(nèi)容格式*/
// public?static?final?String?filterExtArray?[]??=?{“rar““zip““bmp““dib““gif““jfif““jpe““jpeg““jpg““png““tif““tiff““ico““pdf““doc““docx““xls““xlsx“};
public?static?final?String?filterExtArray?[]??=?{“rar““zip““bmp““dib““jfif““jpe““jpeg““tif““tiff““ico““pdf““doc““docx““xls““xlsx“};
@Override
public?void?doPost(HttpServletRequest?request?HttpServletResponse?response)
throws?ServletException?IOException?{
?System.out.println(“start....“);
?request.setCharacterEncoding(“UTF-8“);
?rootUrl?=?request.getParameter(“rootUrl“);
?rootDir?=?request.getParameter(“rootDir“);
HttpSession?session?=?request.getSession();
if(rootUrl?!=?null?&&?!rootUrl.equals(““)?&&?rootDir?!=?null?&&?!rootDir.equals(““)){
if(rootUrl.contains(“.html“)?||?rootUrl.contains(“.jsp“)?||?rootUrl.contains(“.htm“)){
session.setAttribute(“msg““網(wǎng)站抓取失敗網(wǎng)址輸入有誤,不能有‘.html、.jsp’等結(jié)尾“);
response.sendRedirect(“index.jsp“);
return;
}
if(!rootUrl.endsWith(“/“)){
rootUrl?=?rootUrl.concat(“/“);
}
//獲取所有urls
getSubUrls(rootUrlrootUrl);
//保存文件
for(String?absUrl?:?absRelativeUrlMap.keySet()){
String?content;
try?{
content?=?readContent(absUrl);
}?catch?(IOException?e)?{
System.err.println(“url3=“+absUrl+“?頁(yè)面無(wú)效!“);
continue;
}
if(!absUrl.startsWith(rootUrl)){
continue;
}
String?filePath?=?absUrl.substring(rootUrl.length());
filePath?=?FileUtils.parseFilePath(filePath);
//urlmapMap.put(absRelativeUrlMap.get(absUrl)?filePath);//脫機(jī)運(yùn)行和在服務(wù)器運(yùn)行有所不同。。。
urlmapMap.put(absRela
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件?????????36??2017-12-13?10:09??haha\.git\COMMIT_EDITMSG
?????文件????????321??2017-12-05?10:07??haha\.git\config
?????文件?????????73??2017-12-05?10:07??haha\.git\desc
?????文件?????????89??2017-12-13?10:16??haha\.git\FETCH_HEAD
?????文件?????????23??2017-12-05?10:07??haha\.git\HEAD
?????文件????????478??2017-12-05?10:07??haha\.git\hooks\applypatch-msg.sample
?????文件????????896??2017-12-05?10:07??haha\.git\hooks\commit-msg.sample
?????文件????????189??2017-12-05?10:07??haha\.git\hooks\post-update.sample
?????文件????????424??2017-12-05?10:07??haha\.git\hooks\pre-applypatch.sample
?????文件???????1642??2017-12-05?10:07??haha\.git\hooks\pre-commit.sample
?????文件???????1348??2017-12-05?10:07??haha\.git\hooks\pre-push.sample
?????文件???????4951??2017-12-05?10:07??haha\.git\hooks\pre-reba
?????文件???????1239??2017-12-05?10:07??haha\.git\hooks\prepare-commit-msg.sample
?????文件???????3610??2017-12-05?10:07??haha\.git\hooks\update.sample
?????文件???????1888??2017-12-26?10:25??haha\.git\index
?????文件????????240??2017-12-05?10:07??haha\.git\info\exclude
?????文件???????1013??2017-12-13?10:09??haha\.git\logs\HEAD
?????文件???????1013??2017-12-13?10:09??haha\.git\logs\refs\heads\master
?????文件????????176??2017-12-05?10:07??haha\.git\logs\refs\remotes\origin\HEAD
?????文件????????784??2017-12-13?10:09??haha\.git\logs\refs\remotes\origin\master
?????文件????????119??2017-12-05?10:07??haha\.git\ob
?????文件???????2149??2017-12-20?09:44??haha\.git\ob
?????文件?????????65??2017-12-13?10:09??haha\.git\ob
?????文件????????521??2017-12-05?10:07??haha\.git\ob
?????文件????????266??2017-12-13?09:26??haha\.git\ob
?????文件????????987??2017-12-13?09:26??haha\.git\ob
?????文件??????10484??2017-12-13?09:26??haha\.git\ob
?????文件????????182??2017-12-13?09:27??haha\.git\ob
?????文件???????1478??2017-12-13?09:26??haha\.git\ob
?????文件??????15794??2017-12-13?09:26??haha\.git\ob
............此處省略402個(gè)文件信息
評(píng)論
共有 條評(píng)論