資源簡介
使用jsoup,抓取整個網站,保存在本地。包括網站的“css、js、圖片、頁面”,都會保存在本地。并且保存的路徑痛網站上的路徑。經過測試,一般的基礎網站都可以抓取。
代碼片段和文件信息
package?cn.yuping.haha;
import?cn.yuping.haha.util.FileUtils;
import?org.jsoup.Jsoup;
import?org.jsoup.nodes.Document;
import?org.jsoup.nodes.Element;
import?org.jsoup.select.Elements;
import?javax.servlet.ServletException;
import?javax.servlet.annotation.WebServlet;
import?javax.servlet.http.HttpServlet;
import?javax.servlet.http.HttpServletRequest;
import?javax.servlet.http.HttpServletResponse;
import?javax.servlet.http.HttpSession;
import?java.io.File;
import?java.io.IOException;
import?java.util.*;
@WebServlet(name?=?“crawlWebServlet“?urlPatterns?=?“/crawlWeb“)
public?class?CrawlWebSite?extends?HttpServlet?{
?
private?final?static?String?charSet?=?“utf-8“;
private??static?String?rootDir;
private??static?String?rootUrl; //“http://www.17sucai.com/preview/216556/2016-02-25/%E6%A9%99%E8%89%B2%E5%95%86%E5%9C%BAwap/“
private?final?static?int?timeOut?=?30000;
/**網站上相對地址與絕對地址的映射*/
private?static?Map?absRelativeUrlMap?=?new?HashMap();
/**網站上的url與最終本地使用的url映射*/
private?static?Map?urlmapMap?=?new?HashMap();
/**網站上的cssjs*/
private?static?Map?cssjsmapMap?=?new?HashMap();
private?static?List?allFiles?=?new?ArrayList();
private?static?Set?imgList?=?new?HashSet();
/**過濾掉不爬取的內容格式*/
// public?static?final?String?filterExtArray?[]??=?{“rar““zip““bmp““dib““gif““jfif““jpe““jpeg““jpg““png““tif““tiff““ico““pdf““doc““docx““xls““xlsx“};
public?static?final?String?filterExtArray?[]??=?{“rar““zip““bmp““dib““jfif““jpe““jpeg““tif““tiff““ico““pdf““doc““docx““xls““xlsx“};
@Override
public?void?doPost(HttpServletRequest?request?HttpServletResponse?response)
throws?ServletException?IOException?{
?System.out.println(“start....“);
?request.setCharacterEncoding(“UTF-8“);
?rootUrl?=?request.getParameter(“rootUrl“);
?rootDir?=?request.getParameter(“rootDir“);
HttpSession?session?=?request.getSession();
if(rootUrl?!=?null?&&?!rootUrl.equals(““)?&&?rootDir?!=?null?&&?!rootDir.equals(““)){
if(rootUrl.contains(“.html“)?||?rootUrl.contains(“.jsp“)?||?rootUrl.contains(“.htm“)){
session.setAttribute(“msg““網站抓取失敗網址輸入有誤,不能有‘.html、.jsp’等結尾“);
response.sendRedirect(“index.jsp“);
return;
}
if(!rootUrl.endsWith(“/“)){
rootUrl?=?rootUrl.concat(“/“);
}
//獲取所有urls
getSubUrls(rootUrlrootUrl);
//保存文件
for(String?absUrl?:?absRelativeUrlMap.keySet()){
String?content;
try?{
content?=?readContent(absUrl);
}?catch?(IOException?e)?{
System.err.println(“url3=“+absUrl+“?頁面無效!“);
continue;
}
if(!absUrl.startsWith(rootUrl)){
continue;
}
String?filePath?=?absUrl.substring(rootUrl.length());
filePath?=?FileUtils.parseFilePath(filePath);
//urlmapMap.put(absRelativeUrlMap.get(absUrl)?filePath);//脫機運行和在服務器運行有所不同。。。
urlmapMap.put(absRela
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件?????????36??2017-12-13?10:09??haha\.git\COMMIT_EDITMSG
?????文件????????321??2017-12-05?10:07??haha\.git\config
?????文件?????????73??2017-12-05?10:07??haha\.git\desc
?????文件?????????89??2017-12-13?10:16??haha\.git\FETCH_HEAD
?????文件?????????23??2017-12-05?10:07??haha\.git\HEAD
?????文件????????478??2017-12-05?10:07??haha\.git\hooks\applypatch-msg.sample
?????文件????????896??2017-12-05?10:07??haha\.git\hooks\commit-msg.sample
?????文件????????189??2017-12-05?10:07??haha\.git\hooks\post-update.sample
?????文件????????424??2017-12-05?10:07??haha\.git\hooks\pre-applypatch.sample
?????文件???????1642??2017-12-05?10:07??haha\.git\hooks\pre-commit.sample
?????文件???????1348??2017-12-05?10:07??haha\.git\hooks\pre-push.sample
?????文件???????4951??2017-12-05?10:07??haha\.git\hooks\pre-reba
?????文件???????1239??2017-12-05?10:07??haha\.git\hooks\prepare-commit-msg.sample
?????文件???????3610??2017-12-05?10:07??haha\.git\hooks\update.sample
?????文件???????1888??2017-12-26?10:25??haha\.git\index
?????文件????????240??2017-12-05?10:07??haha\.git\info\exclude
?????文件???????1013??2017-12-13?10:09??haha\.git\logs\HEAD
?????文件???????1013??2017-12-13?10:09??haha\.git\logs\refs\heads\master
?????文件????????176??2017-12-05?10:07??haha\.git\logs\refs\remotes\origin\HEAD
?????文件????????784??2017-12-13?10:09??haha\.git\logs\refs\remotes\origin\master
?????文件????????119??2017-12-05?10:07??haha\.git\ob
?????文件???????2149??2017-12-20?09:44??haha\.git\ob
?????文件?????????65??2017-12-13?10:09??haha\.git\ob
?????文件????????521??2017-12-05?10:07??haha\.git\ob
?????文件????????266??2017-12-13?09:26??haha\.git\ob
?????文件????????987??2017-12-13?09:26??haha\.git\ob
?????文件??????10484??2017-12-13?09:26??haha\.git\ob
?????文件????????182??2017-12-13?09:27??haha\.git\ob
?????文件???????1478??2017-12-13?09:26??haha\.git\ob
?????文件??????15794??2017-12-13?09:26??haha\.git\ob
............此處省略402個文件信息
- 上一篇:10份伺服電機技術PPT
- 下一篇:安卓藍牙app源碼
評論
共有 條評論