資源簡介
htmlunit 網絡爬蟲,及其依賴jar包。以及實例
沒有積分,賺點積分,:-D

代碼片段和文件信息
package?com.ustcsoft.jt.util;
import?java.sql.DriverManager;
import?java.sql.PreparedStatement;
import?java.text.DateFormat;
import?java.text.ParseException;
import?java.text.SimpleDateFormat;
import?java.util.Date;
import?java.util.List;
import?java.util.UUID;
import?org.apache.commons.lang3.StringUtils;
import?com.gargoylesoftware.htmlunit.BrowserVersion;
import?com.gargoylesoftware.htmlunit.WebClient;
import?com.gargoylesoftware.htmlunit.html.DomElement;
import?com.gargoylesoftware.htmlunit.html.DomNode;
import?com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import?com.gargoylesoftware.htmlunit.html.HtmlPage;
/**
*?
*?@Author:?QGuo
*?@Time:?2017年8月16日?上午9:25:25
*
*/
public?class?HtmlunitTest?{
private?static?int?maxPage?=?16;
private?static?int?currentPage?=?1;
public?static?void?main(String[]?args)?{
while(currentPage<=maxPage){
//?TODO?獲取當前頁面的頁面內容
String?URL?=?“http://xxgk.ah.gov.cn/tmp/Nav_gongkailanmu.shtml?SS_ID=49&tm=34332.39&Page=“+currentPage;
try?{
produceList(URL);
}?catch?(Exception?e)?{
//?TODO?Auto-generated?catch?block
e.printStackTrace();
}finally{
currentPage++;
}
}
}
public?static?void?produceList(String?URL)?throws?Exception{
/**?HtmlUnit請求web頁面?*/??
????WebClient?wc?=?new?WebClient(BrowserVersion.CHROME);??
????wc.getOptions().setUseInsecureSSL(true);??
????wc.getOptions().setjavascriptEnabled(true);?//?啟用JS解釋器,默認為true??
????wc.getOptions().setCssEnabled(false);?//?禁用css支持??
????wc.getOptions().setThrowExceptionOnscriptError(false);?//?js運行錯誤時,是否拋出異常??
????wc.getOptions().setTimeout(100000);?//?設置連接超時時間?,這里是10S。如果為0,則無限期等待??
????wc.getOptions().setDoNotTrackEnabled(false);??
????wc.waitForBackgroundjavascript(10000);//讓js先加載完,獲取頁面異步生成的代碼,
????HtmlPage?page?=?wc.getPage(URL);?
????//獲得需要的鏈接
????List?achList=page.getAnchors();
????
????//?開始插入數據
? Class.forName(“oracle.jdbc.driver.OracleDriver“).newInstance();
? String?jdbcUrl?=?“jdbc:oracle:thin:@192.168.254.190:1521:orcl“;?//?orcl為數據庫的SID
? String?Username?=?“dss“;?//?用戶名
? String?Password?=?“dss“;?//?密碼
? java.sql.Connection?con?=?DriverManager.getConnection(jdbcUrl?Username
? Password);
????
????for(HtmlAnchor?ach:achList){?
????
???? if?(ach.getHrefAttribute().contains(“/UserData/DocHtml“)?&&?!ach.getHrefAttribute().startsWith(“http://“)){
???? String?newURL?=?“http://xxgk.ah.gov.cn“?+?ach.getHrefAttribute().trim();
???? System.out.println(newURL);
???? HtmlPage?newPage?=?wc.getPage(newURL);
???? //selectors選擇器
???? DomNode?source?=?newPage.querySelector(“.wz_source“);
???? //信息來源
???? String?sourcetext?=?source.asText().replace(“信息來源:“?““);
????
???? DomNode?wztit?=?newPage.querySelector(“.wztit“);
????
???? DomNode?titles?=?newPage.querySelector(“.nr_topcon“);
???? String?title?=?titles.asText();
????
???? int?c?=?title.indexOf(“生成日期:“);
???? int?d?=?ti
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????2003882??2017-08-15?14:37??htmlunit-2.27.jar
?????文件?????736658??2017-08-16?09:45??httpclient-4.5.2.jar
?????文件????1367760??2017-08-15?14:37??xercesImpl-2.11.0.jar
?????文件?????220536??2017-08-16?09:39??xm
?????文件???????5315??2017-08-17?18:18??HtmlunitTest.java
-----------?---------??----------?-----??----
??????????????4334151????????????????????5
評論
共有 條評論