資源簡介
本人公司項目中需要汽車圖片作為人工智能圖片識別的訓練,本人應項目組的需要,使用Java爬蟲,爬了汽車之家12萬張汽車之家汽車圖片。

代碼片段和文件信息
package?carImage;
import?java.io.PrintWriter;
import?java.io.BufferedReader;
import?java.io.ByteArrayOutputStream;
import?java.io.File;
import?java.io.FileInputStream;
import?java.io.FileNotFoundException;
import?java.io.FileOutputStream;
import?java.io.IOException;
import?java.io.InputStream;
import?java.io.InputStreamReader;
import?java.net.HttpURLConnection;
import?java.net.MalformedURLException;
import?java.net.URL;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.Iterator;
import?java.util.List;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;
import?org.jsoup.Jsoup;
import?org.jsoup.nodes.Document;
import?org.jsoup.nodes.Element;
import?org.jsoup.select.Elements;
public?class?Car?{
public?static?int?imgCount?=?0;
public?static?void?main(String[]?args){
//getStr(); //獲取下載圖片所需路徑s
long?startTime=System.currentTimeMillis();
List?urls=readUrls(); //讀取txt中路徑
HashMap?hm?=?new?HashMap();
Car?wc?=?new?Car();
for(String?url:urls){
List?imgUrls=new?ArrayList();
imgCount=0;
try?{
do{
String?urlImage=“http://car.autohome.com.cn/“+url+“#pvareaid=2042264“;
Document?doc?=?null;
try?{
doc?=?Jsoup.connect(urlImage).timeout(100000).get();
}?catch?(IOException?e1)?{
e1.printStackTrace();
}
????????hm?=?wc.getFromYahoo(urlImage);
????????String?str=hm.get(“title“);
????????String[]?strs?=?str.split(“?“);
????????String?strz=strs[0];
????????File?carFolder=new?File(“D:/img/carphoto/“+strz); //新建一個文件夾
carFolder.mkdirs();
Element?body?=?doc.body();
Elements?es=body.select(“img“);
Pattern?p?=?Pattern.compile(“^(http)?(://)?(\\w+(-\\w+)*(\\d+))(\\.(\\w+(-\\w+)*))(\\.(\\w+(-\\w+)*))“
+?“(/(\\w+(-\\w+)*))*(/(\\d+)*)*(/(\\w+(-\\w+)*(\\d+)*))*(/(\\w+(\\_+)(\\d+)*(-\\w+)*))*“
+?“(/(\\w+(\\_+)(\\d+)*))*(\\.(\\w+(-\\w+)*))$“Pattern.CASE_INSENSITIVE?);?
for?(Iterator?it?=?es.iterator();?it.hasNext();)?{
Element?e?=?(Element)?it.next();
String?objUrl=?e.attr(“src“);?
Matcher?m?=?p.matcher(objUrl);???
if(m.find()?&&?!imgUrls.contains(objUrl)){ //是否找到匹配的路徑并且該路徑在不存在集合中
downPic(objUrlurlurlImage); //下載圖片
imgCount++;
imgUrls.add(objUrl); //把路徑加入到集合中
}else{
System.out.println(“匹配不到合適的路徑“);
}
}
}while(false);
}?catch?(Exception?e1)?{
e1.printStackTrace();
}
if(imgCount>=50){
System.out.println(“獲取圖片“+imgCount+“等待3秒“);
break;
}
try?{
Thread.sleep(3000);
}?catch?(InterruptedException?e)?{
e.printStackTrace();
}
}
System.out.println(“總耗時:“?+?((System.currentTimeMillis()?-?startTime)/60000));
}
????/*讀取一個網頁全部內容*/
????public?String?getOneHtml(?String?htmlurl)?throws?IOException?{
????????URL?url;
????????String?temp;
???
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2016-11-25?17:06??CarCarCar\
?????文件??????250231??2016-11-25?16:51??CarCarCar\7.1allCarImageUrl.txt
?????文件??????249915??2016-11-25?16:52??CarCarCar\7.2allCarImageUrl.txt
?????文件??????249953??2016-11-25?16:53??CarCarCar\7.3allCarImageUrl.txt
?????文件??????251291??2016-11-25?16:53??CarCarCar\7.4allCarImageUrl.txt
?????文件??????224178??2016-11-25?16:54??CarCarCar\7.5allCarImageUrl.txt
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\
?????文件?????????943??2016-11-22?10:34??CarCarCar\MyCarImage\.classpath
?????文件?????????386??2016-11-22?10:31??CarCarCar\MyCarImage\.project
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\.settings\
?????文件?????????598??2016-11-22?10:31??CarCarCar\MyCarImage\.settings\org.eclipse.jdt.core.prefs
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\bin\
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\bin\carImage\
?????文件???????11957??2016-11-25?16:35??CarCarCar\MyCarImage\bin\carImage\Car.class
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\lib\
?????文件??????232019??2016-11-22?10:34??CarCarCar\MyCarImage\lib\commons-beanutils-1.8.3.jar
?????文件??????263965??2016-11-22?10:34??CarCarCar\MyCarImage\lib\commons-codec-1.9.jar
?????文件??????575389??2016-11-22?10:34??CarCarCar\MyCarImage\lib\commons-collections-3.2.1.jar
?????文件??????284220??2016-11-22?10:34??CarCarCar\MyCarImage\lib\commons-lang-2.6.jar
?????文件???????61829??2016-11-22?10:34??CarCarCar\MyCarImage\lib\commons-logging-1.2.jar
?????文件???????86487??2016-11-22?10:34??CarCarCar\MyCarImage\lib\ezmorph-1.0.6.jar
?????文件??????732765??2016-11-22?10:34??CarCarCar\MyCarImage\lib\httpclient-4.5.1.jar
?????文件??????326594??2016-11-22?10:34??CarCarCar\MyCarImage\lib\httpcore-4.4.3.jar
?????文件??????159123??2016-11-22?10:34??CarCarCar\MyCarImage\lib\json-lib-2.4-jdk15.jar
?????文件??????300844??2016-11-22?10:33??CarCarCar\MyCarImage\lib\jsoup-1.8.1.jar
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\src\
?????目錄???????????0??2016-11-25?16:37??CarCarCar\MyCarImage\src\carImage\
?????文件???????11409??2016-11-25?16:35??CarCarCar\MyCarImage\src\carImage\Car.java
?????文件???????13107??2016-11-25?17:06??CarCarCar\car.docx
- 上一篇:JavaWeb購物車
- 下一篇:JSP聊天室源碼+文檔
評論
共有 條評論