資源簡介
微博數據爬取demo ,解析微博評論數,點贊數,圖片鏈接等

代碼片段和文件信息
package?top.kittygirl.wechat;
import?cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import?cn.edu.hfut.dmic.webcollector.model.Page;
import?cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
public?class?cpwsDataCrawler?extends?BreadthCrawler?{
????/**
?????*?@param?crawlPath?crawlPath?is?the?path?of?the?directory?which?maintains
?????*??????????????????information?of?this?crawler
?????*?@param?autoParse?if?autoParse?is?trueBreadthCrawler?will?auto?extract
?????*??????????????????links?which?match?regex?rules?from?pag
?????*/
????public?cpwsDataCrawler(String?crawlPath?boolean?autoParse)?{
????????super(crawlPath?autoParse);
????????/*start?page*/
????????//this.addSeed(“http://news.xidian.edu.cn/“);
???????//?http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6
????????/*fetch?url?like?http://news.hfut.edu.cn/show-xxxxxxhtml*/
????????//this.addRegex(“http://news.xidian.edu.cn/info/.*htm“);
????????/*do?not?fetch?jpg|png|gif*/
????????//this.addRegex(“-.*\\.(jpg|png|gif).*“);
????????/*do?not?fetch?url?contains?#*/
???????//?this.addRegex(“-.*#.*“);
????????this.addSeed(“http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6“);
????????setThreads(50);
????????getConf().setTopN(1000);
????????getConf().setExecuteInterval(100000);
????}
????public?void?visit(Page?page?CrawlDatums?next)?{
????????String??a??=?page.select(“#list“).select(“#resultList“).select(“#dataItem1“).text();
?????//???String?url?=?page.url();
???//?????System.out.println(url);
????????/*if?page?is?news?page*/
/*????????if?(page.matchUrl(“http://news.xidian.edu.cn/info/.*htm“))?{
????????????*//*extract?title?and?content?of?news?by?css?selector*//*
????????????String?title?=?page.select(“div.neirong-bt“).text();
????????????String?date?=?page.select(“span#date“).text();
????????????String?clickNum?=?page.select(“div#wz_info.b_b“).first().child(3).select(“span“).first().child(0).select(“span“).val();
????????????String?content?=?page.selectText(“div#artibody“);
????????????System.out.println(“URL:\n“?+?url);
????????????System.out.println(“title:\n“?+?title);
????????????System.out.println(“date:\n“?+?date);
????????????System.out.println(“clickNum:\n“?+?clickNum);
????????????System.out.println(“content:\n“?+?content);
????????}*/
????}
????public?static?void?main(String[]?args)?throws?Exception?{
????????cpwsDataCrawler?crawler?=?new?cpwsDataCrawler(“crawlllesZ“?true);
????????/*start?crawl?with?depth?of?4*/
????????crawler.start(1);
????}
}
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-03-31?09:14??weChatCrawler-master\
?????目錄???????????0??2019-03-31?09:14??weChatCrawler-master\.idea\
?????文件?????????624??2019-03-01?14:40??weChatCrawler-master\.idea\compiler.xm
?????文件?????????138??2019-03-01?14:40??weChatCrawler-master\.idea\encodings.xm
?????目錄???????????0??2019-03-31?09:14??weChatCrawler-master\.idea\fileTemplates\
?????目錄???????????0??2019-03-01?14:59??weChatCrawler-master\.idea\fileTemplates\code\
?????目錄???????????0??2019-03-01?14:59??weChatCrawler-master\.idea\fileTemplates\includes\
?????目錄???????????0??2019-03-01?14:59??weChatCrawler-master\.idea\fileTemplates\internal\
?????目錄???????????0??2019-03-01?14:59??weChatCrawler-master\.idea\fileTemplates\j2ee\
?????目錄???????????0??2019-03-31?09:14??weChatCrawler-master\.idea\libraries\
?????文件?????????504??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__cglib_cglib_nodep_3_2_4.xm
?????文件?????????642??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__cn_edu_hfut_dmic_webcollector_WebCollector_2_73_alpha.xm
?????文件?????????543??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__commons_codec_commons_codec_1_10.xm
?????文件?????????503??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__commons_io_commons_io_2_5.xm
?????文件?????????558??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__commons_logging_commons_logging_1_2.xm
?????文件?????????514??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_alibaba_fastjson_1_2_41.xm
?????文件?????????564??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_codeborne_phantomjsdriver_1_4_0.xm
?????文件?????????654??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xm
?????文件?????????515??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_google_code_gson_gson_2_8_0.xm
?????文件?????????499??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_google_guava_guava_21_0.xm
?????文件?????????480??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_sleepycat_je_5_0_73.xm
?????文件?????????536??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_squareup_okhttp3_okhttp_3_11_0.xm
?????文件?????????510??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__com_squareup_okio_okio_1_14_0.xm
?????文件?????????578??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__javax_servlet_javax_servlet_api_3_1_0.xm
?????文件?????????455??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__junit_junit_4_12.xm
?????文件?????????469??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__log4j_log4j_1_2_17.xm
?????文件?????????574??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__mysql_mysql_connector_java_5_1_31.xm
?????文件?????????492??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__net_java_dev_jna_jna_4_1_0.xm
?????文件?????????555??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__net_java_dev_jna_jna_platform_4_1_0.xm
?????文件?????????498??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__net_sf_opencsv_opencsv_2_3.xm
?????文件?????????577??2019-03-16?14:13??weChatCrawler-master\.idea\libraries\Maven__net_sourceforge_cssparser_cssparser_0_9_22.xm
............此處省略74個文件信息
評論
共有 條評論