資源簡介
爬取17k小說網站的小說,根據start_url提供的開始頁爬取小說的整頁內容,可以在run方法中傳入數值確定從當前頁繼續向下爬取多少頁;

代碼片段和文件信息
#?coding=utf-8
‘‘‘
作者:Jguobao
QQ:779188083
email:jgb2010start@163.com
‘‘‘
import?requests
from?lxml?import?etree
import?json
import?re
class?NovelSpider:
????def?__init__(self):
????????self.start_url?=?“http://all.17k.com/lib/book/2_0_0_0_0_0_1_0_1.html?“
????????self.url?=?“http://www.17k.com/list/1038316.html“??#?“http://www.17k.com/chapter/2938105/36788407.html“
????????self.headers?=?{
????????????“User-Agent“:?“Mozilla/5.0?(Windows?NT?6.1;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/67.0.3396.99?Safari/537.36“}
????def?parse_url(self?url):
????????response?=?requests.get(url?headers=self.headers)
????????if?response.status_code?!=?200:
????????????return?None
????????return?response.content.decode()
????def?get_detail_page(self?html_str?index=““):
????????if?index?!=?““:
????????????index?=?str(index)?+?“.“
????????html_str_etree?=?etree.HTML(html_str)
????????title?=?html_str_etree.xpath(“//div[@class=‘readAreaBox?content‘]/h1/text()“)[0]
????????word?=?html_str_etree.xpath(“//div[@class=‘readAreaBox?content‘]/div[@class=‘p‘]/text()“)
????????word?=?[“??“?+?w.strip()?+?“\n“?if?len(w.strip())?>?0?else?None?for?w?in?word]
????????title?=?“\t\t\t“?+?index?+?title.strip()?+?“\n“
????????for?w?in?word[:]:
????????????if?w?is?None:
????????????????word.remove(w)
????????return?title?word[:-1]
????def?get_txt_name(self?html_str):
????????html_str_etree?=?etree.HTML(html_str)
????????filename?=?html_str_etree.xpath(“//div[@class=‘Main?List‘]//h1/text()“)[0]?+?‘.txt‘?if?len(
????????????html_str_etree.xpath(“//div[@class=‘Main?List‘]//h1“))?>?0?else?“無名“
????????return?filename
????def?get_url_list(self?html_str):
????????item_list?=?[]
????????html_str_etree?=?etree.HTML(html_str)
????????url_list?=?html_str_etree.xpath(“//dl[@class=‘Volume‘]//dd/a/@href“)
????????volume_list?=?html_str_etree.xpath(“//dl[@class=‘Volume‘]“)??#?獲取所有的卷?并且從卷下獲取所有的卷標?章節
????????for?volume?in?volume_list:
????????????item?=?{}
????????????item[‘卷標‘]?=?volume.xpath(“.//span[@class=‘tit‘]/text()“)[0]
????????????item[‘info‘]?=?volume.xpath(“.//span[@class=‘info‘]/text()“)[0]
????????????#?獲取下面的所有的a標簽
????????????a_list_etree?=?volume.xpath(“./dd/a“)
????????????a_list?=?[]
????????????for?a?in?a_list_etree:
????????????????item2?=?{}
????????????????title2?=?a.xpath(“.//span[@class=‘ellipsis‘]/text()“)[0].strip()
????????????????item2[title2]?=?“http://www.17k.com“?+?a.xpath(“./@href“)[0]
????????????????a_list.append(item2)
????????????item[‘章節‘]?=?a_list
????????????item_list.append(item)
????????????#?根據a標簽列表獲取所有章節與對應的鏈接
????????url_list?=?[“http://www.17k.com“?+?url?for?url?in?url_list]
????????return?item_list
????def?save_txt(self?txt_list?filename):
????????with?open(‘./txt/‘?+?filename?“a“?encoding=‘utf8‘)?as?f:
????????????f.write(txt_list[0])
????????????for?wd?in?txt_list[1][:-2]:
????????????????f.write(wd)
????def?process_item_list(self?item_list?fi
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????6555??2019-01-20?14:29??xiaoshuo_spider.py
- 上一篇:socket編程常用API匯總
- 下一篇:UI設計視頻資源
評論
共有 條評論