資源簡介
本例子0積分下載,爬取單本書的所有章節(jié)內(nèi)容。
重點:121行的time.sleep(6)不能刪,網(wǎng)站有反爬蟲!
2021-11-01重新上傳代碼,頂點換域名了,代碼用不了,我隨便改了下,勉強能用,有能力的自己改代碼完善。30
代碼片段和文件信息
#?-*-?coding:utf-8?-*-
#@File:DDXS.py
import?random
import?requests
from?lxml?import?etree
import?time
#?獲取目錄頁面的html
def?get_chapter_html(url?headers):
????resp?=?requests.get(urlheaders=headers)
????resp.encoding?=?‘gbk‘
????html?=?resp.text
????if?‘頂點小說‘?in?html:
????????print(‘----獲取目錄html成功----‘)
????????return?resp.text
????else:
????????print(‘----獲取目錄html失敗----‘)
????????return?‘獲取html失敗‘
#?獲取所有章節(jié)的url
def?get_chapter_url_list(html):
????tree?=?etree.HTML(html)
????book_name?=?tree.xpath(‘//*[@id=“info“]/h1/text()‘)[0]
????chapter_url_x?=?tree.xpath(‘//*[@id=“l(fā)ist“]/dl/dd/a/@href‘)
????if?len(chapter_url_x)?>?12:
????????print(‘----獲取所有章節(jié)url成功----‘)
????????chapter_url_list?=?chapter_url_x[:]
????else:
????????print(‘----獲取所有章節(jié)url失敗----‘)
????????chapter_url_list?=?[]
????return?chapter_url_list?book_name
#?下載并寫入
def?get_save_txt(chapter_url?headers?f):
????try:
????????html?=?requests.get(chapter_url?headers=headers?timeout=10)
????????html.encoding?=?“gbk“
????????html?=?html.text
????except?requests.exceptions.RequestException?as?e:
????????print(e)
????????return?‘獲取失敗‘
????tree?=?etree.HTML(html)
????#?獲取章節(jié)名
????chapter_name?=?tree.xpath(‘//*[@class=“bookname“]/h1/text()‘)
????if?len(chapter_name)?>?0:
????????chapter_name?=?chapter_name[0]
????else:
????????chapter_name?=?‘獲取章節(jié)名失敗|url:‘?+?chapter_url
????????print(‘----獲取章節(jié)名失敗----‘)
????????print(chapter_name)
????????print(‘--------------------‘)
????????return?‘獲取失敗‘
????#?獲取小說內(nèi)容
????content_list?=?tree.xpath(‘//div[@id=“content“]/text()‘)
????if?len(content_list)?>?0:
????????text?=?chapter_name?+?“\n“
????????for?content?in?content_list:
????????????text?+=?content?+?‘\n\n‘
????????f.write(text)
????????f.flush()??#?刷新內(nèi)存緩存區(qū)
????????print(‘寫入成功:‘+?chapter_name?+?‘--URL:‘?+?chapter_url?+?‘\n‘)
????else:
????????print(‘----獲取內(nèi)容失敗----‘)
????????print(‘章節(jié)名:‘?+?chapter_name)
????????print(‘URL:‘?+?chapter_url)
????????print(‘--------------------‘)
????????return
def?main():
????#?程序開始時的時間
????time_start?=?time.time()
????main_url?=?‘https://www.ddxs.cc/‘
????book_url?=?‘https://www.ddxs.cc/ddxs/182824/‘
????book_name?=?‘等待獲取書名‘
????user_agent_list?=?[
????????“Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/68.0.3440.106?Safari/537.36“
????????“Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/67.0.3396.99?Safari/537.36“
????????“Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/64.0.3282.186?Safari/537.36“
????????“Mozilla/5.0?(X11;?Linux?x86_64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/62.0.3202.62?Safari/537.36“
????????“Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/45.0.2454.101?Safari/537.36“
????????“Mozilla/4.0?(compatible;?MSIE?7.0;?Windows?NT?6.0)“
????????“Mozilla/5.0?(Macintosh;?U;?PPC?Mac?OS?X?10.5;?en-US;?rv:1.9.2.15)?Gecko/20
- 上一篇:Python開發(fā)的個人博客
- 下一篇:python函數(shù)編程和講解
評論
共有 條評論