資源簡介
Python爬取網站上的小說保存到本地txt
代碼片段和文件信息
#!/usr/bin/python
#?-*-?coding:utf-8?-*-
?
import?requests?#抓取網頁的html源碼
import?random???#取隨機數
from?bs4?import?BeautifulSoup?#用于代替正則式?取源碼中相應標簽中的內容
import?sys
import?time?#時間操作
?
?
class?downloader(object):
????def?__init__(self):
????????self.server?=?‘http://www.biqukan.com‘
????????self.target?=?‘http://www.biqukan.com/0_790/‘
????????self.names?=?[]?#章節名
????????self.urls?=?[]??#章節鏈接
????????self.nums?=?0???#章節數
?
????“““
????獲取html文檔內容
????“““
????def?get_content(selfurl):
????????#?設置headers是為了模擬瀏覽器訪問?否則的話可能會被拒絕?可通過瀏覽器獲取
????????header?=?{
????????????‘Accept‘:?‘text/htmlapplication/xhtml+xmlapplication/xml;q=0.9*/*;q=0.8‘
????????????‘Connection‘:?‘keep-alive‘
????????????‘Accept-Encoding‘:?‘gzip?deflate‘
????????????‘Accept-Language‘:?‘zh-cn‘
????????????‘User-Agent‘:?‘Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_13_4)?AppleWebKit/605.1.15?(KHTML?like?Gecko)?Version/11.1?Safari/605.1.15‘
????????}
?
????????#?設置一個超時時間?取隨機數?是為了防止網站被認定為爬蟲
????????timeout?=?random.choice(range(80?180))
?
????????while?True:
????????????try:
????????????????req?=?requests.get(url=url?headers=header?timeout=timeout)
????????????????break
????????????except?Exception?as?e:
????????????????print(‘3‘e)
????????????????time.sleep(random.choice(range(8?15)))
????????return?req.text
?
????“““
????獲取下載的章節目錄
????“““
????def?get_download_catalogue(selfurl):
????????html?=?self.get_content(url)
????????bf?=?BeautifulSoup(html?‘html.parser‘)
????????texts?=?bf.find_all(‘div‘?{‘class‘:?‘listmain‘})
????????div?=?texts[0]
????????a_s?=?div.find_all(‘a‘)
????????self.nums?=?len(a_s[12:17])???#去掉重復的最新章節列表?不重復的前5章
????????for?each?in?a_s[12:17]:
????????????self.names.append(each.string)
????????????self.urls.append(self.server?+?each.get(‘href‘))
?
????“““
????獲取下載的具體章節
????“““
????def?get_download_content(self?url):
????????html?=?self.get_content(url)
????????bf?=?BeautifulSoup(html?‘html.parser‘)
????????texts?=?bf.find_all(‘div‘?{‘class‘:?‘showtxt‘?‘id‘:?‘content‘})
????????text?=?texts[0].text.replace(‘\xa0‘?*?7?‘\n\n‘)??#?\xa0表示連續的空白格
????????return?text
?
????“““
????將文章寫入文件
????“““
????def?writer(selfnamepathtext):
????????write_flag?=?True
????????with?open(path?‘a‘?encoding=‘utf-8‘)?as?f:
????????????f.write(name?+?‘\n‘)
????????????f.writelines(text)
????????????f.write(‘\n\n‘)
?
if?__name__?==?‘__main__‘:
????dl?=?downloader()
????dl.get_download_catalogue(dl.target)
????for?i?in?range(dl.nums):
????????dl.writer(dl.names[i]?‘天尊.txt‘?dl.get_download_content(dl.urls[i]))
????????print(“已下載:%.2f%%“%?float((i+1)/dl.nums?*?100)?+?‘\r‘)
????print(‘下載完成!‘)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
????..A..H.?????17920??2020-11-17?11:57??PythonApplication1\.vs\PythonApplication1\v16\.suo
?????文件???????3191??2020-11-17?11:57??PythonApplication1\PythonApplication1.py
?????文件???????1568??2020-11-17?10:23??PythonApplication1\PythonApplication1.pyproj
?????文件????????977??2020-11-17?10:23??PythonApplication1\PythonApplication1.sln
?????文件??????73590??2020-11-17?11:28??PythonApplication1\天尊.txt
?????目錄??????????0??2021-01-25?09:58??PythonApplication1\.vs\PythonApplication1\v16
?????目錄??????????0??2021-01-25?09:58??PythonApplication1\.vs\PythonApplication1
????...D.H.?????????0??2021-01-25?09:58??PythonApplication1\.vs
?????目錄??????????0??2021-01-25?09:58??PythonApplication1
-----------?---------??----------?-----??----
????????????????97246????????????????????9
- 上一篇:NumPy Cookbook
- 下一篇:動物圖片識別.py(基于百度api)
評論
共有 條評論