91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 0.03M
    文件類型: .rar
    金幣: 1
    下載: 0 次
    發布日期: 2024-05-05
  • 語言: Python
  • 標簽: python??py??小說??爬取??

資源簡介

Python爬取網站上的小說保存到本地txt

資源截圖

代碼片段和文件信息



#!/usr/bin/python

#?-*-?coding:utf-8?-*-

?

import?requests?#抓取網頁的html源碼

import?random???#取隨機數

from?bs4?import?BeautifulSoup?#用于代替正則式?取源碼中相應標簽中的內容

import?sys

import?time?#時間操作

?

?

class?downloader(object):

????def?__init__(self):

????????self.server?=?‘http://www.biqukan.com‘

????????self.target?=?‘http://www.biqukan.com/0_790/‘

????????self.names?=?[]?#章節名

????????self.urls?=?[]??#章節鏈接

????????self.nums?=?0???#章節數

?

????“““

????獲取html文檔內容

????“““

????def?get_content(selfurl):

????????#?設置headers是為了模擬瀏覽器訪問?否則的話可能會被拒絕?可通過瀏覽器獲取

????????header?=?{

????????????‘Accept‘:?‘text/htmlapplication/xhtml+xmlapplication/xml;q=0.9*/*;q=0.8‘

????????????‘Connection‘:?‘keep-alive‘

????????????‘Accept-Encoding‘:?‘gzip?deflate‘

????????????‘Accept-Language‘:?‘zh-cn‘

????????????‘User-Agent‘:?‘Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_13_4)?AppleWebKit/605.1.15?(KHTML?like?Gecko)?Version/11.1?Safari/605.1.15‘

????????}

?

????????#?設置一個超時時間?取隨機數?是為了防止網站被認定為爬蟲

????????timeout?=?random.choice(range(80?180))

?

????????while?True:

????????????try:

????????????????req?=?requests.get(url=url?headers=header?timeout=timeout)

????????????????break

????????????except?Exception?as?e:

????????????????print(‘3‘e)

????????????????time.sleep(random.choice(range(8?15)))

????????return?req.text

?

????“““

????獲取下載的章節目錄

????“““

????def?get_download_catalogue(selfurl):

????????html?=?self.get_content(url)

????????bf?=?BeautifulSoup(html?‘html.parser‘)

????????texts?=?bf.find_all(‘div‘?{‘class‘:?‘listmain‘})

????????div?=?texts[0]

????????a_s?=?div.find_all(‘a‘)

????????self.nums?=?len(a_s[12:17])???#去掉重復的最新章節列表?不重復的前5章

????????for?each?in?a_s[12:17]:

????????????self.names.append(each.string)

????????????self.urls.append(self.server?+?each.get(‘href‘))

?

????“““

????獲取下載的具體章節

????“““

????def?get_download_content(self?url):

????????html?=?self.get_content(url)

????????bf?=?BeautifulSoup(html?‘html.parser‘)

????????texts?=?bf.find_all(‘div‘?{‘class‘:?‘showtxt‘?‘id‘:?‘content‘})

????????text?=?texts[0].text.replace(‘\xa0‘?*?7?‘\n\n‘)??#?\xa0表示連續的空白格

????????return?text

?

????“““

????將文章寫入文件

????“““

????def?writer(selfnamepathtext):

????????write_flag?=?True

????????with?open(path?‘a‘?encoding=‘utf-8‘)?as?f:

????????????f.write(name?+?‘\n‘)

????????????f.writelines(text)

????????????f.write(‘\n\n‘)

?

if?__name__?==?‘__main__‘:

????dl?=?downloader()

????dl.get_download_catalogue(dl.target)

????for?i?in?range(dl.nums):

????????dl.writer(dl.names[i]?‘天尊.txt‘?dl.get_download_content(dl.urls[i]))

????????print(“已下載:%.2f%%“%?float((i+1)/dl.nums?*?100)?+?‘\r‘)

????print(‘下載完成!‘)

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----

????..A..H.?????17920??2020-11-17?11:57??PythonApplication1\.vs\PythonApplication1\v16\.suo

?????文件???????3191??2020-11-17?11:57??PythonApplication1\PythonApplication1.py

?????文件???????1568??2020-11-17?10:23??PythonApplication1\PythonApplication1.pyproj

?????文件????????977??2020-11-17?10:23??PythonApplication1\PythonApplication1.sln

?????文件??????73590??2020-11-17?11:28??PythonApplication1\天尊.txt

?????目錄??????????0??2021-01-25?09:58??PythonApplication1\.vs\PythonApplication1\v16

?????目錄??????????0??2021-01-25?09:58??PythonApplication1\.vs\PythonApplication1

????...D.H.?????????0??2021-01-25?09:58??PythonApplication1\.vs

?????目錄??????????0??2021-01-25?09:58??PythonApplication1

-----------?---------??----------?-----??----

????????????????97246????????????????????9


評論

共有 條評論