資源簡介
通過改變URL來爬取百度貼吧不同的貼吧內容,新手友好度很好,值得學習

代碼片段和文件信息
#!/usr/bin/evn?python3
#-*-?coding:?utf-8?-*-
‘‘‘
Created?on?2019年6月30日
‘‘‘
import?urllib.request
import?urllib.parse
from?bs4?import?BeautifulSoup
from?mylog?import?MyLog?as?mylog
import?codecs
class?Item(object):
????title?=?None?#帖子標題
????firstAuthor?=?None?#帖子創建者
????firstTime?=?None?#帖子創建時間
????reNum?=?None?#總回復數
????content?=?None?#最后回復內容
????lastAuthor?=?None?#最后回復者
????lastTime?=?None?#最后回復時間
class?GetTiebaInfo(object):
????def?__init__(self?url):
????????self.url?=?url
????????self.log?=?mylog()
????????self.pageSum?=?5
????????self.urls?=?self.getUrls(self.pageSum)
????????self.items?=?self.spider(self.urls)
????????self.pipelines(self.items)
????def?getUrls(self?pageSum):
????????urls?=?[]
????????pns?=?[str(i?*?50)?for?i?in?range(pageSum)]
????????ul?=?self.url.split(‘=‘)
????????for?pn?in?pns:
????????????ul[-1]?=?pn
????????????url?=?‘=‘.join(ul)
????????????urls.append(url)
????????self.log.info(‘獲取URLS成功‘)
????????return?urls
????def?spider(self?urls):
????????items?=?[]
????????for?url?in?urls:
????????????htmlContent?=?self.getResponseContent(url)
????????????soup?=?BeautifulSoup(htmlContent?‘lxml‘)
????????????tagsli?=?soup.find_all(‘li‘?attrs?=?{‘class‘:‘j_thread_list?clearfix‘})
????????????for?tag?in?tagsli:
????????????????item?=?Item()
????????????????item.title?=?tag.find(‘a‘?attrs?=?{‘class‘:‘j_th_tit‘}).get_text().strip()
????????????????item.firstAuthor?=?tag.find(‘span‘?attrs?=?{‘class‘:‘frs-author-name-wrap‘}).a.get_text().strip()
????????????????item.firstTime?=?tag.find(‘span‘?attrs?=?{‘title‘:‘創建時間‘}).get_text().strip()
????????????????item.reNum?=?tag.find(‘span‘?attrs?=?{‘title‘:‘回復‘}).get_text().strip()
????????????????item.content?=?tag.find(‘div‘?attrs?=?{‘class‘:‘threadlist_abs?threadlist_abs_onlyline‘}).get_text().strip()
????????????????item.lastAuthor?=?tag.find(‘span‘?attrs?=?{‘class‘:‘tb_icon_author_rely?j_replyer‘}).a.get_text().strip()
????????????????item.lastTime?=?tag.find(‘span‘?attrs?=?{‘title‘:‘最后回復時間‘}).get_text().strip()
????????????????items.append(item)
????????????????self.log.info(‘獲取標題為<<%s>>的項成功...‘?%item.title)
????????return?items
????def?pipelines(self?items):
????????fileName?=?‘百度貼吧_java.txt‘#.encode(‘utf-8‘)
????????with?codecs.open(fileName?‘w‘?‘utf-8‘)as?fp:
????????????for?item?in?items:
????????????????try:
????????????????????fp.write(‘title:%s?\t?author:%s?\t?firstTime:%s?\r\n?content:%s?\r\n?return:%s?\r\n‘
????????????????????????????‘lastAuthor:%s?\t?lastTime:%s?\r\n\r\n\r\n\r\n‘%(item.title?item.firstAuthor
????????????????????????????item.firstTime?item.content?item.reNum?item.lastAuthor?item.lastTime))
????????????????except?Exception?as?e:
????????????????????self.log.error(‘寫入文件失敗‘)
????????????????else:
????????????????????self.log.info(‘標題為<<%s>>的項輸入到“%s“成功‘?%(item.title?fileName))
????def?getResponseContent(self?url):
????????‘‘‘
????????單獨使用一個函數返回頁面返回值,是為了后期方便的
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\.idea\
?????文件?????????763??2019-07-01?01:13??baidupostbar?-?副本\.idea\baidupostbar.iml
?????目錄???????????0??2019-06-30?20:53??baidupostbar?-?副本\.idea\inspectionProfiles\
?????文件?????????313??2019-07-01?01:13??baidupostbar?-?副本\.idea\misc.xm
?????文件?????????283??2019-06-30?20:41??baidupostbar?-?副本\.idea\modules.xm
?????文件???????14798??2019-07-22?11:19??baidupostbar?-?副本\.idea\workspace.xm
?????文件??????316009??2019-07-01?19:40??baidupostbar?-?副本\getCommentInfo.log
?????文件????????3770??2019-07-01?19:24??baidupostbar?-?副本\getCommentInfo.py
?????文件????????1574??2019-07-01?11:50??baidupostbar?-?副本\mylog.py
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\
?????目錄???????????0??2019-06-30?20:41??baidupostbar?-?副本\venv\Include\
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\Lib\
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\Lib\site-packages\
?????文件??????????55??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\easy-install.pth
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\
?????文件???????????1??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\dependency_li
?????文件??????????98??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\entry_points.txt
?????文件???????????2??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\not-zip-safe
?????文件????????2972??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\PKG-INFO
?????文件??????????74??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\requires.txt
?????文件???????12502??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\SOURCES.txt
?????文件???????????4??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\EGG-INFO\top_level.txt
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\
?????目錄???????????0??2019-07-22?11:20??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\_internal\
?????文件???????14014??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\_internal\ba
?????文件????????8764??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\_internal\ba
?????文件????????2773??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\_internal\build_env.py
?????文件????????7023??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\_internal\cache.py
?????文件???????16679??2019-06-30?20:41??baidupostbar?-?副本\venv\Lib\site-packages\pip-10.0.1-py3.6.egg\pip\_internal\cmdoptions.py
............此處省略378個文件信息
- 上一篇:豆瓣電影大數據分析-
- 下一篇:《 流暢的python 》源代碼 .zip
評論
共有 條評論