91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 7KB
    文件類型: .py
    金幣: 1
    下載: 0 次
    發(fā)布日期: 2021-06-01
  • 語言: Python
  • 標(biāo)簽: python爬蟲??

資源簡介

本爬蟲實現(xiàn)的功能: 隨便在豆瓣網(wǎng)站中選擇一部電影,獲取影片詳細信息,并自動獲取該影片的短評鏈接,再跳轉(zhuǎn)到短評頁面,獲取各位觀眾的影評,最后將爬取的數(shù)據(jù)存儲到數(shù)據(jù)庫中。 開發(fā)環(huán)境: python3 + pycharm +WIN +mysql

資源截圖

代碼片段和文件信息

#!/usr/bin/env?python3
#?-*-?coding:?utf-8?-*-
import?requests
from?bs4?import?BeautifulSoup
import?re
import?os
import?pymysql
#from?itertools?import?islice

#?獲取網(wǎng)頁文檔,創(chuàng)建BeautifulSoup對象
def?get_soup(url):
????res?=?requests.get(url)??#?獲取網(wǎng)頁
????res.encoding?=?‘utf-8‘??#?最好設(shè)為utf-8,防止不必要的麻煩
????#print(res.text)
????soup?=?BeautifulSoup(res.text‘html.parser‘)
????#?print(soup)
????return?(soupres)??#?這里同時返回res是為后面的正則表達式服務(wù)

#?獲取影片部分信息
def?get_movie_comment(movie_url):
????(soupres)?=?get_soup(movie_url)
????f?=?open(“F:\\SpiderDB\\movie.txt“?“a“encoding=‘utf-8‘)
????for?items?in?soup.select(‘#content‘):
????????#print(items)
????????movie_name?=?items.select(‘span‘)[0].text
????????print(movie_name)
????????f.write(movie_name?+?““)
????????for?info?in?soup.select(‘#info‘):
????????????#print(info)
????????????director?=?info.select(‘.attrs‘)[0].text
????????????print(director)
????????????editor?=?info.select(‘.attrs‘)[1].text
????????????actors?=?info.select(‘.attrs‘)[2].text.strip()
????????????actor?=?actors.split(“/“)[0:2]
????????????actor?=?‘‘.join(actor)
????????????print(actor)
????????????style?=?‘?‘.join([style.text?for?style?in?info.select(‘span[property=“v:genre“]‘)])?#?python?簡潔一行
????????????print(style)
????????????time?=?info.select(‘span[property=“v:initialReleaseDate“]‘)[0].text
????????????time?=?re.split(‘\(‘time)[0]
????????????print(time)
????????????comment_url?=?items.select(‘.mod-hd?span.pl?a‘)[0][‘href‘]
????????????#print(type(comment_url))
????????????print(comment_url)
????????????f.write(director?+?““)
????????????f.write(editor?+?““)
????????????f.write(actor?+?““)
????????????f.write(style?+?““)
????????????f.write(time?+?““)
????????????f.write(comment_url?+?‘‘)
????????????f.close()
????????????get_eval(comment_url)
????????????for?i?in?range(2020020):?##從start=20開始,間隔為20
????????????????new_url?=?comment_url.replace(“status=P““start={}“).format(i)
????????????????#print(new_url)
????????????????get_comment(new_url)

#?獲取評論整體情況
def?get_eval(comment_url):
????(soup1?res1)?=?get_soup(comment_url)
????movie_comment?=?soup1.select(‘#content?h1‘)[0].text.rsplit(‘?短評‘)[0]
????f?=?open(“F:\\SpiderDB\\movie.txt“?“a“?encoding=‘utf-8‘)
????f.write(movie_comment?+?““)
????for?it?in?soup1.select(‘.comment-filter‘):
????????#print(it)
????????good_eval?=?it.select(‘.filter-name‘)[1].text???#評價
????????good_cp?=?it.select(‘span.comment-percent‘)[1].text????#好評率
????????common_eval?=?it.select(‘.filter-name‘)[2].text
????????common_cp?=it.select(‘span.comment-percent‘)[2].text
????????bad_eval?=?it.select(‘.filter-name‘)[3].text
????????bad_cp?=?it.select(‘span.comment-percent‘)[3].text
????????#?print(common_eval)
????????#?print(bad_eval)
????????f.write(good_eval?+?“:“)
????????f.write(good_cp?+?““)
????????f.write(common_eval?+?“:“)
????????f.write(common_cp?+?““)
????????f.write(bad_eval?+?“:“)
????????f.write(bad_cp)
????????f

評論

共有 條評論