資源簡介
python爬蟲初級學習,爬取豆瓣top250電影信息
代碼片段和文件信息
import?re
import?requests
from?bs4?import?BeautifulSoup
def?get_content(url?):
????try:
????????#瀏覽器標識
????????user_agent?=?“Mozilla/5.0?(X11;?Linux?x86_64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/59.0.3071.109?Safari/537.36“
????????response?=?requests.get(url?headers={‘User-Agent‘:?user_agent})
????????response.raise_for_status()??#?如果返回的狀態碼不是200,?則拋出異常;???不是200代表錯誤
????????response.encoding?=?response.apparent_encoding??#?判斷網頁的編碼格式,?便于respons.text知道如何解碼;
????except?Exception?as?e:
????????print(“爬取錯誤“)
????else:
????????print(response.url)
????????print(“爬取成功!“)
????????return?response.content
def?parser_content(htmlContent):
????#?實例化soup對象,?便于處理;
????soup?=?BeautifulSoup(htmlContent?‘html.parser‘)
????#??1).?電影信息存儲在ol標簽里面的li標簽:
????#??
????olObj?=?soup.find_all(‘ol‘?class_=‘grid_view‘)[0]
????#??2).?獲取每個電影的詳細信息?存儲在li標簽;
????details?=?olObj.find_all(‘li‘)
????for?detail?in?details:
????????#?3).?獲取電影名稱;
????????movi
- 上一篇:FastAPI入門級
- 下一篇:從圖像數據集讀取圖片并拼接成大圖
評論
共有 條評論