資源簡介
實戰(zhàn)抓取瓜子二手車詳情頁文本,用etree解析,xpath定位抓取,保存為csv文件。加入了反爬內(nèi)容。僅用于學(xué)習(xí)測試,請勿用于非法用途。
代碼片段和文件信息
#?!/usr/bin/python
#?ctrl+alt+L自動加空格格式化
#?--king--
import?requests
from?lxml?import?html
import?time
#?獲取詳情頁面的函數(shù)
headers?=?{
????‘User-Agent‘:?‘Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/81.0.4044.138?Safari/537.36‘
????‘Cookie‘:?‘track_id=78612551481065472;?uuid=078b88fa-951c-43c4-b101-6aa943f02548;?user_city_id=103;?ganji_uuid=7491503126491343275062;?lg=1;?cityDomain=zz;?antipas=I93s6b4649487681198n608P1u;?clueSourceCode=%2A%2300;?sessionid=e88583b7-7f25-44bf-a7c7-325da0041169;?cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22pcbiaoti%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2278612551481065472%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22078b88fa-951c-43c4-b101-6aa943f02548%22%2C%22ca_city%22%3A%22zz%22%2C%22sessionid%22%3A%22e88583b7-7f25-44bf-a7c7-325da0041169%22%7D;?preTime=%7B%22last%22%3A1589953591%2C%22this%22%3A1589897893%2C%22pre%22%3A1589897893%7D‘}
def?get_detail_urls(url):
????resp?=?requests.get(url?headers=headers)
????text?=?resp.content.decode(‘utf-8?‘)
????#?由于etree要通過html引出所以變量名盡量不要使用html否則會報錯
????htmls?=?html.etree.HTML(text)
????#?輸出是一個l.etree._Element‘>一個class類
????#?確認(rèn)上級ul標(biāo)簽的carlist?clearfix?js-top屬性是否唯一
????#?確認(rèn)后直接抓取這個ul標(biāo)簽下所有內(nèi)容
????#?利用[0]切片從列表里取出第一個l.etree._Element‘>一個class類
????ul?=?htmls.xpath(‘//ul[@class=“carlist?clearfix?js-top“]‘)[0]
????lis?=?ul.xpath(‘./li‘)
????#?輸出是由多個組成的列表所以命名為li+s=lis表示多個li
????#?[...]
????#?說明里邊包含的是多個Element元素
????#?用for循環(huán)取出里邊所有的元素
????#?建立空列表把完整的detail_url添加進(jìn)去
????detail_urls?=?[]
????for?li?in?lis:
????????detail_url?=?li.xpath(“./a/@href“)
????????detail_url?=?‘https://www.guazi.com‘?+?detail_url[0]
????????#?href[0]代表把當(dāng)前列表里的數(shù)據(jù)以字符串的形式切片出來
????????#?由于href是從li中取的
評論
共有 條評論