資源簡介
這是京東商品圖片獲取的python爬蟲。當檢索一個商品名稱之后,會返回值100頁的商品介紹,每頁有60個商品,每個商品都有一張圖片,這個爬蟲就是爬取這些圖片的。

代碼片段和文件信息
‘‘‘March?222018?Author:?Zhiying?Zhou‘‘‘
from?requests.exceptions?import?RequestException
from?urllib.request?import?urlretrieve
from?bs4?import?BeautifulSoup
from?multiprocessing?import?Pool
import?time
from?selenium?import?webdriver
#?獲取網頁源代碼
def?get_html_page(url):
????try:
????????driver?=?webdriver.Firefox()
????????driver.get(url)
????????time.sleep(5)
????????#?執行頁面向下滑至底部的動作
????????driver.execute_script(“window.scrollTo(0document.body.scrollHeight);“)
????????#?停頓5秒等待頁面加載完畢!!!(必須留有頁面加載的時間,否則獲得的源代碼會不完整。)
????????time.sleep(5)
????????html_sourcode?=?driver.page_source
????????driver.close()
????????return?html_sourcode
????except?RequestException:
????????print(RequestException)
#?提取網頁的圖片的網址
def?parse_html_page(html):
????#?對有效圖片網址進行提取
????soup?=?BeautifulSoup(html?‘html5lib‘)
????#?定義一個列表來獲取分析得到的圖片的網址
????url_items?=?[]
????li_tags?=?soup.find_all(‘li‘?‘gl-item‘)
????for?li_tag?in?li_tags:
????????try:
????????????if?len(li_tag.img[“src“])?>=?10:
????????????????url_items.append(li_tag.img[‘src‘])
????????????else:
????????????????pass
????????except:
????????????if?len(li_tag.img[“data-lazy-img“])?>=?10:
????????????????url_items.append(li_tag.img[‘data-lazy-img‘])
????????????else:
????????????????url_items.append(li_tag.img[“src“])
????return?url_items
#?以頁為單位下載圖片并保存到本地
def?download(items?index):
????for?i?in?range(len(items)):
????????uri?=?“https:“?+?str(items[i])
????????path?=?“/home/zhiying/圖片/jd/“?+?“第“?+?str(index?+?1)?+?“頁“?+?str(i?+?1)?+?“.jpg“
????????#?異常處理
????????try:
????????????urlretrieve(uri?filename=path)
????????except:
????????????pass
def?main(index):
????#?構造網址
????url?=?“https://search.jd.com/Search?keyword=男裝&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=男裝&cid2=1342&page=“?+?\
???????????str(index?*?2?+?1)
????print(“正在獲取第%s頁》》》“?%?(index?+?1))
????html?=?get_html_page(url)
????download(parse_html_page(html)?index)
????print(“第%s頁獲取成功!“?%?(index?+?1))
if?__name__?==?‘__main__‘:
????#?計算程序運行時間
????time.clock()
????pool?=?Pool()
????pool.map(main?(index?for?index?in?range(100)))
????print(“獲取圖片成功!\n“)
????print(“程序運行時間為{}“.format(time.clock()))
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-03-23?01:36??Spider-of-jingdong-products-images-master\
?????文件?????????138??2018-03-23?01:36??Spider-of-jingdong-products-images-master\README.md
?????文件????????2479??2018-03-23?01:36??Spider-of-jingdong-products-images-master\jingdongspider(beta2).py
評論
共有 條評論