91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 2KB
    文件類型: .zip
    金幣: 2
    下載: 2 次
    發布日期: 2021-10-07
  • 語言: Python
  • 標簽: python爬蟲??

資源簡介

這是京東商品圖片獲取的python爬蟲。當檢索一個商品名稱之后,會返回值100頁的商品介紹,每頁有60個商品,每個商品都有一張圖片,這個爬蟲就是爬取這些圖片的。

資源截圖

代碼片段和文件信息

‘‘‘March?222018?Author:?Zhiying?Zhou‘‘‘
from?requests.exceptions?import?RequestException
from?urllib.request?import?urlretrieve
from?bs4?import?BeautifulSoup
from?multiprocessing?import?Pool
import?time
from?selenium?import?webdriver


#?獲取網頁源代碼
def?get_html_page(url):
????try:
????????driver?=?webdriver.Firefox()
????????driver.get(url)
????????time.sleep(5)
????????#?執行頁面向下滑至底部的動作
????????driver.execute_script(“window.scrollTo(0document.body.scrollHeight);“)
????????#?停頓5秒等待頁面加載完畢!!!(必須留有頁面加載的時間,否則獲得的源代碼會不完整。)
????????time.sleep(5)
????????html_sourcode?=?driver.page_source
????????driver.close()
????????return?html_sourcode
????except?RequestException:
????????print(RequestException)


#?提取網頁的圖片的網址
def?parse_html_page(html):

????#?對有效圖片網址進行提取
????soup?=?BeautifulSoup(html?‘html5lib‘)
????#?定義一個列表來獲取分析得到的圖片的網址

????url_items?=?[]
????li_tags?=?soup.find_all(‘li‘?‘gl-item‘)
????for?li_tag?in?li_tags:
????????try:
????????????if?len(li_tag.img[“src“])?>=?10:
????????????????url_items.append(li_tag.img[‘src‘])
????????????else:
????????????????pass
????????except:
????????????if?len(li_tag.img[“data-lazy-img“])?>=?10:
????????????????url_items.append(li_tag.img[‘data-lazy-img‘])
????????????else:
????????????????url_items.append(li_tag.img[“src“])

????return?url_items


#?以頁為單位下載圖片并保存到本地
def?download(items?index):
????for?i?in?range(len(items)):
????????uri?=?“https:“?+?str(items[i])
????????path?=?“/home/zhiying/圖片/jd/“?+?“第“?+?str(index?+?1)?+?“頁“?+?str(i?+?1)?+?“.jpg“
????????#?異常處理
????????try:
????????????urlretrieve(uri?filename=path)
????????except:
????????????pass


def?main(index):
????#?構造網址

????url?=?“https://search.jd.com/Search?keyword=男裝&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=男裝&cid2=1342&page=“?+?\
???????????str(index?*?2?+?1)
????print(“正在獲取第%s頁》》》“?%?(index?+?1))
????html?=?get_html_page(url)
????download(parse_html_page(html)?index)
????print(“第%s頁獲取成功!“?%?(index?+?1))


if?__name__?==?‘__main__‘:
????#?計算程序運行時間
????time.clock()
????pool?=?Pool()
????pool.map(main?(index?for?index?in?range(100)))
????print(“獲取圖片成功!\n“)
????print(“程序運行時間為{}“.format(time.clock()))













?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-03-23?01:36??Spider-of-jingdong-products-images-master\
?????文件?????????138??2018-03-23?01:36??Spider-of-jingdong-products-images-master\README.md
?????文件????????2479??2018-03-23?01:36??Spider-of-jingdong-products-images-master\jingdongspider(beta2).py

評論

共有 條評論