資源簡介
python + selenium +pyquery 爬蟲 爬取 1688詳情圖片 阿里巴巴詳情圖片 與標題 下載圖片并進行壓縮 僅供學習交流使用
代碼片段和文件信息
#?-*-?coding:?utf-8?-*
import?datetime
import?os
import?random
import?re
import?time
import?Image
import?requests
from?selenium?import?webdriver
from?pyquery?import?PyQuery?as?pq
from?selenium.webdriver?import?ActionChains
from?selenium.webdriver.common?import?keys
parentPath?=“/Users/niubilea/Documents/ag/ali_goods/%s/“
downloadPath?=?parentPath+“download“
compressPath?=?parentPath+“compress“
def?bluePrint(str):
????print(‘\033[1;34m‘?+?str?+?‘\033[0m‘)
def?redPrint(str):
????print(‘?\033[1;31;40m‘?+?str?+?‘\033[0m‘)
def?openUrl(url):
????browser?=?webdriver.Chrome(“./chromedriver_mac_64“)
????browser.get(url)
????top?=?1000;
????distance=100;
????for?i?in?range?(130):
????????print(i)
????????top=top+i*distance
????????js=“var?q=document.documentElement.scrollTop=“+str(top)
????????browser.execute_script(js)
????????time.sleep(random.random())
????time.sleep(3)
????return?browserbrowser.page_source
def?getPageHtml(pageUrl):
????print(“開始獲取html內容“)
????headers?=?{‘Content-type‘:?‘text/html‘
???????????‘User-Agent‘:?‘Mozilla/5.0?(X11;?Ubuntu;?Linux?x86_64;?rv:22.0)?Gecko/20100201?Firefox/22.0‘}
????content?=?requests.get(pageUrltimeout=30headers=headers)
????#?content.encoding?=‘utf-8‘;
????content.encoding?=‘gbk‘;
????htmlsub?=?content.text
????print(“獲取內容完成“)
????return?htmlsub;
#創建文件
#file_path:文件路徑
#msg:即要寫入的內容
def?create__file(file_pathmsg):
????f=open(file_path“w“)
????f.write(msg)
????f.close
def?download_banner(titlecontentdownloadFoldercompressFolder):
????pic_url?=?re.findall(‘src=“(https://cbu01.*jpg“)‘?content)
????i?=?0;
????for?key?in?pic_url:
????????time.sleep(0.3)
????????i?=?i?+?1
????????temptitle?=?title?+?str(i)
????????targetImgPath?=?downloadFolder?+?‘/%s.jpg‘?%?temptitle
????????print(key?+?“\r\n“)
????????from?urllib?import?urlretrieve
????????try:
????????????if?key.find(“https“)?>=?0:
????????????????urlretrieve(key?targetImgPath)
????????????else:
????????????????urlretrieve(“https:“?+?key?targetImgPath)
????????except?Exception?as?e:
????????????print(e)
????tinypng(downloadFoldercompressFolder)
def?download_content(titlecontentdownloadFoldercompressFolder):
????pic_url?=?re.findall(‘img?src=“(.*?)“‘?content)
????i?=?0;
????for?key?in?pic_url:
????????time.sleep(0.03)
????????i?=?i?+?1
????????temptitle?=?title?+?str(i)
????????targetImgPath?=?downloadFolder?+?‘/%s.jpg‘?%?temptitle
????????print(key?+?“\r\n“)
????????from?urllib?import?urlretrieve
????????try:
????????????if?key.find(“https“)?>=?0:
????????????????urlretrieve(key?targetImgPath)
????????????else:
????????????????urlretrieve(“https:“?+?key?targetImgPath)
????????except?Exception?as?e:
????????????print(e)
????tinypng(downloadFoldercompressFolder)
def?tinypng(downloadcompress):
????#?指定要壓縮的文件夾
????srcPath?=download
????#?壓縮后文件夾
????dstPath?=?compress
????for?filename?in?os.listdir(srcPath):
????????#?如果不存在目的目錄則創建一個,保持層級結構
????????if?not?os.path.exists(dstPath):
????????????os.makedirs(dstPath)
????????#?拼接完整的文件或文
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????15275600??2019-07-15?19:49??chromedriver_mac_64
?????文件?????8543232??2019-07-15?20:29??chromedriver.exe
?????文件????11061936??2019-07-16?10:09??chromedriver_linux64
?????文件????????5546??2019-08-17?23:12??seleniumDemo_back.py
評論
共有 條評論