資源簡介
Python程序,根據(jù)關鍵字爬蟲谷歌,百度,必應上相關圖片

代碼片段和文件信息
import?json
import?itertools
import?urllib
import?requests
import?os
import?re
import?sys
download_path?=?“dataset/“
str_table?=?{
????‘_z2C$q‘:?‘:‘
????‘_z&e3B‘:?‘.‘
????‘AzdH3F‘:?‘/‘
}
char_table?=?{
????‘w‘:?‘a(chǎn)‘
????‘k‘:?‘b‘
????‘v‘:?‘c‘
????‘1‘:?‘d‘
????‘j‘:?‘e‘
????‘u‘:?‘f‘
????‘2‘:?‘g‘
????‘i‘:?‘h‘
????‘t‘:?‘i‘
????‘3‘:?‘j‘
????‘h‘:?‘k‘
????‘s‘:?‘l‘
????‘4‘:?‘m‘
????‘g‘:?‘n‘
????‘5‘:?‘o‘
????‘r‘:?‘p‘
????‘q‘:?‘q‘
????‘6‘:?‘r‘
????‘f‘:?‘s‘
????‘p‘:?‘t‘
????‘7‘:?‘u‘
????‘e‘:?‘v‘
????‘o‘:?‘w‘
????‘8‘:?‘1‘
????‘d‘:?‘2‘
????‘n‘:?‘3‘
????‘9‘:?‘4‘
????‘c‘:?‘5‘
????‘m‘:?‘6‘
????‘0‘:?‘7‘
????‘b‘:?‘8‘
????‘l‘:?‘9‘
????‘a(chǎn)‘:?‘0‘
}
#?str?的translate方法需要用單個字符的十進制unicode編碼作為key
#?value?中的數(shù)字會被當成十進制unicode編碼轉(zhuǎn)換成字符
#?也可以直接用字符串作為value
char_table?=?{ord(key):?ord(value)?for?key?value?in?char_table.items()}
#?解碼圖片URL
def?decode(url):
????#?先替換字符串
????for?key?value?in?str_table.items():
????????url?=?url.replace(key?value)
????#?再替換剩下的字符
????return?url.translate(char_table)
#?生成網(wǎng)址列表
def?buildUrls(word):
????word?=?urllib.parse.quote(word)
????url?=?r“http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60“
????urls?=?(url.format(word=word?pn=x)?for?x?in?itertools.count(start=0?step=60))
????return?urls
#?解析JSON獲取圖片URL
re_url?=?re.compile(r‘“objURL“:“(.*?)“‘)
def?resolveImgUrl(html):
????imgUrls?=?[decode(x)?for?x?in?re_url.findall(html)]
????return?imgUrls
def?downImg(imgUrl?dirpath?imgName):
????filename?=?os.path.join(dirpath?imgName)
????try:
????????res?=?requests.get(imgUrl?timeout=15)
????????if?str(res.status_code)[0]?==?“4“:
????????????print(str(res.status_code)?“:“??imgUrl)
????????????return?False
????except?Exception?as?e:
????????print(“?This?is?Exception:“?imgUrl)
????????print(e)
????????return?False
????with?open(filename?“wb“)?as?f:
????????f.write(res.content)
????return?True
def?mkDir(dirName):
????#dirpath?=?os.path.join(sys.path[0]?dirName)
????dirpath?=?os.path.join(download_path?dirName)
????if?not?os.path.exists(dirpath):
????????os.mkdir(dirpath)
????return?dirpath
def?crawl_data(urls):
????index?=?0
????for?url?in?urls:
????????print(“requesting:“?url)
????????html?=?requests.get(url?timeout=10).content.decode(‘utf-8‘)
????????imgUrls?=?resolveImgUrl(html)
????????if?len(imgUrls)?==?0:??#?沒有圖片則結(jié)束
????????????break
????????for?url?in?imgUrls:
????????????if?downImg(url?dirpath?str(index)?+?“.jpg“):
????????????????index?+=?1
????????????????print(“Downloaded?%s?picture“?%?index)
????????????????if?index==10:#最大下載圖片數(shù)
????????????????????return
????return
if?__name__?==?‘__main__‘:
????print(“Welcome?!!!?\n?Now?it?only?one?word“)
????print(“Download?in?results“)
????print(“=“?*?50)
????word?=?input(“Please?input?your?word:\n“)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????404??2018-11-19?17:42??spider_images\.idea\inspectionProfiles\Project_Default.xm
?????文件????????204??2018-11-15?15:43??spider_images\.idea\misc.xm
?????文件????????273??2018-11-14?18:29??spider_images\.idea\modules.xm
?????文件??????24723??2018-11-19?18:26??spider_images\.idea\workspace.xm
?????文件??????16883??2018-11-19?12:49??spider_images\download.log
?????文件???????3576??2018-11-19?18:14??spider_images\down_baidu_img.py
?????文件???????2691??2018-11-19?18:14??spider_images\down_bing_img.py
?????文件???????3054??2018-11-19?18:22??spider_images\down_google_img.py
?????文件?????399336??2018-11-19?14:24??spider_images\Firefox-latest.exe
?????文件???16976488??2018-10-04?13:18??spider_images\geckodriver.exe
?????文件???????8497??2018-11-19?18:23??spider_images\geckodriver.log
?????文件????????410??2018-11-15?15:43??spider_images\spider_images.iml
?????文件?????????55??2018-11-15?15:39??spider_images\venv\Lib\site-packages\easy-install.pth
?????文件??????????1??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\dependency_li
?????文件?????????98??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\entry_points.txt
?????文件??????????2??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\not-zip-safe
?????文件???????2972??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\PKG-INFO
?????文件?????????74??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\requires.txt
?????文件??????12502??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\SOURCES.txt
?????文件??????????4??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\EGG-INFO\top_level.txt
?????文件??????14014??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\ba
?????文件???????8764??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\ba
?????文件???????2773??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\build_env.py
?????文件???????7023??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\cache.py
?????文件??????16679??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\cmdoptions.py
?????文件???????1500??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\commands\check.py
?????文件???????3018??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\commands\completion.py
?????文件???????7343??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\commands\configuration.py
?????文件???????9092??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\commands\download.py
?????文件???????3320??2018-11-15?15:39??spider_images\venv\Lib\site-packages\pip-10.0.1-py3.7.egg\pip\_internal\commands\freeze.py
............此處省略387個文件信息
評論
共有 條評論