資源簡介
用Python寫網絡爬蟲PDF&源碼
代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
import?urllib2
import?urlparse
def?download1(url):
????“““Simple?downloader“““
????return?urllib2.urlopen(url).read()
def?download2(url):
????“““Download?function?that?catches?errors“““
????print?‘Downloading:‘?url
????try:
????????html?=?urllib2.urlopen(url).read()
????except?urllib2.URLError?as?e:
????????print?‘Download?error:‘?e.reason
????????html?=?None
????return?html
def?download3(url?num_retries=2):
????“““Download?function?that?also?retries?5XX?errors“““
????print?‘Downloading:‘?url
????try:
????????html?=?urllib2.urlopen(url).read()
????except?urllib2.URLError?as?e:
????????print?‘Download?error:‘?e.reason
????????html?=?None
????????if?num_retries?>?0:
????????????if?hasattr(e?‘code‘)?and?500?<=?e.code?600:
????????????????#?retry?5XX?HTTP?errors
????????????????html?=?download3(url?num_retries-1)
????return?html
def?download4(url?user_agent=‘wswp‘?num_retries=2):
????“““Download?function?that?includes?user?agent?support“““
????print?‘Downloading:‘?url
????headers?=?{‘User-agent‘:?user_agent}
????request?=?urllib2.Request(url?headers=headers)
????try:
????????html?=?urllib2.urlopen(request).read()
????except?urllib2.URLError?as?e:
????????print?‘Download?error:‘?e.reason
????????html?=?None
????????if?num_retries?>?0:
????????????if?hasattr(e?‘code‘)?and?500?<=?e.code?600:
????????????????#?retry?5XX?HTTP?errors
????????????????html?=?download4(url?user_agent?num_retries-1)
????return?html
def?download5(url?user_agent=‘wswp‘?proxy=None?num_retries=2):
????“““Download?function?with?support?for?proxies“““
????print?‘Downloading:‘?url
????headers?=?{‘User-agent‘:?user_agent}
????request?=?urllib2.Request(url?headers=headers)
????opener?=?urllib2.build_opener()
????if?proxy:
????????proxy_params?=?{urlparse.urlparse(url).scheme:?proxy}
????????opener.add_handler(urllib2.ProxyHandler(proxy_params))
????try:
????????html?=?opener.open(request).read()
????except?urllib2.URLError?as?e:
????????print?‘Download?error:‘?e.reason
????????html?=?None
????????if?num_retries?>?0:
????????????if?hasattr(e?‘code‘)?and?500?<=?e.code?600:
????????????????#?retry?5XX?HTTP?errors
????????????????html?=?download5(url?user_agent?proxy?num_retries-1)
????return?html
download?=?download5
if?__name__?==?‘__main__‘:
????print?download(‘http://example.webscraping.com‘)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????174??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\.hg_archival.txt
?????文件???????2364??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\common.py
?????文件????????553??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\iteration_crawler1.py
?????文件????????846??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\iteration_crawler2.py
?????文件????????931??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\li
?????文件???????1149??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\li
?????文件???????4649??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\li
?????文件????????445??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter01\sitemap_crawler.py
?????文件????????554??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\bs_example.py
?????文件????????462??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\common.py
?????文件???????4816??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\li
?????文件????????371??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\lxm
?????文件???????2293??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\performance.py
?????文件????????333??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\regex_example.py
?????文件????????700??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\scrape_callback1.py
?????文件????????940??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter02\scrape_callback2.py
?????文件???????3686??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter03\disk_cache.py
?????文件???????3230??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter03\downloader.py
?????文件???????3183??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter03\li
?????文件???????2356??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter03\mongo_cache.py
?????文件????????818??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\alexa_cb.py
?????文件????????564??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\alexa_fn.py
?????文件???????3026??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\mongo_queue.py
?????文件???????2736??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\process_crawler.py
?????文件????????471??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\process_test.py
?????文件????????375??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\sequential_test.py
?????文件???????2491??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\threaded_crawler.py
?????文件????????475??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter04\threaded_test.py
?????文件???????2747??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter05\browser_render.py
?????文件???????1101??2015-09-28?13:29??用Python寫網絡爬蟲PDF&源碼\用Python寫爬蟲-源碼\chapter05\search1.py
............此處省略156個文件信息
評論
共有 條評論