資源簡介
python3實現爬取中國天氣網天氣并寫入csv,若有疑問可參照對應博客或詢問

代碼片段和文件信息
import?re
from?urllib.parse?import?urlparseurljoinurlsplit
import?urllib.request
import?time
from?datetime?import?datetime
import?urllib.robotparser
import?queue
def?link_crawler(seed_url?link_regex=None?delay=5?max_depth=-1?max_urls=-1?headers=None?user_agent=‘wswp‘?proxy=None?num_retries=1?scrape_callback=None):
????“““
????在link_regex匹配的鏈接之后從給定的種子URL抓取
????“““
????#?仍需要抓取的URL隊列
????crawl_queue?=?[seed_url]
????#?已經看到深度的URL
????seen?=?{seed_url:?0}
????#?追蹤有多少URL被下載過
????num_urls?=?0
????rp?=?get_robots(seed_url)
????throttle?=?Throttle(delay)
????headers?=?headers?or?{}
????if?user_agent:
????????headers[‘User-agent‘]?=?user_agent
????while?crawl_queue:
????????url?=?crawl_queue.pop()
????????depth?=?seen[url]
????????#?檢查url傳遞的robots.txt限制
????????if?rp.can_fetch(user_agent?url):
????????????throttle.wait(url)
????????????html?=?download(url?headers?proxy=proxy?num_retries=num_retries)
????????????links?=?[]
????????????if?scrape_callback:
????????????????links.extend(scrape_callback(url?html)?or?[])
????????????#未達到最大深度,仍可以進一步爬取
????????????if?depth?!=?max_depth:
????????????????if?link_regex:
????????????????????#?篩選符合正則表達式的鏈接
????????????????????links.extend(link?for?link?in?get_links(html)?if?re.match(link_regex?link))
????????????????for?link?in?links:
????????????????????link?=?normalize(seed_url?link)
????????????????????#?檢查是否已抓取此鏈接
????????????????????if?link?not?in?seen:
????????????????????????seen[link]?=?depth?+?1
????????????????????????#?檢查鏈接是否在同一個域內
????????????????????????if?same_domain(seed_url?link):
????????????????????????????#?成功!?將此新鏈接添加到隊列
????????????????????????????crawl_queue.append(link)
????????????#?檢查是否已達到下載的最大值
????????????num_urls?+=?1
????????????if?num_urls?==?max_urls:
????????????????break
????????else:
????????????print?(‘Blocked?by?robots.txt:‘?url)
#節流
class?Throttle:
????“““
????通過在對同一域之間請求休眠來限制下載
????“““
????def?__init__(self?delay):
????????#每個域的下載之間的延遲量
????????self.delay?=?delay
????????#?上次訪問域時的時間戳
????????self.domains?=?{}
????????
????def?wait(self?url):
????????“““
???????如果最近訪問過這個域,則會延遲
????????“““
????????domain?=?urlsplit(url).netloc
????????last_accessed?=?self.domains.get(domain)
????????if?self.delay?>?0?and?last_accessed?is?not?None:
????????????sleep_secs?=?self.delay?-?(datetime.now()?-?last_accessed).seconds
????????????if?sleep_secs?>?0:
????????????????time.sleep(sleep_secs)
????????self.domains[domain]?=?datetime.now()
#下載網址
def?download(url?headers?proxy?num_retries?data=None):
????print?(‘Downloading:‘?url)
????request?=?urllib.request.Request(url?data?headers)
????opener?=?urllib.request.build_opener()
????if?proxy:
????????proxy_params?=?{urlparse.urlparse(url).scheme:?proxy}
????????opener.add_handler(urllib.request.ProxyHandler(proxy_params))
????try:
????????response?=?opener.open(request)
????????html?=?response.read()
????????code?=?response.code
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????5333??2018-11-09?20:23??weather_reptile\li
?????文件???????1162??2018-11-09?20:23??weather_reptile\scrape_callback.py
?????目錄??????????0??2018-11-11?19:44??weather_reptile
-----------?---------??----------?-----??----
?????????????????6495????????????????????3
- 上一篇:nulindai.py
- 下一篇:超限學習機—邏輯回歸Python代碼
評論
共有 條評論