資源簡介
代碼片段和文件信息
#!/usr/bin/python
#?-*-?coding:?UTF-8?-*-
from?bs4?import?BeautifulSoup
import?requests.exceptions
from?urllib.parse?import?urlsplit
from?collections?import?deque
import?re
#?一個需要爬行的url隊列
new_urls?=?deque([‘https://www.baidu.com/‘])
#?一組我們已經爬過的url
processed_urls?=?set()
emails?=?set()
#?一個一個地處理url,直到我們耗盡隊列
while?len(new_urls):
????#?將下一個url從隊列移動到處理的url集合
????url?=?new_urls.popleft()
????processed_urls.add(url)
????#?提取基本url以解析相對鏈接
????parts?=?urlsplit(url)
????base_url?=?“{0.scheme}://{0.netloc}“.format(parts)
????path?=?url[:url.rfind(‘/‘)+1]?if?‘/‘?in?parts.path?else?url
????#?獲取url的內容
????print(“Processing?%s“?%?url)
????try:
????????response?=?requests.get(url)
????except?(requests.exceptions.MissingSc
- 上一篇:Python貪吃蛇游戲
- 下一篇:scrapy抓取安居客數據
評論
共有 條評論