91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡介

抓取CSDN博客文章的簡單爬蟲python源碼

資源截圖

代碼片段和文件信息

#?coding=utf-8
import?sys

default_encoding?=?‘utf-8‘
if?sys.getdefaultencoding()?!=?default_encoding:
????reload(sys)
????sys.setdefaultencoding(default_encoding)

import?urllib2
import?re
import?random
import?time
from?bs4?import?BeautifulSoup
import?leancloud
from?leancloud?import?object
from?leancloud?import?LeanCloudError

WAIT_URL?=?None??#?檢測到如果有下一篇,則先保留該網址,等到遍歷上一篇結束后,重新回來遍歷下一篇
SEARCH_TYPE?=?1??#?1為下一篇


class?Get_First_Url:
????def?__init__(self?url2):
????????self.url?=?url2
????????print(‘\n‘)
????????print(‘開始獲取第一篇博客地址‘)
????????print(‘博客主頁地址:?‘?+?self.url)
????????‘‘‘
????????這是注釋
????????‘‘‘
????????user_agents?=?[
????????????‘Mozilla/5.0?(Windows;?U;?Windows?NT?5.1;?it;?rv:1.8.1.11)?Gecko/20071127?Firefox/2.0.0.11‘
????????????‘Opera/9.25?(Windows?NT?5.1;?U;?en)‘
????????????‘Mozilla/4.0?(compatible;?MSIE?6.0;?Windows?NT?5.1;?SV1;?.NET?CLR?1.1.4322;?.NET?CLR?2.0.50727)‘
????????????‘Mozilla/5.0?(compatible;?Konqueror/3.5;?Linux)?KHTML/3.5.5?(like?Gecko)?(Kubuntu)‘
????????????‘Mozilla/5.0?(X11;?U;?Linux?i686;?en-US;?rv:1.8.0.12)?Gecko/20070731?Ubuntu/dapper-security?Firefox/1.5.0.12‘
????????????‘Lynx/2.8.5rel.1?libwww-FM/2.14?SSL-MM/1.4.1?GNUTLS/1.2.9‘
????????????“Mozilla/5.0?(X11;?Linux?i686)?AppleWebKit/535.7?(KHTML?like?Gecko)?Ubuntu/11.04?Chromium/16.0.912.77?Chrome/16.0.912.77?Safari/535.7“
????????????“Mozilla/5.0?(X11;?Ubuntu;?Linux?i686;?rv:10.0)?Gecko/20100101?Firefox/10.0?“
????????]
????????agent?=?random.choice(user_agents)
????????req?=?urllib2.Request(self.url)
????????req.add_header(‘User-Agent‘?agent)
????????req.add_header(‘Host‘?‘blog.csdn.net‘)
????????req.add_header(‘Accept‘?‘*/*‘)
????????req.add_header(‘Referer‘?‘http://blog.csdn.net/mangoer_ys?viewmode=list‘)
????????req.add_header(‘GET‘?url)
????????html?=?urllib2.urlopen(req)
????????page?=?html.read().decode(‘utf-8‘)
????????self.page?=?page
????????self.beginurl?=?self.getFirstUrl()

????#?得到其博客主頁的第一篇文章
????def?getFirstUrl(self):
????????bs?=?BeautifulSoup(self.page)
????????html_content_list?=?bs.find(‘span‘?class_=‘link_title‘)
????????self.type?=?1
????????if?(html_content_list?==?None):
????????????html_content_list?=?bs.find(‘h3‘?class_=‘list_c_t‘)??#?不同的主題
????????????self.type?=?2
????????????if?(html_content_list?==?None):
????????????????return?“nourl“

????????try:
????????????return?‘http://blog.csdn.net‘?+?html_content_list.a[‘href‘]
????????except?Exception?e:
????????????return?“nourl“


class?CSDN_Blog_Spider:
????def?__init__(self?url2?type):
????????self.url?=?url2
????????self.type?=?type
????????if?type?==?4:
????????????global?WAIT_URL
????????????WAIT_URL?=?url2
????????????print?‘已記錄待爬下一篇地址‘?+?url2
????????????print(‘正在爬取網頁地址:?‘?+?self.url)
????????else:
????????????print(‘正在爬取網頁地址:?‘?+?self.url)
????????user_agents?=?[
????????????‘Mozilla/5.0?(Windows;?U;?Windows?NT?5.1;?it;?rv:1.8.1.11)?Gecko/20071127?Firefox/2.0.0.11‘
????????

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????14565??2016-06-25?00:04??test.py

評論

共有 條評論