91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡介

Python豆瓣爬蟲以及爬蟲爬好的圖書分類數據,文件格式為xlsl,含分類、圖書名、作者名、出版社、評分等信息

資源截圖

代碼片段和文件信息

#-*-?coding:?UTF-8?-*-

import?sys
import?time
import?urllib
import?urllib2
import?requests
import?numpy?as?np
from?bs4?import?BeautifulSoup
from?openpyxl?import?Workbook

reload(sys)
sys.setdefaultencoding(‘utf8‘)



#Some?User?Agents
hds=[{‘User-Agent‘:‘Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-US;?rv:1.9.1.6)?Gecko/20091201?Firefox/3.5.6‘}\
{‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?6.2)?AppleWebKit/535.11?(KHTML?like?Gecko)?Chrome/17.0.963.12?Safari/535.11‘}\
{‘User-Agent‘:?‘Mozilla/5.0?(compatible;?MSIE?10.0;?Windows?NT?6.2;?Trident/6.0)‘}]


def?book_spider(book_tag):
????page_num=0;
????book_list=[]
????try_times=0
????
????while(1):
????????#url=‘http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0‘?#?For?Test
????????url=‘http://www.douban.com/tag/‘+urllib.quote(book_tag)+‘/book?start=‘+str(page_num*15)
????????time.sleep(np.random.rand()*5)
????????
????????#Last?Version
????????try:
????????????req?=?urllib2.Request(url?headers=hds[page_num%len(hds)])
????????????source_code?=?urllib2.urlopen(req).read()
????????????plain_text=str(source_code)???
????????except?(urllib2.HTTPError?urllib2.URLError)?e:
????????????print?e
????????????continue
??
????????##Previous?Version?IP?is?easy?to?be?Forbidden
????????#source_code?=?requests.get(url)?
????????#plain_text?=?source_code.text??
????????
????????soup?=?BeautifulSoup(plain_text)
????????list_soup?=?soup.find(‘div‘?{‘class‘:?‘mod?book-list‘})
????????
????????try_times+=1;
????????if?list_soup==None?and?try_times<200:
????????????continue
????????elif?list_soup==None?or?len(list_soup)<=1:
????????????break?#?Break?when?no?informatoin?got?after?200?times?requesting
????????
????????for?book_info?in?list_soup.findAll(‘dd‘):
????????????title?=?book_info.find(‘a‘?{‘class‘:‘title‘}).string.strip()
????????????desc?=?book_info.find(‘div‘?{‘class‘:‘desc‘}).string.strip()
????????????desc_list?=?desc.split(‘/‘)
????????????book_url?=?book_info.find(‘a‘?{‘class‘:‘title‘}).get(‘href‘)
????????????
????????????try:
????????????????author_info?=?‘作者/譯者:?‘?+?‘/‘.join(desc_list[0:-3])
????????????except:
????????????????author_info?=‘作者/譯者:?暫無‘
????????????try:
????????????????pub_info?=?‘出版信息:?‘?+?‘/‘.join(desc_list[-3:])
????????????except:
????????????????pub_info?=?‘出版信息:?暫無‘
????????????try:
????????????????rating?=?book_info.find(‘span‘?{‘class‘:‘rating_nums‘}).string.strip()
????????????except:
????????????????rating=‘0.0‘
????????????try:
????????????????#people_num?=?book_info.findAll(‘span‘)[2].string.strip()
????????????????people_num?=?get_people_num(book_url)
????????????????people_num?=?people_num.strip(‘人評價‘)
????????????except:
????????????????people_num?=‘0‘
????????????
????????????book_list.append([titleratingpeople_numauthor_infopub_info])
????????????try_times=0?#set?0?when?got?valid?information
????????page_num+=1
????????print?‘Downloading?Information?From?Page?%d‘?%?page_num
????return?book_list


def?get_people_num(url):
????#url=‘http://book.douban.com/subj

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-04-24?17:40??douban-spider-and-statics\
?????文件??????205376??2018-12-07?13:29??douban-spider-and-statics\book_list-個人管理-時間管理-投資-文化-宗教.xlsx
?????文件?????4149089??2018-12-07?13:29??douban-spider-and-statics\book_list-傳記-哲學-編程-創業-理財-社會學-佛教.xlsx
?????文件???????40548??2018-12-07?13:29??douban-spider-and-statics\book_list-名著.xlsx
?????文件??????107861??2018-12-07?13:29??douban-spider-and-statics\book_list-商業-理財-管理.xlsx
?????文件?????5495727??2018-12-07?13:29??douban-spider-and-statics\book_list-心理-判斷與決策-算法-數據結構-經濟-歷史.xlsx
?????文件?????1833209??2018-12-07?13:29??douban-spider-and-statics\book_list-思想-科技-科學-web-股票-愛情-兩性.xlsx
?????文件??????379429??2018-12-07?13:29??douban-spider-and-statics\book_list-攝影-設計-音樂-旅行-教育-成長-情感-育兒-健康-養生.xlsx
?????文件???????55049??2018-12-07?13:29??douban-spider-and-statics\book_list-數學.xlsx
?????文件??????125082??2018-12-07?13:29??douban-spider-and-statics\book_list-科幻-思維-金融.xlsx
?????文件??????188029??2018-12-07?13:29??douban-spider-and-statics\book_list-科普-經典-生活-心靈-文學.xlsx
?????文件??????890982??2018-12-07?13:29??douban-spider-and-statics\book_list-計算機-機器學習-linux-android-數據庫-互聯網.xlsx
?????文件????????5331??2018-12-07?13:29??douban-spider-and-statics\doubanSpider.py

評論

共有 條評論