資源簡介
找出評分最高的前100部電影,使用python 實現,對網站爬蟲
代碼片段和文件信息
#-*-?coding:?UTF-8?-*-
import?sys
import?time
import?urllib
import?urllib2
//??import?requests
import?numpy?as?np
from?bs4?import?BeautifulSoup
from?openpyxl?import?Workbook
reload(sys)
sys.setdefaultencoding(‘utf8‘)
#Some?User?Agents
hds=[{‘User-Agent‘:‘Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-US;?rv:1.9.1.6)?Gecko/20091201?Firefox/3.5.6‘}\
{‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?6.2)?AppleWebKit/535.11?(KHTML?like?Gecko)?Chrome/17.0.963.12?Safari/535.11‘}\
{‘User-Agent‘:?‘Mozilla/5.0?(compatible;?MSIE?10.0;?Windows?NT?6.2;?Trident/6.0)‘}]
def?book_spider(book_tag):
????page_num=0;
????book_list=[]
????try_times=0
????
????while(1):
????????#url=‘http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0‘?#?For?Test
????????url=‘http://www.douban.com/tag/‘+urllib.quote(book_tag)+‘/book?start=‘+str(page_num*15)
????????time.sleep(np.random.rand()*5)
????????
????????#Last?Version
????????try:
????????????req?=?urllib2.Request(url?headers=hds[page_num%len(hds)])
????????????source_code?=?urllib2.urlopen(req).read()
????????????plain_text=str(source_code)???
????????except?(urllib2.HTTPError?urllib2.URLError)?e:
????????????print?e
????????????continue
??
????????##Previous?Version?IP?is?easy?to?be?Forbidden
????????#source_code?=?requests.get(url)?
????????#plain_text?=?source_code.text??
????????
????????soup?=?BeautifulSoup(plain_text)
????????list_soup?=?soup.find(‘div‘?{‘class‘:?‘mod?book-list‘})
????????
????????try_times+=1;
????????if?list_soup==None?and?try_times<200:
????????????continue
????????elif?list_soup==None?or?len(list_soup)<=1:
????????????break?#?Break?when?no?informatoin?got?after?200?times?requesting
????????
????????for?book_info?in?list_soup.findAll(‘dd‘):
????????????title?=?book_info.find(‘a‘?{‘class‘:‘title‘}).string.strip()
????????????desc?=?book_info.find(‘div‘?{‘class‘:‘desc‘}).string.strip()
????????????desc_list?=?desc.split(‘/‘)
????????????book_url?=?book_info.find(‘a‘?{‘class‘:‘title‘}).get(‘href‘)
????????????
????????????try:
????????????????author_info?=?‘作者/譯者:?‘?+?‘/‘.join(desc_list[0:-3])
????????????except:
????????????????author_info?=‘作者/譯者:?暫無‘
????????????try:
????????????????pub_info?=?‘出版信息:?‘?+?‘/‘.join(desc_list[-3:])
????????????except:
????????????????pub_info?=?‘出版信息:?暫無‘
????????????try:
????????????????rating?=?book_info.find(‘span‘?{‘class‘:‘rating_nums‘}).string.strip()
????????????except:
????????????????rating=‘0.0‘
????????????try:
????????????????#people_num?=?book_info.findAll(‘s
評論
共有 條評論