91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 5KB
    文件類型: .py
    金幣: 1
    下載: 0 次
    發布日期: 2021-06-02
  • 語言: Python
  • 標簽: 豆瓣??爬蟲??

資源簡介

找出評分最高的前100部電影,使用python 實現,對網站爬蟲

資源截圖

代碼片段和文件信息

#-*-?coding:?UTF-8?-*-

import?sys
import?time
import?urllib
import?urllib2
//??import?requests
import?numpy?as?np
from?bs4?import?BeautifulSoup
from?openpyxl?import?Workbook

reload(sys)
sys.setdefaultencoding(‘utf8‘)



#Some?User?Agents
hds=[{‘User-Agent‘:‘Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-US;?rv:1.9.1.6)?Gecko/20091201?Firefox/3.5.6‘}\
{‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?6.2)?AppleWebKit/535.11?(KHTML?like?Gecko)?Chrome/17.0.963.12?Safari/535.11‘}\
{‘User-Agent‘:?‘Mozilla/5.0?(compatible;?MSIE?10.0;?Windows?NT?6.2;?Trident/6.0)‘}]


def?book_spider(book_tag):
????page_num=0;
????book_list=[]
????try_times=0
????
????while(1):
????????#url=‘http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0‘?#?For?Test
????????url=‘http://www.douban.com/tag/‘+urllib.quote(book_tag)+‘/book?start=‘+str(page_num*15)
????????time.sleep(np.random.rand()*5)
????????
????????#Last?Version
????????try:
????????????req?=?urllib2.Request(url?headers=hds[page_num%len(hds)])
????????????source_code?=?urllib2.urlopen(req).read()
????????????plain_text=str(source_code)???
????????except?(urllib2.HTTPError?urllib2.URLError)?e:
????????????print?e
????????????continue
??
????????##Previous?Version?IP?is?easy?to?be?Forbidden
????????#source_code?=?requests.get(url)?
????????#plain_text?=?source_code.text??
????????
????????soup?=?BeautifulSoup(plain_text)
????????list_soup?=?soup.find(‘div‘?{‘class‘:?‘mod?book-list‘})
????????
????????try_times+=1;
????????if?list_soup==None?and?try_times<200:
????????????continue
????????elif?list_soup==None?or?len(list_soup)<=1:
????????????break?#?Break?when?no?informatoin?got?after?200?times?requesting
????????
????????for?book_info?in?list_soup.findAll(‘dd‘):
????????????title?=?book_info.find(‘a‘?{‘class‘:‘title‘}).string.strip()
????????????desc?=?book_info.find(‘div‘?{‘class‘:‘desc‘}).string.strip()
????????????desc_list?=?desc.split(‘/‘)
????????????book_url?=?book_info.find(‘a‘?{‘class‘:‘title‘}).get(‘href‘)
????????????
????????????try:
????????????????author_info?=?‘作者/譯者:?‘?+?‘/‘.join(desc_list[0:-3])
????????????except:
????????????????author_info?=‘作者/譯者:?暫無‘
????????????try:
????????????????pub_info?=?‘出版信息:?‘?+?‘/‘.join(desc_list[-3:])
????????????except:
????????????????pub_info?=?‘出版信息:?暫無‘
????????????try:
????????????????rating?=?book_info.find(‘span‘?{‘class‘:‘rating_nums‘}).string.strip()
????????????except:
????????????????rating=‘0.0‘
????????????try:
????????????????#people_num?=?book_info.findAll(‘s

評論

共有 條評論