資源簡介
根據(jù)豆瓣所有的電影,分析各國各地區(qū)各類別時間年份評分數(shù)量等各個參數(shù)之間的聯(lián)系,大體上進行分析,主要比較世界電影和中國,以及中國大陸和中國港臺電影之間的差別,分析各參數(shù)之間是否存在關(guān)聯(lián)性及對評分產(chǎn)生的影響;數(shù)據(jù)來源于豆瓣,我對評分不做主觀表現(xiàn),我只對數(shù)據(jù)進行分析展示,能力偏弱,但圖像不弱。

代碼片段和文件信息
#?-*-coding:utf-8-*-
from?urllib?import?request
from?download?import?down_html
from?parase?import?parase_html
from?output?import?output_all
from?url_manager?import?manage_url
root_url?=?“https://movie.douban.com/tag/?view=cloud“
class?DouBan_Spider(object):
????def?__init__(self):
????????self.category_dic?=?{}?#用來存儲所有熱門分類的名字和對應(yīng)的電影數(shù)目catename和catename_num兩個屬性
????????self.down_class?=?down_html.DownHtml()??#下載網(wǎng)頁
????????self.parase_class?=?parase_html.ParaseHtml()?#解析網(wǎng)頁
????????self.output_class?=?output_all.OutPut()?#存儲信息
????????self.manage_class?=?manage_url.UrlManager()?#鏈接管理
????????self.tag_right?=?1
????????self.tag_error?=?0
????#獲取分類下所有熱門分類
????def?get_hotcategory(selfurl):
????????print(“get?all?category!“)
????????page_content?=?self.down_class.download(url)
????????self.category_dic?=?self.parase_class.parase_category(page_content)
????????self.output_class.output_category(self.category_dic)?#將類別信息寫入本地文件
????#得到某個類別下所有電影的鏈接
????def?get_one_cate_all_movie_href(selftag_url):
????????page_content?=?self.down_class.download(tag_url)
????????page_num?=?self.parase_class.parase_pagenum(page_content)?#得到該分類總共多少頁
????????movies_href?=?[]
????????try:
????????????for?page?in?range(int(page_num)):
????????????????page_url?=?“https://movie.douban.com/tag/%E8%8B%B1%E5%9B%BD?start=“+str(page*20)+“&type=T“
????????????????tag_page_content?=?self.down_class.download(page_url)
????????????????movies_href?=?self.parase_class.parase_page_all_movies(tag_page_contentmovies_href)
????????????????print(“all:“page_num“??right:“self.tag_right“??error:“self.tag_error“??page:“page+1“??URL?獲取完畢“)
????????????????self.tag_right?+=1
????????except?Exception?as?e:
????????????print(e)
????????????self.tag_error+=1
????????????pass
????????print(“該類別下對應(yīng)的電影數(shù)目為:\t“l(fā)en(movies_href))
????????self.output_class.output_all_movies_href(movies_href)
????????print(“開始獲取該類別下對應(yīng)的電影信息:\n?“)
????????self.get_one_movie_message(movies_href)??????#該類別對應(yīng)的鏈接抓取完畢,進行這些鏈接對應(yīng)電影信息的抓取
????#下載每個電影的詳細信息
????def?get_one_movie_message(selfmovie_link):
????????all_count?=?1
????????error?=?0
????????self.manage_class.add_new_urls(movie_link)?#將一個類別對應(yīng)的鏈接全部加載到manage_url管理的新的鏈接中
????????while(self.manage_class.has_new_url()):
????????????try:
????????????????one_url?=?self.manage_class.get_new_url()??#獲取一個url
????????????????#?one_url=“?https://movie.douban.com/subject/1297970/“
????????????????print(“Right:“all_count“??URL:“one_url.strip()“??““Error:“error)
????????????????page_content_one?=?self.down_class.download(one_url)??#下載該網(wǎng)頁對應(yīng)的源代碼
????????????????one_movic_dic?=?self.parase_class.parase_one_movie_message(page_content_one)?#解析得到一部電影的具體數(shù)據(jù)
????????????????id?=?one_url.split(“/“)[-2]
????????????????self.output_class.output_one_movie_message(one_movic_dicid)?#將該部電影的數(shù)據(jù)輸出
????????????????all_count+=1
????????????except?Exception?as?e:
????????????????error?+=1
????????????????print(e)
????????????????pass
????#獲取每部電影的短評
????def?get_one_movie_short_dis(selfmovie_url):
??
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\.idea\
?????文件?????????918??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\.idea\DouBan_Spider.iml
?????文件?????????545??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\.idea\misc.xm
?????文件?????????961??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\.idea\modules.xm
?????文件?????????183??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\.idea\vcs.xm
?????文件???????48555??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\.idea\workspace.xm
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\download\
?????文件???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\download\__init__.py
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\
?????文件?????????153??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\__init__.cpython-34.pyc
?????文件????????3914??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\down_html.cpython-34.pyc
?????文件????????5884??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\download\down_html.py
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\
?????文件????????3198??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\category.csv
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\
?????文件???????68500??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\5045678?.txt
?????文件?????1143200??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie.csv
?????文件?????1449138??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie_summary.txt
?????文件?????6072632??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\movies_li
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\
?????文件?????2042984??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\5045678?.txt
?????文件????????6343??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\main.py
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\output\
?????文件???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\output\__init__.py
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\
?????文件?????????151??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\__init__.cpython-34.pyc
?????文件????????2790??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\output_all.cpython-34.pyc
?????文件????????2762??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\output\output_all.py
?????目錄???????????0??2016-09-21?09:03??Douban_Movies_Analysic-master\DouBan_Spider\parase\
............此處省略113個文件信息
評論
共有 條評論