資源簡介
平臺部分主要是hadoop分布式系統,基于該系統融合了組件Spark,Hbase,Hive,Sqoop,Mahout等。繼而進行相關的數據分析
該項目主要分為以下幾部分:
1:數據采集
主要是基于豆瓣電影的數據,進行分析,所以首先要爬取相關的電影數據,對應的源代碼在DouBan_Spider目錄下,主要是采用Python + BeautifulSoup + urllib進行數據采集
2:ETL預處理
3:數據分析
4:可視化
代碼封裝完好,
適用于對作影視感情分析,影評分析,電影類型分析,推薦系統的建立
該項目主要分為以下幾部分:
1:數據采集
主要是基于豆瓣電影的數據,進行分析,所以首先要爬取相關的電影數據,對應的源代碼在DouBan_Spider目錄下,主要是采用Python + BeautifulSoup + urllib進行數據采集
2:ETL預處理
3:數據分析
4:可視化
代碼封裝完好,
適用于對作影視感情分析,影評分析,電影類型分析,推薦系統的建立

代碼片段和文件信息
#?-*-coding:utf-8-*-
from?urllib?import?request
from?download?import?down_html
from?parase?import?parase_html
from?output?import?output_all
from?url_manager?import?manage_url
root_url?=?“https://movie.douban.com/tag/?view=cloud“
class?DouBan_Spider(object):
????def?__init__(self):
????????self.category_dic?=?{}?#用來存儲所有熱門分類的名字和對應的電影數目catename和catename_num兩個屬性
????????self.down_class?=?down_html.DownHtml()??#下載網頁
????????self.parase_class?=?parase_html.ParaseHtml()?#解析網頁
????????self.output_class?=?output_all.OutPut()?#存儲信息
????????self.manage_class?=?manage_url.UrlManager()?#鏈接管理
????????self.tag_right?=?1
????????self.tag_error?=?0
????#獲取分類下所有熱門分類
????def?get_hotcategory(selfurl):
????????print(“get?all?category!“)
????????page_content?=?self.down_class.download(url)
????????self.category_dic?=?self.parase_class.parase_category(page_content)
????????self.output_class.output_category(self.category_dic)?#將類別信息寫入本地文件
????#得到某個類別下所有電影的鏈接
????def?get_one_cate_all_movie_href(selftag_url):
????????page_content?=?self.down_class.download(tag_url)
????????page_num?=?self.parase_class.parase_pagenum(page_content)?#得到該分類總共多少頁
????????movies_href?=?[]
????????try:
????????????for?page?in?range(int(page_num)):
????????????????page_url?=?“https://movie.douban.com/tag/%E8%8B%B1%E5%9B%BD?start=“+str(page*20)+“&type=T“
????????????????tag_page_content?=?self.down_class.download(page_url)
????????????????movies_href?=?self.parase_class.parase_page_all_movies(tag_page_contentmovies_href)
????????????????print(“all:“page_num“??right:“self.tag_right“??error:“self.tag_error“??page:“page+1“??URL?獲取完畢“)
????????????????self.tag_right?+=1
????????except?Exception?as?e:
????????????print(e)
????????????self.tag_error+=1
????????????pass
????????print(“該類別下對應的電影數目為:\t“len(movies_href))
????????self.output_class.output_all_movies_href(movies_href)
????????print(“開始獲取該類別下對應的電影信息:\n?“)
????????self.get_one_movie_message(movies_href)??????#該類別對應的鏈接抓取完畢,進行這些鏈接對應電影信息的抓取
????#下載每個電影的詳細信息
????def?get_one_movie_message(selfmovie_link):
????????all_count?=?1
????????error?=?0
????????self.manage_class.add_new_urls(movie_link)?#將一個類別對應的鏈接全部加載到manage_url管理的新的鏈接中
????????while(self.manage_class.has_new_url()):
????????????try:
????????????????one_url?=?self.manage_class.get_new_url()??#獲取一個url
????????????????#?one_url=“?https://movie.douban.com/subject/1297970/“
????????????????print(“Right:“all_count“??URL:“one_url.strip()“??““Error:“error)
????????????????page_content_one?=?self.down_class.download(one_url)??#下載該網頁對應的源代碼
????????????????one_movic_dic?=?self.parase_class.parase_one_movie_message(page_content_one)?#解析得到一部電影的具體數據
????????????????id?=?one_url.split(“/“)[-2]
????????????????self.output_class.output_one_movie_message(one_movic_dicid)?#將該部電影的數據輸出
????????????????all_count+=1
????????????except?Exception?as?e:
????????????????error?+=1
????????????????print(e)
????????????????pass
????#獲取每部電影的短評
????def?get_one_movie_short_dis(selfmovie_url):
??
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\.idea\
?????文件?????????918??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\.idea\DouBan_Spider.iml
?????文件?????????545??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\.idea\misc.xm
?????文件?????????961??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\.idea\modules.xm
?????文件?????????183??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\.idea\vcs.xm
?????文件???????48555??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\.idea\workspace.xm
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\download\
?????文件???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\download\__init__.py
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\
?????文件?????????153??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\__init__.cpython-34.pyc
?????文件????????3914??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\download\__pycache__\down_html.cpython-34.pyc
?????文件????????5884??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\download\down_html.py
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\
?????文件????????3198??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\category.csv
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\
?????文件???????68500??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\duanping\5045678?.txt
?????文件?????1143200??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie.csv
?????文件?????1449138??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\movie_summary.txt
?????文件?????6072632??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\movies_li
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\
?????文件?????2042984??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\file_output\yingping\5045678?.txt
?????文件????????6343??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\main.py
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\output\
?????文件???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\output\__init__.py
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\
?????文件?????????151??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\__init__.cpython-34.pyc
?????文件????????2790??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\output\__pycache__\output_all.cpython-34.pyc
?????文件????????2762??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\output\output_all.py
?????目錄???????????0??2019-11-26?02:25??Douban_Movies_Analysic-master\DouBan_Spider\parase\
............此處省略113個文件信息
評論
共有 條評論