資源簡介
大數據課程課設設計,基于win10,Hadoop2.8.3,python3.6以及MySQL8.0.

代碼片段和文件信息
#python?connectHDFS.py?ratings.csv?>?result.csv?命令行執行此命令
import?numpy?as?np
import?matplotlib.pyplot?as?plt?
from?hdfs?import?*
import?os
import?math
import?pandas?as?pd
import?pymysql
from?mrjob.job?import?MRJob
from?mrjob.step?import?MRStep
from?matplotlib.pyplot?import?MultipleLocator
db?=?pymysql.connect(host=‘127.0.0.1‘?port=3306?user=‘root‘?passwd=‘123456‘?db=‘test‘?charset=‘utf8‘)
cursor?=?db.cursor()?#新建數據庫訪問游標
user_movie?=?‘u.data‘
movie_information?=?‘u.item‘
user_information?=?‘u.user‘
plt.rcParams[‘font.sans-serif‘]?=?[‘SimHei‘]?#?指定默認字體
plt.rcParams[‘axes.unicode_minus‘]?=?False?#?解決保存圖像是負號‘-‘顯示為方塊的問題
def?mr_PreTreatment():?#數據預處理,生成result1.csv文件
????user_items=[]
????items=[]
????for?line?in?open(‘u.data‘):
????????????user_items.append(line.split(‘\t‘))
?
????for?line?in?open(‘u.item‘?encoding=?‘ISO-8859-1‘):
????????????items.append(line.split(‘|‘))
????items_hash={}
????for?i?in?items:
????????items_hash[i[0]]=i[1]
?
????for?ui?in?user_items:
????????ui[1]=items_hash[ui[1]]
????f?=?open(‘ratings.csv‘‘w‘encoding=‘utf-8‘)
????for?ui?in?user_items:
????????f.write(ui[0]+‘|‘+ui[1]+‘|‘+ui[2]+‘\n‘)
????f.close()
def?score_analysis():?#電影評分分析
????scores?=?{“1“:0“2“:0“3“:0“4“:0“5“:0}
????for?line?in?open(user_movie):
????????user?item?score?=?line.split(‘\t‘)[0:3]
????????scores[score]?+=?1
????x1?=?scores.keys()?#用字典的鍵也就是分數作為x軸
????y1?=?scores.values()?#用字典鍵對應的值也就是打分人數作為y軸
????#?print(scores)
????plt.figure(figsize=(19?10))
????plt.subplot(232)
????plt.bar(x1y1color?=?‘slateblue‘width?=?0.95)
????plt.title(“電影評分統計圖“fontsize=14)
????plt.xlabel(“影片評分?(0-5)“fontsize=14)
????plt.ylabel(“評分人數“fontsize?=?14)
def?movie_year_analysis():?#電影年份分析
????yearCounts?=?{}?#用來統計電影年份與數量的對應關系
????for?year?in?range(19221999):??#按照數據集中電影的年份信息?生成年份字典
????????yearCounts[str(year)]?=?0
????for?line?in?open(movie_informationencoding=‘ISO-8859-1‘):
????????release_date?=?line.split(‘|‘)[2]
????????release_year?=?release_date[-4:]
????????if?release_year?==?““:?continue
????????yearCounts[release_year]?+=?1
????x2?=?list(yearCounts.keys())?#獲取x軸坐標并轉為列表
????y2?=?list(yearCounts.values())#獲取y軸坐標并轉為列表
????plt.subplot(235)
????plt.plot(x2y2label?=?‘電影數量‘color?=?‘cornflowerblue‘)
????plt.legend(loc=“upper?right“)?#設置標簽圖在右上角
????x_major_locator=MultipleLocator(5)?#設置x軸間隔為5
????ax=plt.gca()
????ax.xaxis.set_major_locator(x_major_locator)
????plt.xticks(rotation?=?-60)?#將x軸坐標旋轉60度
????x_new?=?range(19221998)
????plt.title(“電影年份統計圖“fontsize=14)
????plt.xlabel(“年份“fontsize=14)
????plt.ylabel(“電影數量“fontsize?=?14)
????#?plt.show()
def?occuptin_analysis():???
????occuption_count?=?{}
????for?line?in?open(user_information):?
????????occuption?=?line.split(‘|‘)[3]
????????occuption_count[occuption]?=?occuption_count.get(occuption0)?+?1?#統計職業及其對應人數
????sort_occuption_counts?=?sorted(occuption_count.items()key=lambda?k:?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2020-07-05?12:35??.vscode\
?????文件??????????57??2020-07-05?12:35??.vscode\settings.json
?????文件???????13257??2020-07-17?10:52??connectHDFS.py
?????文件????????1184??2020-07-16?10:22??mr1.py
?????文件????????2824??2020-07-16?15:44??mr2.py
?????文件?????????712??2020-07-16?09:48??preprocess.py
?????文件?????3066030??2020-07-17?08:12??ratings.csv
?????文件?????6798328??2020-07-17?08:12??result.csv
?????文件?????????818??2020-07-13?01:33??temp.py
?????文件?????????497??2020-07-09?20:59??test.py
?????文件????????7644??2020-07-12?22:14??test2.py
?????文件????????2654??2020-07-12?10:09??test3.py
?????文件?????????770??2020-07-12?10:12??test4.py
?????文件????????2295??2020-07-12?11:52??test5.py
?????文件????????1146??2020-07-12?22:18??test6.py
?????文件?????1979173??2020-07-10?21:55??u.data
?????文件??????236344??2020-07-10?21:55??u.item
?????文件???????22628??2000-07-20?05:09??u.user
評論
共有 條評論