資源簡介
kaggle入門賽房價預(yù)測,包括pandas數(shù)據(jù)預(yù)處理,使用skitlen線性回歸預(yù)測結(jié)果,輸出結(jié)果表格

代碼片段和文件信息
#coding=UTF-8
‘‘‘
????參看kernel上的數(shù)據(jù)預(yù)處理教程?中文版https://www.leiphone.com/news/201704/Py7Mu3TwRF97pWc7.html
‘‘‘
#invite?people?for?the?Kaggle?party
import?pandas?as?pd
import?matplotlib.pyplot?as?plt
import?seaborn?as?sns
import?numpy?as?np
from?scipy.stats?import?norm
from?sklearn.preprocessing?import?StandardScaler
from?scipy?import?stats
import?warnings
warnings.filterwarnings(‘ignore‘)
#bring?in?the?six?packs
df_train?=?pd.read_csv(r“train.csv“)#../文件目錄
df_test?=?pd.read_csv(r“test.csv“)#../文件目錄
print(df_test.shape)
print(df_train.shape)
#check?the?decoration
#?print?df_train.columns
#descriptive?statistics?summary
#?print?df_train[‘SalePrice‘].describe()
#histogram
sns.distplot(df_train[u‘SalePrice‘])
#?sns.plt.show()
#數(shù)據(jù)偏度和峰指度量
print(“skewness:{}?Kurtosis:{}“.format(df_train[‘SalePrice‘].skew()df_train[‘SalePrice‘].kurt()))
#房價與數(shù)字型變量的關(guān)系
var?=?‘GrLivArea‘
data?=?pd.concat([df_train[‘SalePrice‘]df_train[var]]axis=1)
data.plot.scatter(x=vary=‘SalePrice‘ylim=(0800000))
plt.show()
var?=?‘TotalBsmtSF‘
data?=?pd.concat([df_train[‘SalePrice‘]df_train[var]]axis=1)
data.plot.scatter(x=vary=‘SalePrice‘ylim=(0800000))
plt.show()
#‘OverallQual’與‘SalePrice’箱型圖
var?=?‘OverallQual‘
data?=?pd.concat([df_train[‘SalePrice‘]?df_train[var]]?axis=1)
f?ax?=?plt.subplots(figsize=(8?6))
fig?=?sns.boxplot(x=var?y=“SalePrice“?data=data)
fig.axis(ymin=0?ymax=800000)
plt.show()
#YearBuilt?與?SalePrice?箱型圖
var?=?‘YearBuilt‘
data?=?pd.concat([df_train[‘SalePrice‘]?df_train[var]]?axis=1)
f?ax?=?plt.subplots(figsize=(16?8))
fig?=?sns.boxplot(x=var?y=“SalePrice“?data=data)
fig.axis(ymin=0?ymax=800000)
plt.xticks(rotation=90)
#?plt.show()
#相關(guān)系數(shù)矩陣
corrmat?=?df_train.corr()
fax?=?plt.subplots(figsize=(129))
sns.heatmap(corrmatsquare=True)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
#SalePrice與其他變量之間的相關(guān)性
k?=?10?#number?ofvariables?for?heatmap
cols?=?corrmat.nlargest(k?‘SalePrice‘)[‘SalePrice‘].index
cm?=?np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm?=?sns.heatmap(cm?cbar=True?annot=True?square=True?fmt=‘.2f‘?annot_kws={‘size‘:?10}
yticklabels=cols.values?xticklabels=cols.values)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
#?SalePrice?和相關(guān)變量之間的散點(diǎn)圖
sns.set()
cols?=?[‘SalePrice‘?‘OverallQual‘?‘GrLivArea‘‘GarageCars‘?‘TotalBsmtSF‘?‘FullBath‘?‘YearBuilt‘]
sns.pairplot(df_train[cols]?size=2.5)
plt.show()
#缺失數(shù)據(jù)
total?=?df_train.isnull().sum().sort_values(ascending=False)
percent?=?(df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data?=?pd.concat([totalpercent]axis=1keys=[‘Total‘‘Percent‘])
print(missing_data.head(20))
#處理缺失數(shù)據(jù)
print?(missing_data[missing_data[‘Total‘]>1].index)
#?del?df_train[missing_data[missing_data[‘Total‘]>1].index]
df_train?=?df_train.drop((missing_data[missing_data[‘Total‘]>1]).index1)#刪除列
#?df_train=?df_train.drop((missi
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????8549??2017-12-06?15:08??房價預(yù)測\kernelhouse.py
?????文件?????????79??2017-12-06?15:05??房價預(yù)測\readme.txt
?????文件??????29035??2017-11-19?20:27??房價預(yù)測\submission.csv
?????文件?????452864??2017-11-19?11:48??房價預(yù)測\test.csv
?????文件?????460676??2017-11-09?18:40??房價預(yù)測\train.csv
?????目錄??????????0??2017-12-06?15:08??房價預(yù)測
-----------?---------??----------?-----??----
???????????????951203????????????????????6
評論
共有 條評論