資源簡介
這是kaggle泰坦尼克號準確率0.81的python數據分析超級詳細的源代碼
這是傳說中的泰坦尼克機器學習比賽-對你來說最好的,第一次挑戰,讓你潛入機器學習比賽,熟悉Kaggle平臺的工作原理。
競爭很簡單:使用機器學習來創建一個模型,預測哪些乘客在泰坦尼克號沉船事故中幸存下來。
代碼片段和文件信息
#?-*-?coding:utf-8?-*-
“““
@?Created?by?PyCharm
@?file:?version.py
@?author:?Zhang?Zhihao
@?email:?3382885270@qq.com
@?date:?2018/11/16?9:22
@?version:?1.0.0
“““
import?re
import?numpy?as?np
import?pandas?as?pd
import?matplotlib.pyplot?as?plt
import?seaborn
import?seaborn
from?sklearn.ensemble?import?RandomForestRegressor
from?sklearn.preprocessing?import?MinMaxScaler
import?pandas?as?pd
import?numpy?as?np
import?matplotlib.pyplot?as?plt
import?seaborn?as?sns
from?sklearn.ensemble?import?RandomForestRegressor
from?sklearn.pipeline?import?Pipelinemake_pipeline
from?sklearn.ensemble?import?GradientBoostingClassifier?RandomForestClassifier
from?sklearn.feature_selection?import?SelectKBest
import?warnings
“““
PassengerId?=>?乘客ID?
Pclass?=>?乘客等級(1/2/3等艙位)?
Name?=>?乘客姓名?
Sex?=>?性別?
Age?=>?年齡?
SibSp?=>?堂兄弟/妹個數?
Parch?=>?父母與小孩個數?
Ticket?=>?船票信息?
Fare?=>?票價?
Cabin?=>?客艙?
Embarked?=>?登船港口
“““
input_df?=?pd.read_csv?(?‘train.csv‘?header=0?)
submit_df?=?pd.read_csv?(?‘test.csv‘?header=0?)
#?合并他們
df?=?pd.concat?(?[input_df?submit_df]?)
#?重建index
df.reset_index?(?inplace=True?)
#?刪除reset_index()產生的index?column
df.drop?(?‘index‘?axis=1?inplace=True?)
#?查看train.csv中缺失的數據
print?(?“查看train.csv中缺少數據:\n“?input_df.isnull?().sum?()?)
#?Age????????????177
#?Cabin??????????687
#?Embarked?????????2
print?(?“-“?*?40?)
#?查看test.csv中缺失的數據
print?(?“查看test.csv中缺少數據:\n“?submit_df.isnull?().sum?()?)
#?Age?????????????86
#?Fare?????????????1
#?Cabin??????????327
#?查看合并后缺失的數據
print?(?“查看合并后缺失的數據:\n“?df.isnull?().sum?()?)
#?查看信息
print?(?df.info?()?)
target_train?=?input_df[‘Survived‘]
#?column:??12
#?Age?missing??263??type:?float64
#?Cabin?missing??1014??type:?object
#?Embarked?missing??2??type:?object
#?Fare?missing??1??type:?float64
#?Name?missing??0??type:?object
#?Parch?missing??0??type:?int64
#?PassengerId?missing??0??type:?int64
#?Pclass?missing??0??type:?int64
#?Sex?missing??0??type:?object
#?SibSp?missing??0??type:?int64
#?Survived?missing??418??type:?float64
#?Ticket?missing??0??type:?object
##?處理缺失值
#?Cabin?=>?客艙
#?Ticket?=>?船票信息
#?PassengerId?=>?乘客ID
#?忽略它:
df?=?df.drop?(?[‘Cabin‘?‘Ticket‘?‘PassengerId‘]?axis=1?)
#??Age?=>?年齡
#?#?version1
#?average_age=?df[“Age“].mean()
#?df[‘Age‘][df.Age.isnull()]?=?average_age
#?#?可視化
#?fig?(axis1axis2)?=?plt.subplots(12figsize=(155))
#?axis1.set_title(‘Original?Age?values‘)
#?axis2.set_title(‘New?Age?values‘)
#?df[‘Age‘].plot(kind=‘hist‘?bins=70?ax=axis1)
#?df[‘Age‘].plot(kind=‘hist‘?bins=70?ax=axis2)
#?plt.show()
#?version2
#?試試隨機選取平均值加減標準差范圍的數來改進,使數據更接近真實情況。
average_age???=?df[“Age“].mean()
std_age???????=?df[“Age“].std()
count_nan_age?=?df[“Age“].isnull().sum()
rand?=?np.random.randint(average_age?-?std_age?average_age?+
?????????????????????????std_age?size?=?count_nan_age)
df[‘Age‘][df.Age.isnull()]?=?rand
#?#?可視化
#?fig?(axis1axis2)?=?plt.subplots(12figsize=(155))
#?axis1.set_tit
- 上一篇:python隨機子空間法.py
- 下一篇:python自動登陸該網站并網站內容
評論
共有 條評論