資源簡介
機器學習算法XGboost、LightGBM、Catboost的代碼架構,滿足基本的數據分析,回歸、二分類、多分類。
代碼片段和文件信息
import?pandas?as?pd
import?numpy?as?np
import?scipy?as?sp
#文件讀取f表示文件路徑文件名
def?red_csv_file(flogging?=?False):
????print(“=================讀?取?文?件===================“)
????data?=?pd.read_csv(f)
????if?loggong:
????????print(data.head(5))
????????print(data.columns.values)
????????print(data.describe())
????????print(data.info())
????return?data
#通用的LogisticRegression框架
import?pandas?as?pd
import?numpy?as?np
from?scipy?import?sparse
from?sklearn.preprocessing?import?OneHotEncoder
from?sklearn.linear_model?import?LogisticRegression
from?sklearn.preprocessing?import?StandarScaler
#1.讀入數據
df_train?=?pd.Dataframe()
df_test?=?pd.Dataframe()
y_train?=?df_train[‘label‘].vslues
#2.處理數據
ss?=?StandardScaler()
#3.特征處理/重編碼
#3.1?對分類的變量
enc?=?OneHotEncoder()
feats?=?[“creativeID““adID““campaignID“]
for?i?feat?in?enumerate(feats):
????x_train?=?enc.fit_transform(df_train[feat].values.reshape(-11))
????x_test?=?enc.fit_transform(df_test[feat].values.reshape(-11))
????if?i?==?0:
????????X_train?X_test?=?x_train?x_test
????else:
????????X_train?X_test?=?sparse.hstack((X_train?x_train))?sparse.hstack((X_test?x_test))
#3.2?對數值變量
#對于StandarScalar必須是而分類變量,否則reshape(-1?len(feats))?is?required
feats?=?[“price“?“age“]
x_train?=?ss.fit_transform(df_train[feats].values)
x_test?=?ss.fit_transform(df_test[feats].values)
ss.fit_transform(df_test[feats].values)
X_train?X_test?=?sparse.hstack((X_train?x_train))?sparse.hstack((X_test?x_test))
#模型訓練
lr?=?LogisticRegression()
lr.fit(X_train?y_train)
proba_test?=?lr.predict_predict_proba(X_test)[:1]
#LightGBM二分類
import?lightgbm?as?lgb
import?pandas?as?pd
import?numpy?as?np
import?pickle
from?sklearn.metrics?import?roc_auc_score
from?sklearn.model_selection?import?train_test_split
print(“Loading?Data?...?“)
#導入數據
train_x?train_y?test_x?=?load_data()
#?用sklearn.cross_validation進行訓練數據集劃分,這里訓練集和交叉驗證集比例為7:3,可以自己根據需要設置
X?val_X?y?val_y?=?train_test_split(
????train_x
????train_y
????test_size=0.05
????random_state=1
????stratify=train_y?##?這里保證分割后y的比例分布與原數據一致
)
X_train?=?X
y_train?=?y
X_test?=?val_X
y_test?=?val_y
#創建LightGBM的數據集
lgb_train?=?lgb.Dataset(X_train?y_train)
lgb_eval?=?lgb.Dataset(X_test?y_test?reference=lgb_train)
#?specify?your?configurations?as?a?dict
params?=?{
????‘boosting_type‘:?‘gbdt‘
????‘objective‘:?‘binary‘
????‘metric‘:?{‘binary_logloss‘?‘auc‘}
????‘num_leaves‘:?5
????‘max_depth‘:?6
????‘min_data_in_leaf‘:?450
????‘learning_rate‘:?0.1
????‘feature_fraction‘:?0.9
????‘bagging_fraction‘:?0.95
????‘bagging_freq‘:?5
????‘lambda_l1‘:?1??
????‘lambda_l2‘:?0.001??#?越小l2正則程度越高
????‘min_gain_to_split‘:?0.2
????‘verbose‘:?5
????‘is_unbalance‘:?True
}
#?train
print(‘Start?training...‘)
gbm?=?lgb.train(params
????????????????lgb_train
????????????????num_boost_round=10000
????????????????valid_sets=lgb_eval
????????????????early_stopping_round
評論
共有 條評論