캐글 (4) 타이타닉 생존자 앙상블과 스태킹까지
kaggle 관련 글
- 캐글 (1) Simple Matplotlib & Visualization Tips 공부하기
- 캐글 (2) 타이타닉 튜토리얼 1,2 공부하기
- 캐글 (3) 타이타닉 생존자 EDA 부터 분류까지
- 캐글 (4) 타이타닉 생존자 앙상블과 스태킹까지
- 캐글 (5) 메타 데이터를 이용한 데이터 관찰 및 준비
https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
타이타닉 생존자 데이터를 가지고 앙상블과 스태킹을 필사해보았습니다.
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold
특성 탐색, 특성 공학, 특성 정제
데이터를 살펴보고 특성 공학을 통해 범주형 특징을 수치적으로 인코딩합니다.
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
PassengerId=test['PassengerId']
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
full_data=[train, test]
# 이름의 길이 구하기
train['Name_length']=train['Name'].apply(len)
test['Name_length']=test['Name'].apply(len)
# cabin 데이터 0, 1로 분류하기
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
# 가족 데이터 구하기
for dataset in full_data:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# 홀로 탄 사람들과 아닌 사람 1, 0
for dataset in full_data:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Embarked 에서 Null은 'S'로 지정하기
for dataset in full_data:
dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Fare에서 Null은 평균값으로 채워넣기
for dataset in full_data:
dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
# Fare 데이터 나누기 (qcut은 동일한 개수로 나눠준다)
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Age 데이터 수정하기 (cut은 동일한 길이로 나눠준다)
for dataset in full_data:
age_avg = dataset['Age'].mean() #평균
age_std = dataset['Age'].std() #표준편차
age_null_count = dataset['Age'].isnull().sum() # Null값 개수
# null값에 특정 값 넣어주기
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# 이름 검색해서 Title로 반환해주기
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ""
for dataset in full_data:
dataset['Title'] = dataset['Name'].apply(get_title)
# 틀린 단어들 하나로 통일시키기
for dataset in full_data:
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
# 데이터 숫자로 변환하기
for dataset in full_data:
# Mapping Sex
dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# Title 숫자로 바꾸기
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare']= 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
# 수정한 column 지우기
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test = test.drop(drop_elements, axis = 1)
시각화
train.head()
Survived | Pclass | Sex | Age | Parch | Fare | Embarked | Name_length | Has_Cabin | FamilySize | IsAlone | Title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 23 | 0 | 2 | 0 | 1 |
1 | 1 | 1 | 0 | 2 | 0 | 3 | 1 | 51 | 1 | 2 | 0 | 3 |
2 | 1 | 3 | 0 | 1 | 0 | 1 | 0 | 22 | 0 | 1 | 1 | 2 |
3 | 1 | 1 | 0 | 2 | 0 | 3 | 0 | 44 | 1 | 2 | 0 | 3 |
4 | 0 | 3 | 1 | 2 | 0 | 1 | 0 | 24 | 0 | 1 | 1 | 1 |
# Heatmap을 통한 피어슨 상관계수 파악
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
<AxesSubplot:title={'center':'Pearson Correlation of Features'}>
상관관계가 높은 항들이 많이 있지 않은 것을 알 수 있다.
이는 서로 중복되거나 불필요한 데이터가 많이 없다는 뜻이다.
g = sns.pairplot(train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])
<seaborn.axisgrid.PairGrid at 0x2828c74e9a0>
앙상블
SklearnHelper 클래스를 통해 학습에 필요한 클래스들을 만듭니다.
‘init‘에 sklearn에서 제공하는 학습 방법을 넣을 수 있습니다.
또한 우리가 알고싶은 매개변수를 더 추가할 수 있습니다.
ntrain = train.shape[0] ntest = test.shape[0] SEED = 0 NFOLDS = 5 kf = KFold(n_splits= NFOLDS, shuffle=True ,random_state=SEED)
class SklearnHelper(object): def init(self, clf, seed=0, params=None): params[‘random_state’] = seed self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
def get_oof(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf.split(train)):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
첫번째 레벨 모델 생성
- Random Forest classifier
- Extra Trees classifier
- AdaBoost classifer
- Gradient Boosting classifer
- Support Vector Machine
parameter
n_jobs: 훈련 과정의 코어 개수
n_estimators: 학습 모델 안의 분류 트리 개수
max_depth: 최대 깊이
verbose: 학습 과정 중에 텍스트 출력 여부
rf_params = {
'n_jobs' : -1,
'n_estimators' : 500,
'warm_start' : True,
'max_depth' : 6,
'min_samples_leaf' : 2,
'max_features' : 'sqrt',
'verbose' : 0
}
et_params = {
'n_jobs' : -1,
'n_estimators' : 500,
'max_depth' : 8,
'min_samples_leaf' : 2,
'verbose' : 0
}
ada_params = {
'n_estimators' : 500,
'learning_rate' : 0.75
}
gb_params = {
'n_estimators' : 500,
'max_depth' : 5,
'min_samples_leaf' : 2,
'verbose' : 0
}
svc_params = {
'kernel' : 'linear',
'C' : 0.025
}
rf = SklearnHelper(clf = RandomForestClassifier,seed = SEED,params = rf_params)
et = SklearnHelper(clf = ExtraTreesClassifier,seed = SEED,params = et_params)
ada = SklearnHelper(clf = AdaBoostClassifier,seed = SEED,params = ada_params)
gb = SklearnHelper(clf = GradientBoostingClassifier,seed = SEED,params = gb_params)
svc = SklearnHelper(clf = SVC,seed = SEED,params = svc_params)
# Numpy 배열 만들기 (ravel은 평평하게 만들어주는 함수)
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values
x_test = test.values
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test)
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test)
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test)
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test)
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test)
print("Training is complete")
Training is complete
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)
[0.10435218 0.2102886 0.03567688 0.01960763 0.04623086 0.02985483
0.13117994 0.04982096 0.07059698 0.01160042 0.29079073]
[0.11874701 0.38179518 0.0293706 0.01702077 0.05590766 0.02766201
0.0468396 0.08340196 0.04583293 0.02149851 0.17192377]
[0.03 0.01 0.016 0.062 0.04 0.01 0.692 0.012 0.056 0.002 0.07 ]
[0.08808198 0.0160359 0.04796194 0.01384621 0.05556242 0.0240539
0.17261786 0.03568649 0.11134606 0.00579069 0.42901655]
rf_features = [0.10474135, 0.21837029, 0.04432652, 0.02249159, 0.05432591, 0.02854371
,0.07570305, 0.01088129 , 0.24247496, 0.13685733 , 0.06128402]
et_features = [ 0.12165657, 0.37098307 ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157
,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517, 0.08910063]
ada_features = [0.028 , 0.008 , 0.012 , 0.05866667, 0.032 , 0.008
,0.04666667 , 0. , 0.05733333, 0.73866667, 0.01066667]
gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395, 0.04778854
,0.05965792 , 0.02774745, 0.07462718, 0.4593142 , 0.01340093]
cols = train.columns.values
feature_dataframe = pd.DataFrame( {'features': cols,
'Random Forest feature importances': rf_features,
'Extra Trees feature importances': et_features,
'AdaBoost feature importances': ada_features,
'Gradient Boost feature importances': gb_features
})
trace = go.Scatter(
y = feature_dataframe['Random Forest feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
color = feature_dataframe['Random Forest feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Random Forest Feature Importance',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
trace = go.Scatter(
y = feature_dataframe['Extra Trees feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
color = feature_dataframe['Extra Trees feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Extra Trees Feature Importance',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
trace = go.Scatter(
y = feature_dataframe['AdaBoost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
color = feature_dataframe['AdaBoost feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'AdaBoost Feature Importance',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
trace = go.Scatter(
y = feature_dataframe['Gradient Boost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
color = feature_dataframe['Gradient Boost feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Gradient Boosting Feature Importance',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
두번째 예측 진행하기
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
'ExtraTrees': et_oof_train.ravel(),
'AdaBoost': ada_oof_train.ravel(),
'GradientBoost': gb_oof_train.ravel()
})
base_predictions_train.head()
RandomForest | ExtraTrees | AdaBoost | GradientBoost | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | 1.0 | 0.0 | 1.0 | 1.0 |
3 | 1.0 | 1.0 | 1.0 | 1.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 |
data = [
go.Heatmap(
z= base_predictions_train.astype(float).corr().values ,
x=base_predictions_train.columns.values,
y= base_predictions_train.columns.values,
colorscale='Viridis',
showscale=True,
reversescale = True
)
]
py.iplot(data, filename='labelled-heatmap')
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
gbm = xgb.XGBClassifier(
#learning_rate = 0.02,
n_estimators= 2000,
max_depth= 4,
min_child_weight= 2,
#gamma=1,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread= -1,
scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)