/ PROGRAMMING

캐글 (4) 타이타닉 생존자 앙상블과 스태킹까지

kaggle 관련 글

https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python

타이타닉 생존자 데이터를 가지고 앙상블과 스태킹을 필사해보았습니다.

import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

특성 탐색, 특성 공학, 특성 정제

데이터를 살펴보고 특성 공학을 통해 범주형 특징을 수치적으로 인코딩합니다.

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
PassengerId=test['PassengerId']

train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
full_data=[train, test]
# 이름의 길이 구하기

train['Name_length']=train['Name'].apply(len)
test['Name_length']=test['Name'].apply(len)
# cabin 데이터 0, 1로 분류하기

train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
# 가족 데이터 구하기

for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# 홀로 탄 사람들과 아닌 사람 1, 0 
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Embarked 에서 Null은 'S'로 지정하기

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Fare에서 Null은 평균값으로 채워넣기

for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
# Fare 데이터 나누기 (qcut은 동일한 개수로 나눠준다)

train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Age 데이터 수정하기 (cut은 동일한 길이로 나눠준다)

for dataset in full_data:
    age_avg = dataset['Age'].mean() #평균
    age_std = dataset['Age'].std() #표준편차
    age_null_count = dataset['Age'].isnull().sum() # Null값 개수
    # null값에 특정 값 넣어주기
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# 이름 검색해서 Title로 반환해주기
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# 틀린 단어들 하나로 통일시키기

for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
# 데이터 숫자로 변환하기

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # Title 숫자로 바꾸기

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare']= 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
# 수정한 column 지우기

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)

시각화

train.head()
Survived Pclass Sex Age Parch Fare Embarked Name_length Has_Cabin FamilySize IsAlone Title
0 0 3 1 1 0 0 0 23 0 2 0 1
1 1 1 0 2 0 3 1 51 1 2 0 3
2 1 3 0 1 0 1 0 22 0 1 1 2
3 1 1 0 2 0 3 0 44 1 2 0 3
4 0 3 1 2 0 1 0 24 0 1 1 1
# Heatmap을 통한 피어슨 상관계수 파악

colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)
<AxesSubplot:title={'center':'Pearson Correlation of Features'}>

output_20_1

상관관계가 높은 항들이 많이 있지 않은 것을 알 수 있다.
이는 서로 중복되거나 불필요한 데이터가 많이 없다는 뜻이다.

g = sns.pairplot(train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
       u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])
<seaborn.axisgrid.PairGrid at 0x2828c74e9a0>

output_22_1

앙상블

SklearnHelper 클래스를 통해 학습에 필요한 클래스들을 만듭니다.
init‘에 sklearn에서 제공하는 학습 방법을 넣을 수 있습니다.
또한 우리가 알고싶은 매개변수를 더 추가할 수 있습니다.

ntrain = train.shape[0] ntest = test.shape[0] SEED = 0 NFOLDS = 5 kf = KFold(n_splits= NFOLDS, shuffle=True ,random_state=SEED)

class SklearnHelper(object): def init(self, clf, seed=0, params=None): params[‘random_state’] = seed self.clf = clf(**params)

def train(self, x_train, y_train):
    self.clf.fit(x_train, y_train)

def predict(self, x):
    return self.clf.predict(x)

def fit(self,x,y):
    return self.clf.fit(x,y)

def feature_importances(self,x,y):
    print(self.clf.fit(x,y).feature_importances_)
def get_oof(clf, x_train, y_train, x_test):    
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

첫번째 레벨 모델 생성

  1. Random Forest classifier
  2. Extra Trees classifier
  3. AdaBoost classifer
  4. Gradient Boosting classifer
  5. Support Vector Machine

parameter n_jobs: 훈련 과정의 코어 개수
n_estimators: 학습 모델 안의 분류 트리 개수
max_depth: 최대 깊이
verbose: 학습 과정 중에 텍스트 출력 여부

rf_params = {
    'n_jobs' : -1,
    'n_estimators' : 500,
    'warm_start' : True,
    'max_depth' : 6,
    'min_samples_leaf' : 2,
    'max_features' : 'sqrt',
    'verbose' : 0
}

et_params = {
    'n_jobs' : -1,
    'n_estimators' : 500,
    'max_depth' : 8,
    'min_samples_leaf' : 2,
    'verbose' : 0
}

ada_params = {
    'n_estimators' : 500,
    'learning_rate' : 0.75
}

gb_params = {
    'n_estimators' : 500,
    'max_depth' : 5,
    'min_samples_leaf' : 2,
    'verbose' : 0
}

svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
}
rf = SklearnHelper(clf = RandomForestClassifier,seed = SEED,params = rf_params)
et = SklearnHelper(clf = ExtraTreesClassifier,seed = SEED,params = et_params)
ada = SklearnHelper(clf = AdaBoostClassifier,seed = SEED,params = ada_params)
gb = SklearnHelper(clf = GradientBoostingClassifier,seed = SEED,params = gb_params)
svc = SklearnHelper(clf = SVC,seed = SEED,params = svc_params)
# Numpy 배열 만들기 (ravel은 평평하게 만들어주는 함수)

y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values 
x_test = test.values
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) 
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) 
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) 
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) 

print("Training is complete")
Training is complete
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)
[0.10435218 0.2102886  0.03567688 0.01960763 0.04623086 0.02985483
 0.13117994 0.04982096 0.07059698 0.01160042 0.29079073]
[0.11874701 0.38179518 0.0293706  0.01702077 0.05590766 0.02766201
 0.0468396  0.08340196 0.04583293 0.02149851 0.17192377]
[0.03  0.01  0.016 0.062 0.04  0.01  0.692 0.012 0.056 0.002 0.07 ]
[0.08808198 0.0160359  0.04796194 0.01384621 0.05556242 0.0240539
 0.17261786 0.03568649 0.11134606 0.00579069 0.42901655]
rf_features = [0.10474135,  0.21837029,  0.04432652,  0.02249159,  0.05432591,  0.02854371
  ,0.07570305,  0.01088129 , 0.24247496,  0.13685733 , 0.06128402]
et_features = [ 0.12165657,  0.37098307  ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157
  ,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517,  0.08910063]
ada_features = [0.028 ,   0.008  ,      0.012   ,     0.05866667,   0.032 ,       0.008
  ,0.04666667 ,  0.     ,      0.05733333,   0.73866667,   0.01066667]
gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395,  0.04778854
  ,0.05965792 , 0.02774745,  0.07462718,  0.4593142 ,  0.01340093]
cols = train.columns.values
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })
trace = go.Scatter(
    y = feature_dataframe['Random Forest feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['Random Forest feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

trace = go.Scatter(
    y = feature_dataframe['Extra Trees  feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['Extra Trees  feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Extra Trees Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

trace = go.Scatter(
    y = feature_dataframe['AdaBoost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['AdaBoost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'AdaBoost Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

trace = go.Scatter(
    y = feature_dataframe['Gradient Boost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['Gradient Boost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

두번째 예측 진행하기

base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()
RandomForest ExtraTrees AdaBoost GradientBoost
0 0.0 0.0 0.0 0.0
1 1.0 1.0 1.0 1.0
2 1.0 0.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 0.0 0.0 0.0 0.0
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)