14 November 2021 / PROGRAMMING

캐글 (4) 타이타닉 생존자 앙상블과 스태킹까지

타이타닉 생존자 데이터를 가지고 앙상블과 스태킹을 필사해보았습니다.

import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

특성 탐색, 특성 공학, 특성 정제

데이터를 살펴보고 특성 공학을 통해 범주형 특징을 수치적으로 인코딩합니다.

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

PassengerId=test['PassengerId']

train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

full_data=[train, test]

# 이름의 길이 구하기

train['Name_length']=train['Name'].apply(len)
test['Name_length']=test['Name'].apply(len)

# cabin 데이터 0, 1로 분류하기

train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# 가족 데이터 구하기

for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# 홀로 탄 사람들과 아닌 사람 1, 0 
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

# Embarked 에서 Null은 'S'로 지정하기

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

# Fare에서 Null은 평균값으로 채워넣기

for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# Fare 데이터 나누기 (qcut은 동일한 개수로 나눠준다)

train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

# Age 데이터 수정하기 (cut은 동일한 길이로 나눠준다)

for dataset in full_data:
    age_avg = dataset['Age'].mean() #평균
    age_std = dataset['Age'].std() #표준편차
    age_null_count = dataset['Age'].isnull().sum() # Null값 개수
    # null값에 특정 값 넣어주기
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)

# 이름 검색해서 Title로 반환해주기
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

# 틀린 단어들 하나로 통일시키기

for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# 데이터 숫자로 변환하기

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # Title 숫자로 바꾸기

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare']= 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

# 수정한 column 지우기

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)

시각화

train.head()

	Survived	Pclass	Sex	Age	Fare	Embarked	Name_length	Has_Cabin	FamilySize	IsAlone	Title
0	0	3	1	1	0	0	23	0	2	0	1
1	1	1	0	2	3	1	51	1	2	0	3
2	1	3	0	1	1	0	22	0	1	1	2
3	1	1	0	2	3	0	44	1	2	0	3
4	0	3	1	2	1	0	24	0	1	1	1

# Heatmap을 통한 피어슨 상관계수 파악

colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

<AxesSubplot:title={'center':'Pearson Correlation of Features'}>

output_20_1

상관관계가 높은 항들이 많이 있지 않은 것을 알 수 있다.
이는 서로 중복되거나 불필요한 데이터가 많이 없다는 뜻이다.

g = sns.pairplot(train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
       u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])

<seaborn.axisgrid.PairGrid at 0x2828c74e9a0>

output_22_1

앙상블

SklearnHelper 클래스를 통해 학습에 필요한 클래스들을 만듭니다.
‘init‘에 sklearn에서 제공하는 학습 방법을 넣을 수 있습니다.
또한 우리가 알고싶은 매개변수를 더 추가할 수 있습니다.

ntrain = train.shape[0] ntest = test.shape[0] SEED = 0 NFOLDS = 5 kf = KFold(n_splits= NFOLDS, shuffle=True ,random_state=SEED)

class SklearnHelper(object): def init(self, clf, seed=0, params=None): params[‘random_state’] = seed self.clf = clf(**params)

def train(self, x_train, y_train):
    self.clf.fit(x_train, y_train)

def predict(self, x):
    return self.clf.predict(x)

def fit(self,x,y):
    return self.clf.fit(x,y)

def feature_importances(self,x,y):
    print(self.clf.fit(x,y).feature_importances_)

def get_oof(clf, x_train, y_train, x_test):    
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

첫번째 레벨 모델 생성

Random Forest classifier
Extra Trees classifier
AdaBoost classifer
Gradient Boosting classifer
Support Vector Machine

parameter n_jobs: 훈련 과정의 코어 개수
n_estimators: 학습 모델 안의 분류 트리 개수
max_depth: 최대 깊이
verbose: 학습 과정 중에 텍스트 출력 여부

rf_params = {
    'n_jobs' : -1,
    'n_estimators' : 500,
    'warm_start' : True,
    'max_depth' : 6,
    'min_samples_leaf' : 2,
    'max_features' : 'sqrt',
    'verbose' : 0
}

et_params = {
    'n_jobs' : -1,
    'n_estimators' : 500,
    'max_depth' : 8,
    'min_samples_leaf' : 2,
    'verbose' : 0
}

ada_params = {
    'n_estimators' : 500,
    'learning_rate' : 0.75
}

gb_params = {
    'n_estimators' : 500,
    'max_depth' : 5,
    'min_samples_leaf' : 2,
    'verbose' : 0
}

svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
}

rf = SklearnHelper(clf = RandomForestClassifier,seed = SEED,params = rf_params)
et = SklearnHelper(clf = ExtraTreesClassifier,seed = SEED,params = et_params)
ada = SklearnHelper(clf = AdaBoostClassifier,seed = SEED,params = ada_params)
gb = SklearnHelper(clf = GradientBoostingClassifier,seed = SEED,params = gb_params)
svc = SklearnHelper(clf = SVC,seed = SEED,params = svc_params)

# Numpy 배열 만들기 (ravel은 평평하게 만들어주는 함수)

y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values 
x_test = test.values

et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) 
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) 
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) 
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) 

print("Training is complete")

Training is complete

rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

[0.10435218 0.2102886  0.03567688 0.01960763 0.04623086 0.02985483
 0.13117994 0.04982096 0.07059698 0.01160042 0.29079073]
[0.11874701 0.38179518 0.0293706  0.01702077 0.05590766 0.02766201
 0.0468396  0.08340196 0.04583293 0.02149851 0.17192377]
[0.03  0.01  0.016 0.062 0.04  0.01  0.692 0.012 0.056 0.002 0.07 ]
[0.08808198 0.0160359  0.04796194 0.01384621 0.05556242 0.0240539
 0.17261786 0.03568649 0.11134606 0.00579069 0.42901655]

rf_features = [0.10474135,  0.21837029,  0.04432652,  0.02249159,  0.05432591,  0.02854371
  ,0.07570305,  0.01088129 , 0.24247496,  0.13685733 , 0.06128402]
et_features = [ 0.12165657,  0.37098307  ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157
  ,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517,  0.08910063]
ada_features = [0.028 ,   0.008  ,      0.012   ,     0.05866667,   0.032 ,       0.008
  ,0.04666667 ,  0.     ,      0.05733333,   0.73866667,   0.01066667]
gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395,  0.04778854
  ,0.05965792 , 0.02774745,  0.07462718,  0.4593142 ,  0.01340093]

cols = train.columns.values
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })

trace = go.Scatter(
    y = feature_dataframe['Random Forest feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['Random Forest feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

trace = go.Scatter(
    y = feature_dataframe['Extra Trees  feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['Extra Trees  feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Extra Trees Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

trace = go.Scatter(
    y = feature_dataframe['AdaBoost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['AdaBoost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'AdaBoost Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

trace = go.Scatter(
    y = feature_dataframe['Gradient Boost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
        color = feature_dataframe['Gradient Boost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

두번째 예측 진행하기

base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

	RandomForest	ExtraTrees	AdaBoost	GradientBoost
0	0.0	0.0	0.0	0.0
1	1.0	1.0	1.0	1.0
2	1.0	0.0	1.0	1.0
3	1.0	1.0	1.0	1.0
4	0.0	0.0	0.0	0.0

data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)

캐글 (4) 타이타닉 생존자 앙상블과 스태킹까지

특성 탐색, 특성 공학, 특성 정제

시각화

앙상블

첫번째 레벨 모델 생성

두번째 예측 진행하기

백준 (9)
(1927,11279,11286,
1715,11766)

머신러닝 정리 (6)
모델 평가와 성능 향상

	Survived	Pclass	Sex	Age	Fare	Embarked	Name_length	Has_Cabin	FamilySize	IsAlone	Title
0	0	3	1	1	0	0	23	0	2	0	1
1	1	1	0	2	3	1	51	1	2	0	3
2	1	3	0	1	1	0	22	0	1	1	2
3	1	1	0	2	3	0	44	1	2	0	3
4	0	3	1	2	1	0	24	0	1	1	1

	RandomForest	ExtraTrees	AdaBoost	GradientBoost
0	0.0	0.0	0.0	0.0
1	1.0	1.0	1.0	1.0
2	1.0	0.0	1.0	1.0
3	1.0	1.0	1.0	1.0
4	0.0	0.0	0.0	0.0

	Survived	Pclass	Sex	Age	Fare	Embarked	Name_length	Has_Cabin	FamilySize	IsAlone	Title
0	0	3	1	1	0	0	23	0	2	0	1
1	1	1	0	2	3	1	51	1	2	0	3
2	1	3	0	1	1	0	22	0	1	1	2
3	1	1	0	2	3	0	44	1	2	0	3
4	0	3	1	2	1	0	24	0	1	1	1

	RandomForest	ExtraTrees	AdaBoost	GradientBoost
0	0.0	0.0	0.0	0.0
1	1.0	1.0	1.0	1.0
2	1.0	0.0	1.0	1.0
3	1.0	1.0	1.0	1.0
4	0.0	0.0	0.0	0.0

특성 탐색, 특성 공학, 특성 정제

시각화

앙상블

첫번째 레벨 모델 생성

두번째 예측 진행하기

Search 주건나's Blog

	Survived	Pclass	Sex	Age	Fare	Embarked	Name_length	Has_Cabin	FamilySize	IsAlone	Title
0	0	3	1	1	0	0	23	0	2	0	1
1	1	1	0	2	3	1	51	1	2	0	3
2	1	3	0	1	1	0	22	0	1	1	2
3	1	1	0	2	3	0	44	1	2	0	3
4	0	3	1	2	1	0	24	0	1	1	1

	RandomForest	ExtraTrees	AdaBoost	GradientBoost
0	0.0	0.0	0.0	0.0
1	1.0	1.0	1.0	1.0
2	1.0	0.0	1.0	1.0
3	1.0	1.0	1.0	1.0
4	0.0	0.0	0.0	0.0