/ PROGRAMMING

캐글 (2)
타이타닉 튜토리얼 1,2 공부하기

kaggle 관련 글

타이타닉 튜토리얼 1,2 공부하기

kaggle 타이타닉 튜토리얼을 필사하였다.
해당 유튜브를 따라서 필사하였고 블로그에 자세한 설명도 나와있었다.
https://www.youtube.com/watch?v=_iqz7tFhox0&list=PLC_wC_PMBL5MnqmgTLqDgu4tO8mrQakuF
https://kaggle-kr.tistory.com/17?category=868316
https://kaggle-kr.tistory.com/18?category=868316

# 분석에 필요한 패키지
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# plt의 스타일 지정
plt.style.use('seaborn')
sns.set(font_scale=2.5) 

# 결측치를 알기 쉽게 하는 패키지
import missingno as msno

# warning무시
import warnings
warnings.filterwarnings('ignore')

# notebook에서 바로 그림 확인하는 코드
%matplotlib inline
# 데이터 불러오기
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
df_test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
# 데이터 확인
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# 데이터 행렬 확인
df_train.shape
(891, 12)
# 데이터 셋 특징 확인
df_train.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
# max 값만 확인
df_train.max()
PassengerId                            891
Survived                                 1
Pclass                                   3
Name           van Melkebeke, Mr. Philemon
Sex                                   male
Age                                     80
SibSp                                    8
Parch                                    6
Ticket                           WE/P 5735
Fare                               512.329
dtype: object
df_test.describe()
PassengerId Pclass Age SibSp Parch Fare
count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000
mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188
std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576
min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800
50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200
75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000
max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200
# column값 확인
df_train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
# 각 column의 null 데이터 비율 확인 {:>10}:오른쪽 정렬
for col in df_train.columns:
    msg='column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(df_train[col].isnull().sum()/df_train[col].shape[0]))
    print(msg)
column: PassengerId	 Percent of NaN value: 0.00%
column:   Survived	 Percent of NaN value: 0.00%
column:     Pclass	 Percent of NaN value: 0.00%
column:       Name	 Percent of NaN value: 0.00%
column:        Sex	 Percent of NaN value: 0.00%
column:        Age	 Percent of NaN value: 19.87%
column:      SibSp	 Percent of NaN value: 0.00%
column:      Parch	 Percent of NaN value: 0.00%
column:     Ticket	 Percent of NaN value: 0.00%
column:       Fare	 Percent of NaN value: 0.00%
column:      Cabin	 Percent of NaN value: 77.10%
column:   Embarked	 Percent of NaN value: 0.22%
# 값 확인
df_train[col]
0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object
# null 값 확인
df_train[col].isnull()
0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Embarked, Length: 891, dtype: bool
# null 값의 합
df_train[col].isnull().sum()
2
# shape를 통해 총 데이터 갯수 확인하기
df_train[col].isnull().sum()/df_train[col].shape[0]
0.002244668911335578
for col in df_test.columns:
    msg='column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(df_test[col].isnull().sum()/df_test[col].shape[0]))
    print(msg)
column: PassengerId	 Percent of NaN value: 0.00%
column:     Pclass	 Percent of NaN value: 0.00%
column:       Name	 Percent of NaN value: 0.00%
column:        Sex	 Percent of NaN value: 0.00%
column:        Age	 Percent of NaN value: 20.57%
column:      SibSp	 Percent of NaN value: 0.00%
column:      Parch	 Percent of NaN value: 0.00%
column:     Ticket	 Percent of NaN value: 0.00%
column:       Fare	 Percent of NaN value: 0.24%
column:      Cabin	 Percent of NaN value: 78.23%
column:   Embarked	 Percent of NaN value: 0.00%
# missingno를 통해 확인하기
msno.matrix(df=df_train.iloc[:,:],figsize=(8,8),color=(0.8,0.5,0.2))
<AxesSubplot:>

output_15_1

# iloc으로 가져오고 싶은 위치 찾기
df_train.iloc[:,-1]
0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object
msno.bar(df=df_train.iloc[:,:],figsize=(8,8),color=(0.8,0.5,0.2))
<AxesSubplot:>

output_17_1

# pie plot과 count-plot 그래프 그리기
# 도화지를 준비하는 과정 (1,2): 행렬 
f, ax=plt.subplots(1,2,figsize=(18,8)) 
#'Survived'에 있는 값 count하기, 떨어뜨리기, 글자 규칙, 그리는 위치, 그림자
df_train['Survived'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Pie plot-Survived') # 제목
ax[0].set_ylabel('')
sns.countplot('Survived',data=df_train,ax=ax[1]) #countplot을 [1] 위치에 그리기
ax[1].set_title('Count plot-Survived')
plt.show()

output_18_0

2.1 PClass

# class 별 생존자 수 count는 객체가 몇명인가
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).count()
Survived
Pclass
1 216
2 184
3 491
# sum은 숫자 자체의 데이터의 갯수 [0,1]에서 1을 다 더한 값
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).sum()
Survived
Pclass
1 136
2 87
3 119
# crosstab을 통해 비교 (margin은 All 표현,style.background_gradient를 통해 색상 조절)
pd.crosstab(df_train['Pclass'],df_train['Survived'],margins=True).style.background_gradient(cmap='summer_r')
Survived 0 1 All
Pclass
1 80 136 216
2 97 87 184
3 372 119 491
All 549 342 891
# 평균 알아보기 (as_index를 통해 그래프 그리기 설정, sort_values를 통한 오름차순, ascending=False는 내림차순)
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).mean().sort_values(by='Survived',ascending=False)
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
# 그래프 기리기
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).mean().sort_values(by='Survived',ascending=False).plot()
<AxesSubplot:xlabel='Pclass'>

output_24_1

# as_index=False일때는 Pclass도 같이 그린다
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=False).mean().sort_values(by='Survived',ascending=False).plot()
<AxesSubplot:>

output_25_1

# 막대그래프 그리기
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).mean().sort_values(by='Survived',ascending=False).plot.bar()
<AxesSubplot:xlabel='Pclass'>

output_26_1

y_position=1.02
f,ax=plt.subplots(1,2,figsize=(18,8))
# Class별 탑승자 수
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number of passenger By Pclass',y=y_position)
ax[0].set_ylabel('Count')
# Class별 Survived와 Dead 구분 (hue를 통해 색깔 구분)
sns.countplot('Pclass',hue='Survived',data=df_train,ax=ax[1])
ax[1].set_title('Pclass:Survived vs Dead',y=y_position)
plt.show()

output_27_0

2.2 Sex

f,ax=plt.subplots(1,2,figsize=(18,8))
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex',hue='Survived',data=df_train,ax=ax[1])
ax[1].set_title('Sex:Survived vs Dead')
plt.show()

output_29_0

df_train[['Sex','Survived']].groupby(['Sex'],as_index=False).mean()
Sex Survived
0 female 0.742038
1 male 0.188908
pd.crosstab(df_train['Sex'],df_train['Survived'],margins=True).style.background_gradient(cmap='summer_r')
Survived 0 1 All
Sex
female 81 233 314
male 468 109 577
All 549 342 891

2.2 Both Sex and Pclass

# factorplot 그래프 그리기
# 선은 error bar
sns.factorplot('Pclass','Survived',hue='Sex',data=df_train,size=6,aspect=1.5)
<seaborn.axisgrid.FacetGrid at 0x2248b7c1c10>

output_33_1

# 축과 보는 방향을 바꾼 것
sns.factorplot(x='Sex',y='Survived',col='Pclass',data=df_train,saturation=.5,size=9,aspect=1)
<seaborn.axisgrid.FacetGrid at 0x2248b8ad310>

output_34_1

sns.factorplot(x='Sex',y='Survived',hue='Pclass',data=df_train,saturation=.5,size=9,aspect=1)
<seaborn.axisgrid.FacetGrid at 0x2248ba293a0>

output_35_1

Age

print('제일 나이 많은 탑승객: {:.1f} years'.format(df_train['Age'].max()))
print('제일 나이 어린 탑승객: {:.1f} years'.format(df_train['Age'].min()))
print('탑승객 평균 나이: {:.1f} years'.format(df_train['Age'].mean()))
제일 나이 많은 탑승객: 80.0 years
제일 나이 어린 탑승객: 0.4 years
탑승객 평균 나이: 29.7 years
# kdeplot(커널 밀도 함수) 그리기 (히스토그램과 유사)
fig,ax=plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived']==1]['Age'],ax=ax)
sns.kdeplot(df_train[df_train['Survived']==0]['Age'],ax=ax)
plt.legend(['Survived'==1,'Survived'==0])
plt.show()

output_38_0

# 히스토그램
df_train[df_train['Survived']==1]['Age'].hist()
<AxesSubplot:>

output_39_1

그래프 그리는 다양한 방법

f=plt.figure(figsize=(10,10))
a=np.arange(100)
b=np.sin(a)
plt.plot(b)
[<matplotlib.lines.Line2D at 0x2248d260070>]

output_41_1

f,ax=plt.subplots(1,1,figsize=(10,10))
a=np.arange(100)
b=np.sin(a)
plt.plot(b)
[<matplotlib.lines.Line2D at 0x2248d2c4040>]

output_42_1

plt.figure(figsize=(10,10))
a=np.arange(100)
b=np.sin(a)
plt.plot(b)
[<matplotlib.lines.Line2D at 0x2248bd55280>]

output_43_1

# 탑승객의 연령별 분포
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass']==1].plot(kind='kde')
df_train['Age'][df_train['Pclass']==2].plot(kind='kde')
df_train['Age'][df_train['Pclass']==3].plot(kind='kde')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class','2nd Class','3rd Class'])
<matplotlib.legend.Legend at 0x2248d2f8a00>

output_44_1

# 히스토그램은 겹치면 보이지 않음
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass']==1].plot(kind='hist')
df_train['Age'][df_train['Pclass']==2].plot(kind='hist')
df_train['Age'][df_train['Pclass']==3].plot(kind='hist')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class','2nd Class','3rd Class'])
<matplotlib.legend.Legend at 0x2248b74fd30>

output_45_1

fig,ax=plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[(df_train['Survived']==0)&(df_train['Pclass']==1)]['Age'],ax=ax)
sns.kdeplot(df_train[(df_train['Survived']==1)&(df_train['Pclass']==1)]['Age'],ax=ax)
plt.legend(['Survived==1','Survived==0'])
plt.title('1st class')
plt.show()

output_46_0

# 히스토그램은 겹치면 보이지 않음
plt.figure(figsize=(8,6))
df_train['Age'][(df_train['Pclass']==1)&(df_train['Survived']==0)].plot(kind='hist')
df_train['Age'][(df_train['Pclass']==1)&(df_train['Survived']==1)].plot(kind='hist')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
Text(0.5, 1.0, 'Age Distribution within classes')

output_47_1

fig,ax=plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[(df_train['Survived']==0)&(df_train['Pclass']==2)]['Age'],ax=ax)
sns.kdeplot(df_train[(df_train['Survived']==1)&(df_train['Pclass']==2)]['Age'],ax=ax)
plt.legend(['Survived==1','Survived==0'])
plt.title('2nd class')
plt.show()

output_48_0

# 히스토그램은 겹치면 보이지 않음
plt.figure(figsize=(8,6))
df_train['Age'][(df_train['Pclass']==2)&(df_train['Survived']==0)].plot(kind='hist')
df_train['Age'][(df_train['Pclass']==2)&(df_train['Survived']==1)].plot(kind='hist')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
Text(0.5, 1.0, 'Age Distribution within classes')

output_49_1

fig,ax=plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[(df_train['Survived']==0)&(df_train['Pclass']==3)]['Age'],ax=ax)
sns.kdeplot(df_train[(df_train['Survived']==1)&(df_train['Pclass']==3)]['Age'],ax=ax)
plt.legend(['Survived==1','Survived==0'])
plt.title('3rd class')
plt.show()

output_50_0

# 히스토그램은 겹치면 보이지 않음
plt.figure(figsize=(8,6))
df_train['Age'][(df_train['Pclass']==3)&(df_train['Survived']==0)].plot(kind='hist')
df_train['Age'][(df_train['Pclass']==3)&(df_train['Survived']==1)].plot(kind='hist')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
Text(0.5, 1.0, 'Age Distribution within classes')

output_51_1

change_age_range_survival_ratio=[]

for i in range(1,80):
    change_age_range_survival_ratio.append(df_train[df_train['Age']<i]['Survived'].sum()/len(df_train[df_train['Age']<i]['Survived']))
    
plt.figure(figsize=(7,7))
plt.plot(change_age_range_survival_ratio)
plt.title('Survial rate change depending on range of Age',y=1.02)
plt.ylabel=('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

output_52_0

i=10
df_train[df_train['Age']<i]['Survived'].sum() / len(df_train[df_train['Age']<i]['Survived'])
0.6129032258064516

Pclass, Sex, Age

f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived',data=df_train,scale='count',split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))

sns.violinplot('Sex','Age',hue='Survived',data=df_train,scale='count',split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

output_55_0

# split=False
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived',data=df_train,scale='count',split=False,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))

sns.violinplot('Sex','Age',hue='Survived',data=df_train,scale='count',split=False,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

output_56_0

# scale 차이 같은 면적이기 때문에 count보다 숫자의 개념이 보기 힘듬
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived',data=df_train,scale='area',split=False,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))

sns.violinplot('Sex','Age',hue='Survived',data=df_train,scale='area',split=False,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

output_57_0

Embarked

# Embarked 비율
f, ax= plt.subplots(1,1, figsize=(7,7))
df_train[['Embarked','Survived']].groupby(['Embarked'],as_index=True).mean().sort_values(by='Survived',ascending=False).plot.bar(ax=ax)
<AxesSubplot:xlabel='Embarked'>

output_59_1

# sort_values
df_train[['Embarked','Survived']].groupby(['Embarked'],as_index=True).mean().sort_values(by='Survived')
Survived
Embarked
S 0.336957
Q 0.389610
C 0.553571
# 내림차순
df_train[['Embarked','Survived']].groupby(['Embarked'],as_index=True).mean().sort_values(by='Survived',ascending=False)
Survived
Embarked
C 0.553571
Q 0.389610
S 0.336957
# sort_index
df_train[['Embarked','Survived']].groupby(['Embarked'],as_index=True).mean().sort_index()
Survived
Embarked
C 0.553571
Q 0.389610
S 0.336957
f, ax=plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=df_train,ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')

sns.countplot('Embarked', hue='Sex',data=df_train,ax=ax[0,1])
ax[0,1].set_title('(2) Male-Feamle split for embarked')

sns.countplot('Embarked', hue='Survived',data=df_train,ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')

sns.countplot('Embarked', hue='Pclass',data=df_train,ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')

# 좌우간격, 상하간격 맞추기
plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()

png

Family - Sibsp + Parch

df_train['FamilySize']=df_train['SibSp']+df_train['Parch']+1
print('Maximum size of Family:',df_train['FamilySize'].max())
print('Minimum size of Family:',df_train['FamilySize'].min())
Maximum size of Family: 11
Minimum size of Family: 1
f, ax=plt.subplots(1,3,figsize=(40,10))
sns.countplot('FamilySize',data=df_train,ax=ax[0])
ax[0].set_title('(1) No. Of Passenger Boarded',y=1.02)

sns.countplot('FamilySize',hue='Survived',data=df_train,ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilSize',y=1.02)

df_train[['FamilySize','Survived']].groupby(['FamilySize'],as_index=True).mean().sort_values(by='Survived',ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize',y=1.02)

plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()

png

Fare

df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].mean()
df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
fig, ax=plt.subplots(1,1,figsize=(8,8))
g=sns.distplot(df_train['Fare'],color='b',label='Skweness {:.2f}'.format(df_train['Fare'].skew()),ax=ax)
g=g.legend(loc='best')

png

df_train['Fare']=df_train['Fare'].map(lambda i:np.log(i) if i>0 else 0)
df_train['Ticket'].value_counts()
1601        7
347082      7
CA. 2343    7
347088      6
3101295     6
           ..
PC 17318    1
31418       1
345765      1
244270      1
244278      1
Name: Ticket, Length: 681, dtype: int64

Fill Null in Age

df_train['Age'].isnull().sum()
177
df_train['Age'].mean()
29.69911764705882
# str로 변환한 뒤 extract와 정규표현식을 통해 추출
df_train['Initial']= df_train.Name.str.extract('([A-Za-z]+)\.')
df_test['Initial']= df_test.Name.str.extract('([A-Za-z]+)\.') 
pd.crosstab(df_train['Initial'],df_train['Sex']).T.style.background_gradient(cmap='summer_r')
Initial Capt Col Countess Don Dr Jonkheer Lady Major Master Miss Mlle Mme Mr Mrs Ms Rev Sir
Sex
female 0 0 1 0 1 0 1 0 0 182 2 1 0 125 1 0 0
male 1 2 0 1 6 1 0 2 40 0 0 0 517 0 0 6 1
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)

df_test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)
df_train.groupby('Initial').mean()
PassengerId Survived Pclass Age SibSp Parch Fare FamilySize
Initial
Master 414.975000 0.575000 2.625000 4.574167 2.300000 1.375000 1.190112 4.675000
Miss 411.741935 0.704301 2.284946 21.860000 0.698925 0.537634 1.085686 2.236559
Mr 455.880907 0.162571 2.381853 32.739609 0.293006 0.151229 0.932798 1.444234
Mrs 456.393701 0.795276 1.984252 35.981818 0.692913 0.818898 1.207905 2.511811
Other 564.444444 0.111111 1.666667 45.888889 0.111111 0.111111 0.958425 1.222222
df_train.groupby('Initial')['Survived'].mean().plot.bar()
<AxesSubplot:xlabel='Initial'>

png

df_train.loc[(df_train['Age'].isnull())&(df_train['Initial']=='Mr'),'Age']=33
df_train.loc[(df_train['Age'].isnull())&(df_train['Initial']=='Mrs'),'Age']=36
df_train.loc[(df_train['Age'].isnull())&(df_train['Initial']=='Master'),'Age']=5
df_train.loc[(df_train['Age'].isnull())&(df_train['Initial']=='Miss'),'Age']=22
df_train.loc[(df_train['Age'].isnull())&(df_train['Initial']=='Other'),'Age']=46

df_test.loc[(df_test['Age'].isnull())&(df_test['Initial']=='Mr'),'Age']=33
df_test.loc[(df_test['Age'].isnull())&(df_test['Initial']=='Mrs'),'Age']=36
df_test.loc[(df_test['Age'].isnull())&(df_test['Initial']=='Master'),'Age']=5
df_test.loc[(df_test['Age'].isnull())&(df_test['Initial']=='Miss'),'Age']=22
df_test.loc[(df_test['Age'].isnull())&(df_test['Initial']=='Other'),'Age']=46
df_train.loc[(df_train['Initial']=='Mr'),'Age'].isnull().sum
<bound method Series.sum of 0      False
4      False
5      False
6      False
12     False
       ...  
881    False
883    False
884    False
889    False
890    False
Name: Age, Length: 529, dtype: bool>

Fill Null in Embarked and categorize Age

df_train['Embarked'].isnull().sum()
2
df_train['Embarked'].fillna('S',inplace=True)
df_train['Embarked'].isnull().sum()
0
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize Initial
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 0.683603 NaN S 2 Mr
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 1.450832 C85 C 2 Mrs
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 0.727559 NaN S 1 Miss
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 1.379314 C123 S 2 Mrs
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 0.735091 NaN S 1 Mr
df_train.loc[df_train['Age']<10,'Age_cat']=0
df_train.loc[(df_train['Age']>=10)&(df_train['Age']<20),'Age_cat']=1
df_train.loc[(df_train['Age']>=20)&(df_train['Age']<30),'Age_cat']=2
df_train.loc[(df_train['Age']>=30)&(df_train['Age']<40),'Age_cat']=3
df_train.loc[(df_train['Age']>=40)&(df_train['Age']<50),'Age_cat']=4
df_train.loc[(df_train['Age']>=50)&(df_train['Age']<60),'Age_cat']=5
df_train.loc[(df_train['Age']>=60)&(df_train['Age']<70),'Age_cat']=6
df_train.loc[df_train['Age']>=70,'Age_cat']=7


df_test.loc[df_test['Age']<10,'Age_cat']=0
df_test.loc[(df_test['Age']>=10)&(df_test['Age']<20),'Age_cat']=1
df_test.loc[(df_test['Age']>=20)&(df_test['Age']<30),'Age_cat']=2
df_test.loc[(df_test['Age']>=30)&(df_test['Age']<40),'Age_cat']=3
df_test.loc[(df_test['Age']>=40)&(df_test['Age']<50),'Age_cat']=4
df_test.loc[(df_test['Age']>=50)&(df_test['Age']<60),'Age_cat']=5
df_test.loc[(df_test['Age']>=60)&(df_test['Age']<70),'Age_cat']=6
df_test.loc[df_test['Age']>=70,'Age_cat']=7

df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize Initial Age_cat
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 0.683603 NaN S 2 Mr 2.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 1.450832 C85 C 2 Mrs 3.0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 0.727559 NaN S 1 Miss 2.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 1.379314 C123 S 2 Mrs 3.0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 0.735091 NaN S 1 Mr 3.0
df_test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Initial Age_cat
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 2.057860 NaN Q Mr 3.0
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 1.945910 NaN S Mrs 4.0
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 2.270836 NaN Q Mr 6.0
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 2.159003 NaN S Mr 2.0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 2.508582 NaN S Mrs 2.0
def category_age(x):
    if x<10:
        return 0
    elif x<20:
        return 1
    elif x<30:
        return 2
    elif x<40:
        return 3
    elif x<50:
        return 4
    elif x<60:
        return 5
    elif x<70:
        return 6
    else:
        return 7
df_train['Age_cat_2']=df_train['Age'].apply(category_age)
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize Initial Age_cat Age_cat_2
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 0.683603 NaN S 2 Mr 2.0 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 1.450832 C85 C 2 Mrs 3.0 3
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 0.727559 NaN S 1 Miss 2.0 2
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 1.379314 C123 S 2 Mrs 3.0 3
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 0.735091 NaN S 1 Mr 3.0 3
(df_train['Age_cat']==df_train['Age_cat_2']).all()
True
df_train.drop(['Age','Age_cat_2'],axis=1,inplace=True)
df_test.drop(['Age'],axis=1,inplace=True)

Change string to categorical and Pearson coefficient

df_train.Initial.unique()
array(['Mr', 'Mrs', 'Miss', 'Master', 'Other'], dtype=object)
df_train.loc[df_train['Initial']=='Master','Initial']
7      Master
16     Master
50     Master
59     Master
63     Master
65     Master
78     Master
125    Master
159    Master
164    Master
165    Master
171    Master
176    Master
182    Master
183    Master
193    Master
261    Master
278    Master
305    Master
340    Master
348    Master
386    Master
407    Master
445    Master
480    Master
489    Master
549    Master
709    Master
751    Master
755    Master
787    Master
788    Master
802    Master
803    Master
819    Master
824    Master
827    Master
831    Master
850    Master
869    Master
Name: Initial, dtype: object
df_train['Initial'] = df_train['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})
df_test['Initial'] = df_test['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})
df_train.Embarked.unique()
array(['S', 'C', 'Q'], dtype=object)
df_train['Embarked'].value_counts()
S    646
C    168
Q     77
Name: Embarked, dtype: int64
df_train['Embarked']=df_train['Embarked'].map({'C':0,'Q':1,'S':2})
df_test['Embarked']=df_test['Embarked'].map({'C':0,'Q':1,'S':2})
df_train.head()
PassengerId Survived Pclass Name Sex SibSp Parch Ticket Fare Cabin Embarked FamilySize Initial Age_cat
0 1 0 3 Braund, Mr. Owen Harris male 1 0 A/5 21171 0.683603 NaN 2 2 2 2.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 1 0 PC 17599 1.450832 C85 0 2 3 3.0
2 3 1 3 Heikkinen, Miss. Laina female 0 0 STON/O2. 3101282 0.727559 NaN 2 1 1 2.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 1 0 113803 1.379314 C123 2 2 3 3.0
4 5 0 3 Allen, Mr. William Henry male 0 0 373450 0.735091 NaN 2 1 2 3.0
df_train.Embarked.isnull().any()
False
df_train['Sex'].unique()
array(['male', 'female'], dtype=object)
df_train['Sex']=df_train['Sex'].map({'female':0,'male':1})
df_test['Sex']=df_test['Sex'].map({'female':0,'male':1})
heatmap_data=df_train[['Survived','Pclass','Sex','Fare','Embarked','FamilySize','Initial','Age_cat']]
heatmap_data.corr()
Survived Pclass Sex Fare Embarked FamilySize Initial Age_cat
Survived 1.000000 -0.338481 -0.543351 0.332593 -0.167675 0.016639 -0.085529 -0.095002
Pclass -0.338481 1.000000 0.131900 -0.659932 0.162098 0.065997 -0.133054 -0.314809
Sex -0.543351 0.131900 1.000000 -0.271514 0.108262 -0.200988 0.051687 0.122917
Fare 0.332593 -0.659932 -0.271514 1.000000 -0.177469 0.410847 -0.016650 0.068385
Embarked -0.167675 0.162098 0.108262 -0.177469 1.000000 0.066516 0.026550 -0.033173
FamilySize 0.016639 0.065997 -0.200988 0.410847 0.066516 1.000000 -0.204574 -0.280537
Initial -0.085529 -0.133054 0.051687 -0.016650 0.026550 -0.204574 1.000000 0.481309
Age_cat -0.095002 -0.314809 0.122917 0.068385 -0.033173 -0.280537 0.481309 1.000000
colormap=plt.cm.BuGn
plt.figure(figsize=(12,10))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(heatmap_data.astype(float).corr(),linewidths=0.1,vmax=2,square=True,cmap=colormap,linecolor='white',annot=True,annot_kws={'size':16},fmt='.2f')
<AxesSubplot:title={'center':'Pearson Correlation of Features'}>

png

One-hot encoding on the Initial and Embarked

df_test.head()
PassengerId Pclass Name Sex SibSp Parch Ticket Fare Cabin Embarked Initial Age_cat
0 892 3 Kelly, Mr. James 1 0 0 330911 2.057860 NaN 1 2 3.0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 0 1 0 363272 1.945910 NaN 2 3 4.0
2 894 2 Myles, Mr. Thomas Francis 1 0 0 240276 2.270836 NaN 1 2 6.0
3 895 3 Wirz, Mr. Albert 1 0 0 315154 2.159003 NaN 2 2 2.0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 0 1 1 3101298 2.508582 NaN 2 3 2.0
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')
df_train.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name',  'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.head()
Pclass Sex Fare Age_cat Initial_0 Initial_1 Initial_2 Initial_3 Initial_4 Embarked_0 Embarked_1 Embarked_2
0 3 1 2.057860 3.0 0 0 1 0 0 0 1 0
1 3 0 1.945910 4.0 0 0 0 1 0 0 0 1
2 2 1 2.270836 6.0 0 0 1 0 0 0 1 0
3 3 1 2.159003 2.0 0 0 1 0 0 0 0 1
4 3 0 2.508582 2.0 0 0 0 1 0 0 0 1
df_train.head()
Survived Pclass Sex Fare FamilySize Age_cat Initial_0 Initial_1 Initial_2 Initial_3 Initial_4 Embarked_0 Embarked_1 Embarked_2
0 0 3 1 0.683603 2 2.0 0 0 1 0 0 0 0 1
1 1 1 0 1.450832 2 3.0 0 0 0 1 0 1 0 0
2 1 3 0 0.727559 1 2.0 0 1 0 0 0 0 0 1
3 1 1 0 1.379314 2 3.0 0 0 0 1 0 0 0 1
4 0 3 1 0.735091 1 3.0 0 0 1 0 0 0 0 1

Machine learningl(Randomforest)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
df_train.head()
Survived Pclass Sex Fare FamilySize Age_cat Initial_0 Initial_1 Initial_2 Initial_3 Initial_4 Embarked_0 Embarked_1 Embarked_2
0 0 3 1 0.683603 2 2.0 0 0 1 0 0 0 0 1
1 1 1 0 1.450832 2 3.0 0 0 0 1 0 1 0 0
2 1 3 0 0.727559 1 2.0 0 1 0 0 0 0 0 1
3 1 1 0 1.379314 2 3.0 0 0 0 1 0 0 0 1
4 0 3 1 0.735091 1 3.0 0 0 1 0 0 0 0 1
df_test.head()
Pclass Sex Fare Age_cat Initial_0 Initial_1 Initial_2 Initial_3 Initial_4 Embarked_0 Embarked_1 Embarked_2
0 3 1 2.057860 3.0 0 0 1 0 0 0 1 0
1 3 0 1.945910 4.0 0 0 0 1 0 0 0 1
2 2 1 2.270836 6.0 0 0 1 0 0 0 1 0
3 3 1 2.159003 2.0 0 0 1 0 0 0 0 1
4 3 0 2.508582 2.0 0 0 0 1 0 0 0 1
X_train=df_train.drop('Survived',axis=1).values
target_label=df_train['Survived'].values
X_test=df_test.values
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.3, random_state=2018)
model = RandomForestClassifier()
model.fit(X_tr, y_tr)
prediction = model.predict(X_vld)
print('총 {}명 중 {:.2f}% 정확도로 생존 맞춤'.format(y_vld.shape[0], 100 * metrics.accuracy_score(prediction, y_vld)))
총 268명 중 82.09% 정확도로 생존 맞춤

feature importance and prediction on test set

model.feature_importances_
array([0.09818595, 0.10792619, 0.32805606, 0.09146926, 0.1232598 ,
       0.01214762, 0.04206886, 0.11624336, 0.02958443, 0.0041871 ,
       0.01512252, 0.01363153, 0.01811731])
from pandas import Series

feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_test.columns)
plt.figure(figsize=(8, 8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

output

submission = pd.read_csv('gender_submission.csv')
submission.head()
  PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
prediction = model.predict(X_test)
submission['Survived'] = prediction