[파이썬]캐글 타이타닉 데이터 탐색 #9, #10(Feature Engineering)


캐글 타이타닉 데이터 탐색 #9, #10(Feature Engineering)

#9, #10 동영상에서는 Fill Null in Age와 Fill NUll in Embarked and Categorize Age를 한다.

참고 : You Han Lee 유튜브

df_train['Age'].isnull().sum()
177
df_train['Initial']= df_train['Name'].str.extract('([A-Za-z]+)\.') # 점 앞에꺼를 가져오겠다.
df_test['Initial']= df_test['Name'].str.extract('([A-Za-z]+)\.') # 점 앞에꺼를 가져오겠다.
df_train.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedFamilySizeinitialInitial
0103Braund, Mr. Owen Harrismale22.010A/5 211711.981001NaNS2MrMr
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 175994.266662C85C2MrsMrs
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012822.070022NaNS1MissMiss
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.0101138033.972177C123S2MrsMrs
4503Allen, Mr. William Henrymale35.0003734502.085672NaNS1MrMr
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='summer_r')
InitialCaptColCountessDonDrJonkheerLadyMajorMasterMissMlleMmeMrMrsMsRevSir
Sex
female001010100182210125100
male12016102400005170061
df_train['Initial'].replace(['Mile', 'Mme', 'Ms', 'Dr', 'Major','Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona'],
                           ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr', 'Mr'], inplace=True)
df_test['Initial'].replace(['Mile', 'Mme', 'Ms', 'Dr', 'Major','Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona'],
                           ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr', 'Mr'], inplace=True)
df_train.groupby('Initial').mean()
PassengerIdSurvivedPclassAgeSibSpParchFareFamilySize
Initial
Master414.9750000.5750002.6250004.5741672.3000001.3750003.3407104.675000
Miss408.8641300.7010872.29891321.8310810.7065220.5434783.1134252.250000
Mlle676.5000001.0000001.00000024.0000000.0000000.0000004.0702511.000000
Mr455.8809070.1625712.38185332.7396090.2930060.1512292.6515071.444234
Mrs456.3937010.7952761.98425235.9818180.6929130.8188983.4437512.511811
Other564.4444440.1111111.66666745.8888890.1111110.1111112.6416051.222222
df_train.groupby('Initial')['Survived'].mean().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fde203fc5f8>

png

df_all = pd.concat([df_train, df_test])
df_all.head()
AgeCabinEmbarkedFamilySizeFareInitialNameParchPassengerIdPclassSexSibSpSurvivedTicketinitial
022.0NaNS2.01.981001MrBraund, Mr. Owen Harris013male10.0A/5 21171Mr
138.0C85C2.04.266662MrsCumings, Mrs. John Bradley (Florence Briggs Th...021female11.0PC 17599Mrs
226.0NaNS1.02.070022MissHeikkinen, Miss. Laina033female01.0STON/O2. 3101282Miss
335.0C123S2.03.972177MrsFutrelle, Mrs. Jacques Heath (Lily May Peel)041female11.0113803Mrs
435.0NaNS1.02.085672MrAllen, Mr. William Henry053male00.0373450Mr
df_all.groupby('Initial').mean()
AgeFamilySizeFareParchPassengerIdPclassSibSpSurvived
Initial
Master5.4826424.67500015.4426771.377049658.8524592.6557382.0491800.575000
Miss21.8141042.25000014.0968610.498099616.5399242.3422050.6577950.701087
Mlle24.0000001.0000004.0702510.000000676.5000001.0000000.0000001.000000
Mr32.5563971.44423410.0039410.159533658.8313882.3592740.2866410.162571
Mrs37.0348842.51181123.8969960.824121685.6733671.9296480.6582910.795276
Other44.9230771.22222224.5230340.153846714.9230771.6153850.2307690.111111
# 두번째(1)로우의 값을 가져와 달라 (location을 쓰는것)
df_train.loc[1, :]
PassengerId                                                    2
Survived                                                       1
Pclass                                                         1
Name           Cumings, Mrs. John Bradley (Florence Briggs Th...
Sex                                                       female
Age                                                           38
SibSp                                                          1
Parch                                                          0
Ticket                                                  PC 17599
Fare                                                     4.26666
Cabin                                                        C85
Embarked                                                       C
FamilySize                                                     2
initial                                                      Mrs
Initial                                                      Mrs
Name: 1, dtype: object
# 로케이션 이용해서 Survived가 1인것들만 가져옴
df_train.loc[df_train['Survived'] ==1 ].head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedFamilySizeinitialInitial
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 175994.266662C85C2MrsMrs
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012822.070022NaNS1MissMiss
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.0101138033.972177C123S2MrsMrs
8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.0023477422.409941NaNS3MrsMrs
91012Nasser, Mrs. Nicholas (Adele Achem)female14.0102377363.403555NaNC2MrsMrs
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mr')].head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedFamilySizeinitialInitial
5603Moran, Mr. JamesmaleNaN003308772.135148NaNQ1MrMr
171812Williams, Mr. Charles EugenemaleNaN002443732.564949NaNS1MrMr
262703Emir, Mr. Farred ChehabmaleNaN0026311.977547NaNC1MrMr
293003Todoroff, Mr. LaliomaleNaN003492162.066331NaNS1MrMr
363713Mamee, Mr. HannamaleNaN0026771.978128NaNC1MrMr
# 조건을 만족하는거를 치환
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mr'), 'Age'] = 33
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mrs'), 'Age'] = 37
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Master'), 'Age'] = 5
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Miss'), 'Age'] = 22
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Other'), 'Age'] = 45

df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mr'), 'Age'] = 33
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mrs'), 'Age'] = 37
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Master'), 'Age'] = 5
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Miss'), 'Age'] = 22
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Other'), 'Age'] = 45

df_train['Age'].isnull().sum()
0
df_test['Age'].isnull().sum()
0
df_train['Embarked'].isnull().sum()
2
df_train.shape
(891, 15)
df_train['Embarked'].fillna('S', inplace=True)
df_train['Embarked'].isnull().sum()
0
df_train['Age_cat'] = 0
df_train.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedFamilySizeinitialInitialAge_cat
0103Braund, Mr. Owen Harrismale22.010A/5 211711.981001NaNS2MrMr0
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 175994.266662C85C2MrsMrs0
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012822.070022NaNS1MissMiss0
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.0101138033.972177C123S2MrsMrs0
4503Allen, Mr. William Henrymale35.0003734502.085672NaNS1MrMr0
# 하드 코딩을 하는데 너무 어렵다.
df_train.loc[df_train['Age'] < 10, 'Age_cat'] = 0
df_train.loc[(10 <= df_train['Age']) & (df_train['Age'] < 20), 'Age_cat'] = 1
df_train.loc[(20 <= df_train['Age']) & (df_train['Age'] < 30), 'Age_cat'] = 2
df_train.loc[(30 <= df_train['Age']) & (df_train['Age'] < 40), 'Age_cat'] = 3
df_train.loc[(40 <= df_train['Age']) & (df_train['Age'] < 50), 'Age_cat'] = 4
df_train.loc[(50 <= df_train['Age']) & (df_train['Age'] < 60), 'Age_cat'] = 5
df_train.loc[(60 <= df_train['Age']) & (df_train['Age'] < 70), 'Age_cat'] = 6
df_train.loc[(70 <= df_train['Age']), 'Age_cat'] = 7
df_test.loc[df_test['Age'] < 10, 'Age_cat'] = 0
df_test.loc[(10 <= df_test['Age']) & (df_test['Age'] < 20), 'Age_cat'] = 1
df_test.loc[(20 <= df_test['Age']) & (df_test['Age'] < 30), 'Age_cat'] = 2
df_test.loc[(30 <= df_test['Age']) & (df_test['Age'] < 40), 'Age_cat'] = 3
df_test.loc[(40 <= df_test['Age']) & (df_test['Age'] < 50), 'Age_cat'] = 4
df_test.loc[(50 <= df_test['Age']) & (df_test['Age'] < 60), 'Age_cat'] = 5
df_test.loc[(60 <= df_test['Age']) & (df_test['Age'] < 70), 'Age_cat'] = 6
df_test.loc[(70 <= df_test['Age']), 'Age_cat'] = 7
df_train.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedFamilySizeinitialInitialAge_cat
0103Braund, Mr. Owen Harrismale22.010A/5 211711.981001NaNS2MrMr2
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 175994.266662C85C2MrsMrs3
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012822.070022NaNS1MissMiss2
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.0101138033.972177C123S2MrsMrs3
4503Allen, Mr. William Henrymale35.0003734502.085672NaNS1MrMr3
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7
df_train['Age_cat_2'] = df_train['Age'].apply(category_age)
# 하드 코딩과 같은 결과를 내는지 봄
(df_train['Age_cat'] == df_train['Age_cat_2']).all()
True
df_train.drop(['Age', 'Age_cat_2'], axis=1, inplace=True)
df_test.drop(['Age'], axis=1, inplace=True)





© 2018. by statssy

Powered by statssy