美文网首页数据工程师数据咖程序员
泰坦尼克(Titanic)的数据分析(特征工程部分)

泰坦尼克(Titanic)的数据分析(特征工程部分)

作者: 苟雨 | 来源:发表于2017-03-12 22:12 被阅读1154次

    泰坦尼克数据集是一个好的可选数据集对于kaggle的新手,
    而且很多获胜的kaggle竞赛的团队都对这个数据集有很好的分析.

    import numpy as np
    import pandas as pd
    import re
    import sklearn
    train_ = pd.read_csv('train.csv')
    # test_ = pd.read_csv('test.csv')
    print(train_.head())
    PassengerId = train_['PassengerId']
    print('follow is passengerid')
    print(PassengerId[:5])
    PassengerId  Survived  Pclass  \
    
    0            1         0       3
    
    1            2         1       1
    
    2            3         1       3
    
    3            4         1       1
    
    4            5         0       3
    
                                                   Name     Sex   Age  SibSp  \
    
    0                            Braund, Mr. Owen Harris    male  22.0      1
    
    1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1
    
    2                             Heikkinen, Miss. Laina  female  26.0      0
    
    3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1
    
    4                           Allen, Mr. William Henry    male  35.0      0
    
      Parch            Ticket     Fare Cabin Embarked
    
    0      0         A/5 21171   7.2500   NaN        S
    
    1      0          PC 17599  71.2833   C85        C
    
    2      0  STON/O2. 3101282   7.9250   NaN        S
    
    3      0            113803  53.1000  C123        S
    
    4      0            373450   8.0500   NaN        S
    
    follow is passengerid
    
    0    1
    
    1    2
    
    2    3
    
    3    4
    
    4    5
    
    Name: PassengerId, dtype: int64
    
    
    In [3]:
    
    print (train_[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
    
    print (train_[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
    
    Pclass  Survived
    
    0       1  0.629630
    
    1       2  0.472826
    
    2       3  0.242363
    
    Sex  Survived
    
    0  female  0.742038
    
    1    male  0.188908
    
    In [4]:
    
    train_['Age'].mean()
    
    Out[4]:
    
    29.69911764705882
    

    数据清理

    feature engineering 特征工程,构造出我们需要的特征
    加上一些自己推出的特征

    # 计算名字的长度
    train_['Name_length'] = train_['Name'].apply(len)
    # 将旅客是否住在头等舱二值化
    train_['Has_Cabin'] = train_["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    # 发现房子的大小
    train_['FamilySize'] = train_['SibSp'] + train_['Parch'] + 1
    # 发现是否独居
    train_['IsAlone'] = 0
    train_.loc[train_['FamilySize'] == 1, 'IsAlone'] = 1
    # 移除所有穿上人员的 Embarked 的NULL值
    train_['Embarked'] = train_['Embarked'].fillna('S')
    train_['Fare'] = train_['Fare'].fillna(train_['Fare'].median())
    train_['CategoricalFare'] = pd.qcut(train_['Fare'], 4)
    age_avg = train_['Age'].mean()
    age_std = train_['Age'].std()
    age_null_count = train_['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    train_['Age'][np.isnan(train_['Age'])] = age_null_random_list
    train_['Age'] = train_['Age'].astype(int)
    train_['CategoricalAge'] = pd.cut(train_['Age'], 5)
    # 定义函数导出旅客的Title
    def get_title(name):
       title_search = re.search('([A-Za-z]+)\.',name)
       if title_search:
           return title_search.group(1)
       return ''
    train_['Title'] = train_['Name'].apply(get_title)
    train_['Title'] = train_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    train_['Title'] = train_['Title'].replace('Mlle', 'Miss')
    train_['Title'] = train_['Title'].replace('Ms', 'Miss')
    train_['Title'] = train_['Title'].replace('Mme', 'Mrs')
    #  映射 Sex
    train_['Sex'] = train_['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # 映射 titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    train_['Title'] = train_['Title'].map(title_mapping)
    train_['Title'] = train_['Title'].fillna(0)
    # 映射 Embarked
    train_['Embarked'] = train_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    # 映射 Fare
    train_.loc[ train_['Fare'] <= 7.91, 'Fare'] = 0
    train_.loc[(train_['Fare'] > 7.91) & (train_['Fare'] <= 14.454), 'Fare'] = 1
    train_.loc[(train_['Fare'] > 14.454) & (train_['Fare'] <= 31), 'Fare']   = 2
    train_.loc[ train_['Fare'] > 31, 'Fare'] = 3
    train_['Fare'] = train_['Fare'].astype(int)
    # 映射 Age
    train_.loc[ train_['Age'] <= 16, 'Age'] = 0
    train_.loc[(train_['Age'] > 16) & (train_['Age'] <= 32), 'Age'] = 1
    train_.loc[(train_['Age'] > 32) & (train_['Age'] <= 48), 'Age'] = 2
    train_.loc[(train_['Age'] > 48) & (train_['Age'] <= 64), 'Age'] = 3
    train_.loc[train_['Age'] > 64, 'Age']
    
    Out[5]:
    33     66
    54     65
    96     71
    116    70
    280    65
    456    65
    493    71
    630    80
    672    70
    745    70
    851    74
    Name: Age, dtype: int64
    

    特征选择

    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    
    train_ = train_.drop(drop_elements, axis = 1)
    
    train_ = train_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
    
    # test_  = test_.drop(drop_elements, axis = 1)
    
    In [7]:
    
    train_.head()
    
    Out[7]:
    
    Survived    Pclass  Sex Age Parch   Fare    Embarked    Name_length Has_Cabin   FamilySize  IsAlone Title
    
    0   0   3   1   1   0   0   0   23  0   2   0   1
    
    1   1   1   0   2   0   3   1   51  1   2   0   3
    
    2   1   3   0   1   0   1   0   22  0   1   1   2
    
    3   1   1   0   2   0   3   0   44  1   2   0   3
    
    4   0   3   1   2   0   1   0   24  0   1   1   1
    
    

    可视化

    Pearson Correlation Heatmap(Pearson相关性热力图)
    # 可视化特征之间的关联
    
    import seaborn as sns
    
    import matplotlib.pyplot as plt
    
    %matplotlib inline
    
    colormap = plt.cm.viridis
    
    plt.figure(figsize=(12,12))
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)
    
    sns.heatmap(train_.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
    
    
    __results___11_1.png

    1.图中可以告诉我们特征之间没有太多的强关联
    这对我们的模型来说是一个好消息,因为这意味着训练数据中没有太多的冗余信息,我们每一个特征都可以提供一个独立的信息
    2.两个关联最强的特征是Family size and Parch

    最后让我们生成一个 pairplots图来观察每个特征与其它特征之间的关系

    g = sns.pairplot(train_[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
    
          u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
    
    g.set(xticklabels=[])
    
    

    用模型来选择特征的重要性

    这里使用随机森林来得到每个特征的重要程度

    In [60]:
    import sklearn
    import plotly.offline as py
    py.init_notebook_mode(connected=True)
    import plotly.graph_objs as go
    import plotly.tools as tls
    
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
    from sklearn.svm import SVC
    from sklearn.cross_validation import KFold
    
    # 使用面向对象的编程方式(OOP),用python类来帮助我们生成多个实例,方便后面多个模型的构建与训练
    
    # 下面创建SkleanHelper类它允许我们扩展内建的方法
    
    ntrain = train_.shape[0]
    
    SEED = 0 # 用于后面多个实例seed参数的重置
    
    NFOLDS = 5 # set folds for out-of-fold prediction
    
    kf = KFold(ntrain,n_folds=NFOLDS,random_state=SEED)
    
    class SklearnHelper(object):
    
       def __init__(self,clf,seed=0,params=None):
    
           params['random_state'] = seed
    
           self.clf = clf(**params)
    
       def train(self, x_train, y_train):
    
           self.clf.fit(x_train, y_train)
    
       def predict(self, x):
    
           return self.clf.predict(x)
    
    
    
    def fit(self,x,y):
    
    return self.clf.fit(x,y)
    
    
    
    def feature_importances(self,x,y):
    
    print(self.clf.fit(x,y).feature_importances_)
    

    这里有5个模型,它们都来自于sklearn工具包

    # Random Forest classifier
    # Extra Trees classifier
    # AdaBoost classifer
    # Gradient Boosting classifer
    # Support Vector Machine
    

    参数(parameters)

    # Random Forest parameters
    
    rf_params = {
    
    'n_jobs': -1,
    
    'n_estimators': 500,
    
    'warm_start': True,
    
    #'max_features': 0.2,
    
    'max_depth': 6,
    
    'min_samples_leaf': 2,
    
    'max_features' : 'sqrt',
    
    'verbose': 0
    
    }
    
    # Extra Trees Parameters
    
    et_params = {
    
       'n_jobs': -1,
    
       'n_estimators':500,
    
       #'max_features': 0.5,
    
       'max_depth': 8,
    
       'min_samples_leaf': 2,
    
       'verbose': 0
    
    }
    
    # AdaBoost parameters
    
    ada_params = {
    
       'n_estimators': 500,
    
       'learning_rate' : 0.75
    
    }
    
    # Gradient Boosting parameters
    
    gb_params = {
    
       'n_estimators': 500,
    
        #'max_features': 0.2,
    
       'max_depth': 5,
    
       'min_samples_leaf': 2,
    
       'verbose': 0
    
    }
    
    # Support Vector Classifier parameters
    
    svc_params = {
    
    'kernel' : 'linear',
    
    'C' : 0.025
    
    }
    

    并且我们创建5个对象来分别训练不同的模型

    rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
    
    et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
    
    ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
    
    gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
    
    svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
    
    

    创建用于训练和测试的Numpy数组

    y_train = train_['Survived'].ravel() # 把Survived作为分类的标签(label)
    
    train = train_.drop(['Survived'], axis=1) # 除去Survived
    
    x_train = train_.values
    

    从不同的分类模型中得到不同的特征比重

    # .featureimportances 可以返回每个特征的重要性比重
    
    rf_feature = rf.feature_importances(x_train,y_train)
    
    et_feature = et.feature_importances(x_train, y_train)
    
    ada_feature = ada.feature_importances(x_train, y_train)
    
    gb_feature = gb.feature_importances(x_train,y_train)
    
    [ 0.65595918  0.03713221  0.09610253  0.00581797  0.00419678  0.01383511
    
    0.00504756  0.02717307  0.02183406  0.0192449   0.00290235  0.11075428]
    
    [ 0.72430616  0.03342865  0.12294606  0.00239191  0.00269613  0.01104144
    
    0.00395793  0.00814264  0.02822738  0.00713839  0.00575401  0.04996929]
    
    [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
    
    [ 0.148  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
    
    0.   ]
    

    由于返回的数据并不能直接使用,所以复制粘贴一下,

    rf_features = [0.10474135,  0.21837029,  0.04432652,  0.02249159,  0.05432591,  0.02854371
    
    ,0.07570305,  0.01088129 , 0.24247496,  0.13685733 , 0.06128402]
    
    et_features = [ 0.12165657,  0.37098307  ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157
    
    ,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517,  0.08910063]
    
    ada_features = [0.028 ,   0.008  ,      0.012   ,     0.05866667,   0.032 ,       0.008
    
    ,0.04666667 ,  0.     ,      0.05733333,   0.73866667,   0.01066667]
    
    gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395,  0.04778854
    
    ,0.05965792 , 0.02774745,  0.07462718,  0.4593142 ,  0.01340093]
    
    

    创建一个包含特征重要性的Dataframe用于可视化

    cols = train.columns.values
    
    print(cols)
    
    print(rf_feature)
    
    feature_dataframe = pd.DataFrame( {'features': cols,
    
    'Random Forest feature importances': rf_features,
    
    'Extra Trees  feature importances': et_features,
    
    'AdaBoost feature importances': ada_features,
    
    'Gradient Boost feature importances': gb_features
    
    })
    
    feature_dataframe.head()
    
    ['Pclass' 'Sex' 'Age' 'Parch' 'Fare' 'Embarked' 'Name_length' 'Has_Cabin'
    
    'FamilySize' 'IsAlone' 'Title']
    
    None
    
    Out[109]:
    
    AdaBoost feature importances    Extra Trees feature importances Gradient Boost feature importances  Random Forest feature importances   features
    
    0   0.028000    0.121657    0.067961    0.104741    Pclass
    
    1   0.008000    0.370983    0.038893    0.218370    Sex
    
    2   0.012000    0.031296    0.072378    0.044327    Age
    
    3   0.058667    0.015916    0.026286    0.022492    Parch
    
    4   0.032000    0.055258    0.111944    0.054326    Fare
    
    

    绘制散点图描述每个模型输出的特征重要度

    trace = go.Scatter(
    
    y = feature_dataframe['Random Forest feature importances'].values,
    
    x = feature_dataframe['features'].values,
    
    mode='markers',
    
    marker=dict(
    
    sizemode = 'diameter',
    
    sizeref = 1,
    
    size = 25,
    
    #       size= feature_dataframe['AdaBoost feature importances'].values,
    
    #color = np.random.randn(500), #set color equal to a variable
    
    color = feature_dataframe['Random Forest feature importances'].values,
    
    colorscale='Portland',
    
    showscale=True
    
    ),
    
    text = feature_dataframe['features'].values
    
    )
    
    data = [trace]
    
    layout= go.Layout(
    
       autosize= True,
    
       title= 'Random Forest Feature Importance',
    
       hovermode= 'closest',
    
    #     xaxis= dict(
    
    #         title= 'Pop',
    
    #         ticklen= 5,
    
    #         zeroline= False,
    
    #         gridwidth= 2,
    
    #     ),
    
       yaxis=dict(
    
           title= 'Feature Importance',
    
           ticklen= 5,
    
           gridwidth= 2
    
       ),
    
       showlegend= False
    
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    py.iplot(fig,filename='scatter2010')
    
    # Scatter plot
    
    trace = go.Scatter(
    
    y = feature_dataframe['Extra Trees  feature importances'].values,
    
    x = feature_dataframe['features'].values,
    
    mode='markers',
    
    marker=dict(
    
    sizemode = 'diameter',
    
    sizeref = 1,
    
    size = 25,
    
    #       size= feature_dataframe['AdaBoost feature importances'].values,
    
    #color = np.random.randn(500), #set color equal to a variable
    
    color = feature_dataframe['Extra Trees  feature importances'].values,
    
    colorscale='Portland',
    
    showscale=True
    
    ),
    
    text = feature_dataframe['features'].values
    
    )
    
    data = [trace]
    
    layout= go.Layout(
    
       autosize= True,
    
       title= 'Extra Trees Feature Importance',
    
       hovermode= 'closest',
    
    #     xaxis= dict(
    
    #         title= 'Pop',
    
    #         ticklen= 5,
    
    #         zeroline= False,
    
    #         gridwidth= 2,
    
    #     ),
    
       yaxis=dict(
    
           title= 'Feature Importance',
    
           ticklen= 5,
    
           gridwidth= 2
    
       ),
    
       showlegend= False
    
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    py.iplot(fig,filename='scatter2010')
    
    # Scatter plot
    
    trace = go.Scatter(
    
    y = feature_dataframe['AdaBoost feature importances'].values,
    
    x = feature_dataframe['features'].values,
    
    mode='markers',
    
    marker=dict(
    
    sizemode = 'diameter',
    
    sizeref = 1,
    
    size = 25,
    
    #       size= feature_dataframe['AdaBoost feature importances'].values,
    
    #color = np.random.randn(500), #set color equal to a variable
    
    color = feature_dataframe['AdaBoost feature importances'].values,
    
    colorscale='Portland',
    
    showscale=True
    
    ),
    
    text = feature_dataframe['features'].values
    
    )
    
    data = [trace]
    
    layout= go.Layout(
    
       autosize= True,
    
       title= 'AdaBoost Feature Importance',
    
       hovermode= 'closest',
    
    #     xaxis= dict(
    
    #         title= 'Pop',
    
    #         ticklen= 5,
    
    #         zeroline= False,
    
    #         gridwidth= 2,
    
    #     ),
    
       yaxis=dict(
    
           title= 'Feature Importance',
    
           ticklen= 5,
    
           gridwidth= 2
    
       ),
    
       showlegend= False
    
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    py.iplot(fig,filename='scatter2010')
    
    # Scatter plot
    
    trace = go.Scatter(
    
    y = feature_dataframe['Gradient Boost feature importances'].values,
    
    x = feature_dataframe['features'].values,
    
    mode='markers',
    
    marker=dict(
    
    sizemode = 'diameter',
    
    sizeref = 1,
    
    size = 25,
    
    #       size= feature_dataframe['AdaBoost feature importances'].values,
    
    #color = np.random.randn(500), #set color equal to a variable
    
    color = feature_dataframe['Gradient Boost feature importances'].values,
    
    colorscale='Portland',
    
    showscale=True
    
    ),
    
    text = feature_dataframe['features'].values
    
    )
    
    data = [trace]
    
    layout= go.Layout(
    
       autosize= True,
    
       title= 'Gradient Boosting Feature Importance',
    
       hovermode= 'closest',
    
    #     xaxis= dict(
    
    #         title= 'Pop',
    
    #         ticklen= 5,
    
    #         zeroline= False,
    
    #         gridwidth= 2,
    
    #     ),
    
       yaxis=dict(
    
           title= 'Feature Importance',
    
           ticklen= 5,
    
           gridwidth= 2
    
       ),
    
       showlegend= False
    
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    py.iplot(fig,filename='scatter2010')
    

    由于图片太多这里就不贴了 ,可以访问github

    计算每个模型对每个特征重要性的平均值

    feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 表示操作第二轴(横轴)
    
    feature_dataframe.head(3)
    
    Out[111]:
    
    AdaBoost feature importances    Extra Trees feature importances Gradient Boost feature importances  Random Forest feature importances   features    mean
    
    0   0.028   0.121657    0.067961    0.104741    Pclass  0.080590
    
    1   0.008   0.370983    0.038893    0.218370    Sex 0.159062
    
    2   0.012   0.031296    0.072378    0.044327    Age 0.040000
    

    绘制的到的平均值

    y = feature_dataframe['mean'].values
    
    x = feature_dataframe['features'].values
    
    data = [go.Bar(
    
               x= x,
    
                y= y,
    
               width = 0.5,
    
               marker=dict(
    
                  color = feature_dataframe['mean'].values,
    
               colorscale='Portland',
    
               showscale=True,
    
               reversescale = False
    
               ),
    
               opacity=0.6
    
           )]
    
    layout= go.Layout(
    
       autosize= True,
    
       title= 'Barplots of Mean Feature Importance',
    
       hovermode= 'closest',
    
    #     xaxis= dict(
    
    #         title= 'Pop',
    
    #         ticklen= 5,
    
    #         zeroline= False,
    
    #         gridwidth= 2,
    
    #     ),
    
       yaxis=dict(
    
           title= 'Feature Importance',
    
           ticklen= 5,
    
           gridwidth= 2
    
       ),
    
       showlegend= False
    
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    py.iplot(fig, filename='bar-direct-labels')
    

    相关文章

      网友评论

        本文标题:泰坦尼克(Titanic)的数据分析(特征工程部分)

        本文链接:https://www.haomeiwen.com/subject/gpmxnttx.html