美文网首页
主成分分析_员工离职分析

主成分分析_员工离职分析

作者: a_big_cat | 来源:发表于2021-03-29 08:38 被阅读0次

    介绍

    利用主成分分析原理,将原来的变量重新组合成一组互相无关的几个综合变量,而这些变量尽可能的保留原有的信息,
    从而达到降维的目的,低维的数据让人更易进行可视化分析,以便观察数据的结构分布。

    # satisfaction_level :对公司的满意程度
    # last_evaluation :对公司的评价
    # number_project :做过项目的数量
    # average_montly_hours :每月工作时长
    # time_spend_company :每天在公司的时间
    # Work_accident :工作差错
    # promotion_last_5years :五年内有没有提升
    

    1.导入数据

    import numpy as np
    import pandas as pd
    HR_comma_sep=pd.read_csv('./HR_comma_sep.csv')
    HR_comma_sep.head()
    
    satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
    0 0.38 0.53 2 157 3 0 1 0 sales low
    1 0.80 0.86 5 262 6 0 1 0 sales medium
    2 0.11 0.88 7 272 4 0 1 0 sales medium
    3 0.72 0.87 5 223 5 0 1 0 sales low
    4 0.37 0.52 2 159 3 0 1 0 sales low

    2.数据探索

    
    
    HR_comma_sep.shape
    
    
    
    (14999, 10)
    
    
    
    HR_comma_sep.info()
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 14999 entries, 0 to 14998
    Data columns (total 10 columns):
     #   Column                 Non-Null Count  Dtype  
    ---  ------                 --------------  -----  
     0   satisfaction_level     14999 non-null  float64
     1   last_evaluation        14999 non-null  float64
     2   number_project         14999 non-null  int64  
     3   average_montly_hours   14999 non-null  int64  
     4   time_spend_company     14999 non-null  int64  
     5   Work_accident          14999 non-null  int64  
     6   left                   14999 non-null  int64  
     7   promotion_last_5years  14999 non-null  int64  
     8   sales                  14999 non-null  object 
     9   salary                 14999 non-null  object 
    dtypes: float64(2), int64(6), object(2)
    memory usage: 1.1+ MB
    
    
    
    HR_comma_sep.describe().T
    
    
    
    count mean std min 25% 50% 75% max
    satisfaction_level 14999.0 0.612834 0.248631 0.09 0.44 0.64 0.82 1.0
    last_evaluation 14999.0 0.716102 0.171169 0.36 0.56 0.72 0.87 1.0
    number_project 14999.0 3.803054 1.232592 2.00 3.00 4.00 5.00 7.0
    average_montly_hours 14999.0 201.050337 49.943099 96.00 156.00 200.00 245.00 310.0
    time_spend_company 14999.0 3.498233 1.460136 2.00 3.00 3.00 4.00 10.0
    Work_accident 14999.0 0.144610 0.351719 0.00 0.00 0.00 0.00 1.0
    left 14999.0 0.238083 0.425924 0.00 0.00 0.00 0.00 1.0
    promotion_last_5years 14999.0 0.021268 0.144281 0.00 0.00 0.00 0.00 1.0
    
    
    import numpy as np 
    import pandas as pd  
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns
    get_ipython().run_line_magic('matplotlib', 'inline')
    correlation = HR_comma_sep.corr()
    plt.figure(figsize=(10,10))
    sns.heatmap(correlation, vmax=1,
    square=True,annot=True,cmap='cubehelix')
    plt.title('Correlation between different fearures')
    
    
    
    Text(0.5, 1, 'Correlation between different fearures')
    
    output_9_1.png

    3.模型开发

    
    
    HR_comma_sep_X=HR_comma_sep.drop(labels=['sales','salary','left'],axis=1)
    HR_comma_sep_X.head()
    
    
    
    satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident promotion_last_5years
    0 0.38 0.53 2 157 3 0 0
    1 0.80 0.86 5 262 6 0 0
    2 0.11 0.88 7 272 4 0 0
    3 0.72 0.87 5 223 5 0 0
    4 0.37 0.52 2 159 3 0 0
    
    
    from sklearn.preprocessing import StandardScaler
    HR_comma_sep_X_std = StandardScaler().fit_transform(HR_comma_sep_X)
    
    
    
    
    
    mean_vec = np.mean(HR_comma_sep_X_std, axis=0)
    cov_mat = (HR_comma_sep_X_std - mean_vec).T.dot((HR_comma_sep_X_std - mean_vec)) / (HR_comma_sep_X_std.shape[0]-1)
    print('Covariance matrix \n%s' %cov_mat)
    
    
    
    Covariance matrix 
    [[ 1.00006668  0.10502822 -0.14297912 -0.02004945 -0.1008728   0.05870115
       0.02560689]
     [ 0.10502822  1.00006668  0.34935588  0.33976445  0.1315995  -0.00710476
      -0.00868435]
     [-0.14297912  0.34935588  1.00006668  0.41723845  0.19679901 -0.00474086
      -0.00606436]
     [-0.02004945  0.33976445  0.41723845  1.00006668  0.12776343 -0.01014356
      -0.00354465]
     [-0.1008728   0.1315995   0.19679901  0.12776343  1.00006668  0.00212056
       0.06743742]
     [ 0.05870115 -0.00710476 -0.00474086 -0.01014356  0.00212056  1.00006668
       0.03924805]
     [ 0.02560689 -0.00868435 -0.00606436 -0.00354465  0.06743742  0.03924805
       1.00006668]]
    
    
    
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    print('Eigenvectors \n%s' %eig_vecs)
    print('\nEigenvalues \n%s' %eig_vals)
    
    
    
    Eigenvectors 
    [[-0.08797699 -0.29189921  0.27784886  0.33637135  0.79752505  0.26786864
      -0.09438973]
     [ 0.50695734  0.30996609 -0.70780994  0.07393548  0.33180877  0.1101505
      -0.13499526]
     [ 0.5788351  -0.77736008 -0.00657105 -0.19677589 -0.10338032 -0.10336241
      -0.02293518]
     [ 0.54901653  0.45787675  0.63497294 -0.25170987  0.10388959 -0.01034922
      -0.10714981]
     [ 0.31354922  0.05287224  0.12200054  0.78782241 -0.28404472  0.04036861
       0.42547869]
     [-0.01930249  0.04433104 -0.03622859 -0.05762997  0.37489883 -0.8048393
       0.45245222]
     [ 0.00996933  0.00391698 -0.04873036 -0.39411153  0.10557298  0.50589173
       0.75836313]]
    
    Eigenvalues 
    [1.83017431 0.54823098 0.63363587 0.84548166 1.12659606 0.95598647
     1.06036136]
    
    
    
    # 构造元组
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    # 对特征值进行排序
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    print('Eigenvalues in descending order:')
    for i in eig_pairs:
        print(i[0])
    
    
    
    Eigenvalues in descending order:
    1.830174313875499
    1.1265960639915473
    1.0603613622840846
    0.9559864740066265
    0.8454816637143464
    0.633635874483021
    0.5482309765420602
    
    
    
    tot = sum(eig_vals)
    var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
    plt.figure(figsize=(6, 4))
    plt.bar(range(7), var_exp, alpha=0.5, align='center', label='individual explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()
    
    
    
    output_16_0.png
    
    
    matrix_w = np.hstack((eig_pairs[0][1].reshape(7,1), 
                          eig_pairs[1][1].reshape(7,1)
                        ))
    print('Matrix W:\n', matrix_w)
    
    
    
    Matrix W:
     [[-0.08797699  0.79752505]
     [ 0.50695734  0.33180877]
     [ 0.5788351  -0.10338032]
     [ 0.54901653  0.10388959]
     [ 0.31354922 -0.28404472]
     [-0.01930249  0.37489883]
     [ 0.00996933  0.10557298]]
    
    
    
    Y = HR_comma_sep_X_std.dot(matrix_w)
    Y
    
    
    
    array([[-1.90035018, -1.12083103],
           [ 2.1358322 ,  0.2493369 ],
           [ 3.05891625, -1.68312693],
           ...,
           [-2.0507165 , -1.182032  ],
           [ 2.91418496, -1.42752606],
           [-1.91543672, -1.17021407]])
    
    
    
    from sklearn.decomposition import PCA
    pca = PCA().fit(HR_comma_sep_X_std)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlim(0,7,1)
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    
    
    
    Text(0, 0.5, 'Cumulative explained variance')
    
    output_19_1.png
    
    
    from sklearn.decomposition import PCA 
    sklearn_pca = PCA(n_components=6)
    Y_sklearn = sklearn_pca.fit_transform(HR_comma_sep_X_std)
    print(Y_sklearn)
    Y_sklearn.shape
    
    
    
    [[-1.90035018 -1.12083103 -0.0797787   0.03228437 -0.07256447  0.06063013]
     [ 2.1358322   0.2493369   0.0936161   0.50676925  1.2487747  -0.61378158]
     [ 3.05891625 -1.68312693 -0.301682   -0.4488635  -1.12495888  0.29066929]
     ...
     [-2.0507165  -1.182032   -0.04594506  0.02441143 -0.01553247  0.24980658]
     [ 2.91418496 -1.42752606 -0.36333357 -0.31517759 -0.97107375  0.51444624]
     [-1.91543672 -1.17021407 -0.07024077  0.01486762 -0.09545357  0.01773844]]
    
    
    
    
    
    (14999, 6)
    

    4.总结:

    前6个因子方差累计超过90%,因此可以将7维特征空间缩减为6维子空间。

    相关文章

      网友评论

          本文标题:主成分分析_员工离职分析

          本文链接:https://www.haomeiwen.com/subject/vhlphltx.html