美文网首页
降维算法--PCA

降维算法--PCA

作者: ForgetThatNight | 来源:发表于2018-07-07 10:37 被阅读19次
    import numpy as np
    import pandas as pd
    df = pd.read_csv('iris.data')
    df.head()
    
    df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
    df.head()
    
    # split data table into data X and class labels y
    
    X = df.ix[:,0:4].values
    y = df.ix[:,4].values
    
    from matplotlib import pyplot as plt
    import math
    
    label_dict = {1: 'Iris-Setosa',
                  2: 'Iris-Versicolor',
                  3: 'Iris-Virgnica'}
    
    feature_dict = {0: 'sepal length [cm]',
                    1: 'sepal width [cm]',
                    2: 'petal length [cm]',
                    3: 'petal width [cm]'}
    
    
    plt.figure(figsize=(8, 6))
    for cnt in range(4):
        plt.subplot(2, 2, cnt+1)
        for lab in ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'):
            plt.hist(X[y==lab, cnt],
                         label=lab,
                         bins=10,
                         alpha=0.3,)
        plt.xlabel(feature_dict[cnt])
        plt.legend(loc='upper right', fancybox=True, fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    from sklearn.preprocessing import StandardScaler
    X_std = StandardScaler().fit_transform(X)
    print (X_std)
    

    输出 :
    [[-1.1483555 -0.11805969 -1.35396443 -1.32506301]
    [-1.3905423 0.34485856 -1.41098555 -1.32506301]
    [-1.51163569 0.11339944 -1.29694332 -1.32506301]
    [-1.02726211 1.27069504 -1.35396443 -1.32506301]
    [-0.54288852 1.9650724 -1.18290109 -1.0614657 ]
    。。。
    [ 0.78913885 -0.11805969 0.81283789 1.04731282]
    [ 0.42585866 0.8077768 0.92688012 1.4427088 ]
    [ 0.06257847 -0.11805969 0.75581678 0.78371551]]

    mean_vec = np.mean(X_std, axis=0)
    cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
    print('Covariance matrix \n%s' %cov_mat)
    

    输出 :
    Covariance matrix
    [[ 1.00675676 -0.10448539 0.87716999 0.82249094]
    [-0.10448539 1.00675676 -0.41802325 -0.35310295]
    [ 0.87716999 -0.41802325 1.00675676 0.96881642]
    [ 0.82249094 -0.35310295 0.96881642 1.00675676]]

    print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))
    

    输出 :
    NumPy covariance matrix:
    [[ 1.00675676 -0.10448539 0.87716999 0.82249094]
    [-0.10448539 1.00675676 -0.41802325 -0.35310295]
    [ 0.87716999 -0.41802325 1.00675676 0.96881642]
    [ 0.82249094 -0.35310295 0.96881642 1.00675676]]

    cov_mat = np.cov(X_std.T)
    
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    
    print('Eigenvectors \n%s' %eig_vecs)
    print('\nEigenvalues \n%s' %eig_vals)
    

    输出 :
    Eigenvectors
    [[ 0.52308496 -0.36956962 -0.72154279 0.26301409]
    [-0.25956935 -0.92681168 0.2411952 -0.12437342]
    [ 0.58184289 -0.01912775 0.13962963 -0.80099722]
    [ 0.56609604 -0.06381646 0.63380158 0.52321917]]

    Eigenvalues
    [ 2.92442837 0.93215233 0.14946373 0.02098259]

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    print (eig_pairs)
    print ('----------')
    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    
    # Visually confirm that the list is correctly sorted by decreasing eigenvalues
    print('Eigenvalues in descending order:')
    for i in eig_pairs:
        print(i[0])
    

    输出 :
    [(2.9244283691111144, array([ 0.52308496, -0.25956935, 0.58184289, 0.56609604])), (0.93215233025350641, array([-0.36956962, -0.92681168, -0.01912775, -0.06381646])), (0.14946373489813314, array([-0.72154279, 0.2411952 , 0.13962963, 0.63380158])), (0.020982592764270606, array([ 0.26301409, -0.12437342, -0.80099722, 0.52321917]))]


    Eigenvalues in descending order:
    2.92442836911
    0.932152330254
    0.149463734898
    0.0209825927643

    tot = sum(eig_vals)
    var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
    print (var_exp)
    cum_var_exp = np.cumsum(var_exp)
    cum_var_exp
    

    输出 :
    [72.620033326920336, 23.147406858644135, 3.7115155645845164, 0.52104424985101538]
    Out[49]:
    array([ 72.62003333, 95.76744019, 99.47895575, 100. ])

    a = np.array([1,2,3,4])
    print (a)
    print ('-----------')
    print (np.cumsum(a))
    

    输出 :
    [1 2 3 4]


    [ 1 3 6 10]

    
    plt.figure(figsize=(6, 4))
    
    plt.bar(range(4), var_exp, alpha=0.5, align='center',
                label='individual explained variance')
    plt.step(range(4), cum_var_exp, where='mid',
                 label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()
    
    matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),
                          eig_pairs[1][1].reshape(4,1)))
    
    print('Matrix W:\n', matrix_w)
    

    输出 :
    Matrix W:
    [[ 0.52308496 -0.36956962]
    [-0.25956935 -0.92681168]
    [ 0.58184289 -0.01912775]
    [ 0.56609604 -0.06381646]]

    Y = X_std.dot(matrix_w)
    Y
    

    输出 :
    array([[-2.10795032, 0.64427554],
    [-2.38797131, 0.30583307],
    [-2.32487909, 0.56292316],
    [-2.40508635, -0.687591 ],
    。。。。
    [ 1.99464025, -1.04517619],
    [ 1.85977129, -0.37934387],
    [ 1.54200377, 0.90808604],
    [ 1.50925493, -0.26460621],
    [ 1.3690965 , -1.01583909],
    [ 0.94680339, 0.02182097]])

    plt.figure(figsize=(6, 4))
    for lab, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'),
                            ('blue', 'red', 'green')):
         plt.scatter(X[y==lab, 0],
                    X[y==lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('sepal_len')
    plt.ylabel('sepal_wid')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(6, 4))
    for lab, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'),
                            ('blue', 'red', 'green')):
         plt.scatter(Y[y==lab, 0],
                    Y[y==lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='lower center')
    plt.tight_layout()
    plt.show()
    

    相关文章

      网友评论

          本文标题:降维算法--PCA

          本文链接:https://www.haomeiwen.com/subject/xifiuftx.html