美文网首页
使用matplotlib进行数据可视化

使用matplotlib进行数据可视化

作者: 热苏斯 | 来源:发表于2018-12-01 11:56 被阅读0次
    ###########################################
    # Suppress matplotlib user warnings
    # Necessary for newer version of matplotlib
    import warnings
    warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
    #
    # Display inline matplotlib plots with IPython
    from IPython import get_ipython
    get_ipython().run_line_magic('matplotlib', 'inline')
    ###########################################
    
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import pandas as pd
    import numpy as np
    
    def pca_results(good_data, pca):
        '''
        Create a DataFrame of the PCA results
        Includes dimension feature weights and explained variance
        Visualizes the PCA results
        '''
    
        # Dimension indexing
        dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
    
        # PCA components
        components = pd.DataFrame(np.round(pca.components_, 4), columns = list(good_data.keys()))
        components.index = dimensions
    
        # PCA explained variance
        ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
        variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
        variance_ratios.index = dimensions
    
        # Create a bar plot visualization
        fig, ax = plt.subplots(figsize = (14,8))
    
        # Plot the feature weights as a function of the components
        components.plot(ax = ax, kind = 'bar');
        ax.set_ylabel("Feature Weights")
        ax.set_xticklabels(dimensions, rotation=0)
    
    
        # Display the explained variance ratios
        for i, ev in enumerate(pca.explained_variance_ratio_):
            ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n          %.4f"%(ev))
    
        # Return a concatenated DataFrame
        return pd.concat([variance_ratios, components], axis = 1)
    
    def cluster_results(reduced_data, preds, centers, pca_samples):
        '''
        Visualizes the PCA-reduced cluster data in two dimensions
        Adds cues for cluster centers and student-selected sample data
        '''
    
        predictions = pd.DataFrame(preds, columns = ['Cluster'])
        plot_data = pd.concat([predictions, reduced_data], axis = 1)
    
        # Generate the cluster plot
        fig, ax = plt.subplots(figsize = (14,8))
    
        # Color map
        cmap = cm.get_cmap('gist_rainbow')
    
        # Color the points based on assigned cluster
        for i, cluster in plot_data.groupby('Cluster'):   
            cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
                         color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);
    
        # Plot centers with indicators
        for i, c in enumerate(centers):
            ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
                       alpha = 1, linewidth = 2, marker = 'o', s=200);
            ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);
    
        # Plot transformed sample points 
        ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \
                   s = 150, linewidth = 4, color = 'black', marker = 'x');
    
        # Set plot title
        ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross");
    
    
    def biplot(good_data, reduced_data, pca):
        '''
        Produce a biplot that shows a scatterplot of the reduced
        data and the projections of the original features.
        
        good_data: original data, before transformation.
                   Needs to be a pandas dataframe with valid column names
        reduced_data: the reduced data (the first two dimensions are plotted)
        pca: pca object that contains the components_ attribute
    
        return: a matplotlib AxesSubplot object (for any additional customization)
        
        This procedure is inspired by the script:
        https://github.com/teddyroland/python-biplot
        '''
    
        fig, ax = plt.subplots(figsize = (14,8))
        # scatterplot of the reduced data    
        ax.scatter(x=reduced_data.loc[:, 'Dimension 1'], y=reduced_data.loc[:, 'Dimension 2'], 
            facecolors='b', edgecolors='b', s=70, alpha=0.5)
        
        feature_vectors = pca.components_.T
    
        # we use scaling factors to make the arrows easier to see
        arrow_size, text_pos = 7.0, 8.0,
    
        # projections of the original features
        for i, v in enumerate(feature_vectors):
            ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], 
                      head_width=0.2, head_length=0.2, linewidth=2, color='red')
            ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black', 
                     ha='center', va='center', fontsize=18)
    
        ax.set_xlabel("Dimension 1", fontsize=14)
        ax.set_ylabel("Dimension 2", fontsize=14)
        ax.set_title("PC plane with original feature projections.", fontsize=16);
        return ax
        
    
    def channel_results(reduced_data, outliers, pca_samples):
        '''
        Visualizes the PCA-reduced cluster data in two dimensions using the full dataset
        Data is labeled by "Channel" and cues added for student-selected sample data
        '''
    
        # Check that the dataset is loadable
        try:
            full_data = pd.read_csv("customers.csv")
        except:
            print("Dataset could not be loaded. Is the file missing?")       
            return False
    
        # Create the Channel DataFrame
        channel = pd.DataFrame(full_data['Channel'], columns = ['Channel'])
        channel = channel.drop(channel.index[outliers]).reset_index(drop = True)
        labeled = pd.concat([reduced_data, channel], axis = 1)
        
        # Generate the cluster plot
        fig, ax = plt.subplots(figsize = (14,8))
    
        # Color map
        cmap = cm.get_cmap('gist_rainbow')
    
        # Color the points based on assigned Channel
        labels = ['Hotel/Restaurant/Cafe', 'Retailer']
        grouped = labeled.groupby('Channel')
        for i, channel in grouped:   
            channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
                         color = cmap((i-1)*1.0/2), label = labels[i-1], s=30);
            
        # Plot transformed sample points   
        for i, sample in enumerate(pca_samples):
            ax.scatter(x = sample[0], y = sample[1], \
                   s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none');
            ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125);
    
        # Set plot title
        ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled");
    

    相关文章

      网友评论

          本文标题:使用matplotlib进行数据可视化

          本文链接:https://www.haomeiwen.com/subject/zvilcqtx.html