美文网首页
Notebook - Quick Visualization a

Notebook - Quick Visualization a

作者: 左心Chris | 来源:发表于2019-10-28 14:54 被阅读0次

    https://www.kaggle.com/fatihbilgin/quick-visualization-and-eda-for-beginners

    1 Importing

    import numpy as np
    import pandas as pd 
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas_profiling as pp
    import plotly.graph_objs as go
    from plotly.offline import iplot
    import plotly.express as px
    
    df_train = pd.read_csv('../input/learn-together/train.csv',index_col='Id')
    df_test  = pd.read_csv('../input/learn-together/test.csv',index_col='Id')
    
    

    2 EDA

    df_train.head()
    df_train.info()
    df_train.describe().T
    df_train.iloc[:, 10:-1].colums
    # change columns to categorical ones
    df_train.iloc[:,10:-1] = df_train.iloc[:,10:-1].astype("category")
    df_test.iloc[:,10:] = df_test.iloc[:,10:].astype("category")
    

    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html

    热力图

    f,ax = plt.subplots(figsize=(8,6))
    sns.heatmap(df_train.corr(),annot=True, 
                linewidths=.5, fmt='.1f', ax=ax)
    
    plt.show()
    

    3 Data visualization

    scatter 图

    df_train.plot(kind='scatter', x='Vertical_Distance_To_Hydrology', 
                  y='Horizontal_Distance_To_Hydrology', alpha=0.5, 
                  color='darkblue', figsize = (12,9)
                 )
    
    plt.title('Vertical And Horizontal Distance To Hydrology')
    plt.xlabel("Vertical Distance")
    plt.ylabel("Horizontal Distance")
    
    plt.show()
    
    

    box plot

    trace1 = go.Box(
        y=df_train["Vertical_Distance_To_Hydrology"],
        name = 'Vertical Distance',
        marker = dict(color = 'rgb(0,145,119)')
    )
    
    trace2 = go.Box(
        y=df_train["Horizontal_Distance_To_Hydrology"],
        name = 'Horizontal Distance',
        marker = dict(color = 'rgb(5, 79, 174)')
    )
    
    data = [trace1, trace2]
    layout = dict(autosize=False, width=700,height=500, title='Distance To Hydrology', paper_bgcolor='rgb(243, 243, 243)', 
                  plot_bgcolor='rgb(243, 243, 243)', margin=dict(l=40,r=30,b=80,t=100,)
                 )
    fig = dict(data=data, layout=layout)
    
    iplot(fig)
    

    histogram

    f,ax=plt.subplots(1,2,figsize=(15,7))
    df_train.Vertical_Distance_To_Hydrology.plot.hist(ax=ax[0],bins=30,
                                                      edgecolor='black',color='crimson')
    
    ax[0].set_title('Vertical Distance To Hydrology')
    x1=list(range(-150,350,50))
    ax[0].set_xticks(x1)
    
    df_train.Horizontal_Distance_To_Hydrology.plot.hist(ax=ax[1],bins=30,
                                                        edgecolor='black',color='darkmagenta')
    
    ax[1].set_title('Horizontal Distance To Hydrology')
    x2=list(range(0,1000,100))
    ax[1].set_xticks(x2)
    
    plt.show()
    
    

    bar plot

    soil_types = df_train.iloc[:,14:-1].sum(axis=0)
    
    plt.figure(figsize=(18,9))
    sns.barplot(x=soil_types.index, y=soil_types.values, 
                palette="rocket"
               )
    
    plt.xticks(rotation= 75)
    plt.ylabel('Total')
    plt.title('Count of Soil Types With Value 1',color = 'darkred',fontsize=12)
    
    plt.show()
    

    4 Pandas Profiling

    report = pp.ProfileReport(df_train)
    
    report.to_file("report.html")
    
    report
    

    相关文章

      网友评论

          本文标题:Notebook - Quick Visualization a

          本文链接:https://www.haomeiwen.com/subject/qoavvctx.html