美文网首页
Notebook - Quick Visualization a

Notebook - Quick Visualization a

作者: 左心Chris | 来源:发表于2019-10-28 14:54 被阅读0次

https://www.kaggle.com/fatihbilgin/quick-visualization-and-eda-for-beginners

1 Importing

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px

df_train = pd.read_csv('../input/learn-together/train.csv',index_col='Id')
df_test  = pd.read_csv('../input/learn-together/test.csv',index_col='Id')

2 EDA

df_train.head()
df_train.info()
df_train.describe().T
df_train.iloc[:, 10:-1].colums
# change columns to categorical ones
df_train.iloc[:,10:-1] = df_train.iloc[:,10:-1].astype("category")
df_test.iloc[:,10:] = df_test.iloc[:,10:].astype("category")

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html

热力图

f,ax = plt.subplots(figsize=(8,6))
sns.heatmap(df_train.corr(),annot=True, 
            linewidths=.5, fmt='.1f', ax=ax)

plt.show()

3 Data visualization

scatter 图

df_train.plot(kind='scatter', x='Vertical_Distance_To_Hydrology', 
              y='Horizontal_Distance_To_Hydrology', alpha=0.5, 
              color='darkblue', figsize = (12,9)
             )

plt.title('Vertical And Horizontal Distance To Hydrology')
plt.xlabel("Vertical Distance")
plt.ylabel("Horizontal Distance")

plt.show()

box plot

trace1 = go.Box(
    y=df_train["Vertical_Distance_To_Hydrology"],
    name = 'Vertical Distance',
    marker = dict(color = 'rgb(0,145,119)')
)

trace2 = go.Box(
    y=df_train["Horizontal_Distance_To_Hydrology"],
    name = 'Horizontal Distance',
    marker = dict(color = 'rgb(5, 79, 174)')
)

data = [trace1, trace2]
layout = dict(autosize=False, width=700,height=500, title='Distance To Hydrology', paper_bgcolor='rgb(243, 243, 243)', 
              plot_bgcolor='rgb(243, 243, 243)', margin=dict(l=40,r=30,b=80,t=100,)
             )
fig = dict(data=data, layout=layout)

iplot(fig)

histogram

f,ax=plt.subplots(1,2,figsize=(15,7))
df_train.Vertical_Distance_To_Hydrology.plot.hist(ax=ax[0],bins=30,
                                                  edgecolor='black',color='crimson')

ax[0].set_title('Vertical Distance To Hydrology')
x1=list(range(-150,350,50))
ax[0].set_xticks(x1)

df_train.Horizontal_Distance_To_Hydrology.plot.hist(ax=ax[1],bins=30,
                                                    edgecolor='black',color='darkmagenta')

ax[1].set_title('Horizontal Distance To Hydrology')
x2=list(range(0,1000,100))
ax[1].set_xticks(x2)

plt.show()

bar plot

soil_types = df_train.iloc[:,14:-1].sum(axis=0)

plt.figure(figsize=(18,9))
sns.barplot(x=soil_types.index, y=soil_types.values, 
            palette="rocket"
           )

plt.xticks(rotation= 75)
plt.ylabel('Total')
plt.title('Count of Soil Types With Value 1',color = 'darkred',fontsize=12)

plt.show()

4 Pandas Profiling

report = pp.ProfileReport(df_train)

report.to_file("report.html")

report

相关文章

网友评论

      本文标题:Notebook - Quick Visualization a

      本文链接:https://www.haomeiwen.com/subject/qoavvctx.html