https://www.kaggle.com/fatihbilgin/quick-visualization-and-eda-for-beginners
1 Importing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
df_train = pd.read_csv('../input/learn-together/train.csv',index_col='Id')
df_test = pd.read_csv('../input/learn-together/test.csv',index_col='Id')
2 EDA
df_train.head()
df_train.info()
df_train.describe().T
df_train.iloc[:, 10:-1].colums
# change columns to categorical ones
df_train.iloc[:,10:-1] = df_train.iloc[:,10:-1].astype("category")
df_test.iloc[:,10:] = df_test.iloc[:,10:].astype("category")
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html
热力图
f,ax = plt.subplots(figsize=(8,6))
sns.heatmap(df_train.corr(),annot=True,
linewidths=.5, fmt='.1f', ax=ax)
plt.show()
3 Data visualization
scatter 图
df_train.plot(kind='scatter', x='Vertical_Distance_To_Hydrology',
y='Horizontal_Distance_To_Hydrology', alpha=0.5,
color='darkblue', figsize = (12,9)
)
plt.title('Vertical And Horizontal Distance To Hydrology')
plt.xlabel("Vertical Distance")
plt.ylabel("Horizontal Distance")
plt.show()
box plot
trace1 = go.Box(
y=df_train["Vertical_Distance_To_Hydrology"],
name = 'Vertical Distance',
marker = dict(color = 'rgb(0,145,119)')
)
trace2 = go.Box(
y=df_train["Horizontal_Distance_To_Hydrology"],
name = 'Horizontal Distance',
marker = dict(color = 'rgb(5, 79, 174)')
)
data = [trace1, trace2]
layout = dict(autosize=False, width=700,height=500, title='Distance To Hydrology', paper_bgcolor='rgb(243, 243, 243)',
plot_bgcolor='rgb(243, 243, 243)', margin=dict(l=40,r=30,b=80,t=100,)
)
fig = dict(data=data, layout=layout)
iplot(fig)
histogram
f,ax=plt.subplots(1,2,figsize=(15,7))
df_train.Vertical_Distance_To_Hydrology.plot.hist(ax=ax[0],bins=30,
edgecolor='black',color='crimson')
ax[0].set_title('Vertical Distance To Hydrology')
x1=list(range(-150,350,50))
ax[0].set_xticks(x1)
df_train.Horizontal_Distance_To_Hydrology.plot.hist(ax=ax[1],bins=30,
edgecolor='black',color='darkmagenta')
ax[1].set_title('Horizontal Distance To Hydrology')
x2=list(range(0,1000,100))
ax[1].set_xticks(x2)
plt.show()
bar plot
soil_types = df_train.iloc[:,14:-1].sum(axis=0)
plt.figure(figsize=(18,9))
sns.barplot(x=soil_types.index, y=soil_types.values,
palette="rocket"
)
plt.xticks(rotation= 75)
plt.ylabel('Total')
plt.title('Count of Soil Types With Value 1',color = 'darkred',fontsize=12)
plt.show()
4 Pandas Profiling
report = pp.ProfileReport(df_train)
report.to_file("report.html")
report
网友评论