pandas
import pandas as pd
data=pd.read_csv(path, engine='python')# read
# 预览数据
data.head()
data.shape
data.columns.values
data.to_csv(path, index=False, encoding='utf-8', sep='\t')# save
# 操作
#筛选
year=data['year']
geo_count=data['geo_count']
geo_count[geo_count>90]=90
def func(x):
return 90 if x>90 else x
data['ec_count'] = data['ec_count'].apply(func)
data['ec_count'] = data['ec_count'].apply(lambda x: 85 if x > 85 else x)
#分类
data['anzsic06'].unique()
groupby = data.groupby('anzsic06').agg({'geo_count':['mean']})# 以均值分组
groupby.columns = groupby.columns.droplevel(0)
groupby.rename(columns={ groupby.columns[0]: "anzsic06_mean" }, inplace = True)# 重命名
data = pd.merge(data, groupby, how='left', on='anzsic06')# 加入原df,left join
# 合并
# 行合并
data_part_1 = data.iloc[0:1000]
data_part_2 = data.iloc[1000:]
data_stacked = data_part_1.append(data_part_2, ignore_index=True)
# 列合并
data_new_col = data['ec_count']
data_concat = pd.concat([data, data_new_col], axis=1)
matplotlib
# 数据同上
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
#直方图hist
num_bins=20
n, bins, patches = plt.hist(data['geo_count'], num_bins, density=True)
plt.show()
#直方图筛选
geo_count = data['geo_count']
smaller_geo_count = geo_count[geo_count<100]
num_bins=20
n, bins, patches = plt.hist(smaller_geo_count, num_bins, density=True)
plt.show()
#散点图plot
_ = plt.plot(data['geo_count'], data['ec_count'])
plt.show()
#boxplot
ec_2000 = data[data['year']==2000]['ec_count']
ec_2001 = data[data['year']==2001]['ec_count']
_ = plt.boxplot([ec_2000[ec_2000<100], ec_2001[ec_2001<100]])
plt.show()# 红线中位数,盒子25%~75%,上限以上是异常值
网友评论