数据源和格式
pandas 数据源下载地址:https://video.mugglecode.com/data_pd
本次分析只关心A和D列.
代码
import matplotlib.pyplot as plt
import pandas as pd
import os
#比较咖啡厅菜单中各类饮品的热量
file_path = '/Users/miraco/PycharmProjects/DataMining/data_pd/coffee_menu.csv'
outpath = './coffee_stat/ouptput'
#os.mkdir 與 os.makedirs 的差別在於 os.makedirs 會遞迴地去建立目錄,也就是說連同中繼的目錄也會一起建立
if not os.path.exists(outpath):
os.makedirs(outpath)
def collect_data():
data_df = pd.read_csv(file_path) #这是二维数组
return data_df
def inspect_data(data_df):
#数据有噪声的时候,读取为保险起见,会被读取成obj类型
print(f'数据一共有{data_df.shape[0]}行, {data_df.shape[1]}列')
print('-----------------------------------------------------')
print('数据预览:')
# 如果想看又怕太多,可以用data_df.head(),只显示前几行
print(data_df.head())
print('-----------------------------------------------------')
print('数据的基本信息:')
# data_df.info()可以看数据类型,字符串看成obj类型,数字会自动读取成float或int
print(data_df.info())
print('-----------------------------------------------------')
print('数据统计信息')
#均值、最大值、最小值啥的
print(data_df.describe())
print('-----------------------------------------------------')
def analyze_data(data_df):
bever_cate_col = data_df['Beverage_category'] #这是series,一列,引用列名就行,
bever_cates = bever_cate_col.unique() #一列数据去重,得到各唯一值
print('饮品类别')
print(bever_cates)
print('-----------------------------------------------------')
cate_grouped = data_df.groupby('Beverage_category') #返回的是一种特殊的数据类型
cate_count = cate_grouped['Calories'].count()
cate_mean_cal = cate_grouped['Calories'].mean()
return cate_count, cate_mean_cal
def save_and_show_results(cate_count,cate_mean_cal):
cate_count.to_csv(os.path.join(outpath,'cate_count.csv'))
cate_mean_cal.to_csv(os.path.join(outpath, 'cate_mean_cal.csv'))
cate_count.plot(kind = 'bar') #直接画,无需重复调用figure
plt.title('Category Count')
plt.tight_layout()
plt.savefig(os.path.join(outpath, 'category_count.png'))
plt.show()
cate_mean_cal.plot(kind = 'bar')
plt.title('Category Average Cals')
plt.tight_layout()
plt.savefig(os.path.join(outpath, 'category_ave_cals.png'))
plt.show()
def main():
#数据获取
data_df = collect_data()
#查看数据信息
inspect_data(data_df)
#分析
cate_count, cate_mean_cal = analyze_data(data_df)
#结果展示
save_and_show_results(cate_count, cate_mean_cal)
if __name__ == '__main__':
main()
运行结果
数据一共有242行, 18列
-----------------------------------------------------
数据预览:
Beverage_category ... Caffeine (mg)
0 Coffee ... 175
1 Coffee ... 260
2 Coffee ... 330
3 Coffee ... 410
4 Classic Espresso Drinks ... 75
[5 rows x 18 columns]
-----------------------------------------------------
数据的基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 18 columns):
Beverage_category 242 non-null object
Beverage 242 non-null object
Beverage_prep 242 non-null object
Calories 242 non-null int64
Total Fat (g) 242 non-null object
Trans Fat (g) 242 non-null float64
Saturated Fat (g) 242 non-null float64
Sodium (mg) 242 non-null int64
Total Carbohydrates (g) 242 non-null int64
Cholesterol (mg) 242 non-null int64
Dietary Fibre (g) 242 non-null int64
Sugars (g) 242 non-null int64
Protein (g) 242 non-null float64
Vitamin A (% DV) 242 non-null object
Vitamin C (% DV) 242 non-null object
Calcium (% DV) 242 non-null object
Iron (% DV) 242 non-null object
Caffeine (mg) 241 non-null object
dtypes: float64(3), int64(6), object(9)
memory usage: 34.1+ KB
None
-----------------------------------------------------
数据统计信息
Calories Trans Fat (g) ... Sugars (g) Protein (g)
count 242.000000 242.000000 ... 242.000000 242.000000
mean 193.871901 1.307025 ... 32.962810 6.978512
std 102.863303 1.640259 ... 19.730199 4.871659
min 0.000000 0.000000 ... 0.000000 0.000000
25% 120.000000 0.100000 ... 18.000000 3.000000
50% 185.000000 0.500000 ... 32.000000 6.000000
75% 260.000000 2.000000 ... 43.750000 10.000000
max 510.000000 9.000000 ... 84.000000 20.000000
[8 rows x 9 columns]
-----------------------------------------------------
饮品类别
['Coffee' 'Classic Espresso Drinks' 'Signature Espresso Drinks'
'Tazo® Tea Drinks' 'Shaken Iced Beverages' 'Smoothies'
'Frappuccino® Blended Coffee' 'Frappuccino® Light Blended Coffee'
'Frappuccino® Blended Crème']
-----------------------------------------------------
这个竟然会自动配色
练习
使用柱状图可视化 PM2.5数值
-
题目要求:
-
使用Pandas查看数据文件的基本信息
-
使用Pandas进行数据分析及可视化
-
Beijing_PM.csv,包含了2013-2015年北京每小时的PM2.5值。每行记录为1小时的数据。
-
共7列数据,分别表示:
- year: 年,2013-2015
- month: 月,1-12
- day: 日,1-31
- hour: 小时,0-23
- season:季度,1-4
- PM_China: 中国环保部检测的PM2.5值
- PM_US: 美国使馆检测的PM2.5值
# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import os
file_path = '/Users/miraco/PycharmProjects/DataMining/data_pd/coffee_menu.csv'
outpath = './coffee_stat/ouptput'
#os.mkdir 與 os.makedirs 的差別在於 os.makedirs 會遞迴地去建立目錄,也就是說連同中繼的目錄也會一起建立
if not os.path.exists(outpath):
os.makedirs(outpath)
# 1. 读取csv数据文件
data_df = pd.read_csv('/Users/miraco/PycharmProjects/DataMining/data_pd/Beijing_PM.csv')
# 2. 查看数据文件的基本信息
print('数据预览:')
print(data_df.head())
print('数据文件的基本信息:')
print(data_df.info())
print('数据内容的统计信息:')
print(data_df.describe())
# 4. 按照year列进行分组统计
year_average_pm = data_df.groupby('year')['PM_China'].mean()
# 4. 结果保存
year_average_pm.to_csv(os.path.join(outpath,'year_average_pm.csv'))
# 5. 绘制柱状图
year_average_pm.plot(kind='bar')
plt.tight_layout()
plt.show()
需要注意的是,数据里面有的列里面含有NA,这个在统计时候会跳过的。
运行结果
数据预览:
year month day hour season PM_China PM_US
0 2013 1 1 0 4 NaN 31.0
1 2013 1 1 1 4 NaN 32.0
2 2013 1 1 2 4 NaN 21.0
3 2013 1 1 3 4 NaN 16.0
4 2013 1 1 4 4 NaN 15.0
数据文件的基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 7 columns):
year 26280 non-null int64
month 26280 non-null int64
day 26280 non-null int64
hour 26280 non-null int64
season 26280 non-null int64
PM_China 20508 non-null float64
PM_US 25970 non-null float64
dtypes: float64(2), int64(5)
memory usage: 1.4 MB
None
数据内容的统计信息:
year month ... PM_China PM_US
count 26280.000000 26280.000000 ... 20508.000000 25970.000000
mean 2014.000000 6.526027 ... 92.560806 94.094686
std 0.816512 3.447917 ... 88.027434 93.806554
min 2013.000000 1.000000 ... 3.000000 1.000000
25% 2013.000000 4.000000 ... 28.000000 27.000000
50% 2014.000000 7.000000 ... 68.000000 66.000000
75% 2015.000000 10.000000 ... 127.000000 126.000000
max 2015.000000 12.000000 ... 672.000000 886.000000
[8 rows x 7 columns]
这个图也是自动赋色的
关键语句
#读取csv数据文件
data_df = pd.read_csv('/Users/miraco/PycharmProjects/DataMining/data_pd/Beijing_PM.csv')
#按照year列进行分组统计
year_average_pm = data_df.groupby('year')['PM_China'].mean()
#结果保存
year_average_pm.to_csv(os.path.join(outpath,'year_average_pm.csv'))
#绘制柱状图
year_average_pm.plot(kind='bar')
网友评论