import pandas as pd
pew = pd.read_csv('data/pew.csv')
print(pew.iloc[:5,:7])
religion <$10k $10-20k $20-30k $30-40k $40-50k $50-75k
0 Agnostic 27 34 60 81 76 137
1 Atheist 12 27 37 52 35 70
2 Buddhist 27 21 30 34 33 58
3 Catholic 418 617 732 670 638 1116
4 Don’t know/refused 15 14 15 11 10 35
上图就是宽数据,方便观察与输入。表中宗教,收入,计数三种数据,但是不同收入等级却做成列名,不方便数据分析,需要把收入也做成一列,所以要把宽数据整理成长数据。
melt:
- id_vars:一个容器(列表,元素或ndarray),指定保持原样的变量
- value_vars:指定需要转化的列,默认为未被id_vars指定的列
- var_name:指定被融合的列(变量)的新列名
- value_name:指定被融合的列的值的列名
需要固定单列
pew_long = pd.melt(pew,
id_vars='religion',
var_name='income',
value_name='count'
)
print(pew_long.head())
religion income count
0 Agnostic <$10k 27
1 Atheist <$10k 12
2 Buddhist <$10k 27
3 Catholic <$10k 418
4 Don’t know/refused <$10k 15
需要固定多列
billboard = pd.read_csv('data/billboard.csv')
print(billboard.iloc[0:5, 0:16])
year artist track time date.entered wk1 wk2 \
0 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 87 82.0
1 2000 2Ge+her The Hardest Part Of ... 3:15 2000-09-02 91 87.0
2 2000 3 Doors Down Kryptonite 3:53 2000-04-08 81 70.0
3 2000 3 Doors Down Loser 4:24 2000-10-21 76 76.0
4 2000 504 Boyz Wobble Wobble 3:35 2000-04-15 57 34.0
wk3 wk4 wk5 wk6 wk7 wk8 wk9 wk10 wk11
0 72.0 77.0 87.0 94.0 99.0 NaN NaN NaN NaN
1 92.0 NaN NaN NaN NaN NaN NaN NaN NaN
2 68.0 67.0 66.0 57.0 54.0 53.0 51.0 51.0 51.0
3 72.0 69.0 67.0 65.0 55.0 59.0 62.0 61.0 61.0
4 25.0 17.0 17.0 31.0 36.0 49.0 53.0 57.0 64.0
billboard_long = pd.melt(billboard,
id_vars=['year', 'artist', 'track', 'time', 'date.entered'],
var_name='week',
value_name='rating')
print(billboard_long.head())
year artist track time date.entered week rating
0 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk1 87.0
1 2000 2Ge+her The Hardest Part Of ... 3:15 2000-09-02 wk1 91.0
2 2000 3 Doors Down Kryptonite 3:53 2000-04-08 wk1 81.0
3 2000 3 Doors Down Loser 4:24 2000-10-21 wk1 76.0
4 2000 504 Boyz Wobble Wobble 3:35 2000-04-15 wk1 57.0
列名中包含多个变量(包含多个变量的列)
ebola = pd.read_csv('data/country_timeseries.csv')
print(ebola.iloc[:5, [0,1,2,3,10,11]])
Date Day Cases_Guinea Cases_Liberia Deaths_Guinea Deaths_Liberia
0 1/5/2015 289 2776.0 NaN 1786.0 NaN
1 1/4/2015 288 2775.0 NaN 1781.0 NaN
2 1/3/2015 287 2769.0 8166.0 1767.0 3496.0
3 1/2/2015 286 NaN 8157.0 NaN 3496.0
4 12/31/2014 284 2730.0 8115.0 1739.0 3471.0
比如ebola数据中的Cases_Guinea,Cases_Liberia,Deaths_Guinea和Deaths_Liberia,Cases和Deaths表示病例数和死亡数,Guinea和Liberia是国家名。
ebola_long = pd.melt(ebola,id_vars=['Date', 'Day'])
print(ebola_long.head())
Date Day variable value
0 1/5/2015 289 Cases_Guinea 2776.0
1 1/4/2015 288 Cases_Guinea 2775.0
2 1/3/2015 287 Cases_Guinea 2769.0
3 1/2/2015 286 Cases_Guinea NaN
4 12/31/2014 284 Cases_Guinea 2730.0
需要将variable列按照‘_’拆分
variable_split = ebola_long.variable.str.split('_') # 访问字符串方法
print(type(variable_split))
print(variable_split[0])
print(type(variable_split[0]))
<class 'pandas.core.series.Series'>
['Cases', 'Guinea']
<class 'list'>
status_values = variable_split.str.get(0) # 访问字符串方法
country_values = variable_split.str.get(1)
print(status_values[:5])
print(country_values[:5])
0 Cases
1 Cases
2 Cases
3 Cases
4 Cases
Name: variable, dtype: object
0 Guinea
1 Guinea
2 Guinea
3 Guinea
4 Guinea
Name: variable, dtype: object
ebola_long['status'] = status_values
ebola_long['country'] = country_values
print(ebola_long.head())
Date Day variable value status country
0 1/5/2015 289 Cases_Guinea 2776.0 Cases Guinea
1 1/4/2015 288 Cases_Guinea 2775.0 Cases Guinea
2 1/3/2015 287 Cases_Guinea 2769.0 Cases Guinea
3 1/2/2015 286 Cases_Guinea NaN Cases Guinea
4 12/31/2014 284 Cases_Guinea 2730.0 Cases Guinea
# 分隔后字符串的顺序与原数据顺序一致,可以直接拼接
variable_split = ebola_long.variable.str.split('_', expand=True)
print(variable_split.head())
variable_split.columns = ['status', 'country']
ebola_parsed = pd.concat([ebola_long, variable_split], axis=1)
print(ebola_parsed.head())
0 1
0 Cases Guinea
1 Cases Guinea
2 Cases Guinea
3 Cases Guinea
4 Cases Guinea
Date Day variable value status country status country
0 1/5/2015 289 Cases_Guinea 2776.0 Cases Guinea Cases Guinea
1 1/4/2015 288 Cases_Guinea 2775.0 Cases Guinea Cases Guinea
2 1/3/2015 287 Cases_Guinea 2769.0 Cases Guinea Cases Guinea
3 1/2/2015 286 Cases_Guinea NaN Cases Guinea Cases Guinea
4 12/31/2014 284 Cases_Guinea 2730.0 Cases Guinea Cases Guinea
weather = pd.read_csv('data/weather.csv')
print(weather.iloc[:5,:11])
id year month element d1 d2 d3 d4 d5 d6 d7
0 MX17004 2010 1 tmax NaN NaN NaN NaN NaN NaN NaN
1 MX17004 2010 1 tmin NaN NaN NaN NaN NaN NaN NaN
2 MX17004 2010 2 tmax NaN 27.3 24.1 NaN NaN NaN NaN
3 MX17004 2010 2 tmin NaN 14.4 14.4 NaN NaN NaN NaN
4 MX17004 2010 3 tmax NaN NaN NaN NaN 32.1 NaN NaN
气象数据记录每周每天的最高气温和最低气温,但是最高和最低都在element列中,我们想得到一行就是一天的所有信息,包含最高和最低。
weather_melt = pd.melt(weather,
id_vars=['id', 'year', 'month', 'element'],
var_name='day',
value_name='temp')
print(weather_melt.head())
id year month element day temp
0 MX17004 2010 1 tmax d1 NaN
1 MX17004 2010 1 tmin d1 NaN
2 MX17004 2010 2 tmax d1 NaN
3 MX17004 2010 2 tmin d1 NaN
4 MX17004 2010 3 tmax d1 NaN
df.pivot_table() 将分层的列平铺
weather_tidy = weather_melt.pivot_table(index=['id', 'year', 'month', 'day'],
columns='element',
values='temp')
print(weather_tidy.head())
element tmax tmin
id year month day
MX17004 2010 1 d1 NaN NaN
d10 NaN NaN
d11 NaN NaN
d12 NaN NaN
d13 NaN NaN
# 补上索引
weather_tidy_flat = weather_tidy.reset_index()
print(weather_tidy_flat.head())
element id year month day tmax tmin
0 MX17004 2010 1 d1 NaN NaN
1 MX17004 2010 1 d10 NaN NaN
2 MX17004 2010 1 d11 NaN NaN
3 MX17004 2010 1 d12 NaN NaN
4 MX17004 2010 1 d13 NaN NaN
# 一步到位
weather_tidy = weather_melt.pivot_table(index=['id', 'year', 'month', 'day'],
columns='element',
values='temp').reset_index()
print(weather_tidy.head())
element id year month day tmax tmin
0 MX17004 2010 1 d1 NaN NaN
1 MX17004 2010 1 d10 NaN NaN
2 MX17004 2010 1 d11 NaN NaN
3 MX17004 2010 1 d12 NaN NaN
4 MX17004 2010 1 d13 NaN NaN
print(billboard_long.head())
year artist track time date.entered week rating
0 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk1 87.0
1 2000 2Ge+her The Hardest Part Of ... 3:15 2000-09-02 wk1 91.0
2 2000 3 Doors Down Kryptonite 3:53 2000-04-08 wk1 81.0
3 2000 3 Doors Down Loser 4:24 2000-10-21 wk1 76.0
4 2000 504 Boyz Wobble Wobble 3:35 2000-04-15 wk1 57.0
print(billboard_long[billboard_long.track == 'Loser'].head())
year artist track time date.entered week rating
3 2000 3 Doors Down Loser 4:24 2000-10-21 wk1 76.0
320 2000 3 Doors Down Loser 4:24 2000-10-21 wk2 76.0
637 2000 3 Doors Down Loser 4:24 2000-10-21 wk3 72.0
954 2000 3 Doors Down Loser 4:24 2000-10-21 wk4 69.0
1271 2000 3 Doors Down Loser 4:24 2000-10-21 wk5 67.0
表中包含两类信息:歌曲信息和周排行。最好把所有的歌曲信息提取,加上编号,做成一张歌曲信息表。这样可以去掉很多重复信息。
billboard_songs = billboard_long[['year', 'artist', 'track', 'time']]
print(billboard_songs.shape)
billboard_songs = billboard_songs.drop_duplicates() # 去掉重复的行
print(billboard_songs.shape)
(24092, 4)
(317, 4)
# 为每一行分配唯一的值
billboard_songs['id'] = range(len(billboard_songs))
print(billboard_songs.head())
year artist track time id
0 2000 2 Pac Baby Don't Cry (Keep... 4:22 0
1 2000 2Ge+her The Hardest Part Of ... 3:15 1
2 2000 3 Doors Down Kryptonite 3:53 2
3 2000 3 Doors Down Loser 4:24 3
4 2000 504 Boyz Wobble Wobble 3:35 4
# 将歌曲id匹配到排名表中
billboard_ratings = billboard_long.merge(billboard_songs,
on=['year', 'artist', 'track', 'time'])
print(billboard_ratings.head())
year artist track time date.entered week rating id
0 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk1 87.0 0
1 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk2 82.0 0
2 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk3 72.0 0
3 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk4 77.0 0
4 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 wk5 87.0 0
# 取出想要的排行表列子集
billboard_ratings = billboard_ratings[['id', 'date.entered', 'week', 'rating']]
print(billboard_ratings.head())
id date.entered week rating
0 0 2000-02-26 wk1 87.0
1 0 2000-02-26 wk2 82.0
2 0 2000-02-26 wk3 72.0
3 0 2000-02-26 wk4 77.0
4 0 2000-02-26 wk5 87.0
使用循环读取多个子文件并合并成完整的文件
list_df = []
for csv_filename in filename_list:
df = pd.read_csv(csv_filename)
list_df.append(df)
df_concat = pd.concat(ist_df)
网友评论