pandas库安装:
pip3 install pandas
Collecting pandas
Downloading https://files.pythonhosted.org/packages/78/78/50ef81a903eccc4e90e278a143c9a0530f05199f6221d2e1b21025852982/pandas-0.23.4-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (14.6MB)
100% |████████████████████████████████| 14.7MB 56kB/s
Requirement already satisfied: numpy>=1.9.0 in /Users/.virtualenvs/py3env/lib/python3.6/site-packages (from pandas) (1.15.4)
Collecting pytz>=2011k (from pandas)
Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='pypi.org', port=443): Read timed out. (read timeout=15)",)': /simple/pytz/
Downloading https://files.pythonhosted.org/packages/61/28/1d3920e4d1d50b19bc5d24398a7cd85cc7b9a75a490570d5a30c57622d34/pytz-2018.9-py2.py3-none-any.whl (510kB)
100% |████████████████████████████████| 512kB 43kB/s
Collecting python-dateutil>=2.5.0 (from pandas)
Downloading https://files.pythonhosted.org/packages/74/68/d87d9b36af36f44254a8d512cbfc48369103a3b9e474be9bdfe536abfc45/python_dateutil-2.7.5-py2.py3-none-any.whl (225kB)
100% |████████████████████████████████| 235kB 26kB/s
Requirement already satisfied: six>=1.5 in /Users/.virtualenvs/py3env/lib/python3.6/site-packages (from python-dateutil>=2.5.0->pandas) (1.11.0)
Installing collected packages: pytz, python-dateutil, pandas
Successfully installed pandas-0.23.4 python-dateutil-2.7.5 pytz-2018.9
pandas的Series一维数组应用方法
from pandas import Series, DataFrame
import pandas as pd
obj = Series([4, 5, 6, -7])#pandas一维数组定义
print(obj)
#输出结果如下是带索引一组数据
0 4
1 5
2 6
3 -7
dtype: int64
print( obj.index)
#输出结果RangeIndex(start=0, stop=4, step=1)
print ( obj.values)
#输出结果[ 4 5 6 -7]
字典中的key由哈希值生成唯一值不能修改,如果相同key值会导致对于value覆盖;列表(['a'])和集合({'b'})不能作为字典key,因为内容会变化。
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'c', 'a'])# 定义带索引字典
print(obj2)
#输出结果如下
d 4
b 7
c -5
a 3
dtype: int64
obj2['c'] = 6# 可以直接给对应索引给值
print(obj2)
# 输出结果如下
d 4
b 7
c 6
a 3
dtype: int64
print ('f' in obj2)#可查找是否存在此索引
#输出结果False
sdata = {
'beijing': 35000,
'shanghai': 71000,
'guangzhou': 16000,
'shenzhen': 5000}
obj3 = Series(sdata)#把字典转换为一维数组
print( obj3)
#输出结果如下
beijing 35000
shanghai 71000
guangzhou 16000
shenzhen 5000
dtype: int64
obj3.index = ['bj', 'gz', 'sh', 'sz']# 修改索引
print( obj3)
# 输出结果如下
bj 35000
gz 71000
sh 16000
sz 5000
dtype: int64
pandas的DataFrame多维数组应用方法
from pandas import Series, DataFrame
#字典中添加列表方式定义多维数据表格
data = {'city': ['shanghai', 'shanghai', 'shanghai', 'beijing', 'beijing'],
'year': [2016, 2017, 2018, 2017, 2018],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
print(frame)
#输出结果如下
city year pop
0 shanghai 2016 1.5
1 shanghai 2017 1.7
2 shanghai 2018 3.6
3 beijing 2017 2.4
4 beijing 2018 2.9
frame2 = DataFrame(data, columns=['year', 'city', 'pop'])#自定义key值排列顺序
print(frame2)
#输出结果如下
year city pop
0 2016 shanghai 1.5
1 2017 shanghai 1.7
2 2018 shanghai 3.6
3 2017 beijing 2.4
4 2018 beijing 2.9
print(frame2['city'])#提取列值
#输出结果如下
0 shanghai
1 shanghai
2 shanghai
3 beijing
4 beijing
Name: city, dtype: object
print(frame2.year)#提取列值另一种方法
#输出结果如下
0 2016
1 2017
2 2018
3 2017
4 2018
Name: year, dtype: int64
frame2['new'] = 100#新增列
print(frame2)
#输出结果如下
year city pop new
0 2016 shanghai 1.5 100
1 2017 shanghai 1.7 100
2 2018 shanghai 3.6 100
3 2017 beijing 2.4 100
4 2018 beijing 2.9 100
frame2['cap'] = frame2.city == 'beijing'#带判断新增列
print( frame2)
#输出结果如下
year city pop new cap
0 2016 shanghai 1.5 100 False
1 2017 shanghai 1.7 100 False
2 2018 shanghai 3.6 100 False
3 2017 beijing 2.4 100 True
4 2018 beijing 2.9 100 True
#另一种字典中嵌套方式定义多维数据表格
pop = {'beijing': {2008: 1.5, 2009: 2.0},
'shanghai': {2008: 2.0, 2009: 3.6}
}
frame3 = DataFrame(pop)
print(frame3)
#输出结果如下
beijing shanghai
2008 1.5 2.0
2009 2.0 3.6
print(frame3.T)#列行互换
#输出结果如下
2008 2009
beijing 1.5 2.0
shanghai 2.0 3.6
obj4 = Series([4.5, 7.2, -5.3, 3.6], index=['b', 'd', 'c', 'a'])
obj5 = obj4.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)#调整索引顺序并给新增列给0默认值
print(obj5)
#输出结果如下
a 3.6
b 4.5
c -5.3
d 7.2
e 0.0
dtype: float64
obj6 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print( obj6.reindex(range(6),method='bfill'))#给空值添加默认值,‘ffill’按上补充值,‘bfill’按下补充值
#输出结果如下
0 blue
1 purple
2 purple
3 yellow
4 yellow
5 NaN
dtype: object
from numpy import nan as NA #应用空值
data = Series([1, NA, 2])#给空值
print(data.dropna())#删除空值
#输出结果如下
0 1.0
2 2.0
dtype: float64
data2 = DataFrame([[1., 6.5, 3], [1., NA, NA], [NA, NA, NA]
])
data2[4] = NA#给第4列给空值
print(data2)
#输出结果如下
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
print(data2.dropna(how='all'))#删除整行为空的行
#输出结果如下
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
print(data2.dropna(axis=1, how='all'))#删除整列为空的列
#输出结果如下
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
data2.fillna(0)
print(data2.fillna(0, inplace=True))#填充缺失值为0,True为更新结果到data2
#输出结果None
print(data2)#更新结果后输出被修改
#输出结果如下
0 1 2 4
0 1.0 6.5 3.0 0.0
1 1.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
层次化索引
import numpy as np
#建立两层索引
data3 = Series(np.random.randn(10),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
print (data3)
#输出结果如下
a 1 -0.606962
2 -0.793390
3 0.515835
b 1 -0.269941
2 -0.613685
3 -0.078791
c 1 1.622026
2 -0.342152
d 2 -0.331359
3 0.719142
dtype: float64
print ( data3['b':'c'])#取索引对应值
#输出结果如下
b 1 0.024265
2 0.140279
3 1.465150
c 1 -1.049863
2 1.673730
dtype: float64
print(data3.unstack())#一维层次化索引转换为二维dataframe数组
#输出结果如下
1 2 3
a 0.052463 -0.868392 0.387425
b 0.041187 0.116177 -0.395136
c 0.585591 -0.465362 NaN
d NaN 0.586438 -0.140192
print(data3.unstack().stack())#还原一维层次化索引
#输出结果如下
a 1 0.052463
2 -0.868392
3 0.387425
b 1 0.041187
2 0.116177
3 -0.395136
c 1 0.585591
2 -0.465362
d 2 0.586438
3 -0.140192
dtype: float64
网友评论