1.abs(),返回DataFrame每个数值的绝对值,前提是所有元素均为数值型
import pandas as pd
import numpy as np
df=pd.read_excel('南京银行.xlsx',index_col='Date')
df1=df[:5]
df1.iat[0,1]=-df1.iat[0,1]
df1
Open High Low Close Turnover Volume
Date
2017-09-15 8.06 -8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
df1.abs()
Open High Low Close Turnover Volume
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800.0
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600.0
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100.0
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700.0
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600.0
2.add(other, axis='columns', level=None, fill_value=None) 将某个序列或表中的元素与本表中的元素相加,默认匹配列元素
ar1=[8.1,8.2,8.0,8.15,200.00,32000000]
cl1=['Open','High','Low','Close','Turnover','Volume']
se1=pd.Series(data=ar1,index=cl1)
se1
Open 8.10
High 8.20
Low 8.00
Close 8.15
Trunover 200.00
Volume 32000000.00
dtype: float64
df1.add(se1)
Open High Low Close Turnover Volume
Date
2017-09-15 16.16 0.12 16.03 16.19 395.43 56272800.0
2017-09-18 16.15 16.33 16.03 16.21 400.76 56867600.0
2017-09-19 16.13 16.26 15.94 16.15 633.76 86253100.0
2017-09-20 16.07 16.26 15.95 16.18 519.94 71909700.0
2017-09-21 16.12 16.30 15.99 16.19 441.94 62056600.0
df1.add(df1)
Open High Low Close Turnover Volume
Date
2017-09-15 16.12 -16.16 16.06 16.08 390.86 48545600
2017-09-18 16.10 16.26 16.06 16.12 401.52 49735200
2017-09-19 16.06 16.12 15.88 16.00 867.52 108506200
2017-09-20 15.94 16.12 15.90 16.06 639.88 79819400
2017-09-21 16.04 16.20 15.98 16.08 483.88 60113200
3.add_prefix()和add_suffix()为列名添加前缀或后缀
df1.add_prefix('list')
listOpen listHigh listLow listClose listTurnover listVolume
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
df1.add_suffix('list')
Openlist Highlist Lowlist Closelist Turnoverlist Volumelist
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
4.agg(func, axis=0, *args, **kwargs),合计运算,常用的函数有min,max,prod,mean,std,var,median等
#所有列只做一种运算
df1.agg(sum)
Open 4.013000e+01
High 4.043000e+01
Low 3.994000e+01
Close 4.017000e+01
Turnover 1.391830e+03
Volume 1.733598e+08
dtype: float64
#所有列做两种运算
df1.agg(['sum','min'])
Open High Low Close Turnover Volume
sum 40.13 40.43 39.94 40.17 1391.83 173359800
min 7.97 8.06 7.94 8.00 195.43 24272800
#不同列做不同运算
df1.agg({'Open':['sum','min'],'Close':['sum','max']})
Close Open
max 8.06 NaN
min NaN 7.97
sum 40.17 40.13
5.align(),DataFrame与Series或DataFrame之间连接运算,常用的有内联,外联,左联,右联
df2=df[3:5]
df2
Out[68]:
Open High Low Close Turnover Volume
Date
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
df1.align(df2,join='inner') #返回的为元组类型对象
( Open High Low Close Turnover Volume
Date
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600,
Open High Low Close Turnover Volume
Date
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600)
df1.align(df2,join='left')
Out[69]:
( Open High Low Close Turnover Volume
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600,
Open High Low Close Turnover Volume
Date
2017-09-15 NaN NaN NaN NaN NaN NaN
2017-09-18 NaN NaN NaN NaN NaN NaN
2017-09-19 NaN NaN NaN NaN NaN NaN
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700.0
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600.0)
df1.align(df2,join='left')[0]
Out[70]:
Open High Low Close Turnover Volume
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
6.all()和any(),判断选定的DataFrame中的元素是否全不为空或是否任意一个元素不为空,返回值为Boolean类型
df1.all(axis=0)
Out[72]:
Open True
High True
Low True
Close True
Turnover True
Volume True
dtype: bool
df1.all(axis=1)
Out[73]:
Date
2017-09-15 True
2017-09-18 True
2017-09-19 True
2017-09-20 True
2017-09-21 True
dtype: bool
df1.any()
Out[74]:
Open True
High True
Low True
Close True
Turnover True
Volume True
dtype: bool
7.append(),在此表格尾部添加其他对象的行,返回一个新的对象
df2=df[5:7]
df2
Out[93]:
Open High Low Close Turnover Volume
Date
2017-09-22 8.01 8.10 8.00 8.08 300.13 37212200
2017-09-25 8.06 8.07 7.97 7.99 262.30 32754500
df1.append(df2)
Out[94]:
Open High Low Close Turnover Volume
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
2017-09-22 8.01 8.10 8.00 8.08 300.13 37212200
2017-09-25 8.06 8.07 7.97 7.99 262.30 32754500
#更高效的方法
>>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
... ignore_index=True)
A
0 0
1 1
2 2
3 3
4 4
8.apply(func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds) 对于DataFrame的行或列应用某个函数
df1.apply(np.mean,axis=0)
Out[96]:
Open 8.026000e+00
High 8.086000e+00
Low 7.988000e+00
Close 8.034000e+00
Turnover 2.783660e+02
Volume 3.467196e+07
dtype: float64
df1.apply(np.max,axis=1)
Out[97]:
Date
2017-09-15 24272800.0
2017-09-18 24867600.0
2017-09-19 54253100.0
2017-09-20 39909700.0
2017-09-21 30056600.0
dtype: float64
9.applymap(func) 对DataFrame的元素应用某个函数
df1.applymap(lambda x:'%.3f' %x)
Out[100]:
Open High Low Close Turnover Volume
Date
2017-09-15 8.060 8.080 8.030 8.040 195.430 24272800.000
2017-09-18 8.050 8.130 8.030 8.060 200.760 24867600.000
2017-09-19 8.030 8.060 7.940 8.000 433.760 54253100.000
2017-09-20 7.970 8.060 7.950 8.030 319.940 39909700.000
2017-09-21 8.020 8.100 7.990 8.040 241.940 30056600.000
10.as_blocks()和as_matrix(),分别用于将DataFrame转化为以数据类型为键值的字典和将DataFrame转化为二维数组
df1.as_blocks()
Out[105]:
{'float64': Open High Low Close Turnover
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43
2017-09-18 8.05 8.13 8.03 8.06 200.76
2017-09-19 8.03 8.06 7.94 8.00 433.76
2017-09-20 7.97 8.06 7.95 8.03 319.94
2017-09-21 8.02 8.10 7.99 8.04 241.94, 'int64': Volume
Date
2017-09-15 24272800
2017-09-18 24867600
2017-09-19 54253100
2017-09-20 39909700
2017-09-21 30056600}
df1.as_matrix()
Out[106]:
array([[ 8.06000000e+00, 8.08000000e+00, 8.03000000e+00,
8.04000000e+00, 1.95430000e+02, 2.42728000e+07],
[ 8.05000000e+00, 8.13000000e+00, 8.03000000e+00,
8.06000000e+00, 2.00760000e+02, 2.48676000e+07],
[ 8.03000000e+00, 8.06000000e+00, 7.94000000e+00,
8.00000000e+00, 4.33760000e+02, 5.42531000e+07],
[ 7.97000000e+00, 8.06000000e+00, 7.95000000e+00,
8.03000000e+00, 3.19940000e+02, 3.99097000e+07],
[ 8.02000000e+00, 8.10000000e+00, 7.99000000e+00,
8.04000000e+00, 2.41940000e+02, 3.00566000e+07]])
11.asfreq(freq, method=None, how=None, normalize=False, fill_value=None),将时间序列转化为特定的频度
#创建一个具有4个分钟时间戳的序列
>>> index=pd.date_range('1/1/2017',periods=4,freq='T')
>>> series=pd.Series([0.0,None,2.0,3.0],index=index)
>>> df=pd.DataFrame({'S':series})
>>> df
S
2017-01-01 00:00:00 0.0
2017-01-01 00:01:00 NaN
2017-01-01 00:02:00 2.0
2017-01-01 00:03:00 3.0
#将序列升采样以30秒为间隔的时间序列
>>> df.asfreq(freq='30S')
S
2017-01-01 00:00:00 0.0
2017-01-01 00:00:30 NaN
2017-01-01 00:01:00 NaN
2017-01-01 00:01:30 NaN
2017-01-01 00:02:00 2.0
2017-01-01 00:02:30 NaN
2017-01-01 00:03:00 3.0
#再次升采样,并将填充值设为5.0,可以发现并不改变升采样之前的数值
>>> df.asfreq(freq='30S',fill_value=5.0)
S
2017-01-01 00:00:00 0.0
2017-01-01 00:00:30 5.0
2017-01-01 00:01:00 NaN
2017-01-01 00:01:30 5.0
2017-01-01 00:02:00 2.0
2017-01-01 00:02:30 5.0
2017-01-01 00:03:00 3.0
#再次升采样,提供一个方法,对于空值,用后面的一个值填充
>>> df.asfreq(freq='30S',method='bfill')
S
2017-01-01 00:00:00 0.0
2017-01-01 00:00:30 NaN
2017-01-01 00:01:00 NaN
2017-01-01 00:01:30 2.0
2017-01-01 00:02:00 2.0
2017-01-01 00:02:30 3.0
2017-01-01 00:03:00 3.0
12.asof(where, subset=None),返回非空的行
>>> df.asof(index[0])
S 0.0
Name: 2017-01-01 00:00:00, dtype: float64
>>> df.asof(index)
S
2017-01-01 00:00:00 0.0
2017-01-01 00:01:00 0.0
2017-01-01 00:02:00 2.0
2017-01-01 00:03:00 3.0
13.assign(**kwargs),向DataFrame添加新的列,返回一个新的对象包括了原来的列和新增加的列
>>> df=pd.DataFrame({'A':range(1,11),'B':np.random.randn(10)})
>>> df
A B
0 1 0.540750
1 2 0.099605
2 3 0.165043
3 4 -1.379514
4 5 0.357865
5 6 -0.060789
6 7 -0.544788
7 8 -0.347995
8 9 0.372269
9 10 -0.212716
>>> df.assign(ln_A=lambda x:np.log(x.A))
A B ln_A
0 1 0.540750 0.000000
1 2 0.099605 0.693147
2 3 0.165043 1.098612
3 4 -1.379514 1.386294
4 5 0.357865 1.609438
5 6 -0.060789 1.791759
6 7 -0.544788 1.945910
7 8 -0.347995 2.079442
8 9 0.372269 2.197225
9 10 -0.212716 2.302585
#每次只能添加一列,之前添加的列会被覆盖
>>> df.assign(abs_B=lambda x:np.abs(x.B))
A B abs_B
0 1 0.540750 0.540750
1 2 0.099605 0.099605
2 3 0.165043 0.165043
3 4 -1.379514 1.379514
4 5 0.357865 0.357865
5 6 -0.060789 0.060789
6 7 -0.544788 0.544788
7 8 -0.347995 0.347995
8 9 0.372269 0.372269
9 10 -0.212716 0.212716
14.astype(dtype, copy=True, errors='raise', **kwargs) 将pandas对象数据类型设置为指定类型
>>> ser=pd.Series([5,6],dtype='int32')
>>> ser
0 5
1 6
dtype: int32
>>> ser.astype('int64')
0 5
1 6
dtype: int64
#转换为类目类型
>>> ser.astype('category')
0 5
1 6
dtype: category
Categories (2, int64): [5, 6]
>>>
#转换为定制化排序的类目类型
>>> ser.astype('category',ordered=True,categories=[1,2])
0 NaN
1 NaN
dtype: category
Categories (2, int64): [1 < 2]
15. at_time()和between_time() 取某一时刻或某段时间相应的数据
df1.at_time('9:00AM')
Out[115]:
Empty DataFrame
Columns: [Open, High, Low, Close, Turnover, Volume]
Index: []
df1.between_time('9:00AM','9:30AM')
Out[114]:
Empty DataFrame
Columns: [Open, High, Low, Close, Turnover, Volume]
Index: []
df1.at_time('00:00AM')
Out[116]:
Open High Low Close Turnover Volume
Date
2017-09-15 8.06 8.08 8.03 8.04 195.43 24272800
2017-09-18 8.05 8.13 8.03 8.06 200.76 24867600
2017-09-19 8.03 8.06 7.94 8.00 433.76 54253100
2017-09-20 7.97 8.06 7.95 8.03 319.94 39909700
2017-09-21 8.02 8.10 7.99 8.04 241.94 30056600
16.bfill(axis=None, inplace=False, limit=None, downcast=None)和fillna(method='bfill')效用等同
17.boxplot(column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds)
根据DataFrame的列元素或者可选分组绘制箱线图
df1.boxplot('Open')
Out[117]: <matplotlib.axes._subplots.AxesSubplot at 0x20374716860>
df1.boxplot(['Open','Close'])
Out[118]: <matplotlib.axes._subplots.AxesSubplot at 0x2037477da20>
18.mode(axis=0, numeric_only=False)取众数
Signature: df.mode(axis=0, numeric_only=False)
Docstring:
Gets the mode(s) of each element along the axis selected. Adds a row
for each mode per label, fills in gaps with nan.
Note that there could be multiple values returned for the selected
axis (when more than one item share the maximum frequency), which is
the reason why a dataframe is returned. If you want to impute missing
values with the mode in a dataframe ``df``, you can just do this:
``df.fillna(df.mode().iloc[0])``
Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
* 0 or 'index' : get mode of each column
* 1 or 'columns' : get mode of each row
numeric_only : boolean, default False
if True, only apply to numeric columns
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
'B':[3,2,1,7,7,4,4]})
df.mode()
Out[1]:
A B
0 1.0 4
1 NaN 7
df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
'B':[3,2,7,7,7,4,4]})
df.mode()
Out[2]:
A B
0 1 7
df = pd.DataFrame({'A': [1, 2, 1, 1, 1, 2, 3],
'B':[3,1,1,7,7,4,4]})
df.mode()
Out[3]:
A B
0 1.0 1
1 NaN 4
2 NaN 7
df = pd.DataFrame({'A': [1, 2, 2, 1, 1, 2, 3],
'B':[3,2,1,7,6,4,4]})
df.mode()
Out[5]:
A B
0 1 4.0
1 2 NaN
df = pd.DataFrame({'A': [1, 2, 2, 1, 1, 2, 3],
'B':[3,2,1,7,7,4,4]})
df.mode()
Out[6]:
A B
0 1 4
1 2 7
df = pd.DataFrame({'A': [1, 4, 2, 1, 3, 2, 3],
'B':[3,2,7,7,7,4,4]})
df.mode()
Out[8]:
A B
0 1 7.0
1 2 NaN
2 3 NaN
19 pandas:对dataframe进行groupby后求众数mode
有如下一个dataframe
,打算对a
的每一个类别求b
的众数(mode
)
dir(df.groupby('a'))
可以看到是没有mode
函数的,因此不能直接使用df.groupby('a').mode().reset_index()
注意:.reset_index()
是将Series
对象转变成DataFrame
既然df.groupby('a')
没有mode
函数,又考虑到可能聚合函数agg
能够接用外部函数,搜索网上的解决办法,经过尝试,在此总结并列出以下几种解决方案(当然也可以直接跳到最后一个解决方案)。
(1) 使用scipy.stats.mode():df中的B类别有两个众数,返回的结果B类别的众数取了较小的结果
>>> from scipy import stats
>>> df.groupby('a').agg(lambda x: stats.mode(x)[0][0]).reset_index()
a b
0 A 1
1 B 2
(2) 使用value_counts()
(1) 先看value_counts()的作用:可以看到得到的结果中的index是取值,内容是计数,并且index是降序排列的,因此取index[0]是取最大值,因此有两个众数以上的时候,会取到较大的结果
>>> ss = pd.Series([1,2,2,3,3])
>>> ss
0 1
1 2
2 2
3 3
4 3
dtype: int64
>>> ss.value_counts()
3 2
2 2
1 1
dtype: int64
>>> ss.value_counts().index[0]
3123456789101112131415
(2) 应用到dataframe的groupby之后的聚合函数中:
>>> df.groupby('a').agg(lambda x: x.value_counts().index[1]).reset_index()
a b
0 A 1
1 B 3
(3)使用pd.Series.mode():该函数是返回Series的众数的,当众数有多个时,会返回一个list,里面包含了所有众数
>>> df.groupby('a').agg(pd.Series.mode).reset_index()
a b
0 A 1
1 B [2, 3]
(4)使用pd.Series.mode()和np.mean():上述结果显然不是我想要的,但是如果对有多个众数的结果取均值作为新的众数,是较为满意的结果,则可以这样
>>> import numpy as np
>>> df.groupby('a').agg(lambda x: np.mean(pd.Series.mode(x))).reset_index()
a b
0 A 1.0
1 B 2.5
20 Pandas 描述统计函数
在进行统计描述时,pandas对三个数据对象的轴参数规定如下:
Series: 没有轴参数
DataFrame: “index” (axis=0, default), “columns” (axis=1)
Panel: “items” (axis=0), “major” (axis=1, default), “minor” (axis=2)
统计描述参数如下
df.count() #非空元素计算
df.min() #最小值
df.max() #最大值
df.idxmin() #最小值的位置,类似于R中的which.min函数
df.idxmax() #最大值的位置,类似于R中的which.max函数
df.quantile(0.1) #10%分位数
df.sum() #求和
df.mean() #均值
df.median() #中位数
df.mode() #众数
df.var() #方差
df.std() #标准差
df.mad() #平均绝对偏差
df.skew() #偏度
df.kurt() #峰度
df.describe() #一次性输出多个描述性统计指标
count
Number of non-null observations
观测值的个数
sum
Sum of values
求和
mean
Mean of values
求平均值
mad
Mean absolute deviation
平均绝对方差
median
Arithmetic median of values
中位数
min
Minimum
最小值
max
Maximum
最大值
mode
Mode
众数
abs
Absolute Value
绝对值
prod
Product of values
乘积
std
Bessel-corrected sample standard deviation
标准差
var
Unbiased variance
方差
sem
Standard error of the mean
标准误
skew
Sample skewness (3rd moment)
偏度系数
kurt
Sample kurtosis (4th moment)
峰度
quantile
Sample quantile (value at %)
分位数
cumsum
Cumulative sum
累加
cumprod
Cumulative product
累乘
cummax
Cumulative maximum
累最大值
cummin
Cumulative minimum
累最小值
cov()
covariance
协方差
corr()
correlation
相关系数
rank()
rank by values
排名
pct_change()
time change
时间序列变
21 pandas replace函数使用小结
import numpy as np
import pandas as pd
df = pd.read_csv('emp.csv')
df
#Series对象值替换
s = df.iloc[2]#获取行索引为2数据
#单值替换
s.replace('?',np.nan)#用np.nan替换?
s.replace({'?':'NA'})#用NA替换?
#多值替换
s.replace(['?',r'$'],[np.nan,'NA'])#列表值替换
s.replace({'?':np.nan,'$':'NA'})#字典映射
#同缺失值填充方法类似
s.replace(['?','$'],method='pad')#向前填充
s.replace(['?','$'],method='ffill')#向前填充
s.replace(['?','$'],method='bfill')#向后填充
#limit参数控制填充次数
s.replace(['?','$'],method='bfill',limit=1)
#DataFrame对象值替换
#单值替换
df.replace('?',np.nan)#用np.nan替换?
df.replace({'?':'NA'})#用NA替换?
#按列指定单值替换
df.replace({'EMPNO':'?'},np.nan)#用np.nan替换EMPNO列中?
df.replace({'EMPNO':'?','ENAME':'.'},np.nan)#用np.nan替换EMPNO列中?和ENAME中.
#多值替换
df.replace(['?','.','$'],[np.nan,'NA','None'])##用np.nan替换?用NA替换. 用None替换$
df.replace({'?':'NA','$':None})#用NA替换? 用None替换$
df.replace({'?','$'},{'NA',None})#用NA替换? 用None替换$
#正则替换
df.replace(r'\?|\.|\$',np.nan,regex=True)#用np.nan替换?或.或$原字符
df.replace([r'\?',r'\$'],np.nan,regex=True)#用np.nan替换?和$
df.replace([r'\?',r'\$'],[np.nan,'NA'],regex=True)#用np.nan替换?用NA替换$符号
df.replace(regex={r'\?':None})
#value参数显示传递
df.replace(regex=[r'\?|\.|\$'],value=np.nan)#用np.nan替换?或.或$原字符
df['BrandName'].replace(
to_replace=['ABC', 'AB'],
value='A',
inplace=True
)
网友评论