美文网首页
11-数值计算和统计基础

11-数值计算和统计基础

作者: 蓝剑狼 | 来源:发表于2018-08-26 22:49 被阅读11次

常用数学、统计方法

import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
print("1".center(40,'*'))
print(df)
print("2".center(40,'*'))
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)


m1 = df.mean()
print("3".center(40,'*'))
print(m1,type(m1))
print('单独统计一列:',df['key2'].mean())
# np.nan :空值
# .mean()计算均值
# 只统计数字列
# 可以通过索引单独统计一列

m2 = df.mean(axis=1)
print("4".center(40,'*'))
print(m2)
# axis参数:默认为0,以列来计算,axis=1,以行来计算,这里就按照行来汇总了

m3 = df.mean(skipna=False)
print("5".center(40,'*'))
print(m3)
# skipna参数:是否忽略NaN,默认True,如False,有NaN的列统计结果仍未NaN
#执行结果
*******************1********************
   key1  key2 key3
a   4.0   1.0    1
b   5.0   2.0    2
c   3.0   NaN    3
d   NaN   4.0    j
e   2.0   5.0    k
*******************2********************
float64 float64 object
*******************3********************
key1    3.5
key2    3.0
dtype: float64 <class 'pandas.core.series.Series'>
单独统计一列: 3.0
*******************4********************
a    2.5
b    3.5
c    3.0
d    4.0
e    3.5
dtype: float64
*******************5********************
key1   NaN
key2   NaN
dtype: float64
# 主要数学计算方法,可用于Series和DataFrame(1)

df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10)*10})
print(df)
print('-----')

print(df.count(),'→ count统计非Na值的数量\n')
print(df.min(),'→ min统计最小值\n',df['key2'].max(),'→ max统计最大值\n')
print(df.quantile(q=0.75),'→ quantile统计分位数,参数q确定位置\n')
print(df.sum(),'→ sum求和\n')
print(df.mean(),'→ mean求平均值\n')
print(df.median(),'→ median求算数中位数,50%分位数\n')
print(df.std(),'\n',df.var(),'→ std,var分别求标准差,方差\n')
print(df.skew(),'→ skew样本的偏度\n')
print(df.kurt(),'→ kurt样本的峰度\n')
#执行结果
 key1      key2
0     0  3.738954
1     1  3.832567
2     2  6.699210
3     3  4.084607
4     4  7.456708
5     5  8.323144
6     6  9.040738
7     7  5.164880
8     8  0.094538
9     9  7.399022
-----
key1    10
key2    10
dtype: int64 → count统计非Na值的数量

key1    0.000000
key2    0.094538
dtype: float64 → min统计最小值
 9.040737765606417 → max统计最大值

key1    6.750000
key2    7.442286
Name: 0.75, dtype: float64 → quantile统计分位数,参数q确定位置

key1    45.000000
key2    55.834368
dtype: float64 → sum求和

key1    4.500000
key2    5.583437
dtype: float64 → mean求平均值

key1    4.500000
key2    5.932045
dtype: float64 → median求算数中位数,50%分位数

key1    3.027650
key2    2.718797
dtype: float64 
 key1    9.166667
key2    7.391858
dtype: float64 → std,var分别求标准差,方差

key1    0.000000
key2   -0.722995
dtype: float64 → skew样本的偏度

key1   -1.200000
key2    0.285023
dtype: float64 → kurt样本的峰度
# 主要数学计算方法,可用于Series和DataFrame(2)

df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum样本的累计和\n')

df['key1_p'] = df['key1'].cumprod()
df['key2_p'] = df['key2'].cumprod()
print(df,'→ cumprod样本的累计积\n')

print(df.cummax(),'\n',df.cummin(),'→ cummax,cummin分别求累计最大值,累计最小值\n')
# 会填充key1,和key2的值
#执行结果
 key1      key2  key1_s     key2_s
0     0  3.738954       0   3.738954
1     1  3.832567       1   7.571522
2     2  6.699210       3  14.270731
3     3  4.084607       6  18.355338
4     4  7.456708      10  25.812046
5     5  8.323144      15  34.135191
6     6  9.040738      21  43.175928
7     7  5.164880      28  48.340808
8     8  0.094538      36  48.435346
9     9  7.399022      45  55.834368 → cumsum样本的累计和

   key1      key2  key1_s     key2_s  key1_p        key2_p
0     0  3.738954       0   3.738954       0  3.738954e+00
1     1  3.832567       1   7.571522       0  1.432979e+01
2     2  6.699210       3  14.270731       0  9.599830e+01
3     3  4.084607       6  18.355338       0  3.921153e+02
4     4  7.456708      10  25.812046       0  2.923889e+03
5     5  8.323144      15  34.135191       0  2.433595e+04
6     6  9.040738      21  43.175928       0  2.200150e+05
7     7  5.164880      28  48.340808       0  1.136351e+06
8     8  0.094538      36  48.435346       0  1.074280e+05
9     9  7.399022      45  55.834368       0  7.948625e+05 → cumprod样本的累计积

   key1      key2  key1_s     key2_s  key1_p        key2_p
0   0.0  3.738954     0.0   3.738954     0.0  3.738954e+00
1   1.0  3.832567     1.0   7.571522     0.0  1.432979e+01
2   2.0  6.699210     3.0  14.270731     0.0  9.599830e+01
3   3.0  6.699210     6.0  18.355338     0.0  3.921153e+02
4   4.0  7.456708    10.0  25.812046     0.0  2.923889e+03
5   5.0  8.323144    15.0  34.135191     0.0  2.433595e+04
6   6.0  9.040738    21.0  43.175928     0.0  2.200150e+05
7   7.0  9.040738    28.0  48.340808     0.0  1.136351e+06
8   8.0  9.040738    36.0  48.435346     0.0  1.136351e+06
9   9.0  9.040738    45.0  55.834368     0.0  1.136351e+06 
    key1      key2  key1_s    key2_s  key1_p    key2_p
0   0.0  3.738954     0.0  3.738954     0.0  3.738954
1   0.0  3.738954     0.0  3.738954     0.0  3.738954
2   0.0  3.738954     0.0  3.738954     0.0  3.738954
3   0.0  3.738954     0.0  3.738954     0.0  3.738954
4   0.0  3.738954     0.0  3.738954     0.0  3.738954
5   0.0  3.738954     0.0  3.738954     0.0  3.738954
6   0.0  3.738954     0.0  3.738954     0.0  3.738954
7   0.0  3.738954     0.0  3.738954     0.0  3.738954
8   0.0  0.094538     0.0  3.738954     0.0  3.738954
9   0.0  0.094538     0.0  3.738954     0.0  3.738954 → cummax,cummin分别求累计最大值,累计最小值

# 唯一值:.unique()

s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print("1".center(40,'*'))
print(s)
print("2".center(40,'*'))
print(sq,type(sq))
print("3".center(40,'*'))
print(pd.Series(sq))
# 得到一个唯一值数组
# 通过pd.Series重新变成新的Series

sq.sort()
print("4".center(40,'*'))
print(sq)
# 重新排序
#执行结果
*******************1********************
0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
*******************2********************
['a' 's' 'd' 'v' 'c' 'f' 'g'] <class 'numpy.ndarray'>
*******************3********************
0    a
1    s
2    d
3    v
4    c
5    f
6    g
dtype: object
*******************4********************
['a' 'c' 'd' 'f' 'g' 's' 'v']
sc = s.value_counts(sort = False)  # 也可以这样写:pd.value_counts(sc, sort = False)
print(sc)
# 得到一个新的Series,计算出不同值出现的频率
# sort参数:排序,默认为True
#执行结果
c    1
d    2
v    1
g    2
s    2
f    1
a    2
dtype: int64
# 成员资格:.isin()

s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
                  'key2':np.arange(4,13)})
print("1".center(40,'*'))
print(s)
print(df)

print("2".center(40,'*'))
print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))
# 用[]表示
# 得到一个布尔值的Series或者Dataframe
#执行结果
*******************1********************
0    10
1    11
2    12
3    13
4    14
dtype: int32
  key1  key2
0    a     4
1    s     5
2    d     6
3    c     7
4    b     8
5    v     9
6    a    10
7    s    11
8    d    12
*******************2********************
0    False
1    False
2    False
3    False
4     True
dtype: bool
    key1   key2
0   True  False
1  False  False
2  False  False
3  False  False
4  False   True
5  False  False
6   True  False
7  False  False
8  False  False

相关文章

网友评论

      本文标题:11-数值计算和统计基础

      本文链接:https://www.haomeiwen.com/subject/hubpbftx.html