美文网首页
11-数值计算和统计基础

11-数值计算和统计基础

作者: 蓝剑狼 | 来源:发表于2018-08-26 22:49 被阅读11次

    常用数学、统计方法

    import numpy as np
    import pandas as pd
    
    df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                     'key2':[1,2,np.nan,4,5],
                     'key3':[1,2,3,'j','k']},
                     index = ['a','b','c','d','e'])
    print("1".center(40,'*'))
    print(df)
    print("2".center(40,'*'))
    print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
    
    
    m1 = df.mean()
    print("3".center(40,'*'))
    print(m1,type(m1))
    print('单独统计一列:',df['key2'].mean())
    # np.nan :空值
    # .mean()计算均值
    # 只统计数字列
    # 可以通过索引单独统计一列
    
    m2 = df.mean(axis=1)
    print("4".center(40,'*'))
    print(m2)
    # axis参数:默认为0,以列来计算,axis=1,以行来计算,这里就按照行来汇总了
    
    m3 = df.mean(skipna=False)
    print("5".center(40,'*'))
    print(m3)
    # skipna参数:是否忽略NaN,默认True,如False,有NaN的列统计结果仍未NaN
    #执行结果
    *******************1********************
       key1  key2 key3
    a   4.0   1.0    1
    b   5.0   2.0    2
    c   3.0   NaN    3
    d   NaN   4.0    j
    e   2.0   5.0    k
    *******************2********************
    float64 float64 object
    *******************3********************
    key1    3.5
    key2    3.0
    dtype: float64 <class 'pandas.core.series.Series'>
    单独统计一列: 3.0
    *******************4********************
    a    2.5
    b    3.5
    c    3.0
    d    4.0
    e    3.5
    dtype: float64
    *******************5********************
    key1   NaN
    key2   NaN
    dtype: float64
    
    # 主要数学计算方法,可用于Series和DataFrame(1)
    
    df = pd.DataFrame({'key1':np.arange(10),
                      'key2':np.random.rand(10)*10})
    print(df)
    print('-----')
    
    print(df.count(),'→ count统计非Na值的数量\n')
    print(df.min(),'→ min统计最小值\n',df['key2'].max(),'→ max统计最大值\n')
    print(df.quantile(q=0.75),'→ quantile统计分位数,参数q确定位置\n')
    print(df.sum(),'→ sum求和\n')
    print(df.mean(),'→ mean求平均值\n')
    print(df.median(),'→ median求算数中位数,50%分位数\n')
    print(df.std(),'\n',df.var(),'→ std,var分别求标准差,方差\n')
    print(df.skew(),'→ skew样本的偏度\n')
    print(df.kurt(),'→ kurt样本的峰度\n')
    #执行结果
     key1      key2
    0     0  3.738954
    1     1  3.832567
    2     2  6.699210
    3     3  4.084607
    4     4  7.456708
    5     5  8.323144
    6     6  9.040738
    7     7  5.164880
    8     8  0.094538
    9     9  7.399022
    -----
    key1    10
    key2    10
    dtype: int64 → count统计非Na值的数量
    
    key1    0.000000
    key2    0.094538
    dtype: float64 → min统计最小值
     9.040737765606417 → max统计最大值
    
    key1    6.750000
    key2    7.442286
    Name: 0.75, dtype: float64 → quantile统计分位数,参数q确定位置
    
    key1    45.000000
    key2    55.834368
    dtype: float64 → sum求和
    
    key1    4.500000
    key2    5.583437
    dtype: float64 → mean求平均值
    
    key1    4.500000
    key2    5.932045
    dtype: float64 → median求算数中位数,50%分位数
    
    key1    3.027650
    key2    2.718797
    dtype: float64 
     key1    9.166667
    key2    7.391858
    dtype: float64 → std,var分别求标准差,方差
    
    key1    0.000000
    key2   -0.722995
    dtype: float64 → skew样本的偏度
    
    key1   -1.200000
    key2    0.285023
    dtype: float64 → kurt样本的峰度
    
    # 主要数学计算方法,可用于Series和DataFrame(2)
    
    df['key1_s'] = df['key1'].cumsum()
    df['key2_s'] = df['key2'].cumsum()
    print(df,'→ cumsum样本的累计和\n')
    
    df['key1_p'] = df['key1'].cumprod()
    df['key2_p'] = df['key2'].cumprod()
    print(df,'→ cumprod样本的累计积\n')
    
    print(df.cummax(),'\n',df.cummin(),'→ cummax,cummin分别求累计最大值,累计最小值\n')
    # 会填充key1,和key2的值
    #执行结果
     key1      key2  key1_s     key2_s
    0     0  3.738954       0   3.738954
    1     1  3.832567       1   7.571522
    2     2  6.699210       3  14.270731
    3     3  4.084607       6  18.355338
    4     4  7.456708      10  25.812046
    5     5  8.323144      15  34.135191
    6     6  9.040738      21  43.175928
    7     7  5.164880      28  48.340808
    8     8  0.094538      36  48.435346
    9     9  7.399022      45  55.834368 → cumsum样本的累计和
    
       key1      key2  key1_s     key2_s  key1_p        key2_p
    0     0  3.738954       0   3.738954       0  3.738954e+00
    1     1  3.832567       1   7.571522       0  1.432979e+01
    2     2  6.699210       3  14.270731       0  9.599830e+01
    3     3  4.084607       6  18.355338       0  3.921153e+02
    4     4  7.456708      10  25.812046       0  2.923889e+03
    5     5  8.323144      15  34.135191       0  2.433595e+04
    6     6  9.040738      21  43.175928       0  2.200150e+05
    7     7  5.164880      28  48.340808       0  1.136351e+06
    8     8  0.094538      36  48.435346       0  1.074280e+05
    9     9  7.399022      45  55.834368       0  7.948625e+05 → cumprod样本的累计积
    
       key1      key2  key1_s     key2_s  key1_p        key2_p
    0   0.0  3.738954     0.0   3.738954     0.0  3.738954e+00
    1   1.0  3.832567     1.0   7.571522     0.0  1.432979e+01
    2   2.0  6.699210     3.0  14.270731     0.0  9.599830e+01
    3   3.0  6.699210     6.0  18.355338     0.0  3.921153e+02
    4   4.0  7.456708    10.0  25.812046     0.0  2.923889e+03
    5   5.0  8.323144    15.0  34.135191     0.0  2.433595e+04
    6   6.0  9.040738    21.0  43.175928     0.0  2.200150e+05
    7   7.0  9.040738    28.0  48.340808     0.0  1.136351e+06
    8   8.0  9.040738    36.0  48.435346     0.0  1.136351e+06
    9   9.0  9.040738    45.0  55.834368     0.0  1.136351e+06 
        key1      key2  key1_s    key2_s  key1_p    key2_p
    0   0.0  3.738954     0.0  3.738954     0.0  3.738954
    1   0.0  3.738954     0.0  3.738954     0.0  3.738954
    2   0.0  3.738954     0.0  3.738954     0.0  3.738954
    3   0.0  3.738954     0.0  3.738954     0.0  3.738954
    4   0.0  3.738954     0.0  3.738954     0.0  3.738954
    5   0.0  3.738954     0.0  3.738954     0.0  3.738954
    6   0.0  3.738954     0.0  3.738954     0.0  3.738954
    7   0.0  3.738954     0.0  3.738954     0.0  3.738954
    8   0.0  0.094538     0.0  3.738954     0.0  3.738954
    9   0.0  0.094538     0.0  3.738954     0.0  3.738954 → cummax,cummin分别求累计最大值,累计最小值
    
    
    # 唯一值:.unique()
    
    s = pd.Series(list('asdvasdcfgg'))
    sq = s.unique()
    print("1".center(40,'*'))
    print(s)
    print("2".center(40,'*'))
    print(sq,type(sq))
    print("3".center(40,'*'))
    print(pd.Series(sq))
    # 得到一个唯一值数组
    # 通过pd.Series重新变成新的Series
    
    sq.sort()
    print("4".center(40,'*'))
    print(sq)
    # 重新排序
    #执行结果
    *******************1********************
    0     a
    1     s
    2     d
    3     v
    4     a
    5     s
    6     d
    7     c
    8     f
    9     g
    10    g
    dtype: object
    *******************2********************
    ['a' 's' 'd' 'v' 'c' 'f' 'g'] <class 'numpy.ndarray'>
    *******************3********************
    0    a
    1    s
    2    d
    3    v
    4    c
    5    f
    6    g
    dtype: object
    *******************4********************
    ['a' 'c' 'd' 'f' 'g' 's' 'v']
    
    sc = s.value_counts(sort = False)  # 也可以这样写:pd.value_counts(sc, sort = False)
    print(sc)
    # 得到一个新的Series,计算出不同值出现的频率
    # sort参数:排序,默认为True
    #执行结果
    c    1
    d    2
    v    1
    g    2
    s    2
    f    1
    a    2
    dtype: int64
    
    # 成员资格:.isin()
    
    s = pd.Series(np.arange(10,15))
    df = pd.DataFrame({'key1':list('asdcbvasd'),
                      'key2':np.arange(4,13)})
    print("1".center(40,'*'))
    print(s)
    print(df)
    
    print("2".center(40,'*'))
    print(s.isin([5,14]))
    print(df.isin(['a','bc','10',8]))
    # 用[]表示
    # 得到一个布尔值的Series或者Dataframe
    #执行结果
    *******************1********************
    0    10
    1    11
    2    12
    3    13
    4    14
    dtype: int32
      key1  key2
    0    a     4
    1    s     5
    2    d     6
    3    c     7
    4    b     8
    5    v     9
    6    a    10
    7    s    11
    8    d    12
    *******************2********************
    0    False
    1    False
    2    False
    3    False
    4     True
    dtype: bool
        key1   key2
    0   True  False
    1  False  False
    2  False  False
    3  False  False
    4  False   True
    5  False  False
    6   True  False
    7  False  False
    8  False  False
    
    

    相关文章

      网友评论

          本文标题:11-数值计算和统计基础

          本文链接:https://www.haomeiwen.com/subject/hubpbftx.html