美文网首页
pandas 和numpy库处理数据(2)

pandas 和numpy库处理数据(2)

作者: 2b17a8a8f301 | 来源:发表于2018-12-24 10:21 被阅读0次

    接上:对于数据的基本统计分析(统计元素不再解释,基本是均值、方差等计算)

     df
       num  class   name    sex  english  sport  army  math  possity  space
    0   10      1   mary  woman       80     80    90  75.0       60     65
    1   28      1   land    man       80     50    69  70.0       58     70
    2   15      2   asnx    man       80     69    80  75.0       90     94
    3   18      4  david    man       90     80    86  85.0       95     62
    4   19      2    gry  woman       90     50    64   NaN       64     85
    5   20      2  kitty  woman       84     58    97  94.0       63     21
    6   14      3   lury  woman       98     77    88   0.0       55     40
    7   21      1   facy    man       55     68    94  52.0       36     48
    >>> df['sport'].describe()
    count     8.000000
    mean     66.500000
    std      12.535663
    min      50.000000
    25%      56.000000
    50%      68.500000
    75%      77.750000
    max      80.000000
    Name: sport, dtype: float64
    >>> df.describe()
                 num     class    english    ...           math    possity      space
    count   8.000000  8.000000   8.000000    ...       7.000000   8.000000   8.000000
    mean   18.125000  2.000000  82.125000    ...      64.428571  65.125000  60.625000
    std     5.383507  1.069045  12.699128    ...      31.245571  19.067081  23.820384
    min    10.000000  1.000000  55.000000    ...       0.000000  36.000000  21.000000
    25%    14.750000  1.000000  80.000000    ...      61.000000  57.250000  46.000000
    50%    18.500000  2.000000  82.000000    ...      75.000000  61.500000  63.500000
    75%    20.250000  2.250000  90.000000    ...      80.000000  70.500000  73.750000
    max    28.000000  4.000000  98.000000    ...      94.000000  95.000000  94.000000
    
    [8 rows x 8 columns]
    >>> df['english'].size
    8
    >>> df['english'].max()
    98
    >>> df['english'].min()
    55
    >>> df['english'].sum()
    657
    >>> df['english'].mean()
    82.125
    >>> df['english'].var()
    161.26785714285714
    >>> df['english'].std()
    12.699128204048383
    >>> np.mean(df['english'])
    82.125
    >>> np.average(df['english'])
    82.125
    >>> df.median()
    num        18.5
    class       2.0
    english    82.0
    sport      68.5
    army       87.0
    math       75.0
    possity    61.5
    space      63.5
    dtype: float64
    >>> df.mode()
       num  class   name    sex  english  sport  army  math  possity  space
    0   10    1.0   asnx    man     80.0   50.0    64  75.0       36     21
    1   14    2.0  david  woman      NaN   80.0    69   NaN       55     40
    2   15    NaN   facy    NaN      NaN    NaN    80   NaN       58     48
    3   18    NaN    gry    NaN      NaN    NaN    86   NaN       60     62
    4   19    NaN  kitty    NaN      NaN    NaN    88   NaN       63     65
    5   20    NaN   land    NaN      NaN    NaN    90   NaN       64     70
    6   21    NaN   lury    NaN      NaN    NaN    94   NaN       90     85
    7   28    NaN   mary    NaN      NaN    NaN    97   NaN       95     94
    
    >>> df.mode()
       num  class   name    sex  english  sport  army  math  possity  space
    0   10    1.0   asnx    man     80.0   50.0    64  75.0       36     21
    1   14    2.0  david  woman      NaN   80.0    69   NaN       55     40
    2   15    NaN   facy    NaN      NaN    NaN    80   NaN       58     48
    3   18    NaN    gry    NaN      NaN    NaN    86   NaN       60     62
    4   19    NaN  kitty    NaN      NaN    NaN    88   NaN       63     65
    5   20    NaN   land    NaN      NaN    NaN    90   NaN       64     70
    6   21    NaN   lury    NaN      NaN    NaN    94   NaN       90     85
    7   28    NaN   mary    NaN      NaN    NaN    97   NaN       95     94
    >>> df
       num  class   name    sex  english  sport  army  math  possity  space
    0   10      1   mary  woman       80     80    90  75.0       60     65
    1   28      1   land    man       80     50    69  70.0       58     70
    2   15      2   asnx    man       80     69    80  75.0       90     94
    3   18      4  david    man       90     80    86  85.0       95     62
    4   19      2    gry  woman       90     50    64   NaN       64     85
    5   20      2  kitty  woman       84     58    97  94.0       63     21
    6   14      3   lury  woman       98     77    88   0.0       55     40
    7   21      1   facy    man       55     68    94  52.0       36     48
    >>> df.groupby('class')['english','sport','army'].mean()
             english  sport       army
    class                             
    1      71.666667   66.0  84.333333
    2      84.666667   59.0  80.333333
    3      98.000000   77.0  88.000000
    4      90.000000   80.0  86.000000
    >>> df.groupby(['class','sex'])['english'].agg({'total':np.sum,'number':np.size,'mean':np.mean,'var':np.var})
    
                 total  number  mean    var
    class sex                              
    1     man      135       2  67.5  312.5
          woman     80       1  80.0    NaN
    2     man       80       1  80.0    NaN
          woman    174       2  87.0   18.0
    3     woman     98       1  98.0    NaN
    4     man       90       1  90.0    NaN
    >>> #建立透视表
    >>> df.pivot_table(index=['class','name'])
                 army  english  math  num  possity  space  sport
    class name                                                  
    1     facy     94       55  52.0   21       36     48     68
          land     69       80  70.0   28       58     70     50
          mary     90       80  75.0   10       60     65     80
    2     asnx     80       80  75.0   15       90     94     69
          gry      64       90   NaN   19       64     85     50
          kitty    97       84  94.0   20       63     21     58
    3     lury     88       98   0.0   14       55     40     77
    4     david    86       90  85.0   18       95     62     80
    >>> df
       num  class   name    sex  english  sport  army  math  possity  space
    0   10      1   mary  woman       80     80    90  75.0       60     65
    1   28      1   land    man       80     50    69  70.0       58     70
    2   15      2   asnx    man       80     69    80  75.0       90     94
    3   18      4  david    man       90     80    86  85.0       95     62
    4   19      2    gry  woman       90     50    64   NaN       64     85
    5   20      2  kitty  woman       84     58    97  94.0       63     21
    6   14      3   lury  woman       98     77    88   0.0       55     40
    7   21      1   facy    man       55     68    94  52.0       36     48
    #相关系数
    >>> df['english'].corr(df['sport'])
    0.0785215353368861
    >>> df['english'].corr(df['army'])
    -0.28518424251841296
    >>> df.loc[:,['english','sport','army','math','possity','space']].corr()
              english     sport      army      math   possity     space
    english  1.000000  0.078522 -0.285184 -0.210888  0.486667  0.020484
    sport    0.078522  1.000000  0.604026 -0.275197  0.239372 -0.140894
    army    -0.285184  0.604026  1.000000 -0.010708 -0.191855 -0.744345
    math    -0.210888 -0.275197 -0.010708  1.000000  0.449533  0.180691
    possity  0.486667  0.239372 -0.191855  0.449533  1.000000  0.445185
    space    0.020484 -0.140894 -0.744345  0.180691  0.445185  1.000000
    
    

    相关文章

      网友评论

          本文标题:pandas 和numpy库处理数据(2)

          本文链接:https://www.haomeiwen.com/subject/pxsakqtx.html