美文网首页
DataFrame的groupby

DataFrame的groupby

作者: Chaweys | 来源:发表于2020-11-23 23:32 被阅读0次

    #coding=utf-8
    import pandas as pd
    import numpy as np
    
    df=pd.read_csv("groupby.csv")
    print(df)
    '''
      company  salary  age
    0       A      13   40
    1       A      49   18
    2       C      15   16
    3       B      40   49
    4       A      48   40
    5       A      12   27
    6       B      17   18
    7       B       5   39
    8       B      37   19
    9       B      15   18
    '''
    
    
    #df.groupby()按哪些字段进行组,返回一个DataFrameGroupBy对象
    print(df.groupby("company"))
    '''
    <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E63E723E48>
    '''
    
    
    #通过list()查看DataFrameGroupBy对象的内部情况
    print(list(df.groupby("company")))
    '''
    解释:groupby的过程就是将原有的DataFrame按照groupby的字段(这里是company),划分为若干个分组DataFrame对象
    
    [
    ('A',   
          company  salary  age
    0       A          13   40
    1       A          49   18
    4       A          48   40
    5       A          12   27),
    ('B',   
           company  salary  age
    3       B          40   49
    6       B          17   18
    7       B           5   39
    8       B          37   19
    9       B          15   18), 
    ('C',   
           company  salary  age
    2       C          15   16)
    ]
    '''
    
    
    #get_group()查看分组之后其中某个分组DataFrame
    g=df.groupby("company")
    print(g.get_group("A"))
    '''
      company  salary  age
    0       A      13   40
    1       A      49   18
    4       A      48   40
    5       A      12   27
    '''
    
    
    #通过groupby分组之后,可用通过agg()函数指定聚合操作
    '''
    聚合函数:
    min     最小值
    max     最大值
    sum     求和
    mean    求平均值
    median  求中位数
    count   计数
    nunique 计算去重后个数(类似sql的 count distinct)
    '''
    
    #计算不同公司员工的平均年龄和平均薪资
    g=df.groupby("company")
    print(g.agg({"salary":"mean","age":"mean"}))
    print(g.agg("mean"))
    '''
               age  salary
    company               
    A        31.25    30.5
    B        28.60    22.8
    C        16.00    15.0
             salary    age
    company               
    A          30.5  31.25
    B          22.8  28.60
    C          15.0  16.00
    '''
    
    
    #指定列,行进行操作
    #计算不同公司员工的平均年龄以及薪水的中位数
    g=df.groupby("company")
    print(g.agg({"salary":"median","age":"mean"}))
    '''
               age  salary
    company               
    A        31.25    30.5
    B        28.60    17.0
    C        16.00    15.0
    '''
    
    
    #计算不同公司员工的平均年龄以及薪水的中位数,均值,总人数
    g=df.groupby("company")
    print(g.agg({"age":"mean","salary":["median","mean","count"]}))
    '''
            salary                age
            median  mean count   mean
    company                          
    A         30.5  30.5     4  31.25
    B         17.0  22.8     5  28.60
    C         15.0  15.0     1  16.00
    '''
    
    
    
    '''
    count:计数,不包含NaN的值
    size: 计数,包含NaN的值
    nuinque: 计算去重后的个数(类似sql count distinct)
    '''
    df=pd.DataFrame({"Name":["Alice","Bob","Mallory","Mallory","Bob","Mallory"],
                     "City":["Seattle","Seattle","Portland","Seattle","Seattle","Portland"],
                     "Val":[4,3,3,np.nan,np.nan,4]})
    
    print(df)
    '''
           City     Name  Val
    0   Seattle    Alice  4.0
    1   Seattle      Bob  3.0
    2  Portland  Mallory  3.0
    3   Seattle  Mallory  NaN
    4   Seattle      Bob  NaN
    5  Portland  Mallory  4.0
    '''
    
    g=df.groupby("City")
    print(g.agg({"Val":["count","size"]}))
    '''
    由结果得知:Seattle的count=2,size=4,因为size统计了两个NaN的记录
               Val     
             count size
    City               
    Portland     2    2
    Seattle      2    4
    '''
    
    
    print(g.agg({"Val":["count","size"],"Name":"nunique"}))
    '''
    由结果得知:Seattle分组之后,去重的姓名有三个;Portland分组之后,去重的姓名只有一个
                Val         Name
             count size  nunique
    City                       
    Portland     2    2       1
    Seattle      2    4       3
    '''
    

    相关文章

      网友评论

          本文标题:DataFrame的groupby

          本文链接:https://www.haomeiwen.com/subject/lyfeiktx.html