美文网首页数据科学和人工智能技术笔记程序员
数据科学和人工智能技术笔记 十九、数据整理(6)

数据科学和人工智能技术笔记 十九、数据整理(6)

作者: 布客飞龙 | 来源:发表于2019-01-01 22:08 被阅读21次

    十九、数据整理(6)

    作者:Chris Albon

    译者:飞龙

    协议:CC BY-NC-SA 4.0

    在列中搜索某个值

    # 导入模块
    import pandas as pd
    
    raw_data = {'first_name': ['Jason', 'Jason', 'Tina', 'Jake', 'Amy'], 
            'last_name': ['Miller', 'Miller', 'Ali', 'Milner', 'Cooze'], 
            'age': [42, 42, 36, 24, 73], 
            'preTestScore': [4, 4, 31, 2, 3],
            'postTestScore': [25, 25, 57, 62, 70]}
    df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
    df
    
    first_name last_name age preTestScore postTestScore
    0 Jason Miller 42 4 25
    1 Jason Miller 42 4 25
    2 Tina Ali 36 31 57
    3 Jake Milner 24 2 62
    4 Amy Cooze 73 3 70
    # 在列中寻找值在哪里
    # 查看 postTestscore 大于 50 的地方
    df['preTestScore'].where(df['postTestScore'] > 50)
    
    '''
    0     NaN
    1     NaN
    2    31.0
    3     2.0
    4     3.0
    Name: preTestScore, dtype: float64 
    '''
    

    选择包含特定值的行和列

    # 导入模块
    import pandas as pd
    
    # 设置 ipython 的最大行显示
    pd.set_option('display.max_row', 1000)
    
    # 设置 ipython 的最大列宽
    pd.set_option('display.max_columns', 50)
    
    # 创建示例数据帧
    data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
            'year': [2012, 2012, 2013, 2014, 2014], 
            'reports': [4, 24, 31, 2, 3]}
    df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
    df
    
    name reports year
    Cochice Jason 4 2012
    Pima Molly 24 2012
    Santa Cruz Tina 31 2013
    Maricopa Jake 2 2014
    Yuma Amy 3 2014
    # 按照列值抓取行
    value_list = ['Tina', 'Molly', 'Jason']
    
    df[df.name.isin(value_list)]
    
    name reports year
    Cochice Jason 4 2012
    Pima Molly 24 2012
    Santa Cruz Tina 31 2013
    # 获取列值不是某个值的行
    df[~df.name.isin(value_list)]
    
    name reports year
    Maricopa Jake 2 2014
    Yuma Amy 3 2014

    选择具有特定值的行

    import pandas as pd
    
    # 创建示例数据帧
    data = {'name': ['Jason', 'Molly'], 
            'country': [['Syria', 'Lebanon'],['Spain', 'Morocco']]}
    df = pd.DataFrame(data)
    df
    
    country name
    0 [Syria, Lebanon] Jason
    1 [Spain, Morocco] Molly
    df[df['country'].map(lambda country: 'Syria' in country)]
    
    country name
    0 [Syria, Lebanon] Jason

    使用多个过滤器选择行

    import pandas as pd
    
    # 创建示例数据帧
    data = {'name': ['A', 'B', 'C', 'D', 'E'], 
            'score': [1,2,3,4,5]}
    df = pd.DataFrame(data)
    df
    
    name score
    0 A 1
    1 B 2
    2 C 3
    3 D 4
    4 E 5
    # 选择数据帧的行,其中 df.score 大于 1 且小于 5
    df[(df['score'] > 1) & (df['score'] < 5)]
    
    name score
    1 B 2
    2 C 3
    3 D 4

    根据条件选择数据帧的行

    # 导入模块
    import pandas as pd
    import numpy as np
    
    # 创建数据帧
    raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan], 
            'nationality': ['USA', 'USA', 'France', 'UK', 'UK'], 
            'age': [42, 52, 36, 24, 70]}
    df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
    df
    
    first_name nationality age
    0 Jason USA 42
    1 Molly USA 52
    2 NaN France 36
    3 NaN UK 24
    4 NaN UK 70
    # 方法 1:使用布尔变量
    # 如果国籍是美国,则变量为 TRUE
    american = df['nationality'] == "USA"
    
    # 如果年龄大于 50,则变量为 TRUE
    elderly = df['age'] > 50
    
    # 选择所有国籍为美国且年龄大于 50 的案例
    df[american & elderly]
    
    first_name nationality age
    1 Molly USA 52
    # 方法 2:使用变量属性
    # 选择所有不缺少名字且国籍为美国的案例
    df[df['first_name'].notnull() & (df['nationality'] == "USA")]
    
    first_name nationality age
    0 Jason USA 42
    1 Molly USA 52

    数据帧简单示例

    # 导入模块
    import pandas as pd
    
    raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
            'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
            'age': [42, 52, 36, 24, 73], 
            'preTestScore': [4, 24, 31, 2, 3],
            'postTestScore': [25, 94, 57, 62, 70]}
    df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
    df
    
    first_name last_name age preTestScore postTestScore
    0 Jason Miller 42 4 25
    1 Molly Jacobson 52 24 94
    2 Tina Ali 36 31 57
    3 Jake Milner 24 2 62
    4 Amy Cooze 73 3 70
    # 创建第二个数据帧
    raw_data_2 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'], 
            'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'], 
            'age': [53, 26, 72, 73, 24], 
            'preTestScore': [13, 52, 72, 26, 26],
            'postTestScore': [82, 52, 56, 234, 254]}
    df_2 = pd.DataFrame(raw_data_2, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
    df_2
    
    first_name last_name age preTestScore postTestScore
    0 Sarah Mornig 53 13 82
    1 Gueniva Jaker 26 52 52
    2 Know Alom 72 72 56
    3 Sara Ormon 73 26 234
    4 Cat Koozer 24 26 254
    # 创建第三个数据帧
    raw_data_3 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'], 
            'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'],
             'postTestScore_2': [82, 52, 56, 234, 254]}
    df_3 = pd.DataFrame(raw_data_3, columns = ['first_name', 'last_name', 'postTestScore_2'])
    df_3
    
    first_name last_name postTestScore_2
    0 Sarah Mornig 82
    1 Gueniva Jaker 52
    2 Know Alom 56
    3 Sara Ormon 234
    4 Cat Koozer 254

    排序数据帧的行

    # 导入模块
    import pandas as pd
    
    data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
            'year': [2012, 2012, 2013, 2014, 2014], 
            'reports': [1, 2, 1, 2, 3],
            'coverage': [2, 2, 3, 3, 3]}
    df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
    df
    
    coverage name reports year
    Cochice 2 Jason 1 2012
    Pima 2 Molly 2 2012
    Santa Cruz 3 Tina 1 2013
    Maricopa 3 Jake 2 2014
    Yuma 3 Amy 3 2014
    # 按报告对数据框的行降序排序
    df.sort_values(by='reports', ascending=0)
    
    coverage name reports year
    Yuma 3 Amy 3 2014
    Pima 2 Molly 2 2012
    Maricopa 3 Jake 2 2014
    Cochice 2 Jason 1 2012
    Santa Cruz 3 Tina 1 2013
    # 按 coverage 然后是报告对数据帧的行升序排序
    df.sort_values(by=['coverage', 'reports'])
    
    coverage name reports year
    Cochice 2 Jason 1 2012
    Pima 2 Molly 2 2012
    Santa Cruz 3 Tina 1 2013
    Maricopa 3 Jake 2 2014
    Yuma 3 Amy 3 2014

    将经纬度坐标变量拆分为单独的变量

    import pandas as pd
    import numpy as np
    
    raw_data = {'geo': ['40.0024, -105.4102', '40.0068, -105.266', '39.9318, -105.2813', np.nan]}
    df = pd.DataFrame(raw_data, columns = ['geo'])
    df
    
    geo
    0 40.0024, -105.4102
    1 40.0068, -105.266
    2 39.9318, -105.2813
    3 NaN
    --- ---
    # 为要放置的循环结果创建两个列表
    lat = []
    lon = []
    
    # 对于变量中的每一行
    for row in df['geo']:
        # Try to,
        try:
            # 用逗号分隔行,转换为浮点
            # 并将逗号前的所有内容追加到 lat
            lat.append(row.split(',')[0])
            # 用逗号分隔行,转换为浮点
            # 并将逗号后的所有内容追加到 lon
            lon.append(row.split(',')[1])
        # 但是如果你得到了错误
        except:
            # 向 lat 添加缺失值
            lat.append(np.NaN)
            # 向 lon 添加缺失值
            lon.append(np.NaN)
    
    # 从 lat 和 lon 创建新的两列
    df['latitude'] = lat
    df['longitude'] = lon
    
    df
    
    geo latitude longitude
    0 40.0024, -105.4102 40.0024 -105.4102
    1 40.0068, -105.266 40.0068 -105.266
    2 39.9318, -105.2813 39.9318 -105.2813
    3 NaN NaN NaN

    数据流水线

    # 创建一些原始数据
    raw_data = [1,2,3,4,5,6,7,8,9,10]
    
    # 定义产生 input+6 的生成器
    def add_6(numbers):
        for x in numbers:
            output = x+6
            yield output
    
    # 定义产生 input-2 的生成器
    def subtract_2(numbers):
        for x in numbers:
            output = x-2
            yield output
    
    # 定义产生 input*100 的生成器
    def multiply_by_100(numbers):
        for x in numbers:
            output = x*100
            yield output
    
    # 流水线的第一步
    step1 = add_6(raw_data)
    
    # 流水线的第二步
    step2 = subtract_2(step1)
    
    # 流水线的第三步
    pipeline = multiply_by_100(step2)
    
    # 原始数据的第一个元素
    next(pipeline)
    
    # 500 
    
    # 原始数据的第二个元素
    next(pipeline)
    
    # 600 
    
    # 处理所有数据
    for raw_data in pipeline:
        print(raw_data)
    
    '''
    700
    800
    900
    1000
    1100
    1200
    1300
    1400
    '''
    

    数据帧中的字符串整理

    # 导入模块
    import pandas as pd
    import numpy as np
    import re as re
    
    raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
            'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
            'email': ['[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)', np.NAN, '[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)'], 
            'preTestScore': [4, 24, 31, 2, 3],
            'postTestScore': [25, 94, 57, 62, 70]}
    df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])
    df
    
    first_name last_name email preTestScore postTestScore
    0 Jason Miller [email protected] 4 25
    1 Molly Jacobson [email protected] 24 94
    2 Tina Ali NaN 31 57
    3 Jake Milner [email protected] 2 62
    4 Amy Cooze [email protected] 3 70
    # 电子邮件列中的哪些字符串包含 'gmail'
    df['email'].str.contains('gmail')
    
    '''
    0     True
    1     True
    2      NaN
    3    False
    4    False
    Name: email, dtype: object 
    '''
    
    pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
    
    df['email'].str.findall(pattern, flags=re.IGNORECASE)
    
    '''
    0       [(jas203, gmail, com)]
    1    [(momomolly, gmail, com)]
    2                          NaN
    3     [(battler, milner, com)]
    4     [(Ames1234, yahoo, com)]
    Name: email, dtype: object 
    '''
    
    matches = df['email'].str.match(pattern, flags=re.IGNORECASE)
    matches
    
    '''
    /Users/chrisralbon/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
      if __name__ == '__main__':
    
    0       (jas203, gmail, com)
    1    (momomolly, gmail, com)
    2                        NaN
    3     (battler, milner, com)
    4     (Ames1234, yahoo, com)
    Name: email, dtype: object 
    '''
    
    matches.str[1]
    
    '''
    0     gmail
    1     gmail
    2       NaN
    3    milner
    4     yahoo
    Name: email, dtype: object 
    '''
    

    和 Pandas 一起使用列表推导式

    # 导入模块
    import pandas as pd
    
    # 设置 ipython 的最大行显示
    pd.set_option('display.max_row', 1000)
    
    # 设置 ipython 的最大列宽
    pd.set_option('display.max_columns', 50)
    
    data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
            'year': [2012, 2012, 2013, 2014, 2014], 
            'reports': [4, 24, 31, 2, 3]}
    df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
    df
    
    name reports year
    Cochice Jason 4 2012
    Pima Molly 24 2012
    Santa Cruz Tina 31 2013
    Maricopa Jake 2 2014
    Yuma Amy 3 2014

    作为循环的列表推导式。

    # 创建变量
    next_year = []
    
    # 对于 df.years 的每一行
    for row in df['year']:
        # 为这一行添加 1 并将其附加到 next_year
        next_year.append(row + 1)
    
    # 创建 df.next_year
    df['next_year'] = next_year
    
    # 查看数据帧
    df
    
    name reports year next_year
    Cochice Jason 4 2012 2013
    Pima Molly 24 2012 2013
    Santa Cruz Tina 31 2013 2014
    Maricopa Jake 2 2014 2015
    Yuma Amy 3 2014 2015

    作为列表推导式。

    # 对于 df.year 中的每一行,从行中减去 1
    df['previous_year'] = [row-1 for row in df['year']]
    
    df
    
    name reports year next_year previous_year
    Cochice Jason 4 2012 2013 2011
    Pima Molly 24 2012 2013 2011
    Santa Cruz Tina 31 2013 2014 2012
    Maricopa Jake 2 2014 2015 2013
    Yuma Amy 3 2014 2015 2013

    使用 Seaborn 来可视化数据帧

    import pandas as pd
    %matplotlib inline
    import random
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    df = pd.DataFrame()
    
    df['x'] = random.sample(range(1, 100), 25)
    df['y'] = random.sample(range(1, 100), 25)
    
    df.head()
    
    x y
    0 18 25
    1 42 67
    2 52 77
    3 4 34
    4 14 69
    # 散点图
    sns.lmplot('x', 'y', data=df, fit_reg=False)
    
    # <seaborn.axisgrid.FacetGrid at 0x114563b00> 
    
    png
    # 密度图
    sns.kdeplot(df.y)
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x113ea2ef0> 
    
    png
    sns.kdeplot(df.y, df.x)
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x113d7fef0> 
    
    png
    sns.distplot(df.x)
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x114294160> 
    
    png
    # 直方图
    plt.hist(df.x, alpha=.3)
    sns.rugplot(df.x);
    
    png
    # 箱形图
    sns.boxplot([df.y, df.x])
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x1142b8b38> 
    
    png
    # 提琴图
    sns.violinplot([df.y, df.x])
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x114444a58> 
    
    png
    # 热力图
    sns.heatmap([df.y, df.x], annot=True, fmt="d")
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x114530c88> 
    
    png
    # 聚类图
    sns.clustermap(df)
    
    # <seaborn.matrix.ClusterGrid at 0x116f313c8> 
    
    png

    Pandas 数据结构

    # 导入模块
    import pandas as pd
    

    序列 101

    序列是一维数组(类似 R 的向量)。

    # 创建 floodingReports 数量的序列
    floodingReports = pd.Series([5, 6, 2, 9, 12])
    floodingReports
    
    '''
    0     5
    1     6
    2     2
    3     9
    4    12
    dtype: int64 
    '''
    

    请注意,第一列数字(0 到 4)是索引。

    # 将县名设置为 floodingReports 序列的索引
    floodingReports = pd.Series([5, 6, 2, 9, 12], index=['Cochise County', 'Pima County', 'Santa Cruz County', 'Maricopa County', 'Yuma County'])
    floodingReports
    
    '''
    Cochise County        5
    Pima County           6
    Santa Cruz County     2
    Maricopa County       9
    Yuma County          12
    dtype: int64 
    '''
    
    floodingReports['Cochise County']
    
    # 5 
    
    floodingReports[floodingReports > 6]
    
    '''
    Maricopa County     9
    Yuma County        12
    dtype: int64 
    '''
    

    从字典中创建 Pandas 序列。

    注意:执行此操作时,字典的键将成为序列索引。

    # 创建字典
    fireReports_dict = {'Cochise County': 12, 'Pima County': 342, 'Santa Cruz County': 13, 'Maricopa County': 42, 'Yuma County' : 52}
    
    # 将字典转换为 pd.Series,然后查看它
    fireReports = pd.Series(fireReports_dict); fireReports
    
    '''
    Cochise County        12
    Maricopa County       42
    Pima County          342
    Santa Cruz County     13
    Yuma County           52
    dtype: int64 
    '''
    
    fireReports.index = ["Cochice", "Pima", "Santa Cruz", "Maricopa", "Yuma"]
    fireReports
    
    '''
    Cochice        12
    Pima           42
    Santa Cruz    342
    Maricopa       13
    Yuma           52
    dtype: int64 
    '''
    

    数据帧 101

    数据帧就像 R 的数据帧。

    # 从等长列表或 NumPy 数组的字典中创建数据帧
    data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'], 
            'year': [2012, 2012, 2013, 2014, 2014], 
            'reports': [4, 24, 31, 2, 3]}
    df = pd.DataFrame(data)
    df
    
    county reports year
    0 Cochice 4 2012
    1 Pima 24 2012
    2 Santa Cruz 31 2013
    3 Maricopa 2 2014
    4 Yuma 3 2014
    # 使用 columns 属性设置列的顺序
    dfColumnOrdered = pd.DataFrame(data, columns=['county', 'year', 'reports'])
    dfColumnOrdered
    
    county year reports
    0 Cochice 2012 4
    1 Pima 2012 24
    2 Santa Cruz 2013 31
    3 Maricopa 2014 2
    4 Yuma 2014 3
    # 添加一列
    dfColumnOrdered['newsCoverage'] = pd.Series([42.3, 92.1, 12.2, 39.3, 30.2])
    dfColumnOrdered
    
    county year reports newsCoverage
    0 Cochice 2012 4 42.3
    1 Pima 2012 24 92.1
    2 Santa Cruz 2013 31 12.2
    3 Maricopa 2014 2 39.3
    4 Yuma 2014 3 30.2
    # 删除一列
    del dfColumnOrdered['newsCoverage']
    dfColumnOrdered
    
    county year reports
    0 Cochice 2012 4
    1 Pima 2012 24
    2 Santa Cruz 2013 31
    3 Maricopa 2014 2
    4 Yuma 2014 3
    # 转置数据帧
    dfColumnOrdered.T
    
    0 1 2 3 4
    county Cochice Pima Santa Cruz Maricopa Yuma
    year 2012 2012 2013 2014 2014
    reports 4 24 31 2 3

    Pandas 时间序列基础

    # 导入模块
    from datetime import datetime
    import pandas as pd
    %matplotlib inline
    import matplotlib.pyplot as pyplot
    
    data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'], 
            'battle_deaths': [34, 25, 26, 15, 15, 14, 26, 25, 62, 41]}
    df = pd.DataFrame(data, columns = ['date', 'battle_deaths'])
    print(df)
    
    '''
     date  battle_deaths
    0  2014-05-01 18:47:05.069722             34
    1  2014-05-01 18:47:05.119994             25
    2  2014-05-02 18:47:05.178768             26
    3  2014-05-02 18:47:05.230071             15
    4  2014-05-02 18:47:05.230071             15
    5  2014-05-02 18:47:05.280592             14
    6  2014-05-03 18:47:05.332662             26
    7  2014-05-03 18:47:05.385109             25
    8  2014-05-04 18:47:05.436523             62
    9  2014-05-04 18:47:05.486877             41 
    '''
    
    df['date'] = pd.to_datetime(df['date'])
    
    df.index = df['date']
    del df['date']
    df
    
    battle_deaths
    date
    2014-05-01 18:47:05.069722 34
    2014-05-01 18:47:05.119994 25
    2014-05-02 18:47:05.178768 26
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.280592 14
    2014-05-03 18:47:05.332662 26
    2014-05-03 18:47:05.385109 25
    2014-05-04 18:47:05.436523 62
    2014-05-04 18:47:05.486877 41
    # 查看 2014 年的所有观测
    df['2014']
    
    battle_deaths
    date
    2014-05-01 18:47:05.069722 34
    2014-05-01 18:47:05.119994 25
    2014-05-02 18:47:05.178768 26
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.280592 14
    2014-05-03 18:47:05.332662 26
    2014-05-03 18:47:05.385109 25
    2014-05-04 18:47:05.436523 62
    2014-05-04 18:47:05.486877 41
    # 查看 2014 年 5 月的所有观测
    df['2014-05']
    
    battle_deaths
    date
    2014-05-01 18:47:05.069722 34
    2014-05-01 18:47:05.119994 25
    2014-05-02 18:47:05.178768 26
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.280592 14
    2014-05-03 18:47:05.332662 26
    2014-05-03 18:47:05.385109 25
    2014-05-04 18:47:05.436523 62
    2014-05-04 18:47:05.486877 41
    # 查看 2014.5.3 的所有观测
    df[datetime(2014, 5, 3):]
    
    battle_deaths
    date
    2014-05-03 18:47:05.332662 26
    2014-05-03 18:47:05.385109 25
    2014-05-04 18:47:05.436523 62
    2014-05-04 18:47:05.486877 41

    Observations between May 3rd and May 4th

    # 查看 2014.5.3~4 的所有观测
    df['5/3/2014':'5/4/2014']
    
    battle_deaths
    date
    2014-05-03 18:47:05.332662 26
    2014-05-03 18:47:05.385109 25
    2014-05-04 18:47:05.436523 62
    2014-05-04 18:47:05.486877 41
    # 截断 2014.5.2 之后的观测
    df.truncate(after='5/3/2014')
    
    battle_deaths
    date
    2014-05-01 18:47:05.069722 34
    2014-05-01 18:47:05.119994 25
    2014-05-02 18:47:05.178768 26
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.280592 14
    # 2014.5 的观测
    df['5-2014']
    
    battle_deaths
    date
    2014-05-01 18:47:05.069722 34
    2014-05-01 18:47:05.119994 25
    2014-05-02 18:47:05.178768 26
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.230071 15
    2014-05-02 18:47:05.280592 14
    2014-05-03 18:47:05.332662 26
    2014-05-03 18:47:05.385109 25
    2014-05-04 18:47:05.436523 62
    2014-05-04 18:47:05.486877 41
    # 计算每个时间戳的观测数
    df.groupby(level=0).count()
    
    battle_deaths
    date
    2014-05-01 18:47:05.069722 1
    2014-05-01 18:47:05.119994 1
    2014-05-02 18:47:05.178768 1
    2014-05-02 18:47:05.230071 2
    2014-05-02 18:47:05.280592 1
    2014-05-03 18:47:05.332662 1
    2014-05-03 18:47:05.385109 1
    2014-05-04 18:47:05.436523 1
    2014-05-04 18:47:05.486877 1
    # 每天的 battle_deaths 均值
    df.resample('D').mean()
    
    battle_deaths
    date
    2014-05-01 29.5
    2014-05-02 17.5
    2014-05-03 25.5
    2014-05-04 51.5
    # 每天的 battle_deaths 总数
    df.resample('D').sum()
    
    battle_deaths
    date
    2014-05-01 59
    2014-05-02 70
    2014-05-03 51
    2014-05-04 103
    # 绘制每天的总死亡人数
    df.resample('D').sum().plot()
    
    # <matplotlib.axes._subplots.AxesSubplot at 0x11187a940> 
    
    png

    相关文章

      网友评论

        本文标题:数据科学和人工智能技术笔记 十九、数据整理(6)

        本文链接:https://www.haomeiwen.com/subject/uybolqtx.html