美文网首页
9. 日月光华 Python数据分析-Pandas-数据清理

9. 日月光华 Python数据分析-Pandas-数据清理

作者: 薛东弗斯 | 来源:发表于2023-07-09 06:22 被阅读0次

    重复值处理

    import pandas as pd
    import numpy as np
    
    data = pd.DataFrame({'k1':['one']*3 + ['two']*2, 'k2':[1,1,2,3,3]})
    data
    #   k1  k2
    # 0 one 1
    # 1 one 1
    # 2 one 2
    # 3 two 3
    # 4 two 3
    
    data.duplicated()
    # 0    False
    # 1     True
    # 2    False
    # 3    False
    # 4     True
    # dtype: bool
    
    (data.duplicated()).sum()
    # 2
    
    data.drop_duplicates()   # 抛弃重复值
    # k1    k2
    # 0 one 1
    # 2 one 2
    # 3 two 3
    
    data.drop_duplicates('k1')  # 只关系k1这一列
    #   k1  k2
    # 0 one 1
    # 3 two 3
    
    data['k3'] = 1
    data
    #   k1  k2  k3
    # 0 one 1   1
    # 1 one 1   1
    # 2 one 2   1
    # 3 two 3   1
    # 4 two 3   1
    
    data.duplicated(['k1', 'k3'])
    # 0    False
    # 1     True
    # 2     True
    # 3    False
    # 4     True
    # dtype: bool
    
    data.drop_duplicates(['k1', 'k3'], inplace=True)   # 默认保留第一列
    data
    #   k1  k2  k3
    # 0 one 1   1
    # 3 two 3   1
    

    数值替换

    data
    #   k1  k2  k3
    # 0 one 1   1
    # 1 one 1   1
    # 2 one 2   1
    # 3 two 3   1
    # 4 two 3   1
    
    data.k1.replace('two', 'three')
    # 0      one
    # 1      one
    # 2      one
    # 3    three
    # 4    three
    # Name: k1, dtype: object
    
    data.replace(1, 100, inplace=True)
    data
    # k1    k2  k3
    # 0 one 100 100
    # 1 one 100 100
    # 2 one 2   100
    # 3 two 3   100
    # 4 two 3   100
    

    过滤缺失值

    data = pd.Series([2, np.nan, 4, np.nan,8.5])   # 创建一维数组
    data
    # 0    2.0
    # 1    NaN
    # 2    4.0
    # 3    NaN
    # 4    8.5
    # dtype: float64
    
    (data.isnull()).sum()
    # 2
    
    data.notnull()
    # 3
    
    data[data.notnull()]
    # 0    2.0
    # 2    4.0
    # 4    8.5
    # dtype: float64
    
    data.dropna(inplace=True)
    data
    # 0    2.0
    # 2    4.0
    # 4    8.5
    # dtype: float64
    
    data = pd.DataFrame(np.random.randn(4,3),index = list('abcd'),columns=['aa','bb','cc'])
    data.iloc[1:,:2] = np.nan
    data.iloc[1,2] = np.nan
    data
    #   aa                  bb                  cc
    # a -0.184374   -2.293165   -0.710435
    # b NaN             NaN                 NaN
    # c NaN             NaN                 -0.585972
    # d NaN             NaN                 0.126403
    
    data.isnull()
    #         aa            bb          cc
    # a False   False   False
    # b True            True        True
    # c True            True    False
    # d True            True            False
    
    data.dropna()   # 只要有一个缺失值,这行就会被全部抛弃
    #       aa              bb              cc
    # a -0.674245   -0.416114   -0.79495
    
    data.dropna(how='all')   # 只有这一行全部是缺失值时,才会被抛弃
    #   aa              bb              cc
    # a -0.674245   -0.416114   -0.794950
    # c NaN             NaN             0.453766
    # d NaN             NaN             -3.152937
    
    data
    #       aa              bb                  cc
    # a -0.674245   -0.416114   -0.794950
    # b NaN             NaN             NaN
    # c NaN             NaN             0.453766
    # d NaN             NaN             -3.152937
    
    data.iloc[0, 0] = np.nan
    data
    #   aa          bb              cc
    # a NaN     -2.293165   -0.710435
    # b NaN     NaN             NaN
    # c NaN     NaN             -0.585972
    # d NaN     NaN             0.126403
    
    data.dropna(axis=1, how='all')  # 只抛弃某一列中,全部为NaN的被抛弃。  默认以行为基准。
    #   bb              cc
    # a -2.293165   -0.710435
    # b NaN             NaN
    # c NaN             -0.585972
    # d NaN             0.126403
    

    填充缺失值

    data
    #       aa  bb              cc
    # a NaN -0.416114   -0.794950
    # b NaN NaN             NaN
    # c NaN NaN             0.453766
    # d NaN NaN             -3.152937
    
    data.fillna({'aa': 1, 'bb': 100})
    #   aa  bb                  cc
    # a 1.0 -0.416114   -0.794950
    # b 1.0 100.000000  NaN
    # c 1.0 100.000000  0.453766
    # d 1.0 100.000000  -3.152937
    
    data = pd.Series([2, np.nan, 4, np.nan,8.5])
    data.fillna(data.mean())
    # 0    2.000000
    # 1    4.833333
    # 2    4.000000
    # 3    4.833333
    # 4    8.500000
    # dtype: float64
    

    相关文章

      网友评论

          本文标题:9. 日月光华 Python数据分析-Pandas-数据清理

          本文链接:https://www.haomeiwen.com/subject/mvzdudtx.html