美文网首页
Pandas数据分析基础流程及操作

Pandas数据分析基础流程及操作

作者: 筱木杉 | 来源:发表于2020-04-23 21:00 被阅读0次

    Pandas基础操作

    准备:

    导入库¶

    import pandas as pd # 导入pandas库并简写为pd
    import numpy as np # 导入numpy库并简写为np
    

    一、数据导入

    pd.read_csv(filename) # 导入csv格式文件中的数据
    pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据 
    pd.read_excel(filename) # 导入Excel格式文件中的数据 
    pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据 
    pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据 
    pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据 
    pd.read_clipboard() # 导入系统粘贴板里面的数据 
    pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。
    

    实际数据导入

    data = pd.read_csv('D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv')
    # D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv 为本地csv文件目录
    

    二、数据导出

    df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中 
    df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中 
    df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中 
    df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中
    

    三、数据结构及创建

    Series

    1、通过list创建

    from pandas import Series,DataFrame
    data0 = [1,2,3,4]
    s = Series(data0,index=['a','b','c','d'])
    print(s)
    
    a    1
    b    2
    c    3
    d    4
    dtype: int64
    

    2、通过数组创建

    s1 = np.array(['a','s','d','f','g'])
    print(s1)
    
    ['a' 's' 'd' 'f' 'g']
    

    3、通过字典创建

    data1 = {'a':1,'b':2,'c':3,'d':4}
    s2 = pd.Series(data1,index=['a','d','e','b'])
    print(s2)
    
    a    1.0
    d    4.0
    e    NaN
    b    2.0
    dtype: float64
    
    上结果索引顺序保持不变,缺少的元素使用NAN(不是数字)填充

    DataFrame

    1、从列表创建DataFrame

    a、通过单个列表或列表字典创建
    data3 = [1,2,3,4,5,6]
    s3 = pd.DataFrame(data3)
    print(s3)
    
       0
    0  1
    1  2
    2  3
    3  4
    4  5
    5  6
    
    data4 = [['lijie',20],['zhangsan',45],['wangwu',30]]
    s4 = pd.DataFrame(data4,columns=['name','age'])
    print(s4)
    
           name  age
    0     lijie   20
    1  zhangsan   45
    2    wangwu   30
    
    b、通过字典列表创建
    data5 = [{'a':1,'b':2},{'c':5,'d':8,'e':10}]
    s5 = pd.DataFrame(data5)
    print(s5)
    
         a    b    c    d     e
    0  1.0  2.0  NaN  NaN   NaN
    1  NaN  NaN  5.0  8.0  10.0
    
    2、从ndarrays/Lists的字典来创建DataFrame
    data6 = {'name':['tom','lose','steve','bob'],'age':[12,21,30,14]}
    s6 = pd.DataFrame(data6)
    print(s6)
    
        name  age
    0    tom   12
    1   lose   21
    2  steve   30
    3    bob   14
    
    3、从系列字典创建DataFrame
    data7 = {'one':pd.Series([1,2,3],index = ['a','b','c']),
            'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])}
    s7 = pd.DataFrame(data7)
    print(s7)
    
       one  two
    a  1.0    1
    b  2.0    2
    c  3.0    3
    d  NaN    4
    

    四、数据查看

    1、基础属性

    a、Series

    s8 = Series([1,2,3,4,5])
    print(s8)
    
    0    1
    1    2
    2    3
    3    4
    4    5
    dtype: int64
    
    s8.axes # 返回行轴标签列表
    
    [RangeIndex(start=0, stop=5, step=1)]
    
    s8.dtype # 返回对象的数据类型
    
    dtype('int64')
    
    s8.empty # 如果系列为空,则返回True
    
    False
    
    s8.ndim # 返回底层数据的维数
    
    1
    
    s8.size # 返回基础数据中的元素数
    
    5
    
    s8.values # 将系列作为ndarray返回
    array([1, 2, 3, 4, 5], dtype=int64)
    s8.head() # 返回前n行
    
    0    1
    1    2
    2    3
    3    4
    4    5
    dtype: int64
    
    s8.tail() # 返回最后n行
    
    0    1
    1    2
    2    3
    3    4
    4    5
    dtype: int64
    

    b、DataFrame

    data9 = {'one':pd.Series([1,2,3],index = ['a','b','c']),
            'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])}
    s9 = pd.DataFrame(data9)
    print(s9)
    
       one  two
    a  1.0    1
    b  2.0    2
    c  3.0    3
    d  NaN    4
    
    s9.T # 转置行与列
    
    a   b   c   d
    one 1.0 2.0 3.0 NaN
    two 1.0 2.0 3.0 4.0
    
    s9.axes # 返回一个列,行轴标签和列轴标签作为唯一的成员。
    
    [Index(['a', 'b', 'c', 'd'], dtype='object'),
     Index(['one', 'two'], dtype='object')]
    
    s9.dtypes # 返回此对象中的数据类型(dtypes)
    
    one    float64
    two      int64
    dtype: object
    
    s9.empty # 如果NDFrame完全为空[无项目],则返回为True; 如果任何轴的长度为0
    
    False
    
    s9.ndim # 轴/数组维度大小
    
    2
    
    s9.shape # 返回表示DataFrame的维度的元组
    
    (4, 2)
    
    s9.size # NDFrame中的元素数
    
    8
    
    s9.values # NDFrame的Numpy表示
    
    array([[ 1.,  1.],
           [ 2.,  2.],
           [ 3.,  3.],
           [nan,  4.]])
    
    s9.index # 查看索引
    
    Index(['a', 'b', 'c', 'd'], dtype='object')
    
    s9.columns # 查看列名
    
    Index(['one', 'two'], dtype='object')
    
    s9.info() # 查看数据框的索引、数据类型以及内存信息
    
    <class 'pandas.core.frame.DataFrame'>
    Index: 4 entries, a to d
    Data columns (total 2 columns):
     #   Column  Non-Null Count  Dtype  
    ---  ------  --------------  -----  
     0   one     3 non-null      float64
     1   two     4 non-null      int64  
    dtypes: float64(1), int64(1)
    memory usage: 256.0+ bytes
    

    2、描述统计

    data10 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack',
       'Lee','David','Gasper','Betina','Andres']),
       'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
       'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
    s10 = pd.DataFrame(data10)
    print(s10)
    
          Name  Age  Rating
    0      Tom   25    4.23
    1    James   26    3.24
    2    Ricky   25    3.98
    3      Vin   23    2.56
    4    Steve   30    3.20
    5    Minsu   29    4.60
    6     Jack   23    3.80
    7      Lee   34    3.78
    8    David   40    2.98
    9   Gasper   30    4.80
    10  Betina   51    4.10
    11  Andres   46    3.65
    
    s10.sum() # 求和
    
    Name      TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...
    Age                                                     382
    Rating                                                44.92
    dtype: object
    
    s10.count() # 非空元素
    
    Name      12
    Age       12
    Rating    12
    dtype: int64
    
    s10.min() # 最小值
    
    Name      Andres
    Age           23
    Rating      2.56
    dtype: object
    
    s10.max() # 最大值
    
    Name      Vin
    Age        51
    Rating    4.8
    dtype: object
    
    s10.cumsum() # 累计求和
    
    Name    Age Rating
    0   Tom 25  4.23
    1   TomJames    51  7.47
    2   TomJamesRicky   76  11.45
    3   TomJamesRickyVin    99  14.01
    4   TomJamesRickyVinSteve   129 17.21
    5   TomJamesRickyVinSteveMinsu  158 21.81
    6   TomJamesRickyVinSteveMinsuJack  181 25.61
    7   TomJamesRickyVinSteveMinsuJackLee   215 29.39
    8   TomJamesRickyVinSteveMinsuJackLeeDavid  255 32.37
    9   TomJamesRickyVinSteveMinsuJackLeeDavidGasper    285 37.17
    10  TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...   336 41.27
    11  TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...   382 44.92
    
    s10['Age'].mean() # 年龄平均值
    
    31.833333333333332
    
    s10.describe() # 统计信息摘要
    
    Age Rating
    count   12.000000   12.000000
    mean    31.833333   3.743333
    std 9.232682    0.661628
    min 23.000000   2.560000
    25% 25.000000   3.230000
    50% 29.500000   3.790000
    75% 35.500000   4.132500
    max 51.000000   4.800000
    

    五、数据清洗

    1、重命名

    s.columns = ['a','b'] # 重命名数据框的列名称
    data.head(5) # data为最开始导入的csv数据
    
    1   2268318 2520377 pv  1511544070
    0   1   2333346.0   2520771.0   pv  1.511562e+09
    1   1   2576651.0   149192.0    pv  1.511573e+09
    2   1   2576651.0   149192.0    pv  1.511573e+09
    3   1   4365585.0   2520377.0   pv  1.511596e+09
    4   1   4606018.0   2735466.0   NaN 1.511616e+09
    
    data.columns = ["user_id","item_id","catagory_id","behavior_type","timestamp"]
    data.head(5)
    
    user_id item_id catagory_id behavior_type   timestamp
    0   1   2333346.0   2520771.0   pv  1.511562e+09
    1   1   2576651.0   149192.0    pv  1.511573e+09
    2   1   2576651.0   149192.0    pv  1.511573e+09
    3   1   4365585.0   2520377.0   pv  1.511596e+09
    4   1   4606018.0   2735466.0   NaN 1.511616e+09
    
    s.rename() # 重命名列/行,传入的函数或字典值必须是1对1的,没有包含在字典或者Series中的标签将保持原来的名称。字典中包含df中没有的标签,不会报错
    
    s11 = data.rename(columns={'user_id':'a','item_id':'b','catagory_id':'c','behavior_type':'d','timestamp':'e'})
    s11.head()
    
    a   b   c   d   e
    0   1   2333346.0   2520771.0   pv  1.511562e+09
    1   1   2576651.0   149192.0    pv  1.511573e+09
    2   1   2576651.0   149192.0    pv  1.511573e+09
    3   1   4365585.0   2520377.0   pv  1.511596e+09
    4   1   4606018.0   2735466.0   NaN 1.511616e+09
    
    s.set_index(),可以设置单索引和复合索引
    
    s12 = data.set_index('user_id') # 将user_id作为索引(单索引)
    s12
    
    item_id catagory_id behavior_type   timestamp
    user_id             
    1   2333346.0   2520771.0   pv  1.511562e+09
    1   2576651.0   149192.0    pv  1.511573e+09
    1   2576651.0   149192.0    pv  1.511573e+09
    1   4365585.0   2520377.0   pv  1.511596e+09
    1   4606018.0   2735466.0   NaN 1.511616e+09
    ... ... ... ... ...
    100 4182583.0   1258177.0   pv  1.511751e+09
    100 2337874.0   1194311.0   pv  1.511751e+09
    100 3658601.0   2342116.0   pv  1.511758e+09
    100 5153036.0   2342116.0   pv  1.511759e+09
    100 598929.0    2429887.0   buy 1.511759e+09
    100 rows × 4 columns
    
    s13 = data.set_index(['user_id','item_id']) # 将user_id作为索引(复合索引)
    s13
    
    catagory_id behavior_type   timestamp
    user_id item_id         
    1   2333346.0   2520771.0   pv  1.511562e+09
    2576651.0   149192.0    pv  1.511573e+09
    2576651.0   149192.0    pv  1.511573e+09
    4365585.0   2520377.0   pv  1.511596e+09
    4606018.0   2735466.0   NaN 1.511616e+09
    ... ... ... ... ...
    100 4182583.0   1258177.0   pv  1.511751e+09
    2337874.0   1194311.0   pv  1.511751e+09
    3658601.0   2342116.0   pv  1.511758e+09
    5153036.0   2342116.0   pv  1.511759e+09
    598929.0    2429887.0   buy 1.511759e+09
    100 rows × 3 columns
    

    2、重复值

    duplicated() 查找重复值

    data.duplicated()
    
    0     False
    1     False
    2     False
    3     False
    4     False
          ...  
    95    False
    96    False
    97    False
    98    False
    99    False
    Length: 100, dtype: bool
    

    drop_duplicate() 删除重复值

    data.drop_duplicates()
    
    user_id item_id catagory_id behavior_type   timestamp
    0   1   2333346.0   2520771.0   pv  1.511562e+09
    1   1   2576651.0   149192.0    pv  1.511573e+09
    2   1   2576651.0   149192.0    pv  1.511573e+09
    3   1   4365585.0   2520377.0   pv  1.511596e+09
    4   1   4606018.0   2735466.0   NaN 1.511616e+09
    ... ... ... ... ... ...
    95  100 4182583.0   1258177.0   pv  1.511751e+09
    96  100 2337874.0   1194311.0   pv  1.511751e+09
    97  100 3658601.0   2342116.0   pv  1.511758e+09
    98  100 5153036.0   2342116.0   pv  1.511759e+09
    99  100 598929.0    2429887.0   buy 1.511759e+09
    100 rows × 5 columns
    

    3、空值、缺失值

    data.isnull() # True为空,反之非空
    
    user_id item_id catagory_id behavior_type   timestamp
    0   False   False   False   False   False
    1   False   False   False   False   False
    2   False   False   False   False   False
    3   False   False   False   False   False
    4   False   False   False   True    False
    ... ... ... ... ... ...
    95  False   False   False   False   False
    96  False   False   False   False   False
    97  False   False   False   False   False
    98  False   False   False   False   False
    99  False   False   False   False   False
    100 rows × 5 columns
    
    data.isnull().values==True # 可用data.isnull().values==True来定位
    array([[False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False,  True, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False,  True],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False,  True, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False,  True, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False,  True],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False],
           [False, False, False, False, False]])
    
    data.notnull() # true为非空,反之为空
    
    user_id item_id catagory_id behavior_type   timestamp
    0   True    True    True    True    True
    1   True    True    True    True    True
    2   True    True    True    True    True
    3   True    True    True    True    True
    4   True    True    True    False   True
    ... ... ... ... ... ...
    95  True    True    True    True    True
    96  True    True    True    True    True
    97  True    True    True    True    True
    98  True    True    True    True    True
    99  True    True    True    True    True
    100 rows × 5 columns
    
    data.fillna(0) # 用0填充空值
    
    user_id item_id catagory_id behavior_type   timestamp
    0   1   2333346.0   2520771.0   pv  1.511562e+09
    1   1   2576651.0   149192.0    pv  1.511573e+09
    2   1   2576651.0   149192.0    pv  1.511573e+09
    3   1   4365585.0   2520377.0   pv  1.511596e+09
    4   1   4606018.0   2735466.0   0   1.511616e+09
    ... ... ... ... ... ...
    95  100 4182583.0   1258177.0   pv  1.511751e+09
    96  100 2337874.0   1194311.0   pv  1.511751e+09
    97  100 3658601.0   2342116.0   pv  1.511758e+09
    98  100 5153036.0   2342116.0   pv  1.511759e+09
    99  100 598929.0    2429887.0   buy 1.511759e+09
    100 rows × 5 columns
    
    data.dropna(how='any', inplace=True) # 删除 含空值的行
    

    4、字符处理

    空格

    data['behavior_type'].value_counts()
    
    pv     87
    fav     5
    buy     3
    Name: behavior_type, dtype: int64
    
    data.replace('p','pv')
    
    user_id item_id catagory_id behavior_type   timestamp
    0   1   2333346.0   2520771.0   pv  1.511562e+09
    1   1   2576651.0   149192.0    pv  1.511573e+09
    2   1   2576651.0   149192.0    pv  1.511573e+09
    3   1   4365585.0   2520377.0   pv  1.511596e+09
    5   1   230380.0    411153.0    pv  1.511645e+09
    ... ... ... ... ... ...
    95  100 4182583.0   1258177.0   pv  1.511751e+09
    96  100 2337874.0   1194311.0   pv  1.511751e+09
    97  100 3658601.0   2342116.0   pv  1.511758e+09
    98  100 5153036.0   2342116.0   pv  1.511759e+09
    99  100 598929.0    2429887.0   buy 1.511759e+09
    95 rows × 5 columns
    

    去除空格

    data['behavior_type'].map(str.strip) # 去除两端空格
    
    0      pv
    1      pv
    2      pv
    3      pv
    5      pv
         ... 
    95     pv
    96     pv
    97     pv
    98     pv
    99    buy
    Name: behavior_type, Length: 95, dtype: object
    
    data['behavior_type'].map(str.lstrip) # 去除左端空格
    
    0      pv
    1      pv
    2      pv
    3      pv
    5      pv
         ... 
    95     pv
    96     pv
    97     pv
    98     pv
    99    buy
    Name: behavior_type, Length: 95, dtype: object
    
    data['behavior_type'].map(str.rstrip) # 去除右端空格
    
    0      pv
    1      pv
    2      pv
    3      pv
    5      pv
         ... 
    95     pv
    96     pv
    97     pv
    98     pv
    99    buy
    Name: behavior_type, Length: 95, dtype: object
    

    大小写

    data['behavior_type'].map(str.upper) # 大写
    
    0      PV
    1      PV
    2      PV
    3      PV
    5      PV
         ... 
    95     PV
    96     PV
    97     PV
    98     PV
    99    BUY
    Name: behavior_type, Length: 95, dtype: object
    
    data['behavior_type'].map(str.lower) # 小写
    
    0      pv
    1      pv
    2      pv
    3      pv
    5      pv
         ... 
    95     pv
    96     pv
    97     pv
    98     pv
    99    buy
    Name: behavior_type, Length: 95, dtype: object
    
    data['behavior_type'].map(str.title) # 首字母大写
    
    0      Pv
    1      Pv
    2      Pv
    3      Pv
    5      Pv
         ... 
    95     Pv
    96     Pv
    97     Pv
    98     Pv
    99    Buy
    Name: behavior_type, Length: 95, dtype: object
    
    data['behavior_type'].value_counts()
    
    pv     87
    fav     5
    buy     3
    Name: behavior_type, dtype: int64
    
    # data.replace('Pv','PV') 替换
    

    5、异常和极端值

    发现方法:
    a、可通过describe()方法,观察统计的各项数据,发现异常值
    b、借助散点图、箱型图、正态分布图等进行异常值的发现
    处理办法
    a、删除——先将异常值替换为na,然后用dropna()删除
    b、视为缺失值——先将异常值替换为na,然后用缺失值处理方法处理(填充,插值等)
    

    6、更改数据格式

    data['timestamp'].dtype
    
    dtype('float64')
    
    data['timestamp'].astype('int32') # 将 'float64' 转化为 'int32'
    
    0     1511561733
    1     1511572885
    2     1511572886
    3     1511596146
    5     1511644942
             ...    
    95    1511750991
    96    1511751022
    97    1511757958
    98    1511758581
    99    1511759458
    Name: timestamp, Length: 95, dtype: int32
    
    data['date'] = pd.to_datetime(data['timestamp'])
    
    data['date']
    
    0    1970-01-01 00:00:01.511561733
    1    1970-01-01 00:00:01.511572885
    2    1970-01-01 00:00:01.511572886
    3    1970-01-01 00:00:01.511596146
    5    1970-01-01 00:00:01.511644942
                      ...             
    95   1970-01-01 00:00:01.511750991
    96   1970-01-01 00:00:01.511751022
    97   1970-01-01 00:00:01.511757958
    98   1970-01-01 00:00:01.511758581
    99   1970-01-01 00:00:01.511759458
    Name: date, Length: 95, dtype: datetime64[ns]
    

    8、分列

    str.split(self,pat = None,n = -1,expand = False )
    
    pat str,可选:要分割的字符串或正则表达式。如果未指定,请在空白处分割。
    n int,默认为-1(全部):限制输出的分割数。 None,则0和-1将被解释为返回所有拆分。
    expand,默认为False:将拆分的字符串展开为单独的列。如果为True,则返回DataFrame / MultiIndex扩展维;如果为False,则返回包含字符串列表的Series / Index。
    
    s14 = pd.Series(["this is a regular sentence","https://docs.python.org/3/tutorial/index.html",np.nan])
    s14
    
    0                       this is a regular sentence
    1    https://docs.python.org/3/tutorial/index.html
    2                                              NaN
    dtype: object
    
    s14.str.split()
    
    0                   [this, is, a, regular, sentence]
    1    [https://docs.python.org/3/tutorial/index.html]
    2                                                NaN
    dtype: object
    
    s14.str.rsplit()
    
    0                   [this, is, a, regular, sentence]
    1    [https://docs.python.org/3/tutorial/index.html]
    2                                                NaN
    dtype: object
    
    s14.str.split(n=2)
    
    0                     [this, is, a regular sentence]
    1    [https://docs.python.org/3/tutorial/index.html]
    2                                                NaN
    dtype: object
    
    s14.str.rsplit(n=2)
    
    0                     [this is a, regular, sentence]
    1    [https://docs.python.org/3/tutorial/index.html]
    2                                                NaN
    dtype: object
    
    s14.str.rsplit(pat='/')
    0                         [this is a regular sentence]
    1    [https:, , docs.python.org, 3, tutorial, index...
    2                                                  NaN
    dtype: object
    
    s14.str.split(pat='/',expand=True)
    
    0   1   2   3   4   5
    0   this is a regular sentence  None    None    None    None    None
    1   https:      docs.python.org 3   tutorial    index.html
    2   NaN NaN NaN NaN NaN NaN
    

    六、数据预处理

    1、数据分组、分段

    groupby() 分组
    data.groupby(['behavior_type']).count() # 按照'behavior_type'对data进行分组并计数
    
    user_id item_id catagory_id timestamp   date
    behavior_type                   
    buy 3   3   3   3   3
    fav 5   5   5   5   5
    pv  87  87  87  87  87
    
    data.groupby(['behavior_type','item_id']).sum() # 按照'behavior_type'和'item_id'对data进行分组并求和
    
    user_id catagory_id timestamp
    behavior_type   item_id         
    buy 598929.0    100 2429887.0   1.511759e+09
    1603476.0   100 2951233.0   1.511580e+09
    2971043.0   100 4869428.0   1.511618e+09
    fav 2158340.0   100 2429887.0   1.511743e+09
    2971043.0   100 4869428.0   1.511565e+09
    ... ... ... ... ...
    pv  4954999.0   3   1233459.0   4.536499e+09
    4973305.0   1   2520771.0   1.511969e+09
    5002615.0   2   5040754.0   3.023684e+09
    5100093.0   100 2945933.0   1.511552e+09
    5153036.0   100 2342116.0   1.511759e+09
    74 rows × 3 columns
    
    cut()/pcut() 分段
    score_list = np.random.randint(25, 100, size=20) # 分数列表
    print(score_list)
    
    [38 62 72 26 60 50 61 86 38 97 72 47 61 59 83 73 48 56 59 38]
    
    bins = [0, 59, 70, 80, 100] # 指定分数区间
    print(bins)
    
    [0, 59, 70, 80, 100]
    
    score_cut = pd.cut(score_list, bins) # 对分数按照分数区间进行分段
    print(score_cut)
    
    [(0, 59], (59, 70], (70, 80], (0, 59], (59, 70], ..., (70, 80], (0, 59], (0, 59], (0, 59], (0, 59]]
    Length: 20
    Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
    
    pd.value_counts(score_cut) # 查看各分数区间的人数
    
    (0, 59]      10
    (59, 70]      4
    (80, 100]     3
    (70, 80]      3
    dtype: int64
    

    2、 筛选、排序

    sort_values() 按列进行排序
    data.sort_values('user_id',ascending=True) # 按照'user_id'对data进行升序排序,ascending为False则为降序
    
    user_id item_id catagory_id behavior_type   timestamp   date
    0   1   2333346.0   2520771.0   pv  1.511562e+09    1970-01-01 00:00:01.511561733
    30  1   3157558.0   2520771.0   pv  1.511985e+09    1970-01-01 00:00:01.511984943
    31  1   2087357.0   2131531.0   pv  1.512005e+09    1970-01-01 00:00:01.512004568
    32  1   4170517.0   149192.0    pv  1.512011e+09    1970-01-01 00:00:01.512011019
    33  1   1340922.0   4690421.0   pv  1.512041e+09    1970-01-01 00:00:01.512041260
    ... ... ... ... ... ... ...
    72  100 2971043.0   4869428.0   buy 1.511618e+09    1970-01-01 00:00:01.511617549
    73  100 2337874.0   1194311.0   pv  1.511680e+09    1970-01-01 00:00:01.511680485
    74  100 4182583.0   1258177.0   pv  1.511681e+09    1970-01-01 00:00:01.511680521
    76  100 4919803.0   2881542.0   pv  1.511684e+09    1970-01-01 00:00:01.511684032
    99  100 598929.0    2429887.0   buy 1.511759e+09    1970-01-01 00:00:01.511759458
    95 rows × 6 columns
    
    data.sort_values(['user_id','item_id'],ascending=[True,False])
    
    user_id item_id catagory_id behavior_type   timestamp   date
    14  1   5002615.0   2520377.0   pv  1.511839e+09    1970-01-01 00:00:01.511839385
    16  1   5002615.0   2520377.0   pv  1.511844e+09    1970-01-01 00:00:01.511844273
    28  1   4973305.0   2520771.0   pv  1.511969e+09    1970-01-01 00:00:01.511969365
    37  1   4954999.0   411153.0    pv  1.512061e+09    1970-01-01 00:00:01.512061318
    42  1   4954999.0   411153.0    pv  1.512173e+09    1970-01-01 00:00:01.512172732
    ... ... ... ... ... ... ...
    70  100 1603476.0   2951233.0   buy 1.511580e+09    1970-01-01 00:00:01.511579908
    58  100 1220136.0   4869428.0   pv  1.511551e+09    1970-01-01 00:00:01.511550908
    65  100 704268.0    223690.0    pv  1.511564e+09    1970-01-01 00:00:01.511563606
    92  100 598929.0    2429887.0   pv  1.511743e+09    1970-01-01 00:00:01.511742899
    99  100 598929.0    2429887.0   buy 1.511759e+09    1970-01-01 00:00:01.511759458
    95 rows × 6 columns
    
    sort_index() 按行进行排序

    3、 连接和组合

    append()
    df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                       index=[0, 1, 2, 3])
    df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                       index=[4, 5, 6, 7])
    df1.append(df2) # 在数据框df2的末尾添加数据框df1,其中df1和df2的列数应该相等
    
    A   B   C   D
    0   A0  B0  C0  D0
    1   A1  B1  C1  D1
    2   A2  B2  C2  D2
    3   A3  B3  C3  D3
    4   A4  B4  C4  D4
    5   A5  B5  C5  D5
    6   A6  B6  C6  D6
    7   A7  B7  C7  D7
    
    concat()
    df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                       'B': ['B0', 'B1', 'B2', 'B3'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']},
                      index=[0, 1, 2, 3])
    df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                       'B': ['B4', 'B5', 'B6', 'B7'],
                       'C': ['C4', 'C5', 'C6', 'C7'],
                       'D': ['D4', 'D5', 'D6', 'D7']},
                      index=[4, 5, 6, 7])
    pd.concat([df1,df2],axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等
    
    A   B   C   D   A   B   C   D
    0   A0  B0  C0  D0  NaN NaN NaN NaN
    1   A1  B1  C1  D1  NaN NaN NaN NaN
    2   A2  B2  C2  D2  NaN NaN NaN NaN
    3   A3  B3  C3  D3  NaN NaN NaN NaN
    4   NaN NaN NaN NaN A4  B4  C4  D4
    5   NaN NaN NaN NaN A5  B5  C5  D5
    6   NaN NaN NaN NaN A6  B6  C6  D6
    7   NaN NaN NaN NaN A7  B7  C7  D7
    
    join()
    df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],           
                         'B': ['B0', 'B1', 'B2', 'B3'],
                         'key': ['K0', 'K1', 'K0', 'K1']})
       
    
    df2 = pd.DataFrame({'C': ['C0', 'C1'],
                          'D': ['D0', 'D1']},
                         index=['K0', 'K1'])
       
    
    df1.join(df2, on='key') # 对数据框df1和df2做内连接,其中连接的列为col1
    
    A   B   key C   D
    0   A0  B0  K0  C0  D0
    1   A1  B1  K1  C1  D1
    2   A2  B2  K0  C0  D0
    3   A3  B3  K1  C1  D1
    
    merge()
    left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                         'A': ['A0', 'A1', 'A2', 'A3'],
                         'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 
                          'C': ['C0', 'C1', 'C2', 'C3'], 
                          'D': ['D0', 'D1', 'D2', 'D3']})
    result = pd.merge(left, right, on='key')
    print(result)
    
      key   A   B   C   D
    0  K0  A0  B0  C0  D0
    1  K1  A1  B1  C1  D1
    2  K2  A2  B2  C2  D2
    3  K3  A3  B3  C3  D3
    

    七、数据索引、提取

    df[col] 以数组 Series 的形式返回选取的列
    
    df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
    df['C']
    
    0    0.340454
    1    0.116156
    2    0.234191
    3    0.516992
    4    0.354863
    Name: C, dtype: float64
    
    df[[col1,col2]] 以新的数据框(DataFrame)的形式返回选取的列
    
    df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
    df[['B','E']]
    
    B   E
    0   0.313619    0.848954
    1   0.571887    0.337651
    2   0.666888    0.086449
    3   0.423615    0.368037
    4   0.453995    0.479666
    
    df.loc['index_one'] 按照索引选取
    
    s = pd.Series(np.array(['I','Love','Data']))
    s.loc[1]
    
    'Love'
    
    df.iloc[0] 按照位置选取
    
    df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
    df.iloc[0,:] # 选取第一行
    
    A    0.960768
    B    0.433873
    C    0.777235
    D    0.799733
    E    0.643273
    Name: 0, dtype: float64
    
    df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
    df.iloc[0,0] # 选取第一行第一个元素
    
    0.7353611931210813
    

    时间序列

    八、数据统计

    描述统计

    data.describe()
    
    user_id item_id catagory_id timestamp
    count   95.000000   9.500000e+01    9.500000e+01    9.500000e+01
    mean    45.810526   2.831879e+06    2.494733e+06    1.511826e+09
    std 49.538787   1.398492e+06    1.421917e+06    2.213639e+05
    min 1.000000    4.625900e+04    1.491920e+05    1.511551e+09
    25% 1.000000    2.087357e+06    1.289235e+06    1.511682e+09
    50% 1.000000    2.791761e+06    2.465336e+06    1.511751e+09
    75% 100.000000  3.875118e+06    3.213828e+06    1.511980e+09
    max 100.000000  5.153036e+06    4.869428e+06    1.512282e+09
    
    data.groupby(['behavior_type']).count().mean()
    
    user_id        31.666667
    item_id        31.666667
    catagory_id    31.666667
    timestamp      31.666667
    date           31.666667
    dtype: float64
    
    更多见数据查看

    函数应用

    apply:应用在DataFrame的行或列中
    applymap:应用在DataFrame的每个元素中
    map:应用在单独一列(Series)的每个元素中
    

    九、数据透视表

    pd.pivot_table()
    

    相关文章

      网友评论

          本文标题:Pandas数据分析基础流程及操作

          本文链接:https://www.haomeiwen.com/subject/iqnyvhtx.html