import numpy as np
import pandas as pd
data = pd.DataFrame(np.random.randn(9, 6), columns=list('abcdef'))
data.tail() # 显示后5行
# a b c d e f
# 4 -0.783951 1.060859 0.199606 -0.209257 0.197639 -0.114448
# 5 1.421739 -0.646007 0.742340 -1.510422 -0.219927 0.225591
# 6 -0.524613 -0.058266 0.170682 0.293186 -1.505537 0.625299
# 7 -0.688985 -0.083023 -0.274630 -0.455460 -1.323609 -0.212020
# 8 0.396852 -0.086073 -0.374000 1.958154 -0.920253 0.530094
data.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 9 entries, 0 to 8
# Data columns (total 6 columns):
# a 9 non-null float64
# b 9 non-null float64
# c 9 non-null float64
# d 9 non-null float64
# e 9 non-null float64
# f 9 non-null float64
# dtypes: float64(6)
# memory usage: 512.0 bytes
data.describe()
# a b c d e f
# count 9.000000 9.000000 8.000000 9.000000 9.000000 9.000000
# mean -0.018660 -0.262793 0.368564 0.117029 0.248969 0.235428
# std 1.221575 1.026121 0.764823 0.879644 0.685090 0.991456
# min -2.142549 -1.641552 -0.647248 -1.192345 -1.313795 -1.445769
# 25% -1.041688 -1.038682 -0.146752 -0.312632 0.133815 -0.521155
# 50% 0.447488 -0.242379 0.393846 0.190798 0.486768 0.357102
# 75% 0.567235 0.632706 0.837352 0.822798 0.623940 0.496832
# max 1.403796 1.162124 1.527134 1.394901 1.024189 1.614408
data.sum(0) # 以列为单位进行操作
# a -0.167941
# b -2.365137
# c 2.948514
# d 1.053265
# e 2.240720
# f 2.118855
# dtype: float64
data.a[data.a.idxmax()]
# 1.4037960380489185
data = pd.DataFrame(np.random.randint(1, 10, size=(5,7)))
data
# 0 1 2 3 4 5 6
# 0 5 3 6 7 2 1 8
# 1 7 6 9 8 4 9 4
# 2 1 9 7 6 3 9 1
# 3 2 3 6 8 9 6 2
# 4 6 6 3 1 3 7 6
np.unique(data) # 取出唯一值
# array([1, 2, 3, 4, 5, 6, 7, 8, 9])
data.iloc[2].unique() # 取出第2行的唯一值(行号从0开始)
# array([1, 9, 7, 6, 3])
data.iloc[:, -2].value_counts() # 统计-2列,每个数值出现的次数
# 9 2
# 1 1
# 6 1
# 7 1
# Name: 5, dtype: int64
s = pd.Series(['a', 'b', 'b','b','b', 'a', 'c'])
s.value_counts()
# b 4
# a 2
# c 1
# dtype: int64
s[s.isin(['a', 'c'])] # 统计s中存在a或c的位置
# 0 a
# 5 a
# 6 c
# dtype: object
data.apply(lambda x:x.max()-x.min(), axis=0) # 对列应用该函数
# 0 7
# 1 6
# 2 5
# 3 8
# 4 3
# 5 2
# 6 7
# dtype: int64
data.applymap(lambda x:x**2 + x +3) #对每个元素都应用这个函数
data = pd.DataFrame(np.random.randn(9, 6), columns=list('abcdef'))
data.a.apply(lambda x:x+10)
# 0 9.801785
# 1 9.736178
# 2 9.717085
# 3 10.699237
# 4 11.847702
# 5 10.039552
# 6 10.537041
# 7 9.298466
# 8 9.151704
# Name: a, dtype: float64
data

image.png
data.applymap(lambda x:x**2 + x +3)

image.png
data['g'] = ['ssd', 'ddff', 'zsd', 'sdf', 'bfff', 'xxf', 'zxc', 'sadff', 'sdfsdd']
data

image.png
data['g'] = data.g.apply(lambda x:x.title())
data

image.png
网友评论