#coding=utf-8
import pandas as pd
import numpy as np
data={
'company':['B','A','B',np.nan,'C'],
'gender':['female','female','male','male','female'],
'num':[40,31,28,28,50]
}
df=pd.DataFrame(data)
print(df)
'''
company gender num
0 B female 40
1 A female 31
2 B male 28
3 NaN male 28
4 C female 50
'''
#举例1、value_counts() 统计分类变量中每个类的数量,注:指定的列不能是数值的列
print(df.gender.value_counts())
'''
female 3
male 2
Name: gender, dtype: int64
'''
print(df.company.value_counts())
'''
B 2
C 1
A 1
Name: company, dtype: int64
'''
print(df.gender.value_counts(normalize=True))
'''
解释:normalize=True表示返回占比的百分比
female 0.6
male 0.4
Name: gender, dtype: float64
'''
#举例2、isna(),判断是否有缺失值,有缺失值返回True,无缺失值返回false
print(df.isna())
'''
company gender num
0 False False False
1 False False False
2 False False False
3 True False False
4 False False False
'''
print(df.company.isna())
'''
0 False
1 False
2 False
3 True
4 False
Name: company, dtype: bool
'''
print(df.gender.isna())
'''
0 False
1 False
2 False
3 False
4 False
Name: gender, dtype: bool
'''
print(df.loc[df.company.isna()])
'''
company gender num
3 NaN male 28
'''
#举例3、any()
#大多数情况下数据量比较大,使用isna()效率太低下,any()和isna()综合使用可以判断某一列是否含有缺失值df.isna().any()
print(df.isna().any())
'''
company True
gender False
num False
dtype: bool
'''
#举例4、dropna(),删除缺失值
print(df.dropna())
'''
注:未传入参数inplace=True,并未真正删除原df对象的缺失值
company gender num
0 B female 40
1 A female 31
2 B male 28
4 C female 50
'''
#举例5、fillna(value,method),填充缺失值
'''
参数解释:
value=? 指定缺失值等于某值
method= pad/ffill backfill/bfill
pad/ffill 表示用上一个值填充该缺失值
backfill/bfill 表示用下一个值填充该缺失值
'''
print(df.fillna(value=0))
'''
company gender num
0 B female 40
1 A female 31
2 B male 28
3 0 male 28
4 C female 50
可以看到缺失值被0所替换
注:未传入inplace=True,原df对象并未被真正修改
'''
print(df.fillna(method="pad"))
'''
或print(df.fillna(method="ffill"))
company gender num
0 B female 40
1 A female 31
2 B male 28
3 B male 28
4 C female 50
可以看到缺失值被上一个值"B"所替换
'''
print(df.fillna(method="backfill"))
'''
company gender num
0 B female 40
1 A female 31
2 B male 28
3 C male 28
4 C female 50
可以看到缺失值被下一个值"C"所替换
'''
#举例6、sort_values(by,ascending),排序
print(df.sort_values(by="num"))
'''
company gender num
2 B male 28
3 NaN male 28
1 A female 31
0 B female 40
4 C female 50
'''
print(df.sort_values(by="num",ascending=False))
'''
ascending=False 控制降序排序,默认为True升序排序
company gender num
4 C female 50
0 B female 40
1 A female 31
2 B male 28
3 NaN male 28
'''
网友评论