整理了一些利用pandas和numpy对文件进行预处理的常用方法,数据为加州房价预测数据,仅供参考(to be continued,我太懒了- -!)
#载入数据函数
def load_housing_data(path):
return pd.read_csv(path)
#载入数据,查看前五行
path = "D://housing.csv"
df = load_housing_data(path)
print(df['total_rooms'].head(5)) #查看某列的前五行
print(df.dtypes) #查看数据类型
print(df.index) #查看行
print(df.columns) #查看列
print(df.describe()) #数据集统计描述
print(df.T) #数据集转置
print(df.sort_values(by = 'total_bedrooms',ascending = False).head(20)) #按照某列累加并降序排列,取前20
print(df.housing_median_age.head(4))#某列前四行
#about pandas
print(df.iloc[0:3,0:10]) #数据切片(索引,连续)
print(df.iloc[0:6,[1,3,6,9]]) #数据切片数据切片(索引,不连续)
print(df.ix[0:6,[1,3,6,8,9]]) #ix完美兼容loc和iloc,推荐
print(df.ix[:3,["longitude","latitude","housing_median_age","total_rooms"]])
print(df.ix[:,["longitude","latitude","housing_median_age","total_rooms"]])
print(df[df["housing_median_age"] > 41].sort_values(by='housing_median_age',ascending=True)) #根据某列条件进行判断
df.iloc[0,0] = 99999 #某个值置为新数字
print(df.head(2))
print(np.shape(df)) #数据集形状
df[df.housing_median_age>41] = 1000
print(df.ix[[1,2,4],[1,3,6,8,9]]) ##ix通用行列切分
print(df.head(10))
df['total_rooms'] = np.nan #置为null
df = df.head(10)
print(df.isnull()) 把空值标记为True
df = df.dropna(axis=0) #清洗null数据
print(df.head(10))
df.to_csv("D://housing12345.csv") #导出文件到D盘
df1 = df.ix[0:4,0:3]
df2 = df.ix[8:10,0:3]
print(df1)
print("xxxxxxxxx")
print(df2)
print("xxxxxxxxx")
print(pd.concat([df1,df2],axis=0)) #按行concat连接
print("inner")
print(pd.concat([df1,df2],axis=1,join='inner')) #按列concat连接,inner,outer=full out类似于sql的连表方式
print("outer")
print(pd.concat([df1,df2],axis=1,join='outer'))
print(df1.append(df2))
print(df2.append(df1))
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K5', 'K4'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
print(pd.merge(left,right,on='key',how='inner')) #类似于sql中的inner/outer/left/right join on='key',参加merge函数完美兼容join,类似于ix兼容lioc和loc,推荐
print(pd.merge(left,right,on='key',how='outer'))
print(pd.merge(left,right,on='key',how='left'))
print(pd.merge(left,right,on='key',how='right'))
网友评论