数据归一化
Preprocessing.MinMaxScaler
公式 (x-min(x))/(max(x)-min(x))
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# MinMaxScaler (x-min(x))/(max(x)-min(x))
data=np.random.randint(1,10,5)
scaler=MinMaxScaler()
res=scaler.fit_transform(data)
#复原
scaler.inverse_transform(res)
#feature_range数据压缩范围 默认[0,1]
scaler=MinMaxScaler(feature_range=[1,2])
res=scaler.fit_transform(data)
Preprocessing.StandardScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
data=np.random.randint(1,10,6)
data=data.reshape(2,-1)
scaler=StandardScaler()
res=scaler.fit_transform(data)
scaler.mean_
scaler.var_
res.mean()
res.std()
scaler.inverse_transform(res)
SimpleImputer 填充空缺值 strategy= median/mean/most_frequent/constant
from sklearn.impute import SimpleImputer
Age=data.loc[:, 'Age'].values.reshape(-1,1)
imp_mean=SimpleImputer()
imp_median=SimpleImputer(strategy='median')
imp_0=SimpleImputer(strategy='constant', fill_value=0)
imp_mean=imp_mean.fit_transform(Age)
imp_median=imp_median.fit_transform(Age)
imp_0=imp_0.fit_transform(Age)
Embarked=data.loc[:, 'Embarked'].values.reshape(-1,1)
imp_most=SimpleImputer(strategy='most_frequent')
data.loc[:, 'Embarked']=imp_most.fit_transform(Embarked)
LabelEncoder 标签类使用
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
label=le.fit_transform(data['Embarked'])
le.classes_
le.inverse_transform(label)
OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder
data.iloc[:,1:-1]=OrdinalEncoder().fit_transform(data.iloc[:,1:-1])
OneHotEncoder 转化为独热码
from sklearn.preprocessing import OneHotEncoder
X= data['Sex']
X=pd.DataFrame(X).dropna()
enc=OneHotEncoder(categories='auto')
res=enc.fit_transform(X).toarray()
enc.get_feature_names()
Binarizer 二值化
from sklearn.preprocessing import Binarizer
import pandas as pd
X=data.iloc[:,0].values.reshape(-1,1)
X=pd.DataFrame(X).dropna()
transformer=Binarizer(threshold=30).fit_transform(X)
KBinsDiscretizer
from sklearn.preprocessing import KBinsDiscretizer
kbd=KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='uniform')
kbd.fit_transform(X)
特征过滤
方差过滤 ,以整个特征列的方差计算, 通常选择阈值=0或是很小的阈值
import numpy as np
from sklearn.feature_selection import VarianceThreshold
selector=VarianceThreshold()
#默认删除方差小于0的特征(列)
X_var0=selector.fit_transform(X)
X_var0
#删除方差小于中位数的特征(列)
var_selector=VarianceThreshold(np.median(X.var().values)).fit_transform(X)
还有根据数据分布过滤 例如卡方过滤 , 用到再看吧
特征选择 SelectFromModel
from sklearn.linear_model import LogisticRegression as LR
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
data=load_breast_cancer()
X=data.data
y=data.target
lr=LR(penalty='l2', solver='liblinear', C=0.5)
cross_val_score(lr, X, y, cv=10).mean()
X.shape
X_embedded=SelectFromModel(LR_, norm_order=1).fit_transform(X, y)
cross_val_score(LR_, X_embedded, y, cv=10).mean()
X_embedded.shape
随机森林回归填充缺失值
from sklearn.linear_model import LogisticRegression as LR
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
data=load_breast_cancer()
X=data.data
y=data.target
X.shape
nan_index=np.random.randint(0,569, 30)
X[nan_index,3]=np.nan
X=pd.DataFrame(X)
y=pd.DataFrame(y)
y_fill=X.iloc[:,3]
y_fill.shape
y_train=y_fill[y_fill.notnull()]
y_train.shape
y_test=y_fill[y_fill.isnull()]
y_test.shape
data=pd.concat([X.iloc[:, X.columns!=3], y])
X_train=data.iloc[y_train.index, :]
X_train.shape
X_test=data.iloc[y_test.index, :]
X_test.shape
rfc=RandomForestRegressor(n_estimators=100)
rfc.fit(X_train, y_train)
pred=rfc.predict(X_test)
#填充缺失值
X.loc[X.loc[:,3].isnull(),3]=pred
网友评论