美文网首页
学习笔记:数据预处理

学习笔记:数据预处理

作者: zeolite | 来源:发表于2021-06-17 15:20 被阅读0次

    数据归一化
    Preprocessing.MinMaxScaler
    公式 (x-min(x))/(max(x)-min(x))

    import numpy as np
    from sklearn.preprocessing import MinMaxScaler
    # MinMaxScaler (x-min(x))/(max(x)-min(x))
    
    data=np.random.randint(1,10,5)
    
    scaler=MinMaxScaler()
    res=scaler.fit_transform(data)
    
    #复原
    scaler.inverse_transform(res)
    
    #feature_range数据压缩范围 默认[0,1]
    scaler=MinMaxScaler(feature_range=[1,2])
    res=scaler.fit_transform(data)
    

    Preprocessing.StandardScaler

    from sklearn.preprocessing import StandardScaler
    import numpy as np
    
    data=np.random.randint(1,10,6)
    data=data.reshape(2,-1)
    
    scaler=StandardScaler()
    res=scaler.fit_transform(data)
    
    scaler.mean_
    scaler.var_
    
    res.mean()
    res.std()
    
    scaler.inverse_transform(res)
    

    SimpleImputer 填充空缺值 strategy= median/mean/most_frequent/constant

    from sklearn.impute import SimpleImputer
    
    Age=data.loc[:, 'Age'].values.reshape(-1,1)
    
    imp_mean=SimpleImputer()
    imp_median=SimpleImputer(strategy='median')
    imp_0=SimpleImputer(strategy='constant', fill_value=0)
    
    imp_mean=imp_mean.fit_transform(Age)
    imp_median=imp_median.fit_transform(Age)
    imp_0=imp_0.fit_transform(Age)
    
    
    Embarked=data.loc[:, 'Embarked'].values.reshape(-1,1)
    imp_most=SimpleImputer(strategy='most_frequent')
    data.loc[:, 'Embarked']=imp_most.fit_transform(Embarked)
    

    LabelEncoder 标签类使用

    from sklearn.preprocessing import LabelEncoder
    le=LabelEncoder()
    label=le.fit_transform(data['Embarked'])
    
    le.classes_
    
    le.inverse_transform(label)
    

    OrdinalEncoder

    from sklearn.preprocessing import OrdinalEncoder
    
    data.iloc[:,1:-1]=OrdinalEncoder().fit_transform(data.iloc[:,1:-1])
    

    OneHotEncoder 转化为独热码

    from sklearn.preprocessing import OneHotEncoder
    
    X= data['Sex']
    X=pd.DataFrame(X).dropna()
    
    enc=OneHotEncoder(categories='auto')
    res=enc.fit_transform(X).toarray()
    
    enc.get_feature_names()
    

    Binarizer 二值化

    from sklearn.preprocessing import Binarizer
    import pandas as pd
    X=data.iloc[:,0].values.reshape(-1,1)
    X=pd.DataFrame(X).dropna()
    transformer=Binarizer(threshold=30).fit_transform(X)
    

    KBinsDiscretizer

    from sklearn.preprocessing import KBinsDiscretizer
    
    kbd=KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='uniform')
    kbd.fit_transform(X)
    

    特征过滤
    方差过滤 ,以整个特征列的方差计算, 通常选择阈值=0或是很小的阈值

    import numpy as np
    from sklearn.feature_selection import VarianceThreshold
    selector=VarianceThreshold()
    #默认删除方差小于0的特征(列)
    X_var0=selector.fit_transform(X)
    X_var0
    
    #删除方差小于中位数的特征(列)
    var_selector=VarianceThreshold(np.median(X.var().values)).fit_transform(X)
    

    还有根据数据分布过滤 例如卡方过滤 , 用到再看吧

    特征选择 SelectFromModel

    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.datasets import load_breast_cancer
    import numpy as np
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import cross_val_score
    from sklearn.feature_selection import SelectFromModel
    
    data=load_breast_cancer()
    X=data.data
    y=data.target
    
    lr=LR(penalty='l2', solver='liblinear', C=0.5)
    cross_val_score(lr, X, y, cv=10).mean()
    X.shape
    
    X_embedded=SelectFromModel(LR_, norm_order=1).fit_transform(X, y)
    cross_val_score(LR_, X_embedded, y, cv=10).mean()
    X_embedded.shape
    

    随机森林回归填充缺失值

    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.datasets import load_breast_cancer
    import numpy as np
    from sklearn.metrics import accuracy_score
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor
    
    data=load_breast_cancer()
    X=data.data
    y=data.target
    
    X.shape
    
    nan_index=np.random.randint(0,569, 30)
    X[nan_index,3]=np.nan
    
    X=pd.DataFrame(X)
    y=pd.DataFrame(y)
    
    y_fill=X.iloc[:,3]
    y_fill.shape
    
    y_train=y_fill[y_fill.notnull()]
    y_train.shape
    
    y_test=y_fill[y_fill.isnull()]
    y_test.shape
    
    data=pd.concat([X.iloc[:, X.columns!=3], y])
    
    X_train=data.iloc[y_train.index, :]
    X_train.shape
    
    X_test=data.iloc[y_test.index, :]
    X_test.shape
    
    rfc=RandomForestRegressor(n_estimators=100)
    rfc.fit(X_train, y_train)
    pred=rfc.predict(X_test)
    
    #填充缺失值
    X.loc[X.loc[:,3].isnull(),3]=pred
    

    相关文章

      网友评论

          本文标题:学习笔记:数据预处理

          本文链接:https://www.haomeiwen.com/subject/hmufyltx.html