美文网首页
7.cross_validation_and_grid_sear

7.cross_validation_and_grid_sear

作者: 许志辉Albert | 来源:发表于2021-01-29 10:14 被阅读0次

    1.模型的评估与参数调优

    #引入工具库
    import warnings
    warings.filterwarnings('ignore')
    %matplotlib.inline
    import matplotlib as mpl 
    import numpy as np
    from matplotlib import pyplot as plt
    mpl.rcParams['legend.numpoints'] = 1
    
    #基本建模流程
    from sklearn.dataset import make_blobs
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    
    #构建数据集
    X , y = make_blobs(random_state = 0)
    #切分train 和test data
    X_train , X_test , y_train , y_test = train_test_split(X , y ,random_state = 0)
    #初始化模型对象并拟合
    logreg = LogisticRegression().fit(X_train , y_train)
    #模型评估
    logreg.score(X_test , y_test)
    

    1.1交叉验证/Cross - validation

    from tools import *
    plots.plot_cross_validation()
    
    1

    1.1.1 K折交叉验证

    from sklearn.datasets import load_iris
    iris = load_iris()
    print(iris.target)
    
    2
    plots.plot_stratified_cross_validation()
    
    3

    1.1.2 sklearn中的交叉验证

    from sklearn.model_selection import cross_val_score
    from skelarn.datasets import load_iris
    from skelarn.linear_model import LogisticRegression
    
    iris = load_iris()
    logreg = LogisticRegression()
    
    scores = cross_val_score(logreg, iris.data, iris.target)
    print("cross-validation scores: ", scores)
    
    4
    scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
    scores
    
    5
    scores.mean()
    
    6

    1.1.3不同的数据切分方式

    1.1.3.1 k折交叉验证(手动指定k折切分)

    from sklearn.model_selection import KFold
    kfold = KFold(n_split = 5)
    cross_val_score(logreg , iris.data , iris.target , cv = kfold)
    
    7
    kfold = KFold(n_splits = 3)
    cross_val_score(logreg , iris.data , iris.target , cv = kfold)
    
    8
    kfold = KFold(n_splits=3, shuffle=True, random_state=0)
    cross_val_score(logreg, iris.data, iris.target, cv=kfold)
    
    9

    1.1.3.2 留一交叉验证

    from sklearn.model_selection import LeaveOneOut
    loo  = LeaveOneOut()
    scores = cross_val_score(logreg , iris.data , iris.target , cv = loo)
    print("number of cv iterations: ", len(scores))
    print("mean accuracy: ", scores.mean())
    
    10

    1.1.3.3 乱序分割交叉验证

    from sklearn.model_selection import ShuffleSplit
    shuffle_split = ShuffleSplit(test_size = 5 , train_size = 5 , n_split = 10)
    cross_val_scroe(logreg,iris.data , iris.target , cv=shuffle_split)
    
    11

    1.1.3.4 分层抽样交叉验证

    print("根据标签类别分层抽样的kfold")
    plots.plot_label_kfold()
    
    12
    from sklearn.model_selection import StratifiedKFold
    from tools.datasets import make_blobs
    #构建数据集
    X , y = make_blobs(n_samples = 12 , random_state = 0 )
    #敲定一组label , 做分层抽样交叉验证
    labels = [0,0,0,1,1,1,1,2,2,3,3,3]
    cross_val_score(logerg , X , y , labels , cv = StratifiedKFlod(n_splits = 3))
    
    13

    1,2网格搜索/Grid Search

    1.2.1 手动遍历超参数进行超参数选择(训练集+验证集)

    我们用训练集进行模型拟合,用验证集做效果评估和参数选择

    #naive grid search implementation
    from sklearn.svm import svc
    from sklearn.model_selection import train_test_split
    X_train , X_test , y_train , y_test = train_test_split(iris.data , iris.target , random_state = 0)
    print("Size of training set: %d   size of test set: %d" % (X_train.shape[0], X_test.shape[0]))
    
    best_score = 0
    
    #for 循环遍历参数列表
    for gamma in [0.001 , 0.01 , 0.1 , 1 , 10 , 100 ]:
        for C in [0.001 , 0.01 , 0.1 , 1 , 10 , 100 ]:
            #使用对应的参数组初始化模型对象
            svm = SVC(gamma = gamma , C = C)
            svm.fit(X_train , y_train)
            #评估svm
            score = svm.score(X_test , y_test)
            #保留最高分和对应参数
            if score > best_score:
                best_score = score
                best_parameters = {'C‘ :C ,'gamma' :gamma}
    print("最高得分: ", best_score)
    print("最好的参数: ", best_parameters)
    
    14
    best_score
    
    15

    1.2.2 训练集+测试集+验证集

    • 训练集:拟合和训练模型
    • 验证集:使用不同参数组在验证集上试验,用户参数调优
    • 测试集:模型评估
    plots.plot_threefold.split()
    
    16
    from sklearn.svm import SVC
    #训练集+测试集
    X_trainval , X_test , y_trainval , y_test = train_test_split(iris.data , iris.target , random_state = 0)
    
    #真正的训练集+验证集
    X_train , X_valid , y_train , y_vaild = train_test_split(X_trainval , y_trainval , random_state = 1)
    
    print("训练集数据量: %d,验证集数据量: %d,测试集数据量: %d" % (X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
    best_score = 0
    
    for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
        for C in [0.001, 0.01, 0.1, 1, 10, 100]:
            svm = SVC(gamma=gamma, C=C)
            svm.fit(X_train, y_train)
            # 评估
            score = svm.score(X_valid, y_valid)
            # 保留最高得分
            if score > best_score:
                best_score = score
                best_parameters = {'C': C, 'gamma': gamma}
    
    # 在测试数据上评估
    svm = SVC(**best_parameters)
    svm.fit(X_trainval, y_trainval)
    test_score = svm.score(X_test, y_test)
    print("验证集上最高得分: ", best_score)
    print("最佳参数: ", best_parameters)
    print("验证集选出最好的参数上测试集的得分为: ", test_score)
    
    17

    1.2.3 GridSearchCV = grid_search(产出候选超参数) + cross_validation(评估方式)

    1.2.4 RandomizedSearchCV

    param_grid = {'C' : [0.001 , 0.01 , 0.1, 1 , 10, 100],
                            'gamma' :[0.001 , 0.01 , 0.1 ,1 , 10,100]}
    param_grid
    
    #超参数1:5种取值
    #超参数2:5种取值
    #超参数3:6种取值
    #5-fold 交叉验证 , 要建多少次模型用于评估?
    #5*3*6*5 +1
    
    #有加速方法吗?(并行化,加资源。。)
    
    #depth:[3,5,7,10]
    #min_child :[20,50,100]
    #lr :[0.01 , 0.1 , 1 , 10]
    # 7 50 0.1
    # 周边搜索
    # [10,20,10]
    
    18
    #param_grid是参数列表
    #GridSearchCV是网格搜索交叉验证对象,fit之后可以对参数列表中的参数组进行拟合和交叉验证评估
    from sklearn.model_selection import GridSearchCV
    from sklearn.svm import SVC 
    grid_search = GridSearchCV(SVC() , param_grid , cv = 5)
    X_train , X_test , y_train , y_test = train_test_split(iris.data , iris.target , random_state = 0)
    
    import warnings
    warings.fitlerwarnings("ignore")
    
    grid_search.fit(X_train , y_train)
    
    19
    grid_search.score(X_test , y_test)
    
    20
    #最好的参数 和最高得分
    
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    
    21
    grid_search.best_estimator_
    
    22

    1.2.5 附:模型候选超参数参考表

    23

    1.2.6 检视交叉验证的结果

    grid_search.cv_result_
    
    24
    import numpy as np
    scores = grid_search.cv_result_['mean_test_score']
    scores = np.array(scores).reshape(6,6)
    
    #plot the mean cross-validation scores
    tools.heatmap(scores , xlabel = 'gamma' , ylabel = 'C' , xticklabels = param_grid['gamma'] , yticklabels = param_grid['C'] , cmap = 'viridis')
    
    25
    fig , axes = plt.subplots(1 , 3 , figsize = (13 , 5))
    
    param_grid_linear = {'C' : np.linspace(1,2,6) ,'gamma' : np.linspace(1,2,6)}
    
    param_grid_one_log = {'C' :np.linspace(1,2,6) ,'gamma' :np.linspace(-3 , 2, 6)}
    
    param_grid_range = {'C' : np.linspace(-3 , 2 , 6) ,  'gamma' : np.logspace(-7 ,-2 ,6)}
    
    for param_grid , ax in zip([param_grid_linear , param_grid_one_log , param_grid_range] , axes):
    
        grid_search = GridSearchCV(SVC(),param_grid , cv = 5)
        grid_search.fit(X_train , y_train)
        scores = grid_search.cv_results_['mean_test_score']
        scores = np.array(scores).reshape(6,6)
    
        #plot the mean cross-validation scores
        scores_image = tools.heatmap(scores, xlabel = 'gamma' , ylabel = 'C' , xticklabels = param_grid['gamma'] , yticklabels = param_grid['C'] , cmap = 'viridis' , ax = ax)
    
    plt.colorbar(score_image , ax = axes.tolist())
    
    26

    1.2.7 手动切分数据交叉验证

    1.2.7 手动切分数据交叉验证

    在有一些问题中,我们不能直接对数据进行随机切分,比如分类问题中,如果类别是不均衡的(非1:1),我们不能直接随机切分,更多的情况下,我们会手动切分,并且保证每个fold中的样本比例一致。

    scores= cross_val_score(GridSearch(SVC() , param_grid , cv = 5) , iris.data , iris.target , cv = 5)
    print("交叉验证得分: ", scores)
    print("平均交叉验证得分: ", scores.mean())
    
    27
    def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
        outer_scores = []
        # 手动切分训练集和测试集(几折交叉)
        for training_samples, test_samples in outer_cv.split(X, y):
            # 初始化最好的参数存储的字典:
            best_parms = {}
            best_score = -np.inf
            # 遍历参数
            for parameters in parameter_grid:
                # 记录不同参数交叉验证实验得分
                cv_scores = []
                # 再把训练集做几折切分内部切分为 真正的训练集 和 验证集
                for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
                    clf = Classifier(**parameters)
                    clf.fit(X[inner_train], y[inner_train])
                    score = clf.score(X[inner_test], y[inner_test])
                    cv_scores.append(score)
                # 交叉验证的平均结果
                mean_score = np.mean(cv_scores)
                if mean_score > best_score:
                    best_score = mean_score
                    best_params = parameters
            clf = Classifier(**best_params)
            clf.fit(X[training_samples], y[training_samples])
            outer_scores.append(clf.score(X[test_samples], y[test_samples]))
        return outer_scores
    
    from sklearn.model_selection import ParameterGrid , StratifieKFold
    nested_cv(iris.data , iris.target , StratifieKFold(5) , StratifieKFold(5) , SVC , ParameterGrid(param_grid))
    
    
    28

    相关文章

      网友评论

          本文标题:7.cross_validation_and_grid_sear

          本文链接:https://www.haomeiwen.com/subject/bmbdzktx.html