美文网首页
7.cross_validation_and_grid_sear

7.cross_validation_and_grid_sear

作者: 许志辉Albert | 来源:发表于2021-01-29 10:14 被阅读0次

1.模型的评估与参数调优

#引入工具库
import warnings
warings.filterwarnings('ignore')
%matplotlib.inline
import matplotlib as mpl 
import numpy as np
from matplotlib import pyplot as plt
mpl.rcParams['legend.numpoints'] = 1
#基本建模流程
from sklearn.dataset import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#构建数据集
X , y = make_blobs(random_state = 0)
#切分train 和test data
X_train , X_test , y_train , y_test = train_test_split(X , y ,random_state = 0)
#初始化模型对象并拟合
logreg = LogisticRegression().fit(X_train , y_train)
#模型评估
logreg.score(X_test , y_test)

1.1交叉验证/Cross - validation

from tools import *
plots.plot_cross_validation()
1

1.1.1 K折交叉验证

from sklearn.datasets import load_iris
iris = load_iris()
print(iris.target)
2
plots.plot_stratified_cross_validation()
3

1.1.2 sklearn中的交叉验证

from sklearn.model_selection import cross_val_score
from skelarn.datasets import load_iris
from skelarn.linear_model import LogisticRegression

iris = load_iris()
logreg = LogisticRegression()

scores = cross_val_score(logreg, iris.data, iris.target)
print("cross-validation scores: ", scores)
4
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
scores
5
scores.mean()
6

1.1.3不同的数据切分方式

1.1.3.1 k折交叉验证(手动指定k折切分)

from sklearn.model_selection import KFold
kfold = KFold(n_split = 5)
cross_val_score(logreg , iris.data , iris.target , cv = kfold)
7
kfold = KFold(n_splits = 3)
cross_val_score(logreg , iris.data , iris.target , cv = kfold)
8
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)
9

1.1.3.2 留一交叉验证

from sklearn.model_selection import LeaveOneOut
loo  = LeaveOneOut()
scores = cross_val_score(logreg , iris.data , iris.target , cv = loo)
print("number of cv iterations: ", len(scores))
print("mean accuracy: ", scores.mean())
10

1.1.3.3 乱序分割交叉验证

from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size = 5 , train_size = 5 , n_split = 10)
cross_val_scroe(logreg,iris.data , iris.target , cv=shuffle_split)
11

1.1.3.4 分层抽样交叉验证

print("根据标签类别分层抽样的kfold")
plots.plot_label_kfold()
12
from sklearn.model_selection import StratifiedKFold
from tools.datasets import make_blobs
#构建数据集
X , y = make_blobs(n_samples = 12 , random_state = 0 )
#敲定一组label , 做分层抽样交叉验证
labels = [0,0,0,1,1,1,1,2,2,3,3,3]
cross_val_score(logerg , X , y , labels , cv = StratifiedKFlod(n_splits = 3))
13

1,2网格搜索/Grid Search

1.2.1 手动遍历超参数进行超参数选择(训练集+验证集)

我们用训练集进行模型拟合,用验证集做效果评估和参数选择

#naive grid search implementation
from sklearn.svm import svc
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(iris.data , iris.target , random_state = 0)
print("Size of training set: %d   size of test set: %d" % (X_train.shape[0], X_test.shape[0]))

best_score = 0

#for 循环遍历参数列表
for gamma in [0.001 , 0.01 , 0.1 , 1 , 10 , 100 ]:
    for C in [0.001 , 0.01 , 0.1 , 1 , 10 , 100 ]:
        #使用对应的参数组初始化模型对象
        svm = SVC(gamma = gamma , C = C)
        svm.fit(X_train , y_train)
        #评估svm
        score = svm.score(X_test , y_test)
        #保留最高分和对应参数
        if score > best_score:
            best_score = score
            best_parameters = {'C‘ :C ,'gamma' :gamma}
print("最高得分: ", best_score)
print("最好的参数: ", best_parameters)
14
best_score
15

1.2.2 训练集+测试集+验证集

  • 训练集:拟合和训练模型
  • 验证集:使用不同参数组在验证集上试验,用户参数调优
  • 测试集:模型评估
plots.plot_threefold.split()
16
from sklearn.svm import SVC
#训练集+测试集
X_trainval , X_test , y_trainval , y_test = train_test_split(iris.data , iris.target , random_state = 0)

#真正的训练集+验证集
X_train , X_valid , y_train , y_vaild = train_test_split(X_trainval , y_trainval , random_state = 1)

print("训练集数据量: %d,验证集数据量: %d,测试集数据量: %d" % (X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # 评估
        score = svm.score(X_valid, y_valid)
        # 保留最高得分
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

# 在测试数据上评估
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("验证集上最高得分: ", best_score)
print("最佳参数: ", best_parameters)
print("验证集选出最好的参数上测试集的得分为: ", test_score)
17

1.2.3 GridSearchCV = grid_search(产出候选超参数) + cross_validation(评估方式)

1.2.4 RandomizedSearchCV

param_grid = {'C' : [0.001 , 0.01 , 0.1, 1 , 10, 100],
                        'gamma' :[0.001 , 0.01 , 0.1 ,1 , 10,100]}
param_grid

#超参数1:5种取值
#超参数2:5种取值
#超参数3:6种取值
#5-fold 交叉验证 , 要建多少次模型用于评估?
#5*3*6*5 +1

#有加速方法吗?(并行化,加资源。。)

#depth:[3,5,7,10]
#min_child :[20,50,100]
#lr :[0.01 , 0.1 , 1 , 10]
# 7 50 0.1
# 周边搜索
# [10,20,10]
18
#param_grid是参数列表
#GridSearchCV是网格搜索交叉验证对象,fit之后可以对参数列表中的参数组进行拟合和交叉验证评估
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
grid_search = GridSearchCV(SVC() , param_grid , cv = 5)
X_train , X_test , y_train , y_test = train_test_split(iris.data , iris.target , random_state = 0)

import warnings
warings.fitlerwarnings("ignore")

grid_search.fit(X_train , y_train)
19
grid_search.score(X_test , y_test)
20
#最好的参数 和最高得分

print(grid_search.best_params_)
print(grid_search.best_score_)
21
grid_search.best_estimator_
22

1.2.5 附:模型候选超参数参考表

23

1.2.6 检视交叉验证的结果

grid_search.cv_result_
24
import numpy as np
scores = grid_search.cv_result_['mean_test_score']
scores = np.array(scores).reshape(6,6)

#plot the mean cross-validation scores
tools.heatmap(scores , xlabel = 'gamma' , ylabel = 'C' , xticklabels = param_grid['gamma'] , yticklabels = param_grid['C'] , cmap = 'viridis')
25
fig , axes = plt.subplots(1 , 3 , figsize = (13 , 5))

param_grid_linear = {'C' : np.linspace(1,2,6) ,'gamma' : np.linspace(1,2,6)}

param_grid_one_log = {'C' :np.linspace(1,2,6) ,'gamma' :np.linspace(-3 , 2, 6)}

param_grid_range = {'C' : np.linspace(-3 , 2 , 6) ,  'gamma' : np.logspace(-7 ,-2 ,6)}

for param_grid , ax in zip([param_grid_linear , param_grid_one_log , param_grid_range] , axes):

    grid_search = GridSearchCV(SVC(),param_grid , cv = 5)
    grid_search.fit(X_train , y_train)
    scores = grid_search.cv_results_['mean_test_score']
    scores = np.array(scores).reshape(6,6)

    #plot the mean cross-validation scores
    scores_image = tools.heatmap(scores, xlabel = 'gamma' , ylabel = 'C' , xticklabels = param_grid['gamma'] , yticklabels = param_grid['C'] , cmap = 'viridis' , ax = ax)

plt.colorbar(score_image , ax = axes.tolist())
26

1.2.7 手动切分数据交叉验证

1.2.7 手动切分数据交叉验证

在有一些问题中,我们不能直接对数据进行随机切分,比如分类问题中,如果类别是不均衡的(非1:1),我们不能直接随机切分,更多的情况下,我们会手动切分,并且保证每个fold中的样本比例一致。

scores= cross_val_score(GridSearch(SVC() , param_grid , cv = 5) , iris.data , iris.target , cv = 5)
print("交叉验证得分: ", scores)
print("平均交叉验证得分: ", scores.mean())
27
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
    outer_scores = []
    # 手动切分训练集和测试集(几折交叉)
    for training_samples, test_samples in outer_cv.split(X, y):
        # 初始化最好的参数存储的字典:
        best_parms = {}
        best_score = -np.inf
        # 遍历参数
        for parameters in parameter_grid:
            # 记录不同参数交叉验证实验得分
            cv_scores = []
            # 再把训练集做几折切分内部切分为 真正的训练集 和 验证集
            for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
                clf = Classifier(**parameters)
                clf.fit(X[inner_train], y[inner_train])
                score = clf.score(X[inner_test], y[inner_test])
                cv_scores.append(score)
            # 交叉验证的平均结果
            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                best_score = mean_score
                best_params = parameters
        clf = Classifier(**best_params)
        clf.fit(X[training_samples], y[training_samples])
        outer_scores.append(clf.score(X[test_samples], y[test_samples]))
    return outer_scores
from sklearn.model_selection import ParameterGrid , StratifieKFold
nested_cv(iris.data , iris.target , StratifieKFold(5) , StratifieKFold(5) , SVC , ParameterGrid(param_grid))

28

相关文章

网友评论

      本文标题:7.cross_validation_and_grid_sear

      本文链接:https://www.haomeiwen.com/subject/bmbdzktx.html