美文网首页数据分析工具--python篇
sklearn模型评估--Cross-validation

sklearn模型评估--Cross-validation

作者: 粉红狐狸_dhf | 来源:发表于2019-03-05 20:37 被阅读0次

Cross--validation: evaluating estimator performance

出处:https://scikit-learn.org/stable/modules/cross_validation.html

import numpyas np

from sklearn.model_selectionimport train_test_split,cross_val_score

from sklearnimport svm,datasets

from sklearnimport preprocessing

'''官方文档说明:https://scikit-learn.org/stable/modules/cross_validation.html'''

iris = datasets.load_iris()

iris.data.shape, iris.target.shape

#((150, 4), (150,))

'''交叉验证-----法一:train_test_split'''

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.4,random_state=0)

X_train.shape, y_train.shape

#((90, 4), (90,))

X_test.shape, y_test.shape

#((60, 4), (60,))

clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train)

clf.score(X_test, y_test)

'''The simplest way to use cross-validation is to call the cross_val_score helper function

on the estimator and the dataset.'''

#交叉验证-----法e二:cross_val_score  喂入全部数据 通过cv设定

clf = svm.SVC(kernel='linear',C=1)

scores = cross_val_score(clf, iris.data, iris.target,cv=5,scoring='f1_macro')

#scoring='score'默认

print(scores)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() *2))

#Accuracy: 0.98 (+/- 0.03)

'''对验证集和训练集同时标准化处理'''

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.4,random_state=0)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_transformed = scaler.transform(X_train)

clf = svm.SVC(C=1).fit(X_train_transformed, y_train)

X_test_transformed = scaler.transform(X_test)

clf.score(X_test_transformed, y_test)#0.9333...

''' Pipeline 对标准化处理和交叉验证同时进行'''

from sklearn.pipelineimport make_pipeline

clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))

cross_val_score(clf, iris.data, iris.target,cv=5)

#-----------cross_validate----------

''' The cross_validate function and multiple metric evaluation:不同于cross_val_score,允许评估多个值'''

from sklearn.model_selectionimport cross_validate

from sklearn.metricsimport recall_score

scoring = ['precision_macro','recall_macro']

clf = svm.SVC(kernel='linear',C=1,random_state=0)

scores = cross_validate(clf, iris.data, iris.target,scoring=scoring,cv=5,return_train_score=False)

sorted(scores.keys())

#['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

print(scores['test_recall_macro'])

#--------Cross validation iterators------------

'''根据不同的交叉验证策略生成数据集分割。应用于独立同分布'''

'''1.KFold--Note that KFold is not affected by classes or groups,of equal sizes (if possible)'''

from sklearn.model_selectionimport KFold

X = ["a","b","c","d"]

kf = KFold(n_splits=2)

for train, testin kf.split(X):

print("%s %s" % (train, test))

'''2.Repeated KFold--repeats Stratified K-Fold n times with different randomization in each repetition'''

from sklearn.model_selectionimport RepeatedKFold

X = np.array([[1,2], [3,4], [1,2], [3,4]])

random_state =12883823

rkf = RepeatedKFold(n_splits=2,n_repeats=2,random_state=random_state)

for train, testin rkf.split(X):

print("%s %s" % (train, test))

'''3.Each learning set is created by taking all the samples except one,

the test set being the sample left out.'''

'''作为一般规则,大多数作者和经验证据表明,5或10倍的交叉验证应该优先于LOO。'''

from sklearn.model_selectionimport LeaveOneOut

X = [1,2,3,4]

loo = LeaveOneOut()

for train, testin loo.split(X):

print("%s %s" % (train, test))

'''Cross-validation iterators with stratification based on class labels样本标签非平衡问题'''

'''1.StratifiedKFold is a variation of k-fold which returns stratified folds:

each set contains approximately the same percentage of samples of each target class as the complete set.'''

from sklearn.model_selectionimport StratifiedKFold

X = np.ones(10)

y = [0,0,0,0,1,1,1,1,1,1]

skf = StratifiedKFold(n_splits=3)

for train, testin skf.split(X, y):

print("%s %s" % (train, test))

#RepeatedStratifiedKFold :

#can be used to repeat Stratified K-Fold n times with different randomization in each repetition.

'''Cross-validation iterators for grouped data'''

'''GroupKFold is a variation of k-fold which

ensures that the same group is not represented in both testing and training sets. '''

from sklearn.model_selectionimport GroupKFold

X = [0.1,0.2,2.2,2.4,2.3,4.55,5.8,8.8,9,10]

y = ["a","b","b","b","c","c","c","d","d","d"]

groups = [1,1,1,2,2,2,3,3,3,3]

gkf = GroupKFold(n_splits=3)

for train, testin gkf.split(X, y,groups=groups):

print("%s %s" % (train, test))

相关文章

  • sklearn模型评估--Cross-validation

    Cross--validation: evaluating estimator performance 出处:ht...

  • sklearn数据集

    sklearn数据集的划分 训练集:用于训练,构建模型测试集:在模型检验时使用,用于评估模型是否有效 •sklea...

  • UD机器学习 - C2 机器学习基础

    训练和测试模型:sklearn基本语句,pandas和numpy,train_test_split评估指标:混淆矩...

  • 使用sklearn进行数据挖掘

    数据挖掘的步骤 数据挖掘通常包括数据采集、数据分析、特征工程、训练模型、模型评估等步骤。 sklearn工作流程 ...

  • sklearn中的fit fit_transform和trans

    数据挖掘通常包括数据采集,数据分析,特征工程,训练模型,模型评估等步骤。使用sklearn工具可以方便地进行特征工...

  • sklearn中的模型评估-构建评估函数

    https://www.cnblogs.com/harvey888/p/6964741.html 理论:https...

  • sklearn库-算法调用,模型评估

    一般情况下sklearn在算法调用的时候,先进行数据集的切割,为了以后的模型评估做准备,然后进行算法的调用对数据集...

  • 5、sklearn模型建立及评估

    前置工作 填充数据 编码分类变量 划分数据集 模型搭建 模型评估 模型评估是为了知道模型的泛化能力,主要指标有:1...

  • sklearn库-特征工程

    sklearn作为python机器学习的一个常用库,可以用来做特征工程,算法的调用,模型的评估 特征工程 特征工程...

  • sklearn-9模型持久化

    在用sklearn训练了模型之后,需要保存模型,以便日后使用。下面介绍了关于如何持久化sklearn模型的方法。 ...

网友评论

    本文标题:sklearn模型评估--Cross-validation

    本文链接:https://www.haomeiwen.com/subject/ygzouqtx.html