（十八）adaboosting多分类算法

作者: 羽天驿 | 来源:发表于2020-04-07 08:35 被阅读0次

（十八）adaboosting多分类算法
AdaBoosting推导过程
分类算法与数据挖掘
（十七）boosting\adaBoosting--迭代算法
大数据算法：分类算法
ml-多类别分类：一对多
逻辑回归
读书笔记
sklearn的常用函数以及参数——1. 分类算法
KNN算法介绍

一、算法的案例

算法推导：
https://www.cnblogs.com/liuwu265/p/4692347.html

import numpy as np

from sklearn.ensemble import AdaBoostClassifier,AdaBoostRegressor

from sklearn import datasets

from sklearn.model_selection import train_test_split

from sklearn import tree

d:\python3.7.4\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
  return f(*args, **kwds)

# 目标、项目需求：根据鸢尾花的属性，判定类别
# 支付宝，借呗；微信，微粒贷
# 可借额度，算法预测出来的，回归问题
# 特征：性别、年龄、籍贯、学历、工作单位、电话、……
# 业务理解
X,y = datasets.load_iris(True)#小项目，数据X（清洗），四个特征，目标值y（清洗）

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1024)

# 多分类问题，不是+1、-1
# 类别0,1,2
# 参数，这个参数，选多大合适，试一试：模型选择
# n_estimators参数，通过代码，进行筛选合适的参数
ada = AdaBoostClassifier(n_estimators=3,algorithm='SAMME',learning_rate=1.0)
# algorithm='SAMME'  还有一个algorithm='SAMMER'这个的计算方式更加的快，所以速度也更快。
ada.fit(X_train,y_train)#算法，工作：从X_train---y_train寻找规律
y_ = ada.predict(X_test)
proba_ = ada.predict_proba(X_test)
accuracy = ada.score(X_test,y_test)
print('--------------------算法准确率：',accuracy)
display(y_,proba_)

--------------------算法准确率： 0.9666666666666667



array([1, 0, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0,
       1, 0, 2, 1, 2, 2, 2, 2])



array([[0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2796265 , 0.38881125, 0.33156225],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2796265 , 0.38881125, 0.33156225],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847]])

算法中弱学习器的权重

# 这个是算法，返回的样本的权重：
ada.estimator_weights_

array([1.42403469, 2.14358936, 2.72369906])

ada.estimators_

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=1, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1784306878, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=1, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1759305051, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=1, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=369271939, splitter='best')]

# 回归里面的基模型
adaregressor = AdaBoostRegressor(n_estimators=3)

X,y = datasets.load_boston(True)

adaregressor.fit(X,y)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=3, random_state=None)

adaregressor[0]

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1077777532, splitter='best')

from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor

g1 = GradientBoostingClassifier()#分类
g2 = GradientBoostingRegressor()# 回归

g1.fit(X_train,y_train)
display(g1[0])
X,y = datasets.load_boston(True)
g2.fit(X,y)
display(g2[0])

array([DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=RandomState(MT19937) at 0x22C1DF98048,
                      splitter='best'),
       DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=RandomState(MT19937) at 0x22C1DF98048,
                      splitter='best'),
       DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=RandomState(MT19937) at 0x22C1DF98048,
                      splitter='best')], dtype=object)



array([DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=RandomState(MT19937) at 0x22C1DF98048,
                      splitter='best')], dtype=object)

# 这棵树，简单的树，树的深度：1
# Adaboosting里面都是简单的树
_ = tree.plot_tree(ada[0],filled=True)

[图片上传失败...(image-9071e0-1586219711847)]

计算第一棵树弱学习器权重，更新样本权重

learning_rate = 1.0

# 鸢尾花，三分类问题
np.unique(y)
num = 3

ada.estimator_weights_

array([1.42403469, 2.14358936, 2.72369906])

# 初始化样本(训练数据，X_train)，每个样本的权重都是一样的
w0 = np.full(shape = 120,fill_value=1/120)
# 误差
y1 = ada[0].predict(X_train)#y1弱学习器预测值，y_train真实
e1 = ((y1 != y_train)*w0).sum()
# 弱学习器的权重，扁鹊医院大夫的话语权
a1 = np.round(learning_rate*(np.log((1-e1)/e1) + np.log(num - 1)),8)

# 更新样本的权重
w1 = w0*np.exp(a1*(y_train != y1))
w1 /= w1.sum()
w1

array([0.00411523, 0.01709402, 0.01709402, 0.01709402, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.01709402, 0.00411523,
       0.01709402, 0.01709402, 0.00411523, 0.00411523, 0.01709402,
       0.01709402, 0.01709402, 0.00411523, 0.00411523, 0.00411523,
       0.01709402, 0.01709402, 0.00411523, 0.01709402, 0.00411523,
       0.01709402, 0.01709402, 0.00411523, 0.00411523, 0.01709402,
       0.00411523, 0.00411523, 0.01709402, 0.00411523, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
       0.00411523, 0.01709402, 0.01709402, 0.01709402, 0.01709402,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.01709402,
       0.01709402, 0.00411523, 0.01709402, 0.01709402, 0.01709402,
       0.00411523, 0.01709402, 0.00411523, 0.00411523, 0.01709402,
       0.00411523, 0.00411523, 0.01709402, 0.00411523, 0.01709402,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
       0.01709402, 0.00411523, 0.00411523, 0.00411523, 0.01709402,
       0.00411523, 0.01709402, 0.01709402, 0.00411523, 0.00411523,
       0.00411523, 0.01709402, 0.00411523, 0.00411523, 0.00411523,
       0.00411523, 0.00411523, 0.01709402, 0.00411523, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.01709402, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
       0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
       0.01709402, 0.00411523, 0.01709402, 0.00411523, 0.01709402])

# 这棵树，简单的树，树的深度：1
# Adaboosting里面都是简单的树
_ = tree.plot_tree(ada[1],filled=True)

[图片上传失败...(image-3fa51b-1586219711847)]

第二棵树，计算

ada.estimator_weights_

array([1.42403469, 2.14358936, 2.72369906])

# 误差
y2 = ada[1].predict(X_train)#y1弱学习器预测值，y_train真实
e2 = ((y2 != y_train)*w1).sum()
print('---------------',e2)
# estimator_error = np.mean(np.average((y2 != y_train), weights=w1, axis=0))
# print('+++++++++++++++',estimator_error)
# 第二个弱学习器的权重，扁鹊医院大夫的话语权
a2 = np.round(learning_rate*(np.log((1-e2)/e2) + np.log(num - 1)),8)
print('第二个弱分类器权重：',a2)

# 更新样本的权重
w2 = w1*np.exp(a2*(y_train != y2))
w2 /= w2.sum()#归一化 Normalization
w2

--------------- 0.18993352316858095
第二个弱分类器权重： 2.14358936





array([0.01444444, 0.007034  , 0.007034  , 0.007034  , 0.01444444,
       0.01444444, 0.01444444, 0.01444444, 0.00169337, 0.01444444,
       0.01444444, 0.01444444, 0.00169337, 0.007034  , 0.00169337,
       0.007034  , 0.007034  , 0.00169337, 0.01444444, 0.007034  ,
       0.007034  , 0.007034  , 0.00169337, 0.00169337, 0.01444444,
       0.007034  , 0.007034  , 0.01444444, 0.007034  , 0.01444444,
       0.007034  , 0.007034  , 0.01444444, 0.01444444, 0.007034  ,
       0.00169337, 0.00169337, 0.007034  , 0.00169337, 0.01444444,
       0.00169337, 0.01444444, 0.00169337, 0.01444444, 0.01444444,
       0.01444444, 0.00169337, 0.01444444, 0.00169337, 0.01444444,
       0.00169337, 0.007034  , 0.007034  , 0.007034  , 0.007034  ,
       0.01444444, 0.00169337, 0.01444444, 0.00169337, 0.007034  ,
       0.007034  , 0.00169337, 0.007034  , 0.007034  , 0.007034  ,
       0.01444444, 0.06      , 0.01444444, 0.00169337, 0.007034  ,
       0.00169337, 0.00169337, 0.007034  , 0.01444444, 0.007034  ,
       0.01444444, 0.00169337, 0.00169337, 0.01444444, 0.00169337,
       0.007034  , 0.01444444, 0.00169337, 0.00169337, 0.007034  ,
       0.00169337, 0.007034  , 0.007034  , 0.01444444, 0.00169337,
       0.01444444, 0.007034  , 0.00169337, 0.01444444, 0.00169337,
       0.01444444, 0.01444444, 0.007034  , 0.00169337, 0.00169337,
       0.00169337, 0.00169337, 0.00169337, 0.007034  , 0.00169337,
       0.01444444, 0.01444444, 0.00169337, 0.01444444, 0.00169337,
       0.00169337, 0.01444444, 0.01444444, 0.00169337, 0.01444444,
       0.007034  , 0.01444444, 0.007034  , 0.01444444, 0.007034  ])

# 这棵树，简单的树，树的深度：1
# Adaboosting里面都是简单的树
_ = tree.plot_tree(ada[2],filled=True)

[图片上传失败...(image-984dc8-1586219711847)]

第三棵树，计算

ada.estimator_weights_

array([1.42403469, 2.14358936, 2.72369906])

# 误差
y3 = ada[2].predict(X_train)#y1弱学习器预测值，y_train真实
e3 = ((y3 != y_train)*w2).sum()
print('---------------',e3)
# estimator_error = np.mean(np.average((y2 != y_train), weights=w1, axis=0))
# print('+++++++++++++++',estimator_error)
# 第三个弱学习器的权重，扁鹊医院大夫的话语权
a3 = np.round(learning_rate*(np.log((1-e3)/e3) + np.log(num - 1)),8)
print('第三个弱分类器权重：',a3)

# 更新样本的权重
w3 = w2*np.exp(a3*(y_train != y3))
w3 /= w3.sum()#归一化 Normalization

--------------- 0.11603230434326026
第三个弱分类器权重： 2.72369906

三个弱分类器的权重，和标准答案一模一样

estimator_weights = np.array([a1,a2,a3])
estimator_weights

array([1.42403469, 2.14358936, 2.72369906])

预测一下

y_ = ada.predict(X_test)
proba_ = ada.predict_proba(X_test)#Adaboosting算法，概率问题
display(y_,proba_)

array([1, 0, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0,
       1, 0, 2, 1, 2, 2, 2, 2])



array([[0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2796265 , 0.38881125, 0.33156225],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2796265 , 0.38881125, 0.33156225],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.33156225, 0.38881125, 0.2796265 ],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847]])

print('算法预测类别是：\n',y_)
proba_.argmax(axis = -1)

算法预测类别是：
 [1 0 2 2 0 0 1 2 1 0 0 0 1 2 1 0 1 0 1 0 2 0 1 0 2 1 2 2 2 2]





array([1, 0, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0,
       1, 0, 2, 1, 2, 2, 2, 2], dtype=int64)

$F(x) = \sum\limits_{t=1}^N\alpha_t*f_t(x)$

ada[0].predict(X_test)

array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1])

# numpy运算，功能特别强大！！！
y1_ = (ada[0].predict(X_test) == np.array([[0],[1],[2]])).T.astype(np.int8)

y2_ = (ada[1].predict(X_test) == np.array([[0],[1],[2]])).T.astype(np.int8)
y2_

array([[1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]], dtype=int8)

y3_ = (ada[2].predict(X_test) == np.array([[0],[1],[2]])).T.astype(np.int8)
y3_

array([[0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]], dtype=int8)

pred = y1_*a1 + y2_*a2 + y3_*a3
pred/=estimator_weights.sum()
pred/=(num -1)
proba = np.e**pred/((np.e**pred).sum(axis = 1).reshape(-1,1))
proba[:5]

array([[0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.3719898 , 0.34785853, 0.28015167]])

ada.predict_proba(X_test)[:5]

array([[0.33156225, 0.38881125, 0.2796265 ],
       [0.3719898 , 0.34785853, 0.28015167],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.2783866 , 0.31174493, 0.40986847],
       [0.3719898 , 0.34785853, 0.28015167]])

二、

（十八）adaboosting多分类算法
一、算法的案例算法推导：https://www.cnblogs.com/liuwu265/p/4692347.h...
AdaBoosting推导过程
一、AdaBoosting算法 Adaboosting中的A是adaptive的意思，所以AdaBoosting表...
分类算法与数据挖掘
################分类算法与数据挖掘---也就是回归于分类算法--对应于Y的0/1算法 ####分类...
（十七）boosting\adaBoosting--迭代算法
一、原理的描述 Adaboost是一种迭代算法，其核心思想是针对同一个训练集训练不同的分类器(弱分类器)，然后把这...
大数据算法：分类算法
KNN分类算法 KNN算法，即K近邻（K Nearest Neighbour）算法，是一种基本的分类算法。其主要原...
ml-多类别分类：一对多
目标：使用逻辑回归来解决多类别分类问题分类算法：“一对多”(one-vs-all) 将多个类中的一个类记为正向类(...
逻辑回归
逻辑回归是一种解决分类问题的机器学习算法。逻辑回归可以视为回归算法也可以视为分类算法，但通常用于分类，#####...
读书笔记
读书笔记/人生算法十八关之第四-九关【标题】人生算法十八关之四到九关【书籍】人生算法【01】人生算法十八关之...
sklearn的常用函数以及参数——1. 分类算法
sklearn可实现的函数或者功能有以下几种：分类算法回归算法聚类算法降维算法模型优化文本预处理其中分类算法和回...
KNN算法介绍
一、算法介绍邻近算法，或者说K最近邻(kNN，k-NearestNeighbor)分类算法是数据挖掘分类技术中最...