一、算法的案例
算法推导:
https://www.cnblogs.com/liuwu265/p/4692347.html
import numpy as np
from sklearn.ensemble import AdaBoostClassifier,AdaBoostRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import tree
d:\python3.7.4\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
return f(*args, **kwds)
# 目标、项目需求:根据鸢尾花的属性,判定类别
# 支付宝,借呗;微信,微粒贷
# 可借额度,算法预测出来的,回归问题
# 特征:性别、年龄、籍贯、学历、工作单位、电话、……
# 业务理解
X,y = datasets.load_iris(True)#小项目,数据X(清洗),四个特征,目标值y(清洗)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1024)
# 多分类问题,不是+1、-1
# 类别0,1,2
# 参数,这个参数,选多大合适,试一试:模型选择
# n_estimators参数,通过代码,进行筛选合适的参数
ada = AdaBoostClassifier(n_estimators=3,algorithm='SAMME',learning_rate=1.0)
# algorithm='SAMME' 还有一个algorithm='SAMMER'这个的计算方式更加的快,所以速度也更快。
ada.fit(X_train,y_train)#算法,工作:从X_train---y_train寻找规律
y_ = ada.predict(X_test)
proba_ = ada.predict_proba(X_test)
accuracy = ada.score(X_test,y_test)
print('--------------------算法准确率:',accuracy)
display(y_,proba_)
--------------------算法准确率: 0.9666666666666667
array([1, 0, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0,
1, 0, 2, 1, 2, 2, 2, 2])
array([[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.3719898 , 0.34785853, 0.28015167],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.2783866 , 0.31174493, 0.40986847],
[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.3719898 , 0.34785853, 0.28015167],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.2783866 , 0.31174493, 0.40986847],
[0.2796265 , 0.38881125, 0.33156225],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2796265 , 0.38881125, 0.33156225],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.33156225, 0.38881125, 0.2796265 ],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847]])
算法中弱学习器的权重
# 这个是算法,返回的样本的权重:
ada.estimator_weights_
array([1.42403469, 2.14358936, 2.72369906])
ada.estimators_
[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=1, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=1784306878, splitter='best'),
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=1, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=1759305051, splitter='best'),
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=1, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=369271939, splitter='best')]
# 回归里面的基模型
adaregressor = AdaBoostRegressor(n_estimators=3)
X,y = datasets.load_boston(True)
adaregressor.fit(X,y)
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
n_estimators=3, random_state=None)
adaregressor[0]
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=1077777532, splitter='best')
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
g1 = GradientBoostingClassifier()#分类
g2 = GradientBoostingRegressor()# 回归
g1.fit(X_train,y_train)
display(g1[0])
X,y = datasets.load_boston(True)
g2.fit(X,y)
display(g2[0])
array([DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=RandomState(MT19937) at 0x22C1DF98048,
splitter='best'),
DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=RandomState(MT19937) at 0x22C1DF98048,
splitter='best'),
DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=RandomState(MT19937) at 0x22C1DF98048,
splitter='best')], dtype=object)
array([DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=RandomState(MT19937) at 0x22C1DF98048,
splitter='best')], dtype=object)
# 这棵树,简单的树,树的深度:1
# Adaboosting里面都是简单的树
_ = tree.plot_tree(ada[0],filled=True)
[图片上传失败...(image-9071e0-1586219711847)]
计算第一棵树弱学习器权重,更新样本权重
learning_rate = 1.0
# 鸢尾花,三分类问题
np.unique(y)
num = 3
ada.estimator_weights_
array([1.42403469, 2.14358936, 2.72369906])
# 初始化样本(训练数据,X_train),每个样本的权重都是一样的
w0 = np.full(shape = 120,fill_value=1/120)
# 误差
y1 = ada[0].predict(X_train)#y1弱学习器预测值,y_train真实
e1 = ((y1 != y_train)*w0).sum()
# 弱学习器的权重,扁鹊医院大夫的话语权
a1 = np.round(learning_rate*(np.log((1-e1)/e1) + np.log(num - 1)),8)
# 更新样本的权重
w1 = w0*np.exp(a1*(y_train != y1))
w1 /= w1.sum()
w1
array([0.00411523, 0.01709402, 0.01709402, 0.01709402, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.01709402, 0.00411523,
0.01709402, 0.01709402, 0.00411523, 0.00411523, 0.01709402,
0.01709402, 0.01709402, 0.00411523, 0.00411523, 0.00411523,
0.01709402, 0.01709402, 0.00411523, 0.01709402, 0.00411523,
0.01709402, 0.01709402, 0.00411523, 0.00411523, 0.01709402,
0.00411523, 0.00411523, 0.01709402, 0.00411523, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
0.00411523, 0.01709402, 0.01709402, 0.01709402, 0.01709402,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.01709402,
0.01709402, 0.00411523, 0.01709402, 0.01709402, 0.01709402,
0.00411523, 0.01709402, 0.00411523, 0.00411523, 0.01709402,
0.00411523, 0.00411523, 0.01709402, 0.00411523, 0.01709402,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
0.01709402, 0.00411523, 0.00411523, 0.00411523, 0.01709402,
0.00411523, 0.01709402, 0.01709402, 0.00411523, 0.00411523,
0.00411523, 0.01709402, 0.00411523, 0.00411523, 0.00411523,
0.00411523, 0.00411523, 0.01709402, 0.00411523, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.01709402, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
0.00411523, 0.00411523, 0.00411523, 0.00411523, 0.00411523,
0.01709402, 0.00411523, 0.01709402, 0.00411523, 0.01709402])
# 这棵树,简单的树,树的深度:1
# Adaboosting里面都是简单的树
_ = tree.plot_tree(ada[1],filled=True)
[图片上传失败...(image-3fa51b-1586219711847)]
第二棵树,计算
ada.estimator_weights_
array([1.42403469, 2.14358936, 2.72369906])
# 误差
y2 = ada[1].predict(X_train)#y1弱学习器预测值,y_train真实
e2 = ((y2 != y_train)*w1).sum()
print('---------------',e2)
# estimator_error = np.mean(np.average((y2 != y_train), weights=w1, axis=0))
# print('+++++++++++++++',estimator_error)
# 第二个弱学习器的权重,扁鹊医院大夫的话语权
a2 = np.round(learning_rate*(np.log((1-e2)/e2) + np.log(num - 1)),8)
print('第二个弱分类器权重:',a2)
# 更新样本的权重
w2 = w1*np.exp(a2*(y_train != y2))
w2 /= w2.sum()#归一化 Normalization
w2
--------------- 0.18993352316858095
第二个弱分类器权重: 2.14358936
array([0.01444444, 0.007034 , 0.007034 , 0.007034 , 0.01444444,
0.01444444, 0.01444444, 0.01444444, 0.00169337, 0.01444444,
0.01444444, 0.01444444, 0.00169337, 0.007034 , 0.00169337,
0.007034 , 0.007034 , 0.00169337, 0.01444444, 0.007034 ,
0.007034 , 0.007034 , 0.00169337, 0.00169337, 0.01444444,
0.007034 , 0.007034 , 0.01444444, 0.007034 , 0.01444444,
0.007034 , 0.007034 , 0.01444444, 0.01444444, 0.007034 ,
0.00169337, 0.00169337, 0.007034 , 0.00169337, 0.01444444,
0.00169337, 0.01444444, 0.00169337, 0.01444444, 0.01444444,
0.01444444, 0.00169337, 0.01444444, 0.00169337, 0.01444444,
0.00169337, 0.007034 , 0.007034 , 0.007034 , 0.007034 ,
0.01444444, 0.00169337, 0.01444444, 0.00169337, 0.007034 ,
0.007034 , 0.00169337, 0.007034 , 0.007034 , 0.007034 ,
0.01444444, 0.06 , 0.01444444, 0.00169337, 0.007034 ,
0.00169337, 0.00169337, 0.007034 , 0.01444444, 0.007034 ,
0.01444444, 0.00169337, 0.00169337, 0.01444444, 0.00169337,
0.007034 , 0.01444444, 0.00169337, 0.00169337, 0.007034 ,
0.00169337, 0.007034 , 0.007034 , 0.01444444, 0.00169337,
0.01444444, 0.007034 , 0.00169337, 0.01444444, 0.00169337,
0.01444444, 0.01444444, 0.007034 , 0.00169337, 0.00169337,
0.00169337, 0.00169337, 0.00169337, 0.007034 , 0.00169337,
0.01444444, 0.01444444, 0.00169337, 0.01444444, 0.00169337,
0.00169337, 0.01444444, 0.01444444, 0.00169337, 0.01444444,
0.007034 , 0.01444444, 0.007034 , 0.01444444, 0.007034 ])
# 这棵树,简单的树,树的深度:1
# Adaboosting里面都是简单的树
_ = tree.plot_tree(ada[2],filled=True)
[图片上传失败...(image-984dc8-1586219711847)]
第三棵树,计算
ada.estimator_weights_
array([1.42403469, 2.14358936, 2.72369906])
# 误差
y3 = ada[2].predict(X_train)#y1弱学习器预测值,y_train真实
e3 = ((y3 != y_train)*w2).sum()
print('---------------',e3)
# estimator_error = np.mean(np.average((y2 != y_train), weights=w1, axis=0))
# print('+++++++++++++++',estimator_error)
# 第三个弱学习器的权重,扁鹊医院大夫的话语权
a3 = np.round(learning_rate*(np.log((1-e3)/e3) + np.log(num - 1)),8)
print('第三个弱分类器权重:',a3)
# 更新样本的权重
w3 = w2*np.exp(a3*(y_train != y3))
w3 /= w3.sum()#归一化 Normalization
--------------- 0.11603230434326026
第三个弱分类器权重: 2.72369906
三个弱分类器的权重,和标准答案一模一样
estimator_weights = np.array([a1,a2,a3])
estimator_weights
array([1.42403469, 2.14358936, 2.72369906])
预测一下
y_ = ada.predict(X_test)
proba_ = ada.predict_proba(X_test)#Adaboosting算法,概率问题
display(y_,proba_)
array([1, 0, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0,
1, 0, 2, 1, 2, 2, 2, 2])
array([[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.3719898 , 0.34785853, 0.28015167],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.2783866 , 0.31174493, 0.40986847],
[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.3719898 , 0.34785853, 0.28015167],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.2783866 , 0.31174493, 0.40986847],
[0.2796265 , 0.38881125, 0.33156225],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2796265 , 0.38881125, 0.33156225],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.3719898 , 0.34785853, 0.28015167],
[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.33156225, 0.38881125, 0.2796265 ],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847]])
print('算法预测类别是:\n',y_)
proba_.argmax(axis = -1)
算法预测类别是:
[1 0 2 2 0 0 1 2 1 0 0 0 1 2 1 0 1 0 1 0 2 0 1 0 2 1 2 2 2 2]
array([1, 0, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0,
1, 0, 2, 1, 2, 2, 2, 2], dtype=int64)
ada[0].predict(X_test)
array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 1, 1, 1, 1, 1])
# numpy运算,功能特别强大!!!
y1_ = (ada[0].predict(X_test) == np.array([[0],[1],[2]])).T.astype(np.int8)
y2_ = (ada[1].predict(X_test) == np.array([[0],[1],[2]])).T.astype(np.int8)
y2_
array([[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[0, 0, 1],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[0, 0, 1],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[1, 0, 0],
[0, 0, 1],
[1, 0, 0],
[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[1, 0, 0],
[0, 0, 1],
[0, 0, 1],
[0, 0, 1],
[0, 0, 1]], dtype=int8)
y3_ = (ada[2].predict(X_test) == np.array([[0],[1],[2]])).T.astype(np.int8)
y3_
array([[0, 1, 0],
[0, 1, 0],
[0, 0, 1],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0],
[0, 0, 1],
[0, 0, 1],
[0, 0, 1],
[0, 0, 1]], dtype=int8)
pred = y1_*a1 + y2_*a2 + y3_*a3
pred/=estimator_weights.sum()
pred/=(num -1)
proba = np.e**pred/((np.e**pred).sum(axis = 1).reshape(-1,1))
proba[:5]
array([[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.3719898 , 0.34785853, 0.28015167]])
ada.predict_proba(X_test)[:5]
array([[0.33156225, 0.38881125, 0.2796265 ],
[0.3719898 , 0.34785853, 0.28015167],
[0.2783866 , 0.31174493, 0.40986847],
[0.2783866 , 0.31174493, 0.40986847],
[0.3719898 , 0.34785853, 0.28015167]])
网友评论