from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.model_selection import train_test_split
import xgboost as xgb
xgb.set_config(verbosity=0)
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
feature = pd.read_csv('C:/Users/Administrator.DESKTOP-4UQ3Q0K/Desktop/TCR_CMV_pred-master/code/feature_ac_h1.tsv',sep='\t')
X = feature[['TCR_sum', 'uniques']]
y = feature['status']
X = np.array(X)
y = np.array(y)
from sklearn.preprocessing import StandardScaler
#将所有数据转化为(0,1)间的正态分布
X = StandardScaler().fit_transform(X)
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420) #随机抽样
#在sklearn下建模#
clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)
clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率
cm(Ytest,ypred,labels=[1,0]) #多数类写在前面
#15和23是分类正确,10和0是分类错误。15个是少数类中分类正确的,0个是少数类中分类错误的
recall(Ytest,ypred)
auc(Ytest,clf.predict_proba(Xtest)[:,1])
#负/正样本比例
clf_ = XGBC(scale_pos_weight=3).fit(Xtrain,Ytrain) #scale_pos_weight:正类/负类的数,处理样本不均衡的问题
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)
#随着样本权重逐渐增加,模型的recall,auc和准确率如何变化?
for i in [1,2,3,4,5]:
clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
print(i)
print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))
print("\tRecall:{}".format(recall(Ytest,ypred_)))
print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))

dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
#看看xgboost库自带的predict接口
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
#自己设定阈值
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred[ypred != 1] = 0
开始调scale_pos_weight
#写明参数
scale_pos_weight = [180,200,220]
names = ["negative vs positive: 180"
,"negative vs positive: 200"
,"negative vs positive: 220"]
[*zip(names,scale_pos_weight)]
#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as auc
for name,i in zip(names,scale_pos_weight):
param = {'silent':True,'objective':'binary:logistic'
,"eta":0.1,"scale_pos_weight":i}
num_round = 100
clf = xgb.train(param, dtrain, num_round)
preds = clf.predict(dtest)
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred[ypred != 1] = 0
print(name)
print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
print("\tRecall:{}".format(recall(Ytest,ypred)))
print("\tAUC:{}".format(auc(Ytest,preds)))

#当然我们也可以尝试不同的阈值
for name,i in zip(names,scale_pos_weight):
for thres in [0.3,0.5,0.7,0.9]:
param= {'silent':True,'objective':'binary:logistic'
,"eta":0.1,"scale_pos_weight":i}
clf = xgb.train(param, dtrain, num_round)
preds = clf.predict(dtest)
ypred = preds.copy()
ypred[preds > thres] = 1
ypred[ypred != 1] = 0
print("{},thresholds:{}".format(name,thres))
print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
print("\tRecall:{}".format(recall(Ytest,ypred)))
print("\tAUC:{}".format(auc(Ytest,preds)))
#最好的是negative vs positive: 200,thresholds:0.5
# Accuracy:0.85416
# Recall:0.81818
# AUC:0.935353
调阈值好像没什么进步
def plot_learning_curve(estimator,title, X, y,
ax=None, #选择子图
ylim=None, #设置纵坐标的取值范围
cv=None, #交叉验证
n_jobs=None #设定索要使用的线程
):
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
,shuffle=True
,cv=cv
,random_state=420
,n_jobs=n_jobs)
if ax == None:
ax = plt.gca()
else:
ax = plt.figure()
ax.set_title(title)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
ax.grid() #绘制网格,不是必须
ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-'
, color="r",label="Training score")
ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
, color="g",label="Test score")
ax.legend(loc="best")
return ax
cv = KFold(n_splits=10, shuffle = True, random_state=42) #交叉验证模式
plot_learning_curve(XGBC(n_estimators=100,random_state=420)
,"xgb",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

选择n_estimators(学习曲线)
#=====【TIME WARNING:25 seconds】=====#
axisx = range(10,1010,50)
rs = []
for i in axisx:
reg = XGBC(n_estimators=i,random_state=420)
rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

#======【TIME WARNING: 20s】=======#
axisx = range(10,50,5)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBC(n_estimators=i,random_state=420)
cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
#记录1-偏差
rs.append(cvresult.mean())
#记录方差
var.append(cvresult.var())
#计算泛化误差的可控部分
ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值,并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值,并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值,并打印这个参数下的R2,方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

这来表示在10到50之间每隔5取一棵树时,15棵树的效果是最好的。
于是在1到25之间每1取一棵树
axisx = range(1,25,1)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBC(n_estimators=i,random_state=420)
cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
rs.append(cvresult.mean())
var.append(cvresult.var())
ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

网友评论