sklearn是python中比较常用也是非常好用的机器学习库,总结几个比较常用的机器学习算法,非常简单,对于想快速入门的小白可以看看哦~
- 拆分训练集和测试集
from sklearn.model_selection import train_test_split
#X是训练样本;y是label(目标值);test_size为要训练集和测试集的比例,通常为0.2或0.3;random_state为是随机数的种子,其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数。比如你每次都填1,其他参数一样的情况下你得到的随机数组是一样的。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
- logistic回归
from sklearn import linear_model
logistic = linear_model.LogisticRegression()
#如果用正则化,可以添加参数penalty,可以是l1正则化(可以更有效的抵抗共线性),也可以是l2正则化,如果是类别不均衡的数据集,可以添加class_weight参数,这个可以自己设置,也可以让模型自己计算
logistic = linear_model.LogisticRegression( penalty='l1', class_weight='balanced')
logistic.fit(X_train,y_train)
y_pred = logistic.predict( X_test)
#如果只想预测概率大小,可以用下面这个函数
y_pred = logistic.predict_proba(X_test)
逻辑回归的具体调参方法可以参考https://blog.csdn.net/sun_shengyun/article/details/53811483
- SVM分类
from sklearn import svm
model = svm.SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
- RandomForest分类
from sklearn.ensemble import RandomForestClassifier
#采用袋外样本来评估模型的好坏,提高泛化能力
rf0 = RandomForestClassifier(oob_score=True)
rf0.fit(X_train,y_train)
y_pred = rf0.predict(X_test)
- adaboost分类
from sklearn.ensemble import AdaBoostClassifier
#迭代100次 ,学习率为0.1
clf = AdaBoostClassifier(n_estimators=100,learning_rate=0.1)
clf.fit(X_train,y_train)
y_pred = clf.predict( X_test)
- boosting分类
from sklearn.ensemble import GradientBoostingClassifier
#迭代100次 ,学习率为0.1
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
clf.fit(X_train,y_train)
y_pred = clf.predict( X_test)
- 神经网络
from sklearn import linear_model
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)
classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
rbm.learning_rate = 0.1
rbm.n_iter = 20
rbm.n_components = 100
#正则化强度参数
logistic.C = 1000
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
- 岭回归
岭回归是通过在普通最小二乘线性回归的RSS基础上加上正则项(L2范数)来实现对系数进行压缩惩罚,从而比避免过拟合和产生共线性。
尽管系数较小可以有效减小方差,但不能保证系数为0,所以依然留着一大长串特征会使模型不便于解释。这是岭回归的缺点。
# 岭回归(Ridge 回归)
from sklearn import linear_model
X = [[0, 0], [1, 1], [2, 2]]
y = [0, 1, 2]
clf = linear_model.Ridge(alpha=0.1) # 设置正则化强度
clf.fit(X, y) # 参数拟合
print(clf.coef_) # 系数
print(clf.intercept_) # 常量系数
print(clf.predict([[3, 3]])) # 求预测值
print(clf.decision_function(X)) # 求预测,等同predict
print(clf.score(X, y)) # R^2,拟合优度
print(clf.get_params()) # 获取参数信息
print(clf.set_params(fit_intercept=False)) # 重新设置参数,如将是否使用截距设置为false,即不使用截距。
- Lasso回归
Lasso是通过在普通最小二乘线性回归的RSS基础上加上正则项(L1范数)来实现对系数进行压缩惩罚,从而比避免过拟合和产生共线性。
Lasso回归的系数可以为0,这样可以起到真正的特征筛选效果。
# Lasso回归
from sklearn import linear_model
X = [[0, 0], [1, 1], [2, 2]]
y = [0, 1, 2]
clf = linear_model.Lasso(alpha=0.1) # 设置正则化强度
clf.fit(X, y) # 参数拟合
print(clf.coef_) # 系数
print(clf.intercept_) # 常量系数
print(clf.predict([[3, 3]])) # 求预测值
print(clf.decision_function(X)) # 求预测,等同predict
print(clf.score(X, y)) # R^2,拟合优度
print(clf.get_params()) # 获取参数信息
print(clf.set_params(fit_intercept=False)) # 重新设置参数,如将是否使用截距设置为false,即不使用截距。
- 参数调整
每种分类或者回归算是都有很多可以设置的参数,那这些应该设置为多少才会比较合适呢,sklearn也有模块可以帮助我们设置一个比较好的参数,这个模块叫GridSearchCV。下面以adaboost算法,用iris数据集,找最好的learning_rate为例。
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
model = AdaBoostClassifier()
param_grid = dict(learning_rate=learning_rate)
#设置10折交叉验证
kfold = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
model = AdaBoostClassifier()
param_grid = dict(learning_rate=learning_rate)
#设置10折交叉验证
kfold = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit( iris, iris_y)
#打印最好的learning_rate
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
Out[349]: Best: -0.160830 using {'learning_rate': 0.3}
#在给定的6个学习率中,最好的学习率是0.3
#还可以打印其它的参数
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
Out[350]:-0.462098 (0.000000) with: {'learning_rate': 0.0001}
-0.347742 (0.106847) with: {'learning_rate': 0.001}
-0.237053 (0.056082) with: {'learning_rate': 0.01}
-0.184642 (0.085079) with: {'learning_rate': 0.1}
-0.163586 (0.117306) with: {'learning_rate': 0.2}
-0.160830 (0.135698) with: {'learning_rate': 0.3}
- 查看结果准确率、混淆矩阵、auc等
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
#计算准确率
accuracy = accuracy_score(y_test, y_pred)
#计算auc,一般针对两类分类问题
auc = roc_auc_score(y_test, y_pred)
#计算混淆矩阵,一般针对两类分类问题
conMat = confusion_matrix(y_test, y_pred)
网友评论