机器学习入门-分类问题

作者: 雷小厮 | 来源:发表于2017-07-12 16:45 被阅读306次

决策树

1、画决策树

from sklearn.datasets import load_iris  #使用iris数据集
from sklearn import tree 
#iris.data #iris的分类依据
#iris.target #iris的分类结果
clf = tree.DecisionTreeClassifier(max_depth=2) #设置最大深度为2层
clf.fit(iris.data,iris.target)
clf.predict(iris.data)
# 将决策树输出到图片
from sklearn.externals.six import StringIO  
import pydotplus 
dot_data = StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
graph.write_jpg('tree.jpg') # 生成tree.jpg

tree.jpg

2、画决策边界
只能使用2个变量
第一步，建立模型

from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
X = iris.data[:,[2,3]]  #选取iris.data中第三、第四个变量
y = iris.target
clf = tree.DecisionTreeClassifier(max_depth = 2)
clf.fit(X,y)
x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1 #边界图横坐标
y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1 #边界图纵坐标
xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.plot()
plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow) #边界图背景
plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
plt.title('Decision Tree')
plt.xlabel('Petal.Length')
plt.ylabel('Petal.Width')
plt.show()

决策树决策边界图

逻辑回归分析

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
clf = LogisticRegression()
clf.fit(iris.data,iris.target)
clf.predict(iris.data)

逻辑回归画决策边界图

x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.plot()
plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow) #alpha 透明度，cmap 配色
plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
plt.title('Logistic Regression')
plt.xlabel('Petal.Length')
plt.ylabel('Petal.Width')
plt.show()

逻辑回归决策边界图

SVM

from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
iris = load_iris()
clf = SVC(C=100,kernel='linear') #kernel 可选，参考函数说明；C 正则项，C数值小，margin大，允许数据跨界
clf.fit(iris.data,iris.target)
clf.predict(iris.data)

SVM 与逻辑回归对比

from itertools import product
import numpy as np
import matplotlib.pyplot as plt

def plot_estimator(estimator,X,y):
    x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
    y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
    xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
    Z = estimator.predict(np.c_[xx.ravel(),yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.plot()
    plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow)
    plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
    plt.xlabel('Petal.Length')
    plt.ylabel('Petal.Width')
    plt.show()

X = iris.data[0:100,[2,3]]
y = iris.target[0:100]
clf1 = SVC(kernel='linear')
clf1.fit(X,y)
clf2 = LogisticRegression()
clf2.fit(X,y)

plot_estimator(clf1,X,y)
plot_estimator(clf2,X,y)

SVM与逻辑回归对比

SVM不同kernel对比

from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.svm import SVC

iris = load_iris()
X =iris.data[:,[2,3]]
y = iris.target
clf1 = SVC(kernel = 'rbf') 
clf1.fit(X,y)
clf2 = SVC(kernel = 'poly')
clf2.fit(X,y)
clf3 = SVC(kernel = 'linear')
clf3.fit(X,y)
#rbf 和poly 非线性kernel，耗时久
x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
f,axarr = plt.subplots(1,3,sharex='col',sharey='row',figsize=(20,5))
for idx,clf,title in zip([0,1,2],[clf1,clf2,clf3],['rbf','poly','linear']):
    Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
    Z = Z.reshape(xx.shape)
    axarr[idx].contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.RdYlBu)
    axarr[idx].scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.brg)
    axarr[idx].set_title(title)

SVM 不同kernel对比

类神经网络

import itertools
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
digits = load_digits() #使用自带dataset，辨别手写数字
fig = plt.figure(figsize=(6,6))
fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(36):
    ax = fig.add_subplot(6,6,i+1,xticks=[],yticks=[])
    ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest')
    ax.text(0,7,str(digits.target[i]),color='red',fontsize=20)

手写数字

scaler = StandardScaler()
scaler.fit(digits.data)
X_scaled = scaler.transform(digits.data)
# 对数据进行标准话
mlp = MLPClassifier(hidden_layer_sizes =(30,30,30),activation='logistic',max_iter= 100)
# 查看函数帮助
mlp.fit(X_scaled,digits.target)
predicted = mlp.predict(X_scaled)
fig = plt.figure(figsize=(6,6))
fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(36):
    ax = fig.add_subplot(6,6,i+1,xticks=[],yticks=[])
    ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest')
    ax.text(0,7,str('{}-{}'.format(digits.target[i],predicted[i])),color='red',fontsize=20)

数字和预测值对比

# 查看准确率
res = [i==j for i,j in zip(digits.target,predicted)] 
print(sum(res)/len(digits.target)) # max_iter = 100时准确率94.5%，max_iter=1000时准确率达到100%

随机森林

def plot_estimator(estimator,X,y,title): 
    x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
    y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
    xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
    Z = estimator.predict(np.c_[xx.ravel(),yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.plot()
    plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow)
    plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
    plt.title(title)
    plt.xlabel('Sepal.Length')
    plt.ylabel('Sepal.Width')
    plt.show()
from sklearn.ensemble import RandomForestClassifier
iris = load_iris()
X = iris.data[:,[0,1]]
y = iris.target
clf = RandomForestClassifier(n_estimators=100,criterion='gini',random_state=None)
# n_estimators 树的数量，n越大，分类越准确
clf.fit(X,y)
plot_estimator(clf,X,y,'RandomForestClassifier') # 画决策边界图

n_estimators=100时的决策边界图

各种分类方法对比

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

x = iris.data[:,[0,1]]
y = iris.target
clf1 = SVC(kernel='rbf')
clf1.fit(x,y)
clf2 = DecisionTreeClassifier()
clf2.fit(x,y)
clf3 = RandomForestClassifier(n_estimators=10,criterion='entropy')
clf3.fit(x,y)
clf4 = LogisticRegression()
clf4.fit(x,y)
plot_estimator(clf1,x,y,'rbf')
plot_estimator(clf2,x,y,'DecisionTree')
plot_estimator(clf3,x,y,'RandomForest')
plot_estimator(clf4,x,y,'LogisticRegression')

网友评论

本文标题：机器学习入门-分类问题

本文链接：https://www.haomeiwen.com/subject/skwvhxtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

机器学习入门-分类问题

决策树

逻辑回归分析

逻辑回归画决策边界图

SVM

SVM 与逻辑回归对比

SVM不同kernel对比

类神经网络

随机森林

各种分类方法对比

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

人生苦短，我用python

机器学习