机器学习入门-分类问题

作者: 雷小厮 | 来源:发表于2017-07-12 16:45 被阅读306次

    决策树

    1、画决策树

    from sklearn.datasets import load_iris  #使用iris数据集
    from sklearn import tree 
    #iris.data #iris的分类依据
    #iris.target #iris的分类结果
    clf = tree.DecisionTreeClassifier(max_depth=2) #设置最大深度为2层
    clf.fit(iris.data,iris.target)
    clf.predict(iris.data)
    # 将决策树输出到图片
    from sklearn.externals.six import StringIO  
    import pydotplus 
    dot_data = StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_jpg('tree.jpg') # 生成tree.jpg
    
    tree.jpg

    2、画决策边界
    只能使用2个变量
    第一步,建立模型

    from itertools import product
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.datasets import load_iris
    from sklearn import tree
    iris = load_iris()
    X = iris.data[:,[2,3]]  #选取iris.data中第三、第四个变量
    y = iris.target
    clf = tree.DecisionTreeClassifier(max_depth = 2)
    clf.fit(X,y)
    x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1 #边界图横坐标
    y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1 #边界图纵坐标
    xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
    Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.plot()
    plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow) #边界图背景
    plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
    plt.title('Decision Tree')
    plt.xlabel('Petal.Length')
    plt.ylabel('Petal.Width')
    plt.show()
    
    决策树决策边界图

    逻辑回归分析

    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression
    iris = load_iris()
    clf = LogisticRegression()
    clf.fit(iris.data,iris.target)
    clf.predict(iris.data)
    
    逻辑回归画决策边界图
    x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
    y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
    xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
    Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.plot()
    plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow) #alpha 透明度,cmap 配色
    plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
    plt.title('Logistic Regression')
    plt.xlabel('Petal.Length')
    plt.ylabel('Petal.Width')
    plt.show()
    
    逻辑回归决策边界图

    SVM

    from sklearn.datasets import load_iris
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    iris = load_iris()
    clf = SVC(C=100,kernel='linear') #kernel 可选,参考函数说明;C 正则项,C数值小,margin大,允许数据跨界
    clf.fit(iris.data,iris.target)
    clf.predict(iris.data)
    
    SVM 与逻辑回归对比
    from itertools import product
    import numpy as np
    import matplotlib.pyplot as plt
    
    def plot_estimator(estimator,X,y):
        x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
        y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
        xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
        Z = estimator.predict(np.c_[xx.ravel(),yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.plot()
        plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow)
        plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
        plt.xlabel('Petal.Length')
        plt.ylabel('Petal.Width')
        plt.show()
    
    X = iris.data[0:100,[2,3]]
    y = iris.target[0:100]
    clf1 = SVC(kernel='linear')
    clf1.fit(X,y)
    clf2 = LogisticRegression()
    clf2.fit(X,y)
    
    plot_estimator(clf1,X,y)
    plot_estimator(clf2,X,y)
    
    SVM与逻辑回归对比
    SVM不同kernel对比
    from itertools import product
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.datasets import load_iris
    from sklearn.svm import SVC
    
    iris = load_iris()
    X =iris.data[:,[2,3]]
    y = iris.target
    clf1 = SVC(kernel = 'rbf') 
    clf1.fit(X,y)
    clf2 = SVC(kernel = 'poly')
    clf2.fit(X,y)
    clf3 = SVC(kernel = 'linear')
    clf3.fit(X,y)
    #rbf 和poly 非线性kernel,耗时久
    x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
    y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
    xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
    f,axarr = plt.subplots(1,3,sharex='col',sharey='row',figsize=(20,5))
    for idx,clf,title in zip([0,1,2],[clf1,clf2,clf3],['rbf','poly','linear']):
        Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
        Z = Z.reshape(xx.shape)
        axarr[idx].contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.RdYlBu)
        axarr[idx].scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.brg)
        axarr[idx].set_title(title)
    
    SVM 不同kernel对比

    类神经网络

    import itertools
    from sklearn.datasets import load_digits
    import matplotlib.pyplot as plt
    from sklearn.neural_network import MLPClassifier
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    digits = load_digits() #使用自带dataset,辨别手写数字
    fig = plt.figure(figsize=(6,6))
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
    for i in range(36):
        ax = fig.add_subplot(6,6,i+1,xticks=[],yticks=[])
        ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest')
        ax.text(0,7,str(digits.target[i]),color='red',fontsize=20)
    
    手写数字
    scaler = StandardScaler()
    scaler.fit(digits.data)
    X_scaled = scaler.transform(digits.data)
    # 对数据进行标准话
    mlp = MLPClassifier(hidden_layer_sizes =(30,30,30),activation='logistic',max_iter= 100)
    # 查看函数帮助
    mlp.fit(X_scaled,digits.target)
    predicted = mlp.predict(X_scaled)
    fig = plt.figure(figsize=(6,6))
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
    for i in range(36):
        ax = fig.add_subplot(6,6,i+1,xticks=[],yticks=[])
        ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest')
        ax.text(0,7,str('{}-{}'.format(digits.target[i],predicted[i])),color='red',fontsize=20)
    
    数字和预测值对比
    # 查看准确率
    res = [i==j for i,j in zip(digits.target,predicted)] 
    print(sum(res)/len(digits.target)) # max_iter = 100时准确率94.5%,max_iter=1000时准确率达到100%
    

    随机森林

    def plot_estimator(estimator,X,y,title): 
        x_min ,x_max = X[:,0].min()-1,X[:,0].max()+1
        y_min ,y_max = X[:,1].min()-1,X[:,1].max()+1
        xx,yy = np.meshgrid(np.arange(x_min,x_max,0.1),np.arange(y_min,y_max,0.1))
        Z = estimator.predict(np.c_[xx.ravel(),yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.plot()
        plt.contourf(xx,yy,Z,alpha=0.4,cmap=plt.cm.rainbow)
        plt.scatter(X[:,0],X[:,1],c=y,alpha=1,cmap=plt.cm.RdYlBu)
        plt.title(title)
        plt.xlabel('Sepal.Length')
        plt.ylabel('Sepal.Width')
        plt.show()
    from sklearn.ensemble import RandomForestClassifier
    iris = load_iris()
    X = iris.data[:,[0,1]]
    y = iris.target
    clf = RandomForestClassifier(n_estimators=100,criterion='gini',random_state=None)
    # n_estimators 树的数量,n越大,分类越准确
    clf.fit(X,y)
    plot_estimator(clf,X,y,'RandomForestClassifier') # 画决策边界图
    
    n_estimators=100时的决策边界图

    各种分类方法对比

    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    
    x = iris.data[:,[0,1]]
    y = iris.target
    clf1 = SVC(kernel='rbf')
    clf1.fit(x,y)
    clf2 = DecisionTreeClassifier()
    clf2.fit(x,y)
    clf3 = RandomForestClassifier(n_estimators=10,criterion='entropy')
    clf3.fit(x,y)
    clf4 = LogisticRegression()
    clf4.fit(x,y)
    plot_estimator(clf1,x,y,'rbf')
    plot_estimator(clf2,x,y,'DecisionTree')
    plot_estimator(clf3,x,y,'RandomForest')
    plot_estimator(clf4,x,y,'LogisticRegression')
    

    相关文章

      网友评论

        本文标题:机器学习入门-分类问题

        本文链接:https://www.haomeiwen.com/subject/skwvhxtx.html