二元逻辑回归实现鸢尾花数据分类（python）

作者: 马尔代夫Maldives | 来源:发表于2019-03-09 18:17 被阅读0次

机器学习之逻辑回归
二元逻辑回归实现鸢尾花数据分类（python）
基于逻辑回归的分类预测
2018-10-19
从0开始实现逻辑回归算法(LogicRegression)
Python3机器学习实践：Softmax回归【实例：鸢尾花分类
逻辑回归算法（二）
README.md
逻辑回归
2018-10-20

说明：

本文利用python实现二元逻辑回归，没有加正则项。挑选iris数据前100个样本作为训练集，他们分属于两个类别，样本特征选择第1列（花萼长度x1）和第2列（花萼宽度x2）。
程序以函数形式实现，附录中给出封装式程序实现。代码均以矩阵计算的思路编写。
四部分程序是连续的，拷贝在一起就能完整运行。
没考虑过拟合、欠拟合等问题。

1.查看鸢尾花数据集

from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np

#%matplotlib

iris = load_iris() #导入鸢尾花数据
data = iris.data #numpy.ndarray
target = iris.target #numpy.ndarray

#print(data.shape) #查看维度（150,4），即150个样本，每个样本4个属性
#print(target.shape) #（150,）每个样本的类别0,1,2值，三个类型，前100个样本属于0和1类，后50个样本属于2类
#print(data) #查看完整数据
#print(target) #查看类型数据

#根据不同类型0,1,2和第0列（x1），第1列（x2）绘图。
plt.figure(1)
index_0 = np.where(target==0) #找出0所在位置
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(target==1) #找出1所在位置
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color = 'r',label = '1',s = 15)
index_2 =np.where(target==2) #找出2所在位置
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color = 'k',label = '2',s = 15) 
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc = 'upper left')

上述程序的结果如下：

鸢尾花.png

2.二元逻辑回归程序实现

#定义二元逻辑回归需要的各个函数
def h_sigmod(W,X): #输出1×m
    g = np.dot(W,X.T) #1×m 计算h(X)
    return sigmod(g)

def sigmod(X):
    return 1/(1+np.exp(-X))

def train(X,y,learn_rate=0.001):
    #利用已知样本训练模型，得到系数
    W = None #存放系数
    Loss = [] #存放每次迭代的损失函数值

    num_train,num_feature = X.shape
    W = 0.001*np.random.randn(num_feature,1).reshape(1,-1) #初始化系数,并转化为1个行向量
    #print('W={}'.format(W))
    
    for i in range(5000):
        h = h_sigmod(W,X_train) #1×m
    
        #注意loss是一个标量，但是一个ndarray型数据，而非纯粹的int
        loss = -(np.log(h).dot(y) + np.log(1-h).dot(1-y)) 
    #    print(loss.shape)
        loss = loss/num_train
    
        dW = (y.T-h).dot(X_train) #计算(y-h(X))*X，实际上就是J(W)的一阶导数
    #    print('dW={}'.format(dW))
        W += learn_rate*dW #更新系数W=W+a(y-h(X))*X
    
        Loss.append(loss[0])
    #    print ('i={},Loss={}'.format(i,Loss[i]))
    
    return W,Loss

#利用训练得到的W系数对结果进行预测
def predict(W,X_test):
        h = h_sigmod(W,X_test) #利用训练得到的系数W计算h(X)结果就是预测值
        y_pred = np.where(h>=0.5,1,0)
        return y_pred

3.利用鸢尾花数据对模型进行训练得到系数

##利用鸢尾花部分数据训练模型并查看结果##############################################################
#从原始数据中抽取部分数据作为训练数据
# 前100个样本分别属于0和1类型，本例中用逻辑回归做二分类，不考虑花型数据为2类的数据。
X = data[0:100,[0,1]] #抽出第1和2个特征（x1和x2），分别是花萼的长和宽
y = target[0:100].reshape(-1,1) #每个样本对应的类型，0或1。转化成一个列向量。
# print(X[:5])
# print(y[-5:])

##增加一列“1”，表示x0列，这列只是为了与系数w的第一列w0相匹配（该列不需要与实际特征x1或x2相乘）
#该步很重要
one = np.ones((X.shape[0],1))
X_train = np.hstack((one,X))

W,Loss = train(X_train,y) #利用已知样本训练模型，得到系数和损失函数结果，（）
print('预测的系数：[w0,w1,w2]={}'.format(W)) #输出训练得到的系数

plt.figure(2)
#print('W={}'.format(W))
plt.plot(Loss)
plt.xlabel('number of iteration')
plt.ylabel('Loss')
plt.show()

plt.figure(3)
label = np.array(y)
index_0 = np.where(label==0)
plt.scatter(X[index_0,0],X[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(label==1)
plt.scatter(X[index_1,0],X[index_1,1],marker='o',color = 'r',label = '1',s = 15)
 
#show the decision boundary
x1 = np.arange(4,7.5,0.5)
x2 = (- W[0][0] - W[0][1]*x1) / W[0][2]
plt.plot(x1,x2,color = 'black')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc = 'upper left')

##利用鸢尾花部分数据训练模型并查看结果

输出结果：

预测的系数：[w0,w1,w2]=[[-2.01401491  5.05730268 -8.19245827]]

损失函数.png

分类边界.png

4.利用前面得到的模型数据预测对其他数据进行分类

#预测测试一
#取X_train的前20个样本，让分类器对其进行预测，如果结果都是0，那么说明分类很准确。
X_test= X_train[:20,[0,1,2]]
result = predict(W,X_test) #根据预测得到的系数W对新的样本进行分类
print("测试一结果:{}".format(result)) #看输出是否为0
    
#预测测试二
#暂时抽取第2类数据的x1和x2特征来作为测试数据。
#从第一幅途中可以看出，仅从x1和x2两个特征看，第2类鸢尾花是属于第一1类的，即输出应该都是1
Test_data = data[100:,[0,1]] #暂时抽取第2类数据的x1和x2特征来作为测试数据
one = np.ones((Test_data.shape[0],1)) 
X_test = np.hstack((one,Test_data))
result = predict(W,X_test) #根据预测得到的系数W对新的样本进行分类
print("测试二结果:{}".format(result)) #看输出是否为1

输出分类结果：

测试一结果:[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] #全正确
测试二结果:[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1]] #全正确

附录：上述过程的封装式实现如下

# -*- coding: utf-8 -*-
"""
Created on Thu Mar  7 19:50:46 2019

@author: sean
"""

from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np


# 多cell输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#%matplotlib

iris = load_iris()
data = iris.data #numpy.ndarray
target = iris.target #numpy.ndarray

#print(data.shape) #查看维度（150,4），即150个样本，每个样本4个属性
#print(target.shape) #（150,）每个样本的类别0,1,2值，三个类型，前100个样本属于0和1类，后50个样本属于2类
#print(data) #查看完整数据
#print(target) #查看类型数据

#根据不同类型0,1,2和第0列（x1），第1列（x2）绘图。
plt.figure(1)
index_0 = np.where(target==0) #找出0所在位置
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(target==1) #找出1所在位置
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color = 'r',label = '1',s = 15)
index_2 =np.where(target==2) #找出2所在位置
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color = 'k',label = '2',s = 15) 
plt.xlabel('sepal length (x1)')
plt.ylabel('sepal width (x2)')
plt.legend(loc = 'upper left')

#准备数据############################################################################################
# 前100个样本分别属于0和1类型，本例中用逻辑回归做二分类，不考虑花型数据为2类的数据。
X = data[0:100,[0,1]] #抽出第1和2个特征（x1和x2），分别是花萼的长和宽
y = target[0:100].reshape(-1,1) #每个样本对应的类型，0或1。转化成一个列向量。
# print(X[:5])
# print(y[-5:])
##增加一列“1”，表示x0列，这列只是为了与系数w的第一列w0相匹配（该列不需要与实际特征x1或x2相乘）
one = np.ones((X.shape[0],1))
X_train = np.hstack((one,X))


#构造一个二元逻辑回归分类器########################################################################
class logistic(object):
    def __init__(self):
        self.W = None
        self.Loss = []
    def train(self,X,y,learn_rate = 0.001,num_iters = 5000):
        num_train,num_feature = X.shape
        #init the weight
        self.W = 0.001*np.random.randn(num_feature,1).reshape((1,-1))#初始化系数,并转化为1个行向量
        
        for i in range(num_iters):
            loss,dW = self.compute_loss(X,y)
#            print('i={},loss={}'.format(i,loss))
            
            self.W += learn_rate*dW #更新系数W=W+a(y-h(X))*X
            #print(self.W.shape)
            self.Loss.append(loss[0][0])
            
    def compute_loss(self,X,y):
        num_train = X.shape[0]
        h = self.h_sigmod(X) #1×m

        #注意loss是一个标量，但是一个ndarray型数据，而非纯粹的int
        loss = -(np.log(h).dot(y) + np.log(1-h).dot(1-y))
        #print(loss.shape)
        loss = loss/num_train

        dW = (y.T-h).dot(X_train) #计算(y-h(X))*X，实际上就是J(W)的一阶导数
        #    print('dW={}'.format(dW))
    
        return loss,dW

    def h_sigmod(self,X): #输出1×m
        g = np.dot(self.W,X.T) #1×m，计算h(X)
        return self.sigmod(g)
    
    def sigmod(self,X):
        return 1/(1+np.exp(-X))

    def predict(self,X): #利用训练得到的系数W计算h(X)结果就是预测值
        h = self.h_sigmod(X)
        y_pred = np.where(h>=0.5,1,0)
        return y_pred

#产生一个二元逻辑回归分类器对象######################################################################
my_logistic = logistic() #构造一个二元逻辑回归分类器
my_logistic.train(X_train,y) #利用数据X_train 和y对模型进行训练，确定W和Loss值
Loss = my_logistic.Loss
W = my_logistic.W
#print( 'W={}'.format(my_logistic.W) )
#print( 'Loss={}'.format(my_logistic.Loss) )

#查看损失函数
plt.figure(2)
plt.plot(Loss)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show()

##绘制根据二元逻辑回归得到的分界线
#将鸢尾花的前100个样本，根据x1和x2特征绘图
plt.figure(3)
label = np.array(y)
index_0 = np.where(label==0)
plt.scatter(X[index_0,0],X[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(label==1)
plt.scatter(X[index_1,0],X[index_1,1],marker='o',color = 'r',label = '1',s = 15)
#根据训练得到的系数画分界线
x1 = np.arange(4,7.5,0.5)
x2 = (- W[0][0] - W[0][1]*x1) / W[0][2] 
plt.plot(x1,x2,color = 'black')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc = 'upper left')

#利用训练得到的参数对新的样本进行分类
#预测测试一
#取X_train的前20个样本，让分类器对其进行预测，如果结果都是0，那么说明分类很准确。
X_test= X_train[:20,[0,1,2]]
result = my_logistic.predict(X_test) #对测试数据进行预测
print("测试一结果:{}".format(result)) #看结果有多少是0
    
#预测测试二
#暂时抽取第2类数据的x1和x2特征来作为测试数据。
#从第一幅途中可以看出，仅从x1和x2两个特征看，第2类鸢尾花是属于第一1类的，即输出应该都是1
Test_data = data[100:,[0,1]] 
one = np.ones((Test_data.shape[0],1)) 
X_test = np.hstack((one,Test_data))

result = my_logistic.predict(X_test)
print("测试二结果:{}".format(result)) #看输出是否为1

机器学习之逻辑回归
逻辑回归场景简介：使用逻辑回归实现对鸢尾花的分类预测。基于python基本库的实现模型说明逻辑回归是一个分类...
二元逻辑回归实现鸢尾花数据分类（python）
说明：本文利用python实现二元逻辑回归，没有加正则项。挑选iris数据前100个样本作为训练集，他们分属于两...
基于逻辑回归的分类预测
基于逻辑回归的分类预测学习目标了解逻辑回归的理论掌握逻辑回归的sklearn函数调用并将运用到鸢尾花数据集预测 ...
2018-10-19
Python与数据挖掘(二)——逻辑回归逻辑回归一般用来实现分类一、算法原理 1、预测函数 2、将预测函数的输...
从0开始实现逻辑回归算法(LogicRegression)
从0开始实现逻辑回归算法(LogicRegression) 逻辑回归（LR）算法是一个比较常见的二元分类算法，通常...
Python3机器学习实践：Softmax回归【实例：鸢尾花分类
Softmax回归可看作逻辑回归的扩展，用于解决多分类问题。鸢尾花数据集是机器学习中比较常用的，数据集包含150...
逻辑回归算法（二）
一、线型回归模型（Linear Regression）为了更好的实现分类，逻辑回归诞生了。 [逻辑回归是假设数据...
README.md
ex1 - 线性回归 ex2 - 逻辑回归(二元分类) ex3 - 逻辑回归(多元分类) 与神经网络 ex4 -...
逻辑回归
逻辑回归是一个分类算法，利用回归来做分类.它可以处理二元分类以及多元分类，逻辑回归与线性回归不同主要体现在以下两点...
2018-10-20
Python与数据挖掘(二)——逻辑回归 2、算法实现 import pandas as pdfrom sklea...