机器学习-Logistics

作者: 伍小劲 | 来源:发表于2019-01-16 17:21 被阅读0次

机器学习-Logistics
机器学习——softmax计算
机器学习实战之python实现logistics算法
Logistic回归与最大熵模型-理论推导
连载 | 机器学习基石 Lec10：Logistics Regr
013-物流术语
Logistics
物流行业相关单词
Logistic Regression（吴恩达课程Octave代
Logistics回归

机器学习-Logistics

使用Minst数据集进行分类

from numpy import *
import operator
import os
import numpy as np
import time
from scipy.special import expit
import scipy.optimize as opt
import matplotlib.pyplot as plt
from matplotlib import cm
from os import listdir
from mpl_toolkits.mplot3d import Axes3D
import struct
import math
from sklearn.metrics import classification_report


#读取图片
def read_image(file_name):
    #先用二进制方式把文件都读进来
    file_handle=open(file_name,"rb")  #以二进制打开文档
    file_content=file_handle.read()   #读取到缓冲区中
    offset=0
    head = struct.unpack_from('>IIII', file_content, offset)  # 取前4个整数，返回一个元组
    offset += struct.calcsize('>IIII')
    imgNum = head[1]  #图片数
    rows = head[2]   #宽度
    cols = head[3]  #高度

    images=np.empty((imgNum , 784))#empty，是它所常见的数组内的所有元素均为空，没有实际意义，它是创建数组最快的方法
    image_size=rows*cols#单个图片的大小
    fmt='>' + str(image_size) + 'B'#单个图片的format

    for i in range(imgNum):
        images[i] = np.array(struct.unpack_from(fmt, file_content, offset))
        # images[i] = np.array(struct.unpack_from(fmt, file_content, offset)).reshape((rows, cols))
        offset += struct.calcsize(fmt)
    return images

#读取标签
def read_label(file_name):
    file_handle = open(file_name, "rb")  # 以二进制打开文档
    file_content = file_handle.read()  # 读取到缓冲区中

    head = struct.unpack_from('>II', file_content, 0)  # 取前2个整数，返回一个元组
    offset = struct.calcsize('>II')

    labelNum = head[1]  # label数
    # print(labelNum)
    bitsString = '>' + str(labelNum) + 'B'  # fmt格式：'>47040000B'
    label = struct.unpack_from(bitsString, file_content, offset)  # 取data数据，返回一个元组
    return np.array(label)

# 读取文件
def loadDataSet_Minst():
    train_x_filename="data\\train-images.idx3-ubyte"
    train_y_filename="data\\train-labels.idx1-ubyte"
    test_x_filename="data\\t10k-images.idx3-ubyte"
    test_y_filename="data\\t10k-labels.idx1-ubyte"
    train_x=read_image(train_x_filename)
    train_y=read_label(train_y_filename)
    test_x=read_image(test_x_filename)
    test_y=read_label(test_y_filename)

    # # # #调试的时候让速度快点，就先减少数据集大小
#     train_x=train_x[0:1000,:]
#     train_y=train_y[0:1000]
#     test_x=test_x[0:500,:]
#     test_y=test_y[0:500]
    print("train_x_shape: ", train_x.shape)
    print("train_y_shape: ", train_y.shape)
    print("test_x_shape: ", test_x.shape)
    print("test_y_shape: ", test_y.shape)

    return train_x, test_x, train_y, test_y

# 分类向量
def classifyVector(inX,weights):#这里的inX相当于test_data,以回归系数和特征向量作为输入来计算对应的sigmoid
    prob=sigmoid(sum(inX*weights))
    if prob>0.5:return 1.0
    else: return 0.0

正则化代价函数

$J\left( \theta \right)=\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]}+\frac{\lambda }{2m}\sum\limits_{j=1}^{n}{\theta _{j}^{2}}$

# 计算正则化代价函数
def logstic_cost(theta, X, y, l):
    m = X.shape[0]#m为样本数
    h_theta = expit(np.dot(X, theta)).reshape((m,1))
    logistic_term = (np.dot(np.log(h_theta).T,y)+np.dot(np.log(1-h_theta).T, (1-y))) / (-m)
    regularized_term = (l / (2 * m)) * np.power(theta, 2).sum()
    return logistic_term + regularized_term

正则化梯度

$\frac{\partial J\left( \theta \right)}{\partial {{\theta }_{j}}}=\left( \frac{1}{m}\sum\limits_{i=1}^{m}{({{h}_{\theta }}\left( {{x}^{(i)}} \right)-{{y}^{(i)}})x_{_{j}}^{(i)}}\right)+\frac{\lambda }{m}{{\theta }_{j}}\text{ }\text{ for j}\ge \text{1}$

# 计算梯度
def gradient(theta, X, y, l):
    m = X.shape[0]
    h_theta = expit(np.dot(X, theta)).reshape((m,1))
    logistic_term =(np.dot(X.T,(h_theta-y)))/m
    regularized_theta = (l / m) * theta[1:]
    regularized_term = np.concatenate((np.array([[0]]), regularized_theta), axis=0)
    return logistic_term + regularized_term

# 模型训练
def train_model(train_x,train_y,theta,learning_rate,iterationNum,numClass,lambd):#theta是n+1行的列向量
    m=train_x.shape[0]
    n=train_x.shape[1]
    train_x=np.insert(train_x,0,values=1,axis=1)
    J_theta = np.zeros((iterationNum,numClass))

    for k in range(numClass):
        print("类别 - ", k, " - 模型训练：")
        real_y=np.zeros((m,1))
        index=train_y==k#index中存放的是train_y中等于0的索引
        real_y[index]=1#在real_y中修改相应的index对应的值为1，先分类0和非0
        log_cost = []

        for j in range(iterationNum):
            temp_theta = theta[:,k].reshape((n+1,1))#这里是（785，1）
            J_theta[j,k] = logstic_cost(temp_theta,train_x,real_y,lambd)
            temp_theta = temp_theta - learning_rate*gradient(temp_theta,train_x,real_y,lambd)
            theta[:, k] = temp_theta.reshape((n+1,))#这里是（785，1）
            
            #对代价进行作图
            log_cost.append(J_theta[j,k])
            cost_plot(j,log_cost,path=10)

    return theta#返回的theta是n*numClass矩阵

# 代价函数作图
def cost_plot(epoch_len, cost, path):
    if (epoch_len%path == 0 and epoch_len/path>0 ):
        print("迭代周期-epoch:", epoch_len)
        plt.figure(figsize=(16,5))
        #画一张代价函数图
        plt.subplot(121)
        plt.plot(np.array(range(epoch_len+1)), cost, marker='.')
        plt.grid(True)
        plt.xlabel('epoch')
        plt.ylabel('cost')
        plt.title('Cost Function')
        plt.show()

# 训练集验证
def predict_train(train_x,train_y,theta):#这里的theta是学习得来的最好的theta，是n*numClass的矩阵
    errorCount=0
    train_x = np.insert(train_x, 0, values=1, axis=1)
    m = train_x.shape[0]

    h_theta=expit(np.dot(train_x,theta))#h_theta是m*numClass的矩阵，因为test_x是m*n，theta是n*numClass
    h_theta_max = h_theta.max(axis=1)  # 获得每行的最大值,h_theta_max是m*1的矩阵，列向量
    h_theta_max_postion=h_theta.argmax(axis=1)#获得每行的最大值的label
    for i in range(m):
        if train_y[i]!=h_theta_max_postion[i]:
            errorCount+=1

    error_rate = float(errorCount) / m
    true_rate = (1-error_rate)*100
    print("训练集准确率：%2f %%" % true_rate)
    print("*************************************")
    print("训练集分类报告")
    print(classification_report(train_y, h_theta_max_postion))
    return error_rate


# 测试集验证
def predict_test(test_x,test_y,theta):#这里的theta是学习得来的最好的theta，是n*numClass的矩阵
    errorCount=0
    test_x = np.insert(test_x, 0, values=1, axis=1)
    m = test_x.shape[0]

    h_theta=expit(np.dot(test_x,theta))#h_theta是m*numClass的矩阵，因为test_x是m*n，theta是n*numClass
    h_theta_max = h_theta.max(axis=1)  # 获得每行的最大值,h_theta_max是m*1的矩阵，列向量
    h_theta_max_postion=h_theta.argmax(axis=1)#获得每行的最大值的label
    for i in range(m):
        if test_y[i]!=h_theta_max_postion[i]:
            errorCount+=1

    error_rate = float(errorCount) / m
    true_rate = (1-error_rate)*100
    print("验证集准确率：%2f %%" % true_rate)
    print("*************************************")
    print("验证集分类报告")
    print(classification_report(test_y, h_theta_max_postion))
    return error_rate

# 主函数验证
if __name__=='__main__':
    print("读入数据...")
    time1=time.time()
    train_x, test_x, train_y, test_y = loadDataSet_Minst()
#     train_x, test_x, train_y, test_y = loadDataSet('data\\Customer.txt', test_size=0.1)
    time2=time.time()
    print("读入数据耗时：",time2-time1,"second")
    print("*************************************")

    numClass=len(set(test_y)) #这里是类别，这里取的是标签的集合的长度，也就表示类别的个数
    iteration = 100
    learning_rate = 0.00001
    n=test_x.shape[1]+1
    lambd=0 #正则项 l代表的是lambda的值，0代表lambda=0； 1代表lambda=1
    
    #这里的theta应该是785*10
    theta=np.zeros((n,numClass))# theta=np.random.rand(n,1)#随机构造n*numClass的矩阵,因为有numClass个分类器，所以应该返回的是numClass个列向量（n*1）
#     theta=np.random.random((n,numClass))
    
    print("开始训练数据...")
    theta_new = train_model(train_x, train_y, theta, learning_rate, iteration, numClass,lambd)
    time3 = time.time()
    print("训练数据耗时： ", time3 - time2, "second")
    predict_train(train_x, train_y, theta_new)
    print("*************************************")

    print("开始预测数据...")
    predict_test(test_x, test_y, theta_new)
#     mulitPredict(test_x,test_y,theta_new)
    time4=time.time()
    print("预测数据耗时： ",time4-time3,"second")


# 主函数验证
if __name__=='__main__':
    print("读入数据...")
    time1=time.time()
#     train_x, test_x, train_y, test_y = loadDataSet_Minst()
    train_x, test_x, train_y, test_y = loadDataSet('data\\Customer.txt', test_size=0.1)
    time2=time.time()
    print("读入数据耗时：",time2-time1,"second")
    print("*************************************")

    numClass=len(set(test_y)) #这里是类别，这里取的是标签的集合的长度，也就表示类别的个数
    iteration = 1500
    learning_rate = 0.25
    n=test_x.shape[1]+1
    l=0
    #正则项 l代表的是lambda的值，0代表lambda=0； 1代表lambda=1
    
    #这里的theta应该是785*10
    theta=np.zeros((n,numClass))# theta=np.random.rand(n,1)#随机构造n*numClass的矩阵,因为有numClass个分类器，所以应该返回的是numClass个列向量（n*1）
    
    print("开始训练数据...")
    theta_new = train_model(train_x, train_y, theta, learning_rate, iteration, numClass,l)
    time3 = time.time()
    print("训练数据耗时： ", time3 - time2, "second")
    predict_train(train_x, train_y, theta_new)
    print("*************************************")

    print("开始预测数据...")
    predict_test(test_x, test_y, theta_new)
#     mulitPredict(test_x,test_y,theta_new)
    time4=time.time()
    print("预测数据耗时： ",time4-time3,"second")


#     numClass=len(set(test_y))
#     iteration = 1000
#     learning_rate = 0.00001
#     lambd=0
#     准确度达到：91%
#     
#     iteration = 10000
#     learning_rate = 0.00001
#     lambd=0
#     准确度达到：92%

本文使用Python实现Logistics，对Minst数据集进行分类，最终效果也仅仅到达92%。

机器学习-Logistics
机器学习-Logistics 使用Minst数据集进行分类正则化代价函数正则化梯度本文使用Python实现L...
机器学习——softmax计算
概念与应用 Softmax是机器学习中一个非常重要的工具，他可以兼容 logistics 算法、可以独立作为机器学...
机器学习实战之python实现logistics算法
从疝气病症预测病马的死亡率
Logistic回归与最大熵模型-理论推导
1. logistics 回归模型 1.1 logistics模型构建对于数据集有Logistics模型的基本思...
连载 | 机器学习基石 Lec10：Logistics Regr
Tips：符号主要参照 Lec1，部分参照其他Lec～上一节介绍了线性回归，线性回归是输出一个 score，可以...
013-物流术语
一、物流名词术语 1 article物品 2 logistics物流 3 logistics activity物流...
Logistics
Logistics department Best regards Purchase department Tec...
物流行业相关单词
基本概念术语 1.物流 logistics2.物品 article3.物流活动 logistics activit...
Logistic Regression（吴恩达课程Octave代
详细代码参考：github Logistics回归实例：建立Logistics模型，根据学生的两门考试成绩，判断...
Logistics回归
一、概念 Logistic Regression（逻辑回归）是机器学习中一个非常非常常见的模型，在实际生产环境中也...