机器学习-Logistics
使用Minst数据集进行分类
from numpy import *
import operator
import os
import numpy as np
import time
from scipy.special import expit
import scipy.optimize as opt
import matplotlib.pyplot as plt
from matplotlib import cm
from os import listdir
from mpl_toolkits.mplot3d import Axes3D
import struct
import math
from sklearn.metrics import classification_report
#读取图片
def read_image(file_name):
#先用二进制方式把文件都读进来
file_handle=open(file_name,"rb") #以二进制打开文档
file_content=file_handle.read() #读取到缓冲区中
offset=0
head = struct.unpack_from('>IIII', file_content, offset) # 取前4个整数,返回一个元组
offset += struct.calcsize('>IIII')
imgNum = head[1] #图片数
rows = head[2] #宽度
cols = head[3] #高度
images=np.empty((imgNum , 784))#empty,是它所常见的数组内的所有元素均为空,没有实际意义,它是创建数组最快的方法
image_size=rows*cols#单个图片的大小
fmt='>' + str(image_size) + 'B'#单个图片的format
for i in range(imgNum):
images[i] = np.array(struct.unpack_from(fmt, file_content, offset))
# images[i] = np.array(struct.unpack_from(fmt, file_content, offset)).reshape((rows, cols))
offset += struct.calcsize(fmt)
return images
#读取标签
def read_label(file_name):
file_handle = open(file_name, "rb") # 以二进制打开文档
file_content = file_handle.read() # 读取到缓冲区中
head = struct.unpack_from('>II', file_content, 0) # 取前2个整数,返回一个元组
offset = struct.calcsize('>II')
labelNum = head[1] # label数
# print(labelNum)
bitsString = '>' + str(labelNum) + 'B' # fmt格式:'>47040000B'
label = struct.unpack_from(bitsString, file_content, offset) # 取data数据,返回一个元组
return np.array(label)
# 读取文件
def loadDataSet_Minst():
train_x_filename="data\\train-images.idx3-ubyte"
train_y_filename="data\\train-labels.idx1-ubyte"
test_x_filename="data\\t10k-images.idx3-ubyte"
test_y_filename="data\\t10k-labels.idx1-ubyte"
train_x=read_image(train_x_filename)
train_y=read_label(train_y_filename)
test_x=read_image(test_x_filename)
test_y=read_label(test_y_filename)
# # # #调试的时候让速度快点,就先减少数据集大小
# train_x=train_x[0:1000,:]
# train_y=train_y[0:1000]
# test_x=test_x[0:500,:]
# test_y=test_y[0:500]
print("train_x_shape: ", train_x.shape)
print("train_y_shape: ", train_y.shape)
print("test_x_shape: ", test_x.shape)
print("test_y_shape: ", test_y.shape)
return train_x, test_x, train_y, test_y
# 分类向量
def classifyVector(inX,weights):#这里的inX相当于test_data,以回归系数和特征向量作为输入来计算对应的sigmoid
prob=sigmoid(sum(inX*weights))
if prob>0.5:return 1.0
else: return 0.0
正则化代价函数
# 计算正则化代价函数
def logstic_cost(theta, X, y, l):
m = X.shape[0]#m为样本数
h_theta = expit(np.dot(X, theta)).reshape((m,1))
logistic_term = (np.dot(np.log(h_theta).T,y)+np.dot(np.log(1-h_theta).T, (1-y))) / (-m)
regularized_term = (l / (2 * m)) * np.power(theta, 2).sum()
return logistic_term + regularized_term
正则化梯度
# 计算梯度
def gradient(theta, X, y, l):
m = X.shape[0]
h_theta = expit(np.dot(X, theta)).reshape((m,1))
logistic_term =(np.dot(X.T,(h_theta-y)))/m
regularized_theta = (l / m) * theta[1:]
regularized_term = np.concatenate((np.array([[0]]), regularized_theta), axis=0)
return logistic_term + regularized_term
# 模型训练
def train_model(train_x,train_y,theta,learning_rate,iterationNum,numClass,lambd):#theta是n+1行的列向量
m=train_x.shape[0]
n=train_x.shape[1]
train_x=np.insert(train_x,0,values=1,axis=1)
J_theta = np.zeros((iterationNum,numClass))
for k in range(numClass):
print("类别 - ", k, " - 模型训练:")
real_y=np.zeros((m,1))
index=train_y==k#index中存放的是train_y中等于0的索引
real_y[index]=1#在real_y中修改相应的index对应的值为1,先分类0和非0
log_cost = []
for j in range(iterationNum):
temp_theta = theta[:,k].reshape((n+1,1))#这里是(785,1)
J_theta[j,k] = logstic_cost(temp_theta,train_x,real_y,lambd)
temp_theta = temp_theta - learning_rate*gradient(temp_theta,train_x,real_y,lambd)
theta[:, k] = temp_theta.reshape((n+1,))#这里是(785,1)
#对代价进行作图
log_cost.append(J_theta[j,k])
cost_plot(j,log_cost,path=10)
return theta#返回的theta是n*numClass矩阵
# 代价函数作图
def cost_plot(epoch_len, cost, path):
if (epoch_len%path == 0 and epoch_len/path>0 ):
print("迭代周期-epoch:", epoch_len)
plt.figure(figsize=(16,5))
#画一张代价函数图
plt.subplot(121)
plt.plot(np.array(range(epoch_len+1)), cost, marker='.')
plt.grid(True)
plt.xlabel('epoch')
plt.ylabel('cost')
plt.title('Cost Function')
plt.show()
# 训练集验证
def predict_train(train_x,train_y,theta):#这里的theta是学习得来的最好的theta,是n*numClass的矩阵
errorCount=0
train_x = np.insert(train_x, 0, values=1, axis=1)
m = train_x.shape[0]
h_theta=expit(np.dot(train_x,theta))#h_theta是m*numClass的矩阵,因为test_x是m*n,theta是n*numClass
h_theta_max = h_theta.max(axis=1) # 获得每行的最大值,h_theta_max是m*1的矩阵,列向量
h_theta_max_postion=h_theta.argmax(axis=1)#获得每行的最大值的label
for i in range(m):
if train_y[i]!=h_theta_max_postion[i]:
errorCount+=1
error_rate = float(errorCount) / m
true_rate = (1-error_rate)*100
print("训练集准确率:%2f %%" % true_rate)
print("*************************************")
print("训练集分类报告")
print(classification_report(train_y, h_theta_max_postion))
return error_rate
# 测试集验证
def predict_test(test_x,test_y,theta):#这里的theta是学习得来的最好的theta,是n*numClass的矩阵
errorCount=0
test_x = np.insert(test_x, 0, values=1, axis=1)
m = test_x.shape[0]
h_theta=expit(np.dot(test_x,theta))#h_theta是m*numClass的矩阵,因为test_x是m*n,theta是n*numClass
h_theta_max = h_theta.max(axis=1) # 获得每行的最大值,h_theta_max是m*1的矩阵,列向量
h_theta_max_postion=h_theta.argmax(axis=1)#获得每行的最大值的label
for i in range(m):
if test_y[i]!=h_theta_max_postion[i]:
errorCount+=1
error_rate = float(errorCount) / m
true_rate = (1-error_rate)*100
print("验证集准确率:%2f %%" % true_rate)
print("*************************************")
print("验证集分类报告")
print(classification_report(test_y, h_theta_max_postion))
return error_rate
# 主函数验证
if __name__=='__main__':
print("读入数据...")
time1=time.time()
train_x, test_x, train_y, test_y = loadDataSet_Minst()
# train_x, test_x, train_y, test_y = loadDataSet('data\\Customer.txt', test_size=0.1)
time2=time.time()
print("读入数据耗时:",time2-time1,"second")
print("*************************************")
numClass=len(set(test_y)) #这里是类别,这里取的是标签的集合的长度,也就表示类别的个数
iteration = 100
learning_rate = 0.00001
n=test_x.shape[1]+1
lambd=0 #正则项 l代表的是lambda的值,0代表lambda=0; 1代表lambda=1
#这里的theta应该是785*10
theta=np.zeros((n,numClass))# theta=np.random.rand(n,1)#随机构造n*numClass的矩阵,因为有numClass个分类器,所以应该返回的是numClass个列向量(n*1)
# theta=np.random.random((n,numClass))
print("开始训练数据...")
theta_new = train_model(train_x, train_y, theta, learning_rate, iteration, numClass,lambd)
time3 = time.time()
print("训练数据耗时: ", time3 - time2, "second")
predict_train(train_x, train_y, theta_new)
print("*************************************")
print("开始预测数据...")
predict_test(test_x, test_y, theta_new)
# mulitPredict(test_x,test_y,theta_new)
time4=time.time()
print("预测数据耗时: ",time4-time3,"second")
# 主函数验证
if __name__=='__main__':
print("读入数据...")
time1=time.time()
# train_x, test_x, train_y, test_y = loadDataSet_Minst()
train_x, test_x, train_y, test_y = loadDataSet('data\\Customer.txt', test_size=0.1)
time2=time.time()
print("读入数据耗时:",time2-time1,"second")
print("*************************************")
numClass=len(set(test_y)) #这里是类别,这里取的是标签的集合的长度,也就表示类别的个数
iteration = 1500
learning_rate = 0.25
n=test_x.shape[1]+1
l=0
#正则项 l代表的是lambda的值,0代表lambda=0; 1代表lambda=1
#这里的theta应该是785*10
theta=np.zeros((n,numClass))# theta=np.random.rand(n,1)#随机构造n*numClass的矩阵,因为有numClass个分类器,所以应该返回的是numClass个列向量(n*1)
print("开始训练数据...")
theta_new = train_model(train_x, train_y, theta, learning_rate, iteration, numClass,l)
time3 = time.time()
print("训练数据耗时: ", time3 - time2, "second")
predict_train(train_x, train_y, theta_new)
print("*************************************")
print("开始预测数据...")
predict_test(test_x, test_y, theta_new)
# mulitPredict(test_x,test_y,theta_new)
time4=time.time()
print("预测数据耗时: ",time4-time3,"second")
# numClass=len(set(test_y))
# iteration = 1000
# learning_rate = 0.00001
# lambd=0
# 准确度达到:91%
#
# iteration = 10000
# learning_rate = 0.00001
# lambd=0
# 准确度达到:92%
本文使用Python实现Logistics,对Minst数据集进行分类,最终效果也仅仅到达92%。
网友评论