# 1)样本 2)KNNN 3) CNN
# 旧瓶装新酒:MNist数字识别的的不同
# -> 网络 每一级 原理后代吗
# !) 样本 yann.lecun.com/exdb/mnist
# 2) knn最近邻域法
# 本质:k个与当前相似的,其中相似性最大的数字作为结果(10图中有8个描述为1)
# 步骤: 1 load data
# 2 距离计算 knn test and train distance 5 * 500 = 2500个距离
# 3 knn 利用距离找到k个最近的图片 test5 与 train100, 从train500中找出4张与测试图片最接近的图片
# 4 解析k个最近的图片-> parse content label(得到label)
# 5 label -> 数字
# 6 检测概率统计
# 小结:
import numpy as np
import random
import tensorflow as tf
# 解决mnist版本弃用警告问题
old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)
from tensorflow.examples.tutorials.mnist import input_data
# 1) load data filename: 'MNIST_data' ont_hot:数组中一个为1其余为0
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)
# 属性设置
trainNum = 55000
testNum = 10000
trainSize = 500
testSize = 5
k = 4 # k个与样本最接近
# data分解 trainSize个 0~trainNum之间选取,replace: false表示不可重复
trainIndex = np.random.choice(trainNum, trainSize, replace = False)
testIndex = np.random.choice(testNum, testSize, replace = False)
trainData = mnist.train.images[trainIndex] # 训练图片
trainLabel = mnist.train.labels[trainIndex] # 训练标签
testData = mnist.test.images[testIndex]
testLabel = mnist.test.labels[testIndex]
print('trainData.shape = ', trainData.shape)
print('trainLabel.shape = ', trainLabel.shape)
print('testData.shape = ', testData.shape)
print('testLabel.shape = ', testLabel.shape)
# [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] 1:testData[0]
# [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] 9::testData[1] 一共五个值 ...
print('testLabel = ', testLabel)
# tf input 建立占位符 784 -> image
trainDataInput = tf.placeholder(shape = [None, 784], dtype = tf.float32) # (500, 784)
trainLabelInput = tf.placeholder(shape = [None, 10], dtype = tf.float32) # (500, 10)
testDataInput = tf.placeholder(shape = [None, 784], dtype = tf.float32) # (5, 784)
testLabelInput = tf.placeholder(shape = [None, 10], dtype = tf.float32) # (5, 10)
# 2) knn distance 5 * 784 -> 5 * 1 * 784
# 扩展维度: 测试5 训练500 像素点784(维)=> 2500 *784
f1 = tf.expand_dims(testDataInput, 1)
# 作差-> 差值放入784维中 -> sum(784)
f2 = tf.subtract(trainDataInput, f1)
# f3: 5 * 500 每一点表示某一个测试点与某一个训练点二者之间的距离之差
f3 = tf.reduce_sum(tf.abs(f2), reduction_indices = 2) # 在第二维度(即784)进行数据累加, 784个像素点之间的差值,取abs
# 3) 找出4张与测试图片最近的图片
# 取反
f4 = tf.negative(f3)
# 选取f4中最大的4个值(f3中最小的4个值)
f5, f6 = tf.nn.top_k(f4, k = 4)
# 4) 解析内容标签
# f6 index -> trainLabelInput
f7 = tf.gather(trainLabelInput, f6)
# 5) 通过label -> 数字
# num
f8 = tf.reduce_sum(f7, reduction_indices = 1)
# 一维,最大概率的下标
f9 = tf.argmax(f8, dimension = 1)
# 6) 通过f10统计检测概率
f10 = tf.argmax(testLabel[0 : 5], axis = 1)
with tf.Session() as sess:
# f1 <- 五张testData待检测的手写数字图片
p1 = sess.run(f1, feed_dict = {testDataInput : testData[0 : 5]})
print('p1.shape = ', p1.shape) # 维度(5, 1, 784) [28行 * 28列]
p2 = sess.run(f2, feed_dict = {trainDataInput : trainData, testDataInput : testData[0 : 5]})
# (5, 500, 784) p2实现对应像素作差: (1, 100)则是第二张测试图片与第一百零一张训练图片之间的距离
print('p2.shape = ', p2.shape)
p3 = sess.run(f3, feed_dict = {trainDataInput : trainData, testDataInput : testData[0 : 5]})
print('p3.shape = ', p3.shape) # (5, 500)
print('p3[0, 0]', p3[0, 0]) # 距离差值knn distance 114.54903
p4 = sess.run(f4, feed_dict = {trainDataInput : trainData, testDataInput : testData[0 : 5]})
print('p4.shape = ', p4.shape) # p4.shape = (5, 500)
print('p4[0, 0] = ', p4[0, 0]) # p3[0, 0] 122.364716【每一次不同取决于随机生成的train与test】 p4[0, 0] = -122.364716
# p6下标 p5内容
p5, p6 = sess.run((f5, f6), feed_dict = {trainDataInput : trainData, testDataInput : testData[0 : 5]})
print('p5.shape = ', p5.shape) # (5, 4)
print('p6.shape = ', p6.shape) # (5, 4)
print('p5[0, 0] = ', p5[0, 0]) # -43.44706 内容
print('p6[0, 0] = ', p6[0, 0]) # 390 下标
print('p5[0] = ', p5[0])
print('p6[0] = ', p6[0])
p7 = sess.run(f7, feed_dict = {trainDataInput : trainData, testDataInput : testData[0 : 5], trainLabelInput : trainLabel})
print('p7.shape = ', p7.shape) # (5, 4, 10)
print('p7 = ', p7) # 5组 每组整体是一个测试图片,每行代表一个最近的测试图片,每个数字代表一个标签,一行表示一个数
p8 = sess.run(f8, feed_dict = {trainDataInput : trainData, testDataInput : testData, trainLabelInput : trainLabel})
print('p8.shape = ', p8.shape) # (5, 10)
# [[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
# [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
# [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
# [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
# 数值方向上的累加 -> [[0. 0. 0. 4. 0. 0. 0. 0. 0. 0.]
print('p8 = ', p8)
p9 = sess.run(f9, feed_dict = {trainDataInput : trainData, testDataInput : testData, trainLabelInput : trainLabel})
print('p9.shape = ', p9.shape) #(5, ) 即是5维数据
# p8: [[0. 0. 4. 0. 0. 0. 0. 0. 0. 0.]
# [0. 4. 0. 0. 0. 0. 0. 0. 0. 0.]
# [0. 0. 0. 0. 3. 0. 0. 0. 1. 0.]
# [0. 4. 0. 0. 0. 0. 0. 0. 0. 0.]
# [0. 4. 0. 0. 0. 0. 0. 0. 0. 0.]]
print('p9 = ', p9) # 最大值的下标 [2 1 4 1 1]
p10 = sess.run(f10, feed_dict = {trainDataInput : trainData, testDataInput : testData, trainLabelInput : trainLabel})
print('p10 = ', p10) # p9 = [3 4 2 6 4] p10 = [3 4 8 6 4] 相差一个
j = 0
for i in range(0, 5):
if p10[i] == p9[i]:
j = j + 1
print('ac = ', j * 100 / 5) # 训练数据很多 测试数据很少 所以检测概率较大60 80 100%..
准确度为100%或80%……
打印值与准确度结果如下:
![](https://img.haomeiwen.com/i4479796/35f1659249a40f4b.png)
![](https://img.haomeiwen.com/i4479796/e0b57362eb50345b.png)
网友评论