美文网首页
在PaddlePaddle中实现MNIST数据集训练:高层API

在PaddlePaddle中实现MNIST数据集训练:高层API

作者: LabVIEW_Python | 来源:发表于2021-02-22 05:24 被阅读0次

    本文从MNIST数据集下载开始,详细介绍在PaddlePaddle中,基于高层API实现MNIST数据集训练

    第一步:将MNIST数据下载到本地,下载链接:http://yann.lecun.com/exdb/mnist/,可以得到四个文件:

    • train-images-idx3-ubyte.gz: 训练集图像数据
    • train-labels-idx1-ubyte.gz: 训练集标签
    • t10k-images-idx3-ubyte.gz: 测试集图像数据
    • t10k-labels-idx1-ubyte.gz: 测试集标签

    第二步:将下载的四个文件以Numpy ndarray类型载入内存。解压文件并读取数据的过程非常标准,大家可以直接用下面的范例程序

    # train-images-idx3-ubyte 文件格式, 参考:http://yann.lecun.com/exdb/mnist/
    '''
    [offset] [type]          [value]          [description] 
    0000     32 bit integer  0x00000803(2051) magic number 
    0004     32 bit integer  60000            number of images 
    0008     32 bit integer  28               number of rows 
    0012     32 bit integer  28               number of columns 
    0016     unsigned byte   ??               pixel 
    0017     unsigned byte   ??               pixel 
    ........ 
    xxxx     unsigned byte   ??               pixel
    Pixels are organized row-wise. Pixel values are 0 to 255. 
    0 means background (white), 255 means foreground (black).
    '''
    def load_images(image_file):
        # 读取*.gz格式文件
        with gzip.open(image_file) as f:
            buf = f.read()
    
        idx = 0
        # 读取文件信息
        magic, num_images, rows, cols = struct.unpack_from('>IIII', buf, idx)
        idx += struct.calcsize('>IIII')
        length = int(num_images*rows*cols)
        # 读取图像数据
        images = struct.unpack_from('>'+str(length)+'B', buf, idx)
        images = np.array(images).astype('float32')
        images = images.reshape(num_images, rows, cols)
        # 返回np.ndarray类型, N*r*c 图像数据
        return images
    
    
    # train-labels-idx1-ubyte.gz 文件格式
    '''
    [offset] [type]          [value]          [description]
    0000     32 bit integer  0x00000801(2049) magic number (MSB first)
    0004     32 bit integer  60000            number of items
    0008     unsigned byte   ??               label
    0009     unsigned byte   ??               label
    ........
    xxxx     unsigned byte   ??               label
    The labels values are 0 to 9.
    '''
    def load_labels(label_file):
        # 读取*.gz格式文件
        with gzip.open(label_file) as f:
            buf = f.read()
        # 读取文件信息
        idx = 0
        magic, num_labels = struct.unpack_from('>II', buf, idx)
        # 读取标签数据
        idx += struct.calcsize('>II')
        labels = struct.unpack_from('>'+str(num_labels)+'B',buf,idx)
     
        labels = np.array(labels).astype('int64')
        # 返回np.ndarray类型, 标签数据
        return labels
    

    可以用下面的代码来测试图像数据的读入

    # Test Code
    import matplotlib.pyplot as plt
    train_images = load_images('train-images-idx3-ubyte.gz')
    test_images  = load_images('t10k-images-idx3-ubyte.gz')
    train_labels = load_labels('train-labels-idx1-ubyte.gz').reshape(-1,1)
    test_labels  = load_labels('t10k-labels-idx1-ubyte.gz').reshape(-1,1)
    print(train_images.shape, train_labels.shape, test_images.shape, test_labels.shape)
    idx = 5
    fig = plt.figure()
    plt.subplot(1,2,1)
    plt.imshow(train_images[idx],cmap='rainbow')
    plt.subplot(1,2,2)
    plt.imshow(test_images[idx],cmap='rainbow')
    plt.show()
    

    (60000, 28, 28) (60000,1) (10000, 28, 28) (10000,1)

    展示数据

    第三步:使用飞桨提供的paddle.io.Dataset基类,将数据封装为可迭代的数据源。

    train_images = load_images('train-images-idx3-ubyte.gz')
    test_images  = load_images('t10k-images-idx3-ubyte.gz')
    train_labels = load_labels('train-labels-idx1-ubyte.gz')
    test_labels  = load_labels('t10k-labels-idx1-ubyte.gz')
    
    # 图像数据归一化
    train_images = train_images / 255.0
    test_images  = test_images / 255.0
    
    num_train_samples = train_images.shape[0]
    num_test_samples = test_images.shape[0]
    
    import paddle
    from paddle.io import Dataset
    class TrainDataSet(Dataset):
        """
        步骤一:继承paddle.io.Dataset类
        """
        def __init__(self, num_samples):
            """
            步骤二:实现构造函数,定义数据集大小
            """
            super().__init__()
            self.num_samples = num_samples
    
        def __getitem__(self, index):
            """
            步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
            """
            data = train_images[index]
            label = train_labels[index]
    
            return data, label
    
        def __len__(self):
            """
            步骤四:实现__len__方法,返回数据集总数目
            """
            return self.num_samples
    
    class TestDataSet(Dataset):
        """
        步骤一:继承paddle.io.Dataset类
        """
        def __init__(self, num_samples):
            """
            步骤二:实现构造函数,定义数据集大小
            """
            super().__init__()
            self.num_samples = num_samples
    
        def __getitem__(self, index):
            """
            步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
            """
            data = test_images[index]
            label = test_labels[index]
    
            return data, label
    
        def __len__(self):
            """
            步骤四:实现__len__方法,返回数据集总数目
            """
            return self.num_samples
    
    # 测试定义的数据集
    train_dataset = TrainDataSet(num_train_samples)
    test_dataset = TestDataSet(num_test_samples)
    

    第四步:针对顺序的线性网络结构,使用飞桨提供的Sequential类来快速完成组网,这样可以减少类的定义等代码编写。

    # 定义模型
    mnist = paddle.nn.Sequential(
        paddle.nn.Flatten(),
        paddle.nn.Linear(784, 512),
        paddle.nn.ReLU(),
        paddle.nn.Dropout(0.2),
        paddle.nn.Linear(512, 10)
    )
    

    第五步:生成模型实例,并完成损失函数、优化方法和评估方法的配置。

    # 预计模型结构生成模型实例,便于进行后续的配置、训练和验证
    model = paddle.Model(mnist)
    
    # 模型训练相关配置,准备损失计算方法,优化器和精度计算方法
    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
                  paddle.nn.CrossEntropyLoss(),
                  paddle.metric.Accuracy())
    

    最后一步:用fit()方法启动训练,evaluate()方法实现评估,predict()方法实现预测

    # 开始模型训练
    model.fit(train_dataset,
              epochs=5,
              batch_size=100,
              verbose=1)
    
    # 用 evaluate 在测试集上对模型进行验证
    eval_result = model.evaluate(test_dataset, verbose=0)
    print(eval_result)
    # 用 predict 在测试集上对模型进行测试
    test_result = model.predict(test_dataset)
    

    完整可运行的代码如下

    import gzip 
    import struct 
    import numpy as np 
    
    # train-images-idx3-ubyte 文件格式, 参考:http://yann.lecun.com/exdb/mnist/
    '''
    [offset] [type]          [value]          [description] 
    0000     32 bit integer  0x00000803(2051) magic number 
    0004     32 bit integer  60000            number of images 
    0008     32 bit integer  28               number of rows 
    0012     32 bit integer  28               number of columns 
    0016     unsigned byte   ??               pixel 
    0017     unsigned byte   ??               pixel 
    ........ 
    xxxx     unsigned byte   ??               pixel
    Pixels are organized row-wise. Pixel values are 0 to 255. 
    0 means background (white), 255 means foreground (black).
    '''
    def load_images(image_file):
        # 读取*.gz格式文件
        with gzip.open(image_file) as f:
            buf = f.read()
    
        idx = 0
        # 读取文件信息
        magic, num_images, rows, cols = struct.unpack_from('>IIII', buf, idx)
        idx += struct.calcsize('>IIII')
        length = int(num_images*rows*cols)
        # 读取图像数据
        images = struct.unpack_from('>'+str(length)+'B', buf, idx)
        images = np.array(images).astype('float32')
        images = images.reshape(num_images, rows, cols)
        # 返回np.ndarray类型, N*r*c 图像数据
        return images
    
    
    # train-labels-idx1-ubyte.gz 文件格式
    '''
    [offset] [type]          [value]          [description]
    0000     32 bit integer  0x00000801(2049) magic number (MSB first)
    0004     32 bit integer  60000            number of items
    0008     unsigned byte   ??               label
    0009     unsigned byte   ??               label
    ........
    xxxx     unsigned byte   ??               label
    The labels values are 0 to 9.
    '''
    def load_labels(label_file):
        # 读取*.gz格式文件
        with gzip.open(label_file) as f:
            buf = f.read()
        # 读取文件信息
        idx = 0
        magic, num_labels = struct.unpack_from('>II', buf, idx)
        # 读取标签数据
        idx += struct.calcsize('>II')
        labels = struct.unpack_from('>'+str(num_labels)+'B',buf,idx)
     
        labels = np.array(labels).astype('int64')
        # 返回np.ndarray类型, 标签数据
        return labels
    
    train_images = load_images('train-images-idx3-ubyte.gz')
    test_images  = load_images('t10k-images-idx3-ubyte.gz')
    train_labels = load_labels('train-labels-idx1-ubyte.gz').reshape(-1,1)
    test_labels  = load_labels('t10k-labels-idx1-ubyte.gz').reshape(-1,1)
    
    # 图像数据归一化
    train_images = train_images / 255.0
    test_images  = test_images / 255.0
    
    num_train_samples = train_images.shape[0]
    num_test_samples = test_images.shape[0]
    
    import paddle
    from paddle.io import Dataset
    class TrainDataSet(Dataset):
        """
        步骤一:继承paddle.io.Dataset类
        """
        def __init__(self, num_samples):
            """
            步骤二:实现构造函数,定义数据集大小
            """
            super().__init__()
            self.num_samples = num_samples
    
        def __getitem__(self, index):
            """
            步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
            """
            data = train_images[index]
            label = train_labels[index]
    
            return data, label
    
        def __len__(self):
            """
            步骤四:实现__len__方法,返回数据集总数目
            """
            return self.num_samples
    
    class TestDataSet(Dataset):
        """
        步骤一:继承paddle.io.Dataset类
        """
        def __init__(self, num_samples):
            """
            步骤二:实现构造函数,定义数据集大小
            """
            super().__init__()
            self.num_samples = num_samples
    
        def __getitem__(self, index):
            """
            步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
            """
            data = test_images[index]
            label = test_labels[index]
    
            return data, label
    
        def __len__(self):
            """
            步骤四:实现__len__方法,返回数据集总数目
            """
            return self.num_samples
    
    # 测试定义的数据集
    train_dataset = TrainDataSet(num_train_samples)
    test_dataset = TestDataSet(num_test_samples)
    
    # 定义模型
    mnist = paddle.nn.Sequential(
        paddle.nn.Flatten(),
        paddle.nn.Linear(784, 512),
        paddle.nn.ReLU(),
        paddle.nn.Dropout(0.2),
        paddle.nn.Linear(512, 10)
    )
    
    # 预计模型结构生成模型实例,便于进行后续的配置、训练和验证
    model = paddle.Model(mnist)
    
    # 模型训练相关配置,准备损失计算方法,优化器和精度计算方法
    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
                  paddle.nn.CrossEntropyLoss(),
                  paddle.metric.Accuracy())
    
    # 开始模型训练
    model.fit(train_dataset,
              epochs=5,
              batch_size=100,
              verbose=1)
    
    # 用 evaluate 在测试集上对模型进行验证
    eval_result = model.evaluate(test_dataset, verbose=0)
    print(eval_result)
    # 用 predict 在测试集上对模型进行测试
    test_result = model.predict(test_dataset)
    # 展示预测结果
    import matplotlib.pyplot as plt 
    def show_img(img, predict):
        plt.title(f'predict:{predict}')
        plt.imshow(img.reshape([28,28]))
        plt.show()
    idx = 2 #抽样索引
    show_img(test_dataset[idx][0], np.argmax(test_result[0][idx]))
    
    运行结果如下: 运行结果 预测值展示

    下一节 《在PaddlePaddle中实现MNIST数据集训练:基础API》

    相关文章

      网友评论

          本文标题:在PaddlePaddle中实现MNIST数据集训练:高层API

          本文链接:https://www.haomeiwen.com/subject/ovmyxltx.html