美文网首页
kaggle猫狗大战-真实数据集

kaggle猫狗大战-真实数据集

作者: poteman | 来源:发表于2019-08-05 11:54 被阅读0次
    • 导入所需的包
    # In this exercise you will train a CNN on the FULL Cats-v-dogs dataset
    # This will require you doing a lot of data preprocessing because
    # the dataset isn't split into training and validation for you
    # This code block has all the required inputs
    import os
    import zipfile
    import random
    import tensorflow as tf
    from tensorflow.keras.optimizers import RMSprop
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    from shutil import copyfile
    
    • 下载并解压数据
    # This code block downloads the full Cats-v-Dogs dataset and stores it as 
    # cats-and-dogs.zip. It then unzips it to /tmp
    # which will create a tmp/PetImages directory containing subdirectories
    # called 'Cat' and 'Dog' (that's how the original researchers structured it)
    # If the URL doesn't work, 
    # .   visit https://www.microsoft.com/en-us/download/confirmation.aspx?id=54765
    # And right click on the 'Download Manually' link to get a new URL
    
    !wget --no-check-certificate \
        "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip" \
        -O "/tmp/cats-and-dogs.zip"
    
    local_zip = '/tmp/cats-and-dogs.zip'
    zip_ref = zipfile.ZipFile(local_zip, 'r')
    zip_ref.extractall('/tmp')
    zip_ref.close()
    
    • 建立文件夹
    # Use os.mkdir to create your directories
    # You will need a directory for cats-v-dogs, and subdirectories for training
    # and testing. These in turn will need subdirectories for 'cats' and 'dogs'
    
    
    try:
        os.mkdir('/tmp/cats-v-dogs')
        os.mkdir('/tmp/cats-v-dogs/training')
        os.mkdir('/tmp/cats-v-dogs/testing')
        os.mkdir('/tmp/cats-v-dogs/training/cats')
        os.mkdir('/tmp/cats-v-dogs/training/dogs')
        os.mkdir('/tmp/cats-v-dogs/testing/cats')
        os.mkdir('/tmp/cats-v-dogs/testing/dogs')
        
        print("done")
    except OSError:
        pass
    
    • 拆分数据
    # Write a python function called split_data which takes
    # a SOURCE directory containing the files
    # a TRAINING directory that a portion of the files will be copied to
    # a TESTING directory that a portion of the files will be copie to
    # a SPLIT SIZE to determine the portion
    # The files should also be randomized, so that the training set is a random
    # X% of the files, and the test set is the remaining files
    # SO, for example, if SOURCE is PetImages/Cat, and SPLIT SIZE is .9
    # Then 90% of the images in PetImages/Cat will be copied to the TRAINING dir
    # and 10% of the images will be copied to the TESTING dir
    # Also -- All images should be checked, and if they have a zero file length,
    # they will not be copied over
    #
    # os.listdir(DIRECTORY) gives you a listing of the contents of that directory
    # os.path.getsize(PATH) gives you the size of the file
    # copyfile(source, destination) copies a file from source to destination
    # random.sample(list, len(list)) shuffles a list
    def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
      
        files = []
        for filename in os.listdir(SOURCE):
          file = SOURCE + filename
          if os.path.getsize(file) > 0:
            files.append(filename)
          else:
            print(filename + " is zero lenght, so ignoring")
        
        training_length = int(len(files) * SPLIT_SIZE)
        shuffled_set = random.sample(files, len(files))
        training_set = shuffled_set[:training_length]
        testing_set = shuffled_set[training_length:]
        
        for filename in training_set:
          this_file = SOURCE + filename
          destination = TRAINING + filename
          copyfile(this_file, destination)
        
        for filename in testing_set:
          this_file = SOURCE + filename
          destination = TESTING + filename
          copyfile(this_file, destination)
        
        
    CAT_SOURCE_DIR = "/tmp/PetImages/Cat/"
    TRAINING_CATS_DIR = "/tmp/cats-v-dogs/training/cats/"
    TESTING_CATS_DIR = "/tmp/cats-v-dogs/testing/cats/"
    DOG_SOURCE_DIR = "/tmp/PetImages/Dog/"
    TRAINING_DOGS_DIR = "/tmp/cats-v-dogs/training/dogs/"
    TESTING_DOGS_DIR = "/tmp/cats-v-dogs/testing/dogs/"
    
    split_size = .9
    split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, TESTING_CATS_DIR, split_size)
    split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, TESTING_DOGS_DIR, split_size)
    
    # Expected output
    # 666.jpg is zero length, so ignoring
    # 11702.jpg is zero length, so ignoring
    
    • 查看
    print(len(os.listdir('/tmp/cats-v-dogs/training/cats/')))
    print(len(os.listdir('/tmp/cats-v-dogs/training/dogs/')))
    print(len(os.listdir('/tmp/cats-v-dogs/testing/cats/')))
    print(len(os.listdir('/tmp/cats-v-dogs/testing/dogs/')))
    
    # Expected output:
    # 11250
    # 11250
    # 1250
    # 1250
    
    • 建立模型
    # DEFINE A KERAS MODEL TO CLASSIFY CATS V DOGS
    # USE AT LEAST 3 CONVOLUTION LAYERS
    from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
    model = tf.keras.models.Sequential([
        Conv2D(16, (3,3), activation='relu', input_shape=(150,150,3)),
        MaxPooling2D(2,2),
        Conv2D(32, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        Conv2D(64, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        Conv2D(128, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
        
    ])
    
    model.compile(optimizer=RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
    
    • 构建batch数据集
    TRAINING_DIR = "/tmp/cats-v-dogs/training/"
    train_datagen = ImageDataGenerator(rescale=1/255.)
    train_generator = train_datagen.flow_from_directory(
              TRAINING_DIR,
              batch_size=50,
              class_mode='binary',
              target_size=(150,150)
            )
    
    VALIDATION_DIR = "/tmp/cats-v-dogs/testing/"
    validation_datagen = ImageDataGenerator(rescale=1/255.)
    validation_generator = validation_datagen.flow_from_directory(
              VALIDATION_DIR,
              batch_size=50,
              class_mode='binary',
              target_size=(150,150)
            )
    
    # Expected Output:
    # Found 22498 images belonging to 2 classes.
    # Found 2500 images belonging to 2 classes.
    
    • 模型训练
    import warnings
    warnings.filterwarnings("ignore")
    history = model.fit_generator(train_generator,
                                  epochs=15,
                                  steps_per_epoch=500,
                                  verbose=1,
                                  validation_data=validation_generator,
                                  validation_steps=50,
                                  )
    
    # The expectation here is that the model will train, and that accuracy will be > 95% on both training and validation
    # i.e. acc:A1 and val_acc:A2 will be visible, and both A1 and A2 will be > .9
    
    • 画图查看准确率和loss
    # PLOT LOSS AND ACCURACY
    %matplotlib inline
    
    import matplotlib.image  as mpimg
    import matplotlib.pyplot as plt
    
    #-----------------------------------------------------------
    # Retrieve a list of list results on training and test data
    # sets for each training epoch
    #-----------------------------------------------------------
    acc=history.history['acc']
    val_acc=history.history['val_acc']
    loss=history.history['loss']
    val_loss=history.history['val_loss']
    
    epochs=range(len(acc)) # Get number of epochs
    
    #------------------------------------------------
    # Plot training and validation accuracy per epoch
    #------------------------------------------------
    plt.plot(epochs, acc, 'r', "Training Accuracy")
    plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
    plt.title('Training and validation accuracy')
    plt.figure()
    
    #------------------------------------------------
    # Plot training and validation loss per epoch
    #------------------------------------------------
    plt.plot(epochs, loss, 'r', "Training Loss")
    plt.plot(epochs, val_loss, 'b', "Validation Loss")
    
    
    plt.title('Training and validation loss')
    
    # Desired output. Charts with training and validation metrics. No crash :)
    
    • 模型预测
    # Here's a codeblock just for fun. You should be able to upload an image here 
    # and have it classified without crashing
    
    import numpy as np
    from google.colab import files
    from keras.preprocessing import image
    
    uploaded = files.upload()
    
    for fn in uploaded.keys():
     
      # predicting images
      path = '/content/' + fn
      img = image.load_img(path, target_size=(# YOUR CODE HERE))
      x = image.img_to_array(img)
      x = np.expand_dims(x, axis=0)
    
      images = np.vstack([x])
      classes = model.predict(images, batch_size=10)
      print(classes[0])
      if classes[0]>0.5:
        print(fn + " is a dog")
      else:
        print(fn + " is a cat")
    

    【参考资料】
    1.google colab

    相关文章

      网友评论

          本文标题:kaggle猫狗大战-真实数据集

          本文链接:https://www.haomeiwen.com/subject/aknpdctx.html