美文网首页
深度学习中的数据处理 - 数据集的建立

深度学习中的数据处理 - 数据集的建立

作者: 拓季 | 来源:发表于2018-01-21 14:04 被阅读0次

    在深度学习的模型构建过程中数据集的重要性不言而喻,其建立过程包括以下几点:

    • 数据读入和借助可视化工具辅助数据分析

    • 根据基本的分析结果对特征数据进行选择,舍弃不重要的特征

    • 对于类别数据进行 独热编码或映射

    • 数据特征的标准化和数据集的划分

    在前面几个项目的学习中对于不同的数据来源,其实际的处理过程各有差异,在此对几个经典的例子放在一起进行一个对比和总结。

    本笔记所示代码源自 Udacity Deep Learning Nano Degree,版权归属于 Udacity,Jupyter notebook 完整代码请见 我的 GihHub

    Data processing example from student admission project

    In [1]:
    
    # Importing pandas and numpy
    import pandas as pd
    import numpy as np
    
    # Reading the csv file into a pandas DataFrame
    data = pd.read_csv('student_data.csv')
    
    # Printing out the first 3 rows of our data
    data[:3]
    
    Out[1]:
    
        admit   gre gpa rank
    0   0       380 3.61    3
    1   1       660 3.67    3
    2   1       800 4.00    1
    
    In [2]:
    
    # Importing matplotlib
    import matplotlib.pyplot as plt
    
    # Function to help us plot
    def plot_points(data):
        X = np.array(data[['gre','gpa']])
        y = np.array(data['admit'])
        admitted = X[np.argwhere(y==1)]
        rejected = X[np.argwhere(y==0)]
        plt.scatter([s[0][0] for s in rejected], [s[0][1] for s in rejected], s = 25, color = 'red', edgecolor = 'k')
        plt.scatter([s[0][0] for s in admitted], [s[0][1] for s in admitted], s = 25, color = 'cyan', edgecolor = 'k')
        plt.xlabel('Test (GRE)')
        plt.ylabel('Grades (GPA)')
    
    # Plotting the points
    plot_points(data)
    plt.show()
    
    Student admission
    In [3]:
    
    # Make dummy variables for rank
    one_hot_data = pd.concat([data, pd.get_dummies(data['rank'], prefix='rank')], axis=1)
    
    # Drop the previous rank column
    one_hot_data = one_hot_data.drop('rank', axis=1)
    
    # Print the first 3 rows of our data
    one_hot_data[:3]
    
    Out[3]:
    
        admit   gre gpa rank_1  rank_2  rank_3  rank_4
    0   0       380 3.61    0   0       1       0
    1   1       660 3.67    0   0       1       0
    2   1       800 4.00    1   0       0       0
    
    In [4]:
    
    # Scaling the data
    processed_data = one_hot_data[:]
    
    # Scaling the columns
    processed_data['gre'] = processed_data['gre'] / 800
    processed_data['gpa'] = processed_data['gpa'] / 4.0
    processed_data[:3]
    
    
    Out[4]:
        admit   gre     gpa     rank_1  rank_2  rank_3  rank_4
    0   0       0.475   0.9025  0       0       1       0
    1   1       0.825   0.9175  0       0       1       0
    2   1       1.000   1.0000  1       0       0       0
    
    In [5]:
    
    # choose the data randomly
    sample = np.random.choice(processed_data.index, size=int(len(processed_data)*0.9), replace=False)
    train_data, test_data = processed_data.iloc[sample], processed_data.drop(sample)
    
    print("Number of training samples is", len(train_data))
    print("Number of testing samples is", len(test_data))
    print(train_data[:3])
    print(test_data[:3])
    
    Out [5]:
    
    Number of training samples is 360
    Number of testing samples is 40
         admit  gre     gpa  rank_1  rank_2  rank_3  rank_4
    302      1  0.5  0.7875       0       1       0       0
    121      1  0.6  0.6675       0       1       0       0
    249      0  0.8  0.9325       0       0       1       0
        admit    gre     gpa  rank_1  rank_2  rank_3  rank_4
    3       1  0.800  0.7975       0       0       0       1
    12      1  0.950  1.0000       1       0       0       0
    13      0  0.875  0.7700       0       1       0       0
    
    
    In [6]:
    
    import keras
    
    # Separate data and one-hot encode the output
    # Note: We're also turning the data into numpy arrays, in order to train the model in Keras
    # use keras.utils.to_categorical to one-hot encoding targets
    features = np.array(train_data.drop('admit', axis=1))
    targets = np.array(keras.utils.to_categorical(train_data['admit'], 2))
    features_test = np.array(test_data.drop('admit', axis=1))
    targets_test = np.array(keras.utils.to_categorical(test_data['admit'], 2))
    
    print(features[:3])
    print(targets[:3])
    
    [[ 0.5     0.7875  0.      1.      0.      0.    ]
     [ 0.6     0.6675  0.      1.      0.      0.    ]
     [ 0.8     0.9325  0.      0.      1.      0.    ]]
    [[ 0.  1.]
     [ 0.  1.]
     [ 1.  0.]]
    

    Data processing example from bike rental project

    In [7]:
    
    %matplotlib inline
    %config InlineBackend.figure_format = 'retina'
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    
    In [8]:
    
    data_path = 'Bike-Sharing-Dataset/hour.csv'
    rides = pd.read_csv(data_path)
    
    
    In [9]:
    
    # check to see what style is available in your current working environment
    plt.style.available
    
    Out[9]:
    
    ['seaborn-deep',
     'seaborn-talk',
     'seaborn-paper',
     'bmh',
     'grayscale',
     'seaborn-bright',
     'seaborn-colorblind',
     'ggplot',
     'seaborn-notebook',
     'seaborn-muted',
     'dark_background',
     'seaborn-dark-palette',
     'seaborn-white',
     'seaborn-darkgrid',
     'classic',
     'seaborn-poster',
     'seaborn-pastel',
     'fivethirtyeight',
     'seaborn-ticks',
     '_classic_test',
     'seaborn-whitegrid',
     'seaborn-dark',
     'seaborn']
    
    In [10]:
    
    # choose the style you like
    plt.style.use('ggplot')
    
    fig, ax = plt.subplots(nrows=1, ncols=1) # add this line to take control of the figure configuration later
    rides[:24 * 10].plot(x='dteday', y='cnt', ax=ax, figsize=(10, 5)) #set ax=ax to take control of the figure
    ax.legend().set_visible(False)
    ax.set(title='Rental counts in the first 10 days', ylabel='Rental Counts', xlabel='Date'); 
    # this very semicolon stop plt printing out working messages
    
    Bike rental
    In [11]:
    
    # this demonstrates how you can one-hot encoding more than one column using pandas
    dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']
    for each in dummy_fields:
        dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
        rides = pd.concat([rides, dummies], axis=1)
    
    fields_to_drop = ['instant', 'dteday', 'season', 'weathersit', 
                      'weekday', 'atemp', 'mnth', 'workingday', 'hr']
    data = rides.drop(fields_to_drop, axis=1)
    
    In [12]:
    
    # scaling the data with standard values
    quant_features = ['casual', 'registered', 'cnt', 'temp', 'hum', 'windspeed']
    # Store scalings in a dictionary so we can convert back later
    scaled_features = {}
    for each in quant_features:
        mean, std = data[each].mean(), data[each].std()
        scaled_features[each] = [mean, std]
        data[each] = (data[each] - mean) / std 
        # this line should be write this way for simplicity's sake
    
    In [13]:
    
    # Save data for approximately the last 21 days 
    test_data = data[-21*24:]
    
    # Now remove the test data from the data set 
    data = data[:-21*24]
    
    # Separate the data into features and targets
    target_fields = ['cnt', 'casual', 'registered']
    features, targets = data.drop(target_fields, axis=1), data[target_fields]
    test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]
    
    
    In [14]:
    
    # Hold out the last 60 days or so of the remaining data as a validation set
    train_features, train_targets = features[:-60*24], targets[:-60*24]
    val_features, val_targets = features[-60*24:], targets[-60*24:]
    

    相关文章

      网友评论

          本文标题:深度学习中的数据处理 - 数据集的建立

          本文链接:https://www.haomeiwen.com/subject/gedfaxtx.html