pandas notes 2

作者: sherrysack | 来源:发表于2017-06-12 18:24 被阅读0次

    pandas datafram add series experiment

    import pandas as pd

    # Change False to True for each block of code to see what it does
    
    # Adding a Series to a square DataFrame
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({
            0: [10, 20, 30, 40],
            1: [50, 60, 70, 80],
            2: [90, 100, 110, 120],
            3: [130, 140, 150, 160]
        })
        
        print df
        print '' # Create a blank line between outputs
        print df + s
        
    # Adding a Series to a one-row DataFrame 
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
        
        print df
        print '' # Create a blank line between outputs
        print df + s
    
    # Adding a Series to a one-column DataFrame
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({0: [10, 20, 30, 40]})
        
        print df
        print '' # Create a blank line between outputs
        print df + s
        
    
        
    # Adding when DataFrame column names match Series index
    if False:
        s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
        df = pd.DataFrame({
            'a': [10, 20, 30, 40],
            'b': [50, 60, 70, 80],
            'c': [90, 100, 110, 120],
            'd': [130, 140, 150, 160]
        })
        
        print df
        print '' # Create a blank line between outputs
        print df + s
        
    # Adding when DataFrame column names don't match Series index
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({
            'a': [10, 20, 30, 40],
            'b': [50, 60, 70, 80],
            'c': [90, 100, 110, 120],
            'd': [130, 140, 150, 160]
        })
        
        print df
        print '' # Create a blank line between outputs
        print df + s
    

    pandas加减乘除的方向确定方法:如果要对行操作,那么就是对同一组index进行操作,那么就改用add, sub,div的方法来,并把axis调成index;默认的+——/都是按照列来,也就是对同一组column操作,例如df.mean(),返回一个数组,每个数组的元素代表某一列的数组的平均值。df.mean() == df.means(axis = 'index')

    • 求某一行的平均值df.mean(axis = 'columns')
    • 将某个df减去每一行的平均值df.sub(df.mean(axis = 'columns'), axis = 'index')

    Excercise

    import pandas as pd
    
    # Adding using +
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({
            0: [10, 20, 30, 40],
            1: [50, 60, 70, 80],
            2: [90, 100, 110, 120],
            3: [130, 140, 150, 160]
        })
        
        print df
        print '' # Create a blank line between outputs
        print df + s
        
    # Adding with axis='index'
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({
            0: [10, 20, 30, 40],
            1: [50, 60, 70, 80],
            2: [90, 100, 110, 120],
            3: [130, 140, 150, 160]
        })
        
        print df
        print '' # Create a blank line between outputs
        print df.add(s, axis='index')
        # The functions sub(), mul(), and div() work similarly to add()
        
    # Adding with axis='columns'
    if False:
        s = pd.Series([1, 2, 3, 4])
        df = pd.DataFrame({
            0: [10, 20, 30, 40],
            1: [50, 60, 70, 80],
            2: [90, 100, 110, 120],
            3: [130, 140, 150, 160]
        })
        
        print df
        print '' # Create a blank line between outputs
        print df.add(s, axis='columns')
        # The functions sub(), mul(), and div() work similarly to add()
        
    grades_df = pd.DataFrame(
        data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
              'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
        index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
               'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
    )
    
    def standardize(df):
        '''
        Fill in this function to standardize each column of the given
        DataFrame. To standardize a variable, convert each value to the
        number of standard deviations it is above or below the mean.
        
        This time, try to use vectorized operations instead of apply().
        You should get the same results as you did before.
        '''
        return None
    
    def standardize_rows(df):
        '''
        Optional: Fill in this function to standardize each row of the given
        DataFrame. Again, try not to use apply().
        
        This one is more challenging than standardizing each column!
        '''
        return None
    

    Excercise about groupby()

    
    import numpy as np
    import pandas as pd
    
    values = np.array([1, 3, 2, 4, 1, 6, 4])
    example_df = pd.DataFrame({
        'value': values,
        'even': values % 2 == 0,
        'above_three': values > 3 
    }, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
    
    # Change False to True for each block of code to see what it does
    
    # Standardize each group
    if False:
        def standardize(xs):
            return (xs - xs.mean()) / xs.std()
        grouped_data = example_df.groupby('even')
        print grouped_data['value'].apply(standardize)
        
    # Find second largest value in each group
    if False:
        def second_largest(xs):
            sorted_xs = xs.sort(inplace=False, ascending=False)
            return sorted_xs.iloc[1]
        grouped_data = example_df.groupby('even')
        print grouped_data['value'].apply(second_largest)
    
    # --- Quiz ---
    # DataFrame with cumulative entries and exits for multiple stations
    ridership_df = pd.DataFrame({
        'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
        'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
        'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
        'EXITSn': [1088151, 13755385,  1088159, 13755393,  1088177, 13755598, 1088231, 13756191,  1088275]
    })
    
    def get_hourly_entries_and_exits(entries_and_exits):
        '''
        Fill in this function to take a DataFrame with cumulative entries
        and exits and return a DataFrame with hourly entries and exits.
        The hourly entries and exits should be calculated separately for
        each station (the 'UNIT' column).
        
        Hint: Take a look at the `get_hourly_entries_and_exits()` function
        you wrote in a previous quiz, DataFrame Vectorized Operations. If
        you copy it here and rename it, you can use it and the `.apply()`
        function to help solve this problem.
        '''
        return None
    
    

    相关文章

      网友评论

        本文标题:pandas notes 2

        本文链接:https://www.haomeiwen.com/subject/ewtrqxtx.html