美文网首页我爱编程
5.17 ② summary statistics

5.17 ② summary statistics

作者: 钊钖 | 来源:发表于2018-05-18 13:33 被阅读0次
    # median
    from numpy  import  median
    values_median  = median(values )
    

    variance tells us how concentrated the date is around the mean.
    and measures how far the average data point is from the mean .
    calculate variance by subtracting every value from the mean,squaring the results, and then averaging them.

    mean of list sum()/len()
    mean of pandas series.mean()
    mean of numpy np.array.mean()

    # variance 
    import  matplotlib.pyplot  as plt
    import pandas as pd
    
    pf_mean = nba_stats['pf'].mean()
    variance  = 0
    for p in nba_stats['pf']:
        difference = p - pf_mean 
        square_difference = difference ** 2
        variance += square_difference
    variance  = variance / len(nba_stats['pf']) 
    
    
    
    # standard deviation 
    # the square root of variance 
    import  numpy as np
    
    def calc_column_deviation(column):
        mean = column .mean()
        variance  =0
    
        for p in column:
            difference = p - mean
            square_difference = difference ** 2
            variance += square_difference
        
        variance = variance / len(column)
        return  variance ** .5
    
    mp_dev = calc_col_deviation(nba_stats['mp'])
    

    pandas method std()on series.

    mp_dev = nba_stats['mp'].std()

    # standard deviation distance compares data density.
    import matplotlib.pyplot as plt
    
    plt.hist(nba_stats["pf"])
    mean = nba_stats["pf"].mean()
    plt.axvline(mean, color="r")
    # We can calculate standard deviation
    # by using the std() method on a pandas series.
    std_dev = nba_stats["pf"].std()
    # Plot a line one standard deviation below the mean.
    plt.axvline(mean - std_dev, color="g")
    # Plot a line one standard deviation above the mean.
    plt.axvline(mean + std_dev, color="g")
    
    # We can see how many of the data points 
    #fall within one standard deviation of the mean.
    # The more that fall into this range, the more dense the data is.
    plt.show()
     
    # We can calculate how many 
    #standard deviations a data point 
    #is from the mean by doing some subtraction 
    #and division.
    # First, we find the total distance
    # by subtracting the mean.
    total_distance = nba_stats["pf"][0] - mean
    # Then we divide by standard deviation to 
    #find how many standard deviations 
    #away the point is.
    standard_deviation_distance = total_distance / std_dev
    
    point_10 = nba_stats["pf"][9]
    point_100 = nba_stats["pf"][99]
    point_10_std = (point_10 - mean) / std_dev
    point_100_std = (point_100 - mean) / std_dev
    

    normal distribution
    Make a normal distribution across the range that starts at-10, ends at 10, and has the step .1.
    points = np.arange(-10,10,.1)

    import numpy as np
    import matplotlib.pyplot as plt
    # The norm module has a pdf function (
    # pdf -  probability density function)
    from scipy.stats import norm
    
    # The arange function generates a numpy vector
    vector
    # The vector below will start at -1,  
    # and go up to, but not including 1
    # It will proceed in "steps" of .01. 
    # So the first element will be -1, 
    # the second -.99, the third -.98, 
    # all the way up to .99.
    points = np.arange(-1,1,0.01)
    
    # The norm.pdf function will take the points vector 
    # and convert it into a probability vector
    
    
    # Each element in the vector will correspond 
    # to the normal distribution 
    #(earlier elements and later element 
    # smaller, peak in the center)
    
    
    # The distribution will be centered on 0, 
    # and will have a standard devation of .3
    probabilities = norm.pdf(points, 0, .3)
    
    # Plot the points values on the x-axis 
    # and the corresponding probabilities on the y-axis
    # See the bell curve?
    plt.plot(points, probabilities)
    plt.show()
    
    
    
    12.png
    # Housefly wing lengths in millimeters
    wing_lengths = [36, 37, 38, 38, 39, 39, 40, 40, 40,
                    40, 41, 41, 41, 41, 41, 41, 42, 42, 
                    42, 42, 42, 42, 42, 43, 43, 43, 43, 
                    43, 43, 43, 43, 44, 44, 44, 44, 44, 
                    44, 44, 44, 44, 45, 45, 45, 45, 45, 
                    45, 45, 45, 45, 45, 46, 46, 46, 46,
                    46, 46, 46, 46, 46, 46, 47, 47, 47,
                    47, 47, 47, 47, 47, 47, 48, 48, 48, 
                    48, 48, 48, 48, 48, 49, 49, 49, 49, 
                    49, 49, 49, 50, 50, 50, 50, 50, 50,
                    51, 51, 51, 51, 52, 52, 53, 53, 54,
                    55]
    
    
    mean = sum(wing_lengths) / len(wing_lengths)
    variances = [(i - mean) ** 2 for i in wing_lengths]
    variance = sum(variances)/ len(variances)
    standard_deviation = variance ** (1/2)
    
    standard_deviations = [(i - mean) / standard_deviation for i in wing_lengths]
    def within_percentage(deviations, count):
        within = [i for i in deviations if i <= count and i >= -count]
        count = len(within)
        return count / len(deviations)
    
    within_one_percentage = within_percentage(standard_deviations, 1)
    within_two_percentage = within_percentage(standard_deviations, 2)
    within_three_percentage = within_percentage(standard_deviations, 3)
    

    Using Scatterplots to Plot Correlations

    import matplotlib.pyplot as plt
    
    # Plot field goals attempted (number of 
    # shots someone takes in a season) vs. point 
    # scored in a season.
    # Field goals attempted is on the x-axis, 
    # and points is on the y-axis.
    # As you can tell, they are very strongly correlated. 
    #The plot is close to a straight line.
    # The plot also slopes upward, 
    # which means that as field goal attempts 
    #go up, so do points.
    # That means that the plot is positively correlated.
    plt.scatter(nba_stats["fga"], nba_stats["pts"])
    plt.show()
    

    Measuring Correlation with Pearson's r
    The most common way to measure correlation is to use Pearson's r, which we also call an r-value.
    An r-value ranges from -1 to 1, and indicates how strongly two variables are correlated.
    We can use a function from scipy to calculate Pearson's r.

    from scipy.stats.stats import pearsonr
    
    # The pearsonr function will find the correlation 
    # between two columns of data.
    # It returns the r value and the p value. 
    r, p_value = pearsonr(nba_stats["fga"], nba_stats["pts"])
    # As we can see, this is a very high positive r value 
    # - it's close to 1.
    print(r)
    
    

    Covariance
    Another way to think of correlation is in terms of variance.
    Covariance refers to how different numbers vary jointly.
    For each element in the vectors x and y, we:

    • Take the value at each position from 1 to the length of the vectors.
    • Subtract the mean of the vector from those values.
    • Multiply them together at each position, and all of the resulting values together.
    def covariance (x,y):
        x_mean = sum(x) /len(x)
        y_mean = sum(y) /len(y)
        x_diffs =  [i - x_mean for i in x]
        y_diffs = [i -y_mean for i in y]
        
        codeviates = [x_diffs[i]*y_diffs[i] for i in range(len(x))]
        
        return sum( codeviates )/len(codeviates)
    

    from numpy import cov
    cov(nd_array_a, nd_array_b)

    Calculate Correlation With the std() Method
    We can use the std method on any pandas DataFrame or Series to calculate the standard deviation.
    We can use the cov function from NumPy to compute covariance,

    \frac{cov(\mathbf{x},
    \mathbf{y})}{\sigma_{x}\sigma_{y}}

    from numpy import cov
    
    r_fta_blk = cov(nba_stats['fta'],
                    nba_stats["blk"])[0,1]/(
        nba_stats['fta'].var(
        )*nba_stats["blk"].var(
        ))**(1/2)
    

    visualize dataset

    import matplotlib.pyplot as plt
    import pandas as pd
    movie_reviews = pd.read_csv(
        "fandango_score_comparison.csv")
    
    
    fig = plt.figure(figsize =(5,12))
    ax1 = fig.add_subplot(4,1,1)
    ax2 = fig.add_subplot(4,1,2)
    ax3 = fig.add_subplot(4,1,3)
    ax4 = fig.add_subplot(4,1,4)
    ax1.set_xlim(0,5.0)
    ax2.set_xlim(0,5.0)
    ax3.set_xlim(0,5.0)
    ax4.set_xlim(0,5.0)
    
    movie_reviews["RT_user_norm"].hist(ax=ax1)
    movie_reviews["Metacritic_user_nom"].hist(ax=ax2)
    movie_reviews["Fandango_Ratingvalue"].hist(ax=ax3)
    movie_reviews["IMDB_norm"].hist(ax=ax4)
    plt.show()
    
    #Write a function, named calc_mean, 
    # that returns the mean
    # for the values in a Series object.
    def calc_mean(series):
        return None
    # Recall that you can return the values 
    # in a Series using the values attribute.
    def calc_mean(series):
        vals = series.values
        mean = sum(vals) / len(vals)
        return mean    
    
    # Select just the columns containing normalized user 
    # reviews and assign to a separate Dataframe named user_reviews.
    columns = ["RT_user_norm","Metacritic_user_nom",
               "Fandango_Ratingvalue","IMDB_norm",]
    user_reviews = movie_reviews[columns]
    user_reviews_means = user_reviews.apply(calc_mean)
    
    rt_mean = user_reviews_means["RT_user_norm"]
    mc_mean = user_reviews_means["Metacritic_user_nom"]
    fg_mean = user_reviews_means["Fandango_Ratingvalue"]
    id_mean = user_reviews_means["IMDB_norm"]
    
    print("Rotten Tomatoes (mean):", rt_mean)
    print("Metacritic (mean):", mc_mean)
    print("Fandango (mean):",fg_mean)
    print("IMDB (mean):",id_mean)
    
    def calc_mean(series):
        vals = series.values
        mean = sum(vals) / len(vals)
        return mean
    # To calculate the variance:
    # write a function, 
    # named calc_variance, that returns 
    # the variance for the values in a Series object.
    
    def calc_variance(series):
        mean = calc_mean(series)
        squared_deviations = (series - mean)**2
        mean_squared_deviations = calc_mean(squared_deviations)
        return mean_squared_deviations
    
    
    cols = ["RT_user_norm", 
            "Metacritic_user_nom", 
            "Fandango_Ratingvalue", 
            "IMDB_norm"]
    user_reviews = movie_reviews[columns]
    user_reviews_variances = user_reviews.apply(calc_variance)
    
    # Calculate the variance and  standard deviation
    # for the RT_user_norm column and 
    # assign to rt_var and rt_stdev respectively.
    
    rt_var = user_reviews_variances["RT_user_norm"]
    mc_var = user_reviews_variances["Metacritic_user_nom"]
    fg_var = user_reviews_variances["Fandango_Ratingvalue"]
    id_var = user_reviews_variances["IMDB_norm"]
    
    rt_stdev = rt_var ** (1/2)
    mc_stdev = mc_var ** (1/2)
    fg_stdev = fg_var ** (1/2)
    id_stdev = id_var ** (1/2)
    
    print("Rotten Tomatoes (variance):", rt_var)
    print("Metacritic (variance):", mc_var)
    print("Fandango (variance):", fg_var)
    print("IMDB (variance):", id_var)
    
    print("Rotten Tomatoes (standard deviation):", rt_stdev)
    print("Metacritic (standard deviation):", mc_stdev)
    print("Fandango (standard deviation):", fg_stdev)
    print("IMDB (standard deviation):", id_stdev)
    
    # Create a matplotlib subplot grid with the following properties:
    
    #3 rows by 1 column,
    # figsize of 4 (width) by 8 (height),
    # each Axes instance should have
    # an x-value range of 0.0 to 5.0.
    import matplotlib.pyplot as plt
    
    fig = plt.figure(figsize = (4,8))
    ax1 = fig.add_subplot(3,1,1)
    ax2 = fig.add_subplot(3,1,2)
    ax3 = fig.add_subplot(3,1,3)
    
    ax1.set_xlim(0.0,5.0)
    ax2.set_xlim(0.0,5.0)
    ax3.set_xlim(0.0,5.0)
    
    ax1.scatter(movie_reviews["RT_user_norm"],movie_reviews["Fandango_Ratingvalue"])
    ax2.scatter(movie_reviews["Metacritic_user_nom"],movie_reviews["Fandango_Ratingvalue"])
    ax3.scatter(movie_reviews["IMDB_norm"],movie_reviews["Fandango_Ratingvalue"])
    
    plt.show()
    
    def calc_mean(series):
        vals = series.values
        mean = sum(vals) / len(vals)
        return mean
    
    def calc_variance(series):
        mean = calc_mean(series)
        squared_deviations = (series - mean)**2
        mean_squared_deviations = calc_mean(squared_deviations)
        return mean_squared_deviations
    
    def calc_covariance(series_one, series_two):
        x = series_one.values
        y = series_two.values
        x_mean = calc_mean(series_one)
        y_mean = calc_mean(series_two)
        x_diffs = [i - x_mean for i in x]
        y_diffs = [i - y_mean for i in y]
        codeviates = [x_diffs[i] * y_diffs[i] for i in range(len(x))]
        return sum(codeviates) / len(codeviates)
    
    rt_fg_covar = calc_covariance(movie_reviews["RT_user_norm"], 
                                  movie_reviews["Fandango_Ratingvalue"])
    mc_fg_covar = calc_covariance(movie_reviews["Metacritic_user_nom"], 
                                  movie_reviews["Fandango_Ratingvalue"])
    id_fg_covar = calc_covariance(movie_reviews["IMDB_norm"], 
                                  movie_reviews["Fandango_Ratingvalue"])
    
    def calc_correlation(series_one,series_two):
        cov = calc_covariance(series_one,series_two)
        stde_one = calc_variance(series_one)**(1/2)
        stde_two = calc_variance(series_two)**(1/2)
        correlation = cov/(stde_one*stde_two)
        return correlation
    
    rt_fg_corr= calc_correlation(movie_reviews['RT_user_norm'],
                               movie_reviews['Fandango_Ratingvalue'])
    
    mc_fg_corr= calc_correlation(movie_reviews['Metacritic_user_nom'],
                               movie_reviews['Fandango_Ratingvalue'])
    
    id_fg_corr= calc_correlation(movie_reviews['IMDB_norm'],
                               movie_reviews['Fandango_Ratingvalue'])
    
    print("Correlation between Rotten Tomatoes and Fandango", rt_fg_corr)
    print("Correlation between Metacritic and Fandango", mc_fg_corr)
    print("Correlation between IMDB and Fandango", id_fg_corr)
    

    相关文章

      网友评论

        本文标题:5.17 ② summary statistics

        本文链接:https://www.haomeiwen.com/subject/wijbdftx.html