第三次尝试

作者: Evas77 | 来源:发表于2017-09-21 22:02 被阅读7次
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    %matplotlib inline
    %config InlineBackend.figure_format = 'retina'
    
    df = pd.read_csv('HRSalaries.csv')
    df.head()
    
    len(df)
    
    30284
    
    df.Department.value_counts()
    
    POLICE                   12461
    FIRE                      4798
    SANITATION                2092
    WATER MGMT                1796
    AVIATION                  1252
    TRANSPORTATION            1196
    EMERGENCY MGMT            1182
    GENERAL SERVICES           922
    PUBLIC LIBRARY             874
    FAMILY & SUPPORT           719
    HEALTH                     568
    FINANCE                    533
    LAW                        455
    CITY COUNCIL               265
    BUILDINGS                  261
    COMMUNITY DEVELOPMENT      216
    BUSINESS AFFAIRS           177
    DoIT                        97
    MAYOR'S OFFICE              96
    PROCUREMENT                 77
    CULTURAL AFFAIRS            76
    HUMAN RESOURCES             61
    ANIMAL CONTRL               57
    DISABILITIES                29
    TREASURER                   24
    Name: Department, dtype: int64
    
    len(df.Department.unique())
    
    25
    
    salary = df.Annual_Salary
    
    salary.sum() / len(salary)
    
    60836.98560295866
    
    salary.mean()
    
    60836.98560295866
    
    np.mean(salary)
    
    60836.98560295866
    
    df.groupby('Department').Annual_Salary.mean().sort_values(ascending=False)
    
    Department
    DoIT                     73831.979381
    BUILDINGS                72137.885057
    FIRE                     69383.989996
    MAYOR'S OFFICE           68953.677083
    WATER MGMT               64760.186526
    COMMUNITY DEVELOPMENT    64262.597222
    GENERAL SERVICES         63747.808026
    TREASURER                63497.500000
    POLICE                   63127.904984
    TRANSPORTATION           62947.504181
    PROCUREMENT              61452.584416
    HEALTH                   61213.503521
    CULTURAL AFFAIRS         61181.894737
    DISABILITIES             58058.586207
    BUSINESS AFFAIRS         57216.067797
    HUMAN RESOURCES          57108.163934
    LAW                      55917.958242
    AVIATION                 55816.200479
    SANITATION               55555.813576
    FINANCE                  54286.375235
    ANIMAL CONTRL            47604.473684
    PUBLIC LIBRARY           44241.731121
    EMERGENCY MGMT           42845.754653
    CITY COUNCIL             38046.547170
    FAMILY & SUPPORT         31193.307371
    Name: Annual_Salary, dtype: float64
    
    len(salary)
    
    30284
    
    sorted_salary = salary.sort_values()
    sorted_salary
    
    16629      3128
    2247       3132
    20961      3133
    13423      3135
    25422      3135
    16451      3136
    1732       3136
    2215       3140
    23454      3144
    12291      3149
    25959      3151
    19186      3153
    9905       3156
    1752       3182
    25170      3188
    4739       3188
    4858       3190
    3688       3193
    6947       3209
    20065      3215
    17837      3219
    24322      3230
    5840       3232
    25971      3236
    8175       3244
    22631      3253
    22602      3257
    15958      3263
    5194       3265
    18713      3266
              ...  
    23452    128174
    27353    130023
    14122    130788
    18022    131070
    17173    131461
    7333     132249
    20450    132283
    11322    135289
    14042    135623
    14386    136798
    6491     136806
    17347    137121
    7565     137153
    13217    137300
    12748    137305
    10283    137574
    16910    137583
    4165     137584
    29829    137669
    1260     138506
    14746    138546
    15059    138760
    2866     138826
    8904     144240
    3246     144914
    20222    146247
    11657    146776
    3486     157054
    27879    167858
    6724     201448
    Name: Annual_Salary, Length: 30284, dtype: int64
    
    (sorted_salary.iloc[15141] + sorted_salary.iloc[15142]) / 2
    
    61836.0
    
    salary.median()
    
    61836.0
    
    plt.hist(salary, bins=50, rwidth=0.9)
    plt.show()
    
    output_14_0.png
    plt.hist(salary, bins=25, rwidth=0.9, range=(100000, 210000))
    plt.show()
    
    output_15_0.png
    salary.mean() > salary.median()
    
    False
    
    fire_salary = df[df.Department == 'FIRE'].Annual_Salary
    plt.hist(fire_salary, bins=50, rwidth=0.9)
    plt.show()
    
    output_17_0.png
    fire_salary.mean()
    
    
    69383.9899958316
    
    fire_salary.median()
    
    66260.0
    
    fire_salary.mean() > fire_salary.median()
    
    True
    
    salary.max() 
    
    201448
    
    salary.min()
    
    3128
    
    salary.max() - salary.min()
    
    198320
    
    Q1 = salary.quantile(0.25)
    
    Q3 = salary.quantile(0.75)
    Q3
    
    68558.5
    
    IQR = Q3 - Q1
    IQR
    
    12886.75
    
    salary.quantile(0.5)
    
    61836.0
    
    salary.median()
    
    61836.0
    
    salary.plot(kind='box', vert=False, figsize=(15, 5))
    plt.show()
    
    output_29_0.png
    doit_salary = df[df.Department == 'DoIT'].Annual_Salary.tolist()
    plt.boxplot(doit_salary)
    plt.show()
    
    output_30_0.png
    build_salary = df[df.Department == 'BUILDINGS'].Annual_Salary.tolist()
    plt.boxplot([build_salary, doit_salary], labels=['BUILDINGS', 'DoIT'])
    plt.show()
    
    output_31_0.png
    import seaborn as sns
    
    sns.boxplot(data=df, y='Department', x='Annual_Salary')
    plt.show()
    
    output_32_0.png
    mean = salary.mean()
    np.sum((salary - mean)**2) / (len(salary) - 1)
    
    271490393.4177519
    
    var = salary.var()
    var
    
    271490393.4177519
    
    np.sqrt(np.sum((salary - mean)**2) / (len(salary) - 1))
    
    16476.965540346071
    
    std = salary.std()
    std
    
    16476.96554034607
    
    len(salary[salary.between(mean - std, mean + std)]) / len(salary)
    
    0.7666094307224938
    
    len(salary[salary.between(mean - 2*std, mean + 2*std)]) / len(salary)
    
    0.933364152687888
    
    score = df.Review_Score
    
    np.sum((salary - salary.mean()) * (score - score.mean())) / (len(salary)-1)
    
    7.747599921809748
    
    np.cov(salary, score)
    
    array([[  2.71490393e+08,   7.74759992e+00],
           [  7.74759992e+00,   1.06173362e+00]])
    
    cov = np.cov(salary, score)[0,1]
    cov
    
    7.7475999218100222
    
    np.cov(salary, score)[0,1] / (salary.std() * score.std())
    
    0.00045633330757004046
    
    np.corrcoef(salary, score)[0,1]
    
    0.00045633330757003586
    
    plt.scatter(score, salary, alpha=0.3)
    plt.show()
    
    output_45_0.png
    position = df[df.Position_Title == 'FIREFIGHTER']
    print(np.corrcoef(position.Annual_Salary, position.Review_Score)[1,0])
    plt.scatter(position.Review_Score, position.Annual_Salary)
    plt.show()
    
    0.0571267765462
    
    output_46_1.png
    #第三课作业
    
    #1、计算 HRSalaries 数据中评分Review_Score 的均值和中位数,并判断其偏度是左偏还是右偏?
    
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    %matplotlib inline
    %config InlineBackend.figure_format = 'retina'
    
    df = pd.read_csv('HRSalaries.csv')
    df.head()
    
    salary = df.Review_Score
    
    salary.sum() / len(salary)
    
    6.4558908994849205
    
    df.groupby('Department').Review_Score.mean().sort_values(ascending=False)
    
    Department
    MAYOR'S OFFICE           6.711458
    ANIMAL CONTRL            6.575439
    BUSINESS AFFAIRS         6.509605
    AVIATION                 6.504073
    COMMUNITY DEVELOPMENT    6.493519
    HEALTH                   6.493486
    HUMAN RESOURCES          6.480328
    BUILDINGS                6.474330
    GENERAL SERVICES         6.472234
    WATER MGMT               6.464310
    POLICE                   6.463117
    FIRE                     6.461171
    EMERGENCY MGMT           6.452369
    DISABILITIES             6.441379
    LAW                      6.439121
    SANITATION               6.433987
    FINANCE                  6.420450
    FAMILY & SUPPORT         6.413908
    PROCUREMENT              6.406494
    PUBLIC LIBRARY           6.399542
    TRANSPORTATION           6.395234
    CITY COUNCIL             6.383019
    DoIT                     6.342268
    CULTURAL AFFAIRS         6.261842
    TREASURER                6.258333
    Name: Review_Score, dtype: float64
    
    len(score)
    
    30284
    
    salary.mean() > salary.median()
    
    False
    
    fire_salary.mean() > fire_salary.median()
    
    True
    
    #2、 Review_Score 的IQR是多少?并绘制该数据的box图。
    
    Q1 = salary.quantile(0.25)
    Q1
    
    5.8
    
    Q3 = salary.quantile(0.75)
    Q3
    
    7.2
    
    IQR = Q3 - Q1
    IQR
    
    1.4000000000000004
    
    salary.plot(kind='box', vert=False, figsize=(15, 5))
    plt.show()
    
    output_61_0.png
    #3、Review_Score的标准差是多少?
    
    np.sqrt(np.sum((salary - mean)**2) / (len(salary) - 1))
    
    60831.534080120342
    
    std = salary.std()
    std
    
    1.030404588021642
    
    #4、在Review_Score中,求落在两个标准差内的数据占总数的百分比。
    
    #  这一课感觉完全看不懂了,大概能理解到最前面的一些东西,导入什么库,和读取文件。
    #这些公式表示看不懂。下次再来研究它。
    

    相关文章

      网友评论

        本文标题:第三次尝试

        本文链接:https://www.haomeiwen.com/subject/sdotextx.html