import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
df = pd.read_csv('HRSalaries.csv')
df.head()
len(df)
30284
df.Department.value_counts()
POLICE 12461
FIRE 4798
SANITATION 2092
WATER MGMT 1796
AVIATION 1252
TRANSPORTATION 1196
EMERGENCY MGMT 1182
GENERAL SERVICES 922
PUBLIC LIBRARY 874
FAMILY & SUPPORT 719
HEALTH 568
FINANCE 533
LAW 455
CITY COUNCIL 265
BUILDINGS 261
COMMUNITY DEVELOPMENT 216
BUSINESS AFFAIRS 177
DoIT 97
MAYOR'S OFFICE 96
PROCUREMENT 77
CULTURAL AFFAIRS 76
HUMAN RESOURCES 61
ANIMAL CONTRL 57
DISABILITIES 29
TREASURER 24
Name: Department, dtype: int64
len(df.Department.unique())
25
salary = df.Annual_Salary
salary.sum() / len(salary)
60836.98560295866
salary.mean()
60836.98560295866
np.mean(salary)
60836.98560295866
df.groupby('Department').Annual_Salary.mean().sort_values(ascending=False)
Department
DoIT 73831.979381
BUILDINGS 72137.885057
FIRE 69383.989996
MAYOR'S OFFICE 68953.677083
WATER MGMT 64760.186526
COMMUNITY DEVELOPMENT 64262.597222
GENERAL SERVICES 63747.808026
TREASURER 63497.500000
POLICE 63127.904984
TRANSPORTATION 62947.504181
PROCUREMENT 61452.584416
HEALTH 61213.503521
CULTURAL AFFAIRS 61181.894737
DISABILITIES 58058.586207
BUSINESS AFFAIRS 57216.067797
HUMAN RESOURCES 57108.163934
LAW 55917.958242
AVIATION 55816.200479
SANITATION 55555.813576
FINANCE 54286.375235
ANIMAL CONTRL 47604.473684
PUBLIC LIBRARY 44241.731121
EMERGENCY MGMT 42845.754653
CITY COUNCIL 38046.547170
FAMILY & SUPPORT 31193.307371
Name: Annual_Salary, dtype: float64
len(salary)
30284
sorted_salary = salary.sort_values()
sorted_salary
16629 3128
2247 3132
20961 3133
13423 3135
25422 3135
16451 3136
1732 3136
2215 3140
23454 3144
12291 3149
25959 3151
19186 3153
9905 3156
1752 3182
25170 3188
4739 3188
4858 3190
3688 3193
6947 3209
20065 3215
17837 3219
24322 3230
5840 3232
25971 3236
8175 3244
22631 3253
22602 3257
15958 3263
5194 3265
18713 3266
...
23452 128174
27353 130023
14122 130788
18022 131070
17173 131461
7333 132249
20450 132283
11322 135289
14042 135623
14386 136798
6491 136806
17347 137121
7565 137153
13217 137300
12748 137305
10283 137574
16910 137583
4165 137584
29829 137669
1260 138506
14746 138546
15059 138760
2866 138826
8904 144240
3246 144914
20222 146247
11657 146776
3486 157054
27879 167858
6724 201448
Name: Annual_Salary, Length: 30284, dtype: int64
(sorted_salary.iloc[15141] + sorted_salary.iloc[15142]) / 2
61836.0
salary.median()
61836.0
plt.hist(salary, bins=50, rwidth=0.9)
plt.show()
output_14_0.png
plt.hist(salary, bins=25, rwidth=0.9, range=(100000, 210000))
plt.show()
output_15_0.png
salary.mean() > salary.median()
False
fire_salary = df[df.Department == 'FIRE'].Annual_Salary
plt.hist(fire_salary, bins=50, rwidth=0.9)
plt.show()
output_17_0.png
fire_salary.mean()
69383.9899958316
fire_salary.median()
66260.0
fire_salary.mean() > fire_salary.median()
True
salary.max()
201448
salary.min()
3128
salary.max() - salary.min()
198320
Q1 = salary.quantile(0.25)
Q3 = salary.quantile(0.75)
Q3
68558.5
IQR = Q3 - Q1
IQR
12886.75
salary.quantile(0.5)
61836.0
salary.median()
61836.0
salary.plot(kind='box', vert=False, figsize=(15, 5))
plt.show()
output_29_0.png
doit_salary = df[df.Department == 'DoIT'].Annual_Salary.tolist()
plt.boxplot(doit_salary)
plt.show()
output_30_0.png
build_salary = df[df.Department == 'BUILDINGS'].Annual_Salary.tolist()
plt.boxplot([build_salary, doit_salary], labels=['BUILDINGS', 'DoIT'])
plt.show()
output_31_0.png
import seaborn as sns
sns.boxplot(data=df, y='Department', x='Annual_Salary')
plt.show()
output_32_0.png
mean = salary.mean()
np.sum((salary - mean)**2) / (len(salary) - 1)
271490393.4177519
var = salary.var()
var
271490393.4177519
np.sqrt(np.sum((salary - mean)**2) / (len(salary) - 1))
16476.965540346071
std = salary.std()
std
16476.96554034607
len(salary[salary.between(mean - std, mean + std)]) / len(salary)
0.7666094307224938
len(salary[salary.between(mean - 2*std, mean + 2*std)]) / len(salary)
0.933364152687888
score = df.Review_Score
np.sum((salary - salary.mean()) * (score - score.mean())) / (len(salary)-1)
7.747599921809748
np.cov(salary, score)
array([[ 2.71490393e+08, 7.74759992e+00],
[ 7.74759992e+00, 1.06173362e+00]])
cov = np.cov(salary, score)[0,1]
cov
7.7475999218100222
np.cov(salary, score)[0,1] / (salary.std() * score.std())
0.00045633330757004046
np.corrcoef(salary, score)[0,1]
0.00045633330757003586
plt.scatter(score, salary, alpha=0.3)
plt.show()
output_45_0.png
position = df[df.Position_Title == 'FIREFIGHTER']
print(np.corrcoef(position.Annual_Salary, position.Review_Score)[1,0])
plt.scatter(position.Review_Score, position.Annual_Salary)
plt.show()
0.0571267765462
output_46_1.png
#第三课作业
#1、计算 HRSalaries 数据中评分Review_Score 的均值和中位数,并判断其偏度是左偏还是右偏?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
df = pd.read_csv('HRSalaries.csv')
df.head()
salary = df.Review_Score
salary.sum() / len(salary)
6.4558908994849205
df.groupby('Department').Review_Score.mean().sort_values(ascending=False)
Department
MAYOR'S OFFICE 6.711458
ANIMAL CONTRL 6.575439
BUSINESS AFFAIRS 6.509605
AVIATION 6.504073
COMMUNITY DEVELOPMENT 6.493519
HEALTH 6.493486
HUMAN RESOURCES 6.480328
BUILDINGS 6.474330
GENERAL SERVICES 6.472234
WATER MGMT 6.464310
POLICE 6.463117
FIRE 6.461171
EMERGENCY MGMT 6.452369
DISABILITIES 6.441379
LAW 6.439121
SANITATION 6.433987
FINANCE 6.420450
FAMILY & SUPPORT 6.413908
PROCUREMENT 6.406494
PUBLIC LIBRARY 6.399542
TRANSPORTATION 6.395234
CITY COUNCIL 6.383019
DoIT 6.342268
CULTURAL AFFAIRS 6.261842
TREASURER 6.258333
Name: Review_Score, dtype: float64
len(score)
30284
salary.mean() > salary.median()
False
fire_salary.mean() > fire_salary.median()
True
#2、 Review_Score 的IQR是多少?并绘制该数据的box图。
Q1 = salary.quantile(0.25)
Q1
5.8
Q3 = salary.quantile(0.75)
Q3
7.2
IQR = Q3 - Q1
IQR
1.4000000000000004
salary.plot(kind='box', vert=False, figsize=(15, 5))
plt.show()
output_61_0.png
#3、Review_Score的标准差是多少?
np.sqrt(np.sum((salary - mean)**2) / (len(salary) - 1))
60831.534080120342
std = salary.std()
std
1.030404588021642
#4、在Review_Score中,求落在两个标准差内的数据占总数的百分比。
# 这一课感觉完全看不懂了,大概能理解到最前面的一些东西,导入什么库,和读取文件。
#这些公式表示看不懂。下次再来研究它。
网友评论