中心极限定理 & 区间估计
import scipy.stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
matplotlib inline这句,可以不用plt.show出图
标准正态分布
standard_norm = scipy.stats.norm
x = np.arange(-4, 4, 0.01)
plt.plot(x, standard_norm.pdf(x))
plt.show()
错误记录:
拼写,standard,我少了一个a,错写为:standrd。 一直没找到错误原因

t 分布
t_dist = scipy.stats.t
plt.plot(x, standard_norm.pdf(x), label='standard normal')
x = np.arange(-4, 4, 0.01)
plt.plot(x, t_dist.pdf(x, df=1), label='t distribution')
plt.legend()
plt.show()
错误记录:
plt.plot(x,y,label='abc'),中的参数label,被我写成了labels复数。

置信区间
$$ \bar{x} - |z_{\alpha/2}|\frac{\sigma}{\sqrt{n}} < \mu < \bar{x} + |z_{\alpha/2}|\frac{\sigma}{\sqrt{n}}$$
导入住房面积数据
house = pd.read_csv('house_size.csv', header=None)
house_size = house.iloc[:,0]
print(list(house_size))
314, 119, 217, 326, 342, 318, 130, 465, 383, 396, 507, 283, 250, 326, 279, 363, 229, 303, 367, 246,
247, 262, 209, 294, 112, 249, 354, 355, 272, 277, 377, 411, 223, 232, 445, 333, 336, 349, 611, 516, 233, 275, 395, 241, 127, 228, 305, 321, 235, 226, 288, 503, 305, 280, 318, 281, 227, 279, 171, 290, 336, 284, 380, 314, 316, 476, 309, 293, 160, 300, 319, 396, 275, 212, 344, 305, 280, 331, 359, 283, 136, 322, 359, 202, 188, 187, 457, 340, 262, 288, 318, 381, 289, 205, 373, 200, 320, 213, 261, 357]
假设已知总体的标准差为 86 平方米
pop_std = 86
sample_mean = house_size.mean()
sample_mean
300.85
sample_size = len(house_size)
sample_size
100
z_score = scipy.stats.norm.isf(0.025) # 95% 置信度
z_score
1.9599639845400545
margin_error = z_score * pop_std / np.sqrt(sample_size)
margin_error
16.855690267044469
c_limit = sample_mean - margin_error
upper_limit = sample_mean + margin_error
print('95%% Confidence Interval: ( %.1f, %.1f)' % (lower_limit, upper_limit))
95% Confidence Interval: ( 284.0, 317.7)
定义置信区间计算的函数
def ci_z(data, pop_std, confidence):
sample_mean = np.mean(data)
sample_size = len(data)
alpha = (1 - confidence) / 2
z_score = scipy.stats.norm.isf(alpha)
ME = z_score * pop_std / np.sqrt(sample_size)
lower_limit = sample_mean - ME
upper_limit = sample_mean + ME
return (lower_limit, upper_limit)
ci_z(house_size, pop_std, 0.90)
(286.70425880821733, 314.99574119178271)
ci_z(house_size, pop_std, 0.95)
(283.99430973295557, 317.70569026704447)
ci_z(house_size, pop_std, 0.99)
(278.69786798947951, 323.00213201052054)
ci_z(house_size, pop_std, 1)
(-inf, inf)
错误记录1,sqrt,是表示二次开方, 需要些np.sqrt(data)
错误记录2: z——score在取值的时候,已经把置信度放进去了,isf是指两段大于0.025的部分。 所以我在此处就不需要再乘以α/2了
Bootstrap 方法计算置信区间
np.random.choice(house_size, size=10) # 从house_size数据中随机抽取10个数据,可重复抽取
array([326, 226, 212, 226, 279, 294, 373, 383, 226, 377])
def bootstrap_mean(data):
# 从数据data中重复抽样,样本大小与data相同,并返回样本均值
return np.mean(np.random.choice(data, size=len(data)))
def draw_bootstrap(data, times=1):
#初始化长度为times的空数组
bs_mean = np.empty(times)
#进行多次(times次)抽样,将每次得到的样本均值存储在bs_mean中
for i in range(times):
bs_mean[i] = bootstrap_mean(data)
return bs_mean
bs_mean = draw_bootstrap(house_size, 10000)
plt.hist(bs_mean, bins=50, normed=True, rwidth=0.9)
plt.show()

np.percentile(bs_mean, [2.5, 97.5])
array([ 283.659, 318.66 ])
作业
用t分布求上述置信区间
import scipy.stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
house = pd.read_csv('house_size.csv', header=None)
house_size = house.iloc[:,0]
print(house_size.head())
0 314
1 119
2 217
3 326
4 342
Name: 0, dtype: int64
t_dist = scipy.stats.t
print(t_dist)
<scipy.stats._continuous_distns.t_gen object at 0x7f5e05844f98>
t_score = scipy.stats.t(df=1).isf(0.025)
print(t_score)
12.7062047364
sample_mean = house_size.mean()
sample_size = len(house_size)
sample_std = np.std(house_size)
print(sample_mean)
print(sample_size)
300.85
100
t_score = scipy.stats.t(df=sample_mean-1).isf(0.025)
print(t_score)
1.96790699855
ME = t_score*sample_std/np.sqrt(sample_size)
print(ME)
#错误记录S是样本标准差,我竟然使用使用的是样本均值去计算。
17.4183977393
lower_limit = sample_mean - ME
upper_limit = sample_mean + ME
print(lower_limit,upper_limit)
283.431602261 318.268397739
来吧,试一下用函数呢
def ci_t(data,confidengce):
sample_size = len(data)
sample_std = np.std(data)
sample_mean = np.mean(data)
t_score = scipy.stats.t(df=sample_size-1).isf((1-confidengce)/2)
ME = t_score*sample_std/np.sqrt(sample_std)
l_limit = sample_mean-ME
u_limit = sample_mean+ME
return(l_limit,u_limit)
ci_t(house_size,0.95)
(282.18229316004903, 319.51770683995102)
ci_t(house_size,0.00001)
(300.84988178885078, 300.85011821114927)
网友评论