关于learning feasible和hoeffding bound的理解,该实验很有意思:
image.png
(a) u=0.5
(b) 样本是抽一枚硬币,样本集D是抽10次,总共100,000次试验,相当于生成了100,000个样本集,这里面有good data, 当然也有bad data. 代码如下:
from random import randint
from matplotlib import pyplot as plt
import os
import json
import collections
import numpy as np
class Coin:
"""1是head"""
def roll(self):
return randint(0, 1)
class Experiment:
def __init__(self):
self.coins = []
self.result = []
self._init_coins()
def _init_coins(self):
for _ in range(1000):
self.coins.append(Coin())
def execute_experiment(self):
for coin in self.coins:
total = 0
for _ in range(10):
total += coin.roll()
self.result.append(total / 10)
v1 = self.result[0]
v_rand = self.result[randint(0, 999)]
v_min = min(self.result)
return v1, v_rand, v_min
def load_data():
with open('v1.json', 'r') as f:
v1_data = json.load(f)
with open('v_rand.json', 'r') as f:
v_rand_data = json.load(f)
with open('v_min.json', 'r') as f:
v_min_data = json.load(f)
print_counts(v1_data)
print_counts(v_rand_data)
print_counts(v_min_data)
plt.subplot(131)
plt.hist(v1_data, bins=11)
plt.subplot(132)
plt.hist(v_rand_data, bins=11)
plt.subplot(133)
plt.hist(v_min_data, range=(0, 1))
plt.show()
return v1_data, v_rand_data, v_min_data
def print_counts(l):
obj = collections.Counter(l)
for k, v in sorted(obj.items()):
print(k, v)
print()
(c) v已经算出来,u=0.5, 把epsilon从0到1逐渐增大,概率P可以用频率来得到,绘图函数时plot_bound
def plot_bound(l_v, legend):
np_v = np.array(l_v)
total_experiments = len(l_v)
abs_diff = np.abs(np_v - 0.5) # u=0.5
N = 10
P = []
upper_bounds = []
epsilons = np.linspace(0, 1, 50, endpoint=False)
for epsilon in epsilons:
P.append(sum(abs_diff > epsilon) / total_experiments)
upper_bounds.append(2 * np.exp(-2 * epsilon ** 2 * N))
plt.plot(epsilons, P)
plt.plot(epsilons, upper_bounds)
plt.legend([legend, 'upper bound'], loc='upper right')
plt.show()
if __name__ == '__main__':
if not os.path.exists('v1.json'):
v1_data = []
v_rand_data = []
v_min_data = []
for _ in range(100000):
experiment = Experiment()
v1, v_rand, v_min = experiment.execute_experiment()
v1_data.append(v1)
v_rand_data.append(v_rand)
v_min_data.append(v_min)
with open('v1.json', 'w') as f:
json.dump(v1_data, f)
with open('v_rand.json', 'w') as f:
json.dump(v_rand_data, f)
with open('v_min.json', 'w') as f:
json.dump(v_min_data, f)
v1_data, v_rand_data, v_min_data = load_data()
plot_bound(v1_data, 'v1')
plot_bound(v_rand_data, 'v_rand')
plot_bound(v_min_data, 'v_min')
绘出下面3幅图:
image.png
image.png
可以发现前两个概率都在Hoeffding bound下面,但是第三个概率在Hoeffding bound上面。原因就是v_min是在data set生成后再选则出来的,而非一开始就固定的。(
h is not fixed before you generate the data set, but you are allowed to change h after you generate the data set
)(d) 1000个硬币就是1000个bin,v_1是固定在一个bin,v_rand是随机的选bin,v_min是找最小的Ein所在的bin,而抽样次数即样本点个数是N=10次。事实上,如果N非常大,即使用最小的Ein来估计u,也是比较精确的。
image.png
网友评论