There is a classification problem having two classes, with equal prior probabilities, and is shown in Figure 2.1.
image1) Generate a figure same like Fig.2.1. The blue class is generated from a single Gaussian while the red class comes from a mixture of two Gaussians.
答: 如果是将一组标准高斯分布的数据转换为协方差为的数据,那么需要对进行cholesky分解得到变换矩阵,再将
得到属于新高斯分布的数据。
设置分布的参数为:
blue_sigma = [np.array([[0.7,0.2], [0.1,0.6]])]
blue_mu = [np.array([0, 0])]
red_sigma = [np.array([[0.5, 0.3], [0.4, 0.5]]), np.array([[0.6, 0], [0, 0.6]])]
red_mu = [np.array([2.5, 1]), np.array([1, -1])]
points_num = 300
prior = [0.5,0.5]
生成的分布:
image
2) Because we know the class priors and the class-conditional densities, it is straight forward to evaluate and plot the true posterior probabilities as well as the minimum misclassification-rate decision boundary, as shown in Figure 2.1.
根据以上公式,将blue和red的sigma与mu带入,可以得到数据点的条件概率,。
根据朴素贝叶斯公式:
可以得到二维平面上的属于A还是B得概率分布。
绘图得:
3) Evaluate the optimal decision boundary for minimizing the misclassification rate (which corresponds to the contour along which the posterior probabilities for each class equal 0.5) and show is by the green curve.
直接找到概率分布矩阵prob_mat里值为0.5的那些点,并描出来
image
reference:
[1] https://www.jianshu.com/p/04cc140e1127
Code
import pandas as pd
import numpy as np
import pylab
from scipy import *
from numpy.linalg import cholesky
%matplotlib inline
# %config InlineBackend.figure_format = 'svg'
# initial dataset
blue_sigma = [np.array([[2,0], [0,2]])]
blue_mu = [np.array([0, 0])]
red_sigma = [np.array([[0.3, 0.1], [0.3, 0.6]]), np.array([[0.5, 0], [0, 0.5]])]
red_mu = [np.array([2.5, 1]), np.array([1, -1])]
points_num = 600
prior = [0.5,0.5]
num_points = [prior[0]*points_num,prior[1]*points_num]
red_weights=[0.7,0.3]
def points_gen(mean,cov,num,label):
# refre from https://www.zhihu.com/question/39823283
R = cholesky(cov).T
points = np.random.randn(num, 2) @ R + mean
points = np.column_stack((points,np.ones(num)))
points[:, 2] = label
return points
data = []
data.append(points_gen(blue_mu[0], blue_sigma[0], int(num_points[0]), 0))
data.append(points_gen(red_mu[0], red_sigma[0], int(num_points[1] * 0.7), 1))
data.append(points_gen(red_mu[1], red_sigma[1], int(num_points[1] * 0.3), 1))
data = np.concatenate(data)
blue_points = data[np.where(data[:,2]==0)]
red_points = data[np.where(data[:,2]==1)]
pylab.scatter(blue_points[:,0],blue_points[:,1])
pylab.scatter(red_points[:,0],red_points[:,1])
<matplotlib.collections.PathCollection at 0x1a1a2f1cc0>
image.png
# 计算后验概率
def cal_prob(x, sigmas, mus, weights=None):
#计算在某个点上的概率
if weights :
# mixture guassians
prob = 0
for idx, (sigma, mu) in enumerate(zip(sigmas, mus)):
tmp_prob = 1/(2*np.pi*(np.linalg.det(sigma)**(1/2))) * \
np.exp(-0.5*(x-mu)@np.linalg.inv(sigma)@(x-mu))
prob += red_weights[idx] * tmp_prob
else :
prob = 1/(2*np.pi*(np.linalg.det(sigmas[0])**(1/2))) * \
np.exp(-0.5*(x-mus[0])@np.linalg.inv(sigmas[0])@(x-mus[0]))
return prob
xs = np.arange(-3, 3, 0.02)
prob_mat = np.zeros((len(xs), len(xs)))
#生成后验概率矩阵
bound_points = []
for index_x,x in enumerate(xs):
for index_y,y in enumerate(xs):
blue_cond_prob = cal_prob(np.array([x,y]),blue_sigma,blue_mu)
red_cond_prob = cal_prob(np.array([x,y]),red_sigma,red_mu,red_weights)
prior_blue = prior[0]
prior_red = prior[1]
prob_mat[len(xs)-index_y-1,index_x] = blue_cond_prob*prior_blue/(red_cond_prob*prior_red+blue_cond_prob*prior_blue)
if(np.abs(prob_mat[len(xs)-index_y-1,index_x]-0.5)<0.01):
bound_points.append([x,y])
import seaborn as sns
# 绘制热力图
sns.heatmap(prob_mat,cmap='YlGnBu')
# cal_prob([0,0],blue_sigma,blue_mu)
<matplotlib.axes._subplots.AxesSubplot at 0x1a193e0da0>
image.png
#绘制边界
pylab.scatter(blue_points[:,0],blue_points[:,1])
pylab.scatter(red_points[:,0],red_points[:,1])
bound_points = np.array(bound_points)
pylab.plot(bound_points[:,0],bound_points[:,1], 'g.',markersize=10)
image.png
网友评论