美文网首页
机器学习-5 朴素贝叶斯【附代码】

机器学习-5 朴素贝叶斯【附代码】

作者: Eric_i33 | 来源:发表于2019-08-04 10:58 被阅读0次

    返回主页


    朴素贝叶斯算法(naive Bayes)是基于贝叶斯定理与特征条件独立假设的分类算法,在机器学习众多算法中,它实现简单、效率高,但代价是往往要牺牲一定的分类准确率。
    从模型的分类上来讲,朴素贝叶斯算法属于概率模型(或生成模型),即它学习到的是数据生成的机制。详见笔记第一章 1.2.2

    1、关于条件独立性假设

    2、定义数据集

    数据集 特征空间

    3、假设空间

    从贝叶斯定理推出假设函数

    4、目标函数(最大似然估计)

    5、优化算法

    手写 naive Bayes 算法

    # -*- coding: utf-8 -*-
    from __future__ import (absolute_import, division, print_function)
    import numpy as np
    import pandas as pd
    import scipy.spatial.distance as dist
    from scipy.stats import norm
    from sklearn.utils import shuffle
    from sklearn.model_selection import train_test_split
    #import random as rd
    #import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler
    from sklearn.datasets import load_iris
    
    
    class NaiveBayesModel(object):
        def __init__(self, x_train, y_train, x_test):
            self.x_train = x_train
            self.y_train = y_train
            self.x_test = x_test
            self.N, self.n = x_train.shape
        
        
        def prior_probs(self):
            '''计算标记值的先验概率(含拉普拉斯平滑): 本案设为三分类问题'''
            labels, labels_count = np.unique(self.y_train, return_counts=True)
            label_0, label_1, label_2 = labels[0], labels[1], labels[2]
            prior_probs_0, prior_probs_1, prior_probs_2 = (labels_count + 1) / (labels_count.sum() + self.n)
            # 类别标记
            label_dict = {"label_0": label_0, 
                          "label_1": label_1, 
                          "label_2": label_2}
            # 类别标记对应的先验概率
            prior_probs_dict = {"prior_probs_0": prior_probs_0, 
                                "prior_probs_1": prior_probs_1, 
                                "prior_probs_2": prior_probs_2}
            return label_dict, prior_probs_dict
            
    
        def likelihood_probs(self, label):
            '''计算每个样本的似然概率(含拉普拉斯平滑)'''
            # 初始化似然概率数组
            likeli_probs_array = np.array([])
            # 行循环
            for i in range(len(self.x_test)):
                # 初始化似然概率的联合分布
                likeli_probs_res = 1.0
                # 列循环
                for j in range(self.n):
                    val = self.x_test.iloc[i, j]
                    # 若字段取值为分类变量,则计算条件概率,加拉普拉斯平滑
                    if isinstance(val, str):
                        fen_zi = len(self.x_train.loc[(self.x_train.iloc[:, j] == val) & 
                                                      (y_train == label)]) + 1
                        fen_mu = sum(self.y_train == label) + self.n
                        likeli_probs = fen_zi / fen_mu
                    # 若字段取值为连续变量,则依据正态分布计算条件概率密度
                    elif isinstance(val, float):
                        vals = x_train[self.y_train == label].iloc[:, j]
                        mean = np.mean(vals)
                        std = np.std(vals)
                        likeli_probs = norm.pdf(val, mean, std)
                    else:
                        raise TypeError("Type must be as str or float.")
                    # 计算似然概率的联合分布
                    likeli_probs_res *= likeli_probs
                # 保持似然概率
                likeli_probs_array = np.append(likeli_probs_array, likeli_probs_res)
            return likeli_probs_array
        
        
        def predict(self, prior_probs_0, prior_probs_1, prior_probs_2, 
                    likeli_probs_array_0, likeli_probs_array_1, likeli_probs_array_2):
            '''模型预测, 计算后验概率, 取最大化'''
            posterior_probs_0 = (prior_probs_0 * likeli_probs_array_0).reshape(-1, 1)
            posterior_probs_1 = (prior_probs_1 * likeli_probs_array_1).reshape(-1, 1)
            posterior_probs_2 = (prior_probs_2 * likeli_probs_array_2).reshape(-1, 1)
            
            posterior_probs = np.concatenate([posterior_probs_0, 
                                              posterior_probs_1, 
                                              posterior_probs_2], axis=1)
            # 最大后验概率
            y_pred = np.argmax(posterior_probs, axis=1)
            return y_pred
        
        
        def get_score(self, y_true, y_pred):
            '''模型评估'''
            score = sum(y_true == y_pred) / len(y_true)
            return score
            
        
        
    if __name__ == "__main__":
        # 读取数据
        iris = load_iris()
        dataSet = pd.DataFrame(iris.data, columns=iris.feature_names)
        dataSet.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
        # 构造三分类变量
        dataSet["sepal_length_mult"] = pd.cut(dataSet.sepal_length, bins=3, 
               right=True, include_lowest=True, 
               labels=["low", "med", "high"])
        
        dataSet["sepal_width_mult"] = pd.cut(dataSet.sepal_width, bins=3, 
               right=True, include_lowest=True, 
               labels=["low", "med", "high"])
        # 构造二分类变量
        dataSet["petal_length_bina"] = pd.cut(dataSet.petal_length, bins=2, 
               right=True, include_lowest=True, 
               labels=["small", "big"])
        
        dataSet["petal_width_mult"] = pd.cut(dataSet.petal_width, bins=2, 
               right=True, include_lowest=True, 
               labels=["small", "big"])
        # 标记值
        dataSet["label"] = iris.target
        # 洗牌
        dataSet = shuffle(dataSet) # , random_state=0
        
        x = dataSet.iloc[:, 0:8]
        y = dataSet.iloc[:, 8]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
        
        # 手写模型
        model = NaiveBayesModel(x_train, y_train, x_test)
        label_dict, prior_probs_dict = model.prior_probs()
        
        label_0 = label_dict.get("label_0")
        label_1 = label_dict.get("label_1")
        label_2 = label_dict.get("label_2")
    
        prior_probs_0 = prior_probs_dict.get("prior_probs_0")
        prior_probs_1 = prior_probs_dict.get("prior_probs_1")
        prior_probs_2 = prior_probs_dict.get("prior_probs_2")  
        
        likeli_probs_array_0 = model.likelihood_probs(label=label_0)
        likeli_probs_array_1 = model.likelihood_probs(label=label_1)
        likeli_probs_array_2 = model.likelihood_probs(label=label_2)
        
        y_pred = model.predict(prior_probs_0, prior_probs_1, prior_probs_2, 
                               likeli_probs_array_0, likeli_probs_array_1, likeli_probs_array_2)
        
        score = model.get_score(y_test, y_pred)
        print(f"NaiveBayesModel 预测准确率:{score}")
    

    NaiveBayesModel 预测准确率:0.9333333333333333


    返回主页

    相关文章

      网友评论

          本文标题:机器学习-5 朴素贝叶斯【附代码】

          本文链接:https://www.haomeiwen.com/subject/fmqprctx.html