美文网首页机器学习与数据挖掘
决策树 CART处理连续型数据的 python实现

决策树 CART处理连续型数据的 python实现

作者: 夹小欣 | 来源:发表于2017-11-03 14:52 被阅读549次

    参考的原文地址,原文还有对CART的分析。
    这个代码用的是误差平方和最小,对连续值进行了二分,可以处理0,1离散值,不能‘好,不好’这种字符串表示特征。

    #!/usr/bin/env python2
    # -*- coding: utf-8 -*-
    import numpy as np
    import pickle
    # =============================================================================
    # 创建CART树(回归树或模型树)
    # 输入:数据集data,叶子节点形式leafType:regressLeaf(回归树)、modelLeaf(模型树)
    # 损失函数errType:误差平方和也分为regressLeaf和modelLeaf
    # 用户自定义阈值参数:误差减少的阈值,子样本集应包含的最少样本个数
    # =============================================================================
    def CreateCart(data,leafType=regressLeaf,errType=regressErr,threshold=(1,4)):
    
        #寻找最优特征与最优切分点
        feature,value = ChooseBest(data,leafType,errType,threshold)
        #停止条件:当结点的样本个数小于阈值,或基尼指数小于阈值,或者没有更多特征时停止
        if feature == None:
            return value
        returnTree = {}
        returnTree['bestSplitFeature'] = feature
        returnTree['bestSplitValue'] = value
        leftSet,rightSet = binarySplit(data,feature,value)
        returnTree['left'] = leftSet
        returnTree['right'] = rightSet
        return returnTree
    
    def ChooseBest(data,leafType,errType,threshold):
        thresholdErr, thresholdSamples = threshold[0],threshold[1]
        #数据中输出相同,不需要继续划分树,输出平均值
        #需要将np array转换成list再做set
        if len(set(data[:,-1].T.tolist()[0])) == 1:
            #回归树返回叶子平均值,模型树返回系数
            return None,leafType(data)
        m,n = data.shape()
        #分别处理回归树、模型树的err计算方法
        Err = errType(data)
        bestErr, bestFindex, bestFval = np.Inf,0,0
        #对于每个特征,根据每个取值将data划分成两个子集,计算两个子集的err
        #取所有可能划分点中,err最小的特征和结点,作为划分点
        #同时,保证划分的子集中样本个数大于阈值thresholdSamples
        #这种划分方式适用于连续型数据,标称型数据如果不能比大小则要用id3的方式划分
        for findex in range(n-1):
            for fval in data[:,findex]:
                left,right = binarySplit(data,findex,fval)
                #阈值判断
                if (left.shape()[0]<thresholdSamples) or (right.shape()[0]<thresholdSamples):
                    continue
                temerr = errType(left) + errType(right)
                #更新最小误差
                if temerr < bestErr:
                    bestErr, bestFindex, bestFval = temerr,findex,fval
        #检验所选最优划分点的误差,与未划分时的差值是否小于阈值thresholdErr
        if Err - bestErr < thresholdErr:
            return None, leafType(data)
        
        return bestFindex,bestFval
                
    def binarySplit(data,findex, fval):
        #np.nonzero选取符合条件的非零值所在的每一纬度的下标
        left = data[np.nonzero(data[:,findex] <= fval)[0],:]
        right = data[np.nonzero(data[:,findex] > fval)[0],:]
        return left, right
    # =============================================================================
    # 回归树、模型树对应的不同处理函数
    # =============================================================================
    #输出叶结点平均值
    def regressLeaf(data):
        #数据集输出列即最后一列的平均值
        return np.mean(data[:,-1])
    #输出系数
    def modelLeaf(data):
        w,x,y = linearSolve(data)
        return w
    #y=kx+b写成矩阵形式
    def linearSolve(data):
        m,n = np.shape(data)
        #mat转换成矩阵,方便后面计算
        x,y = np.mat(np.ones((m,n))),np.mat(np.ones((m,1)))
        x[:,1:n] = data[:,0:(n-1)]
        y = data[:,-1]
        xTx = x.T*x
        #计算行列式,判断是否能够取逆
        if np.linalg.det(xTx) == 0:
            #抛出异常
            raise NameError('matrix cannot do inverse,try increasing the second value of threshold')
        else:
            w = xTx.I*(x.T*y)
            return w,x,y
    #划分后左右数据集的总误差平方和
    def regressErr(data):
        #回归树的叶子节点取的是均值
        #计算误差相当于(每个输出-均值)^2
        #就是方差
        #np.var的方差分母是(n-1)
        return np.var(data[:,-1])*(np.shape(data)[0]-1)
    
    #模型树误差
    def modelErr(data):
        w,x,y = linearSolve(data)
        y_pie = x * w
        return sum(np.power(y-y_pie,2))
    
    # =============================================================================
    # 剪枝
    # =============================================================================
    #从叶子向上,比较剪掉和不剪的误差
    def prune(tree,test):
        #数据集没有数据
        if test.shape()[0] == 0: return getMean(tree)
        #向下递归到叶子
        if isTree(tree['left']) or isTree(tree['right']):
            testleft,testright = binarySplit(test,tree['bestSplitFeature'],tree['bestSplitFeatValue'])
        if isTree(tree['left']):
            tree['left'] = prune(tree['left'],testleft)
        if isTree(tree['right']):
            tree['right'] = prune(tree['right'],testright)
        #找到叶子之后计算误差
        if not isTree(tree['left']) and not isTree(tree['right']):
            leftmean,rightmean = binarySplit(test,tree['bestSplitFeature'],tree['bestSplitFeatValue'])
            errno = sum(np.power(leftmean[:,-1]-tree['left'],2))+sum(np.power(rightmean[:,-1]-tree['right'],2))
            errMerge = sum(np.power(test[:,-1]-getMean(tree),2))
            if errMerge < errno:
                print 'merge'
                return getMean(tree)
            else:
                return tree
        else:
            return tree
    
    def getMean(tree):
        #tree['left']保存左子树数据集,如果是叶子节点保存平均值
        if isTree(tree['left']):tree['left'] = getMean(tree['left'])
        if isTree(tree['right']):tree['right'] = getMean(tree['right'])
        return (tree['left']+tree['right'])/2.
    
    
    #判断是否存在叶结点
    def isTree(obj):
        #type(obj)返回的是obj的类型关键字
        #__name__将类型关键字转化为str
        return (type(obj).__name__=='dict')
    # =============================================================================
    # 预测
    # =============================================================================
    def createForeCast(tree,test,modelEval=regressEvaluation):
        m = len(test)
        y = np.mat(np.zeros((m,1)))
        for i in range(m):
            y = treeForeCast(tree,test[i],modelEval)
        return y
    
    def treeForeCast(tree,test,modelEval=regressEvaluation):
        #到了叶子节点输出结果
        if not isTree(tree): return modelEval(tree,test)
        #向左子树递归
        if test[tree['bestSplitFeature']] <= tree['bestSplitFeature']:
            if isTree(tree['left']):
                return treeForeCast(tree['left'],test,modelEval)
            else:
                return modelEval(tree['left'],test)
        #向右子树递归
        else:
            if isTree(tree['right']):
                return treeForeCast(tree['right'],test,modelEval)
            else:
                return modelEval(tree['right'],test)
        
    def regressEvaluation(tree,test):
        return float(tree)
    # =============================================================================
    # 数据加载
    # =============================================================================
    
    def regressData(filename):
        fr = open(filename)
        #持久化
        return pickle.load(fr)
    
    if __name__ == '__main__':
        trainfilename = 'e:\\python\\ml\\trainDataset.txt'
        testfilename = 'e:\\python\\ml\\testDataset.txt'
    
        trainDataset = regressData(trainfilename)
        testDataset = regressData(testfilename)
        
        cartTree = CreateCart(trainDataset,threshold=(1,4))
        pruneTree = prune(cartTree,testDataset)
        y = createForeCast(cartTree,np.mat([0.3]),modelEval=regressEvaluation)
    

    相关文章

      网友评论

        本文标题:决策树 CART处理连续型数据的 python实现

        本文链接:https://www.haomeiwen.com/subject/lkytmxtx.html