美文网首页
决策树对西瓜数据集2.0二分类

决策树对西瓜数据集2.0二分类

作者: 异想派 | 来源:发表于2017-04-07 00:15 被阅读1971次
    西瓜数据集.jpg

    @生成分类字典

    # -*- coding: UTF-8 -*- 
    
    #设置默认编码,否则中文会乱码
    import sys 
    reload(sys) 
    sys.setdefaultencoding('utf-8') 
    from math import log
    
    #1、获取样例集和属性列表
    def filetodataset(filename):   
        fr=open(filename,'r')
        all_lines=fr.readlines()   #list形式,每行为1个str
        featname=all_lines[0].strip().split(',')  #list形式
        featname=featname[:-1]
        dictcategory={}
        dataset=[]
        for sample in all_lines[1:]:
            sample=sample.strip().split(',')   #以逗号为分割符拆分列表
            dataset.append(sample)
        return dataset,featname
    
    #2、计算香农商
    def calcent(dataset):
        dictcategory={}
        for i in dataset:
            category=i[-1]
            if category not in dictcategory:
                dictcategory[category]=0
            dictcategory[category]+=1
        num=len(dataset)
        shannon=0
        for i in dictcategory:
            prob=float(dictcategory[i])/num
            shannon-=prob*log(prob,2)
        return shannon
    
    #3、对特定属性选择特定取值后,将满足该条件的剩余数据集组合留待计算香农商
    def splitdataset(dataset,axis,value):
        subdataset=[]
        for sample in dataset:
            if sample[axis]==value:
                reducedfeatvec=sample[:axis]
                reducedfeatvec.extend(sample[axis+1:])
                subdataset.append(reducedfeatvec)
        return subdataset
    
    #4、选择最佳的划分属性
    def choosebestfeaturetosplit(dataset):
        attrnum=len(dataset[0])     #计算属性个数
        baseshannon=calcent(dataset) #计算整个样本集的香农商
        bestinfogain=0.0 ; bestfeature=-1
        for i in range(attrnum-1):
            featlist=[example[i] for example in dataset]  #取出特定属性的所有值。dataset包含了类,但不影响,因为取不到
            unifeat=set(featlist)   #每个属性所含的值
            attrshannon=0
            for value in unifeat:
                subdataset=splitdataset(dataset,i,value)
                shannon=calcent(subdataset)  #每个属性值取每个值的香农商
                prob=len(subdataset)/float(len(dataset))
                attrshannon+=prob*shannon
            infogain=baseshannon-attrshannon
            if infogain>bestinfogain:
                bestinfogain=infogain
                bestfeature=i
        return bestfeature
    
    
    #5、返回样例中类最多的那个类别
    def majorclass(data):
        aa=[sample[-1] for sample in data]   #获取每个样例最后的类别
        bb={}
        for i in aa:
            bb[i]=aa.count(i)
        #将字典bb降序排列,书中用的另一种方式
        bb= sorted(bb.iteritems(), key=lambda d:d[1], reverse = True)
        return bb
    
    
    #6、生成决策树
    def createtree(mydata,labels):  #labels为属性标签
        #情况1、当所有样例的类别一致时,返回类别
        samplelabel=[sample[-1] for sample in mydata]
        usamplelabel=list(set(samplelabel))
        if len(usamplelabel)==1:
            return usamplelabel[0]
    
        #情况2、当属性已经用完,则选择类别最多的显示
        if len(mydata[0])==1:
            return majorclass(mydata)
    
        #情况3:选择最佳划分属性进行划分
        bestfeature=choosebestfeaturetosplit(mydata)
        bestfeaturelabel=labels[bestfeature]
        mytree={bestfeaturelabel:{}}
        del labels[bestfeature]
    
        featurevalue=[sample[bestfeature] for sample in mydata]
        ufeaturevalue=set(featurevalue)
        for value in ufeaturevalue:
            sublabels=labels[:]
            mytree[bestfeaturelabel][value]=createtree(splitdataset(mydata,bestfeature,value),sublabels)
        return mytree
    
    
    if __name__=='__main__':
        import json
        filename='/Users/enniu/Desktop/jqxx/xiguaset.txt'
        mydata,featname=filetodataset(filename)
        #shannon=calcent(mydata)
        #choosebestfeaturetosplit(mydata)
        mytree=createtree(mydata,featname)
        print json.dumps(mytree, ensure_ascii=False)   #直接打印字典,里面含有中文,控制台信息输出窗口按照ascii编码输出utf8编码的字符串。
    
    

    结果如下:

    {"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
    说明
    1、在结点上下游(递归)属性只出现一次,因为后面算法会剔除掉。同个属性可能出现在不同分叉路

    2、与机器学习书相比P78,少了个色泽浅白为好瓜的判断

    参考:
    如何实现并应用决策树算法?
    python 字典中有中文写入文件后变成编码


    @绘制树形图

    
    # -*- coding:utf-8 -*-
    
    import sys 
    reload(sys) 
    sys.setdefaultencoding('utf-8')
    import matplotlib.pyplot as plt
    import json
    #mytree={"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
    anothertree={'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
    #anothertree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
    #print json.dumps(mytree,ensure_ascii=False)
    
    #计算叶节点数目
    def calculateleaf(mytree):
        numleaf=0
        firststr=mytree.keys()[0]  #获取字典第一个键值
        seconddict=mytree[firststr]
        for key in seconddict.keys():
            if type(seconddict[key]).__name__=='dict':
                numleaf+= calculateleaf(seconddict[key])
            else:
                numleaf+=1
        return numleaf
    
    #计算数的层数
    def calculatedepth(mytree):
        maxdepth=0
        firststr=mytree.keys()[0]
        seconddict=mytree[firststr]
        for key in seconddict.keys():
            #print key,
            if type(seconddict[key]).__name__=='dict':
                numdepth=1+calculatedepth(seconddict[key])
            else:
                numdepth=1   #到叶节点后,计算树深度的变量+1
            if numdepth>maxdepth:
                maxdepth=numdepth
            #print numdepth,maxdepth
        return maxdepth
    
    def plotmidtext(cntrpt,parentpt,txtstring):
        xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
        ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1]
        createplot.ax1.text(xmid,ymid,txtstring)
    
    decisionnode=dict(boxstyle="sawtooth",fc="0.8")
    leafnode=dict(boxstyle="round4",fc="0.8")
    arrow_args=dict(arrowstyle="<-")
    
    def plotnode(nodetext,centerpt,parentpt,nodetype):
        createplot.ax1.annotate(nodetext,xy=parentpt,xytext=centerpt,arrowprops=arrow_args,\
            xycoords='axes fraction',va='center',ha='center',bbox=nodetype)
    
    
    def plottree(mytree,parentpt,nodetxt):
        numleafs=calculateleaf(mytree)
        depth=calculatedepth(mytree)
        firststr=mytree.keys()[0]
        cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
        print '子节点坐标:',cntrpt
        plotmidtext(cntrpt,parentpt,nodetxt)  #自定义函数
        plotnode(firststr,cntrpt,parentpt,decisionnode) #刚开始根节点与子节点是连在一起的?
        print '绘制连接箭头',cntrpt,parentpt
        seconddict=mytree[firststr]
        plottree.yoff=plottree.yoff-1.0/(1.0*plottree.totald) #控制宽度
        print 'y轴值:',plottree.yoff
        for key in seconddict.keys():
            if type(seconddict[key]).__name__=='dict':
                print '***sandy***',plottree.xoff  #经过else的判断后已变为1/6
                plottree(seconddict[key],cntrpt,str(key))
                print '***lam***',plottree.xoff
            else:
                plottree.xoff=plottree.xoff+1.0/plottree.totalw
                plotnode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafnode)
                print '灯灯hoho',(plottree.xoff,plottree.yoff),cntrpt
                plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
        #plottree.yoff=plottree.yoff+1.0/plottree.totald
    
    def createplot(intree):
        fig=plt.figure(1,facecolor='white')
        fig.clf()
        axprops=dict(xticks=[0,0.2,0.4,0.6,0.8,1],yticks=[0,0.2,0.4,0.6,0.8,1])
        createplot.ax1=plt.subplot(111,frameon=True,**axprops)  #把**axprops去掉亦可,默认显示刻度
        plottree.totalw=float(calculateleaf(intree))
        plottree.totald=float(calculatedepth(intree))
        plottree.xoff=-0.5/plottree.totalw
        plottree.yoff=1.0
        plottree(intree,(0.5,1.0),'')
        plt.show()
    
    if __name__=='__main__':
        createplot(anothertree)
    

    @@递归探讨

    当碰到递归时,沿着递归执行到最终结果(即最后停止递归的地方),然后再依次往上层执行

    # -*- coding: UTF-8 -*- 
    def calculatedepth(mytree):
        maxdepth=0
        firststr=mytree.keys()[0]
        seconddict=mytree[firststr]
        for key in seconddict.keys():
            print key
            if type(seconddict[key]).__name__=='dict':
                print '**'
                numdepth=1+calculatedepth(seconddict[key])
                print '第1种情况',numdepth
            else:
                numdepth=1   #到叶节点后,计算树深度的变量+1
                print '第2种情况',numdepth
            if numdepth>maxdepth:
                maxdepth=numdepth
            print (numdepth,maxdepth)
        return maxdepth
    
    mytree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
    
    if __name__=='__main__':
        a=calculatedepth(mytree)
    

    隐形眼镜数据集.png

    相关文章

      网友评论

          本文标题:决策树对西瓜数据集2.0二分类

          本文链接:https://www.haomeiwen.com/subject/fyycattx.html