决策树对西瓜数据集2.0二分类

作者: 异想派 | 来源:发表于2017-04-07 00:15 被阅读1971次

机器学习：决策树算法代码详细注释笔记
决策树的构建及可视化——帮自己配副隐形眼镜
ML08-决策树
机器学习决策树—Apple的学习笔记
决策树
机器学习实战教程（三）：决策树实战篇（c）
决策树对西瓜数据集2.0二分类
第十四天使用决策树对乳腺癌进行分类
决策树
【Spark Mllib】决策树，随机森林——预测森林植被类型

西瓜数据集.jpg

@生成分类字典

# -*- coding: UTF-8 -*- 

#设置默认编码，否则中文会乱码
import sys 
reload(sys) 
sys.setdefaultencoding('utf-8') 
from math import log

#1、获取样例集和属性列表
def filetodataset(filename):   
    fr=open(filename,'r')
    all_lines=fr.readlines()   #list形式,每行为1个str
    featname=all_lines[0].strip().split(',')  #list形式
    featname=featname[:-1]
    dictcategory={}
    dataset=[]
    for sample in all_lines[1:]:
        sample=sample.strip().split(',')   #以逗号为分割符拆分列表
        dataset.append(sample)
    return dataset,featname

#2、计算香农商
def calcent(dataset):
    dictcategory={}
    for i in dataset:
        category=i[-1]
        if category not in dictcategory:
            dictcategory[category]=0
        dictcategory[category]+=1
    num=len(dataset)
    shannon=0
    for i in dictcategory:
        prob=float(dictcategory[i])/num
        shannon-=prob*log(prob,2)
    return shannon

#3、对特定属性选择特定取值后，将满足该条件的剩余数据集组合留待计算香农商
def splitdataset(dataset,axis,value):
    subdataset=[]
    for sample in dataset:
        if sample[axis]==value:
            reducedfeatvec=sample[:axis]
            reducedfeatvec.extend(sample[axis+1:])
            subdataset.append(reducedfeatvec)
    return subdataset

#4、选择最佳的划分属性
def choosebestfeaturetosplit(dataset):
    attrnum=len(dataset[0])     #计算属性个数
    baseshannon=calcent(dataset) #计算整个样本集的香农商
    bestinfogain=0.0 ; bestfeature=-1
    for i in range(attrnum-1):
        featlist=[example[i] for example in dataset]  #取出特定属性的所有值。dataset包含了类，但不影响，因为取不到
        unifeat=set(featlist)   #每个属性所含的值
        attrshannon=0
        for value in unifeat:
            subdataset=splitdataset(dataset,i,value)
            shannon=calcent(subdataset)  #每个属性值取每个值的香农商
            prob=len(subdataset)/float(len(dataset))
            attrshannon+=prob*shannon
        infogain=baseshannon-attrshannon
        if infogain>bestinfogain:
            bestinfogain=infogain
            bestfeature=i
    return bestfeature


#5、返回样例中类最多的那个类别
def majorclass(data):
    aa=[sample[-1] for sample in data]   #获取每个样例最后的类别
    bb={}
    for i in aa:
        bb[i]=aa.count(i)
    #将字典bb降序排列，书中用的另一种方式
    bb= sorted(bb.iteritems(), key=lambda d:d[1], reverse = True)
    return bb


#6、生成决策树
def createtree(mydata,labels):  #labels为属性标签
    #情况1、当所有样例的类别一致时，返回类别
    samplelabel=[sample[-1] for sample in mydata]
    usamplelabel=list(set(samplelabel))
    if len(usamplelabel)==1:
        return usamplelabel[0]

    #情况2、当属性已经用完，则选择类别最多的显示
    if len(mydata[0])==1:
        return majorclass(mydata)

    #情况3：选择最佳划分属性进行划分
    bestfeature=choosebestfeaturetosplit(mydata)
    bestfeaturelabel=labels[bestfeature]
    mytree={bestfeaturelabel:{}}
    del labels[bestfeature]

    featurevalue=[sample[bestfeature] for sample in mydata]
    ufeaturevalue=set(featurevalue)
    for value in ufeaturevalue:
        sublabels=labels[:]
        mytree[bestfeaturelabel][value]=createtree(splitdataset(mydata,bestfeature,value),sublabels)
    return mytree


if __name__=='__main__':
    import json
    filename='/Users/enniu/Desktop/jqxx/xiguaset.txt'
    mydata,featname=filetodataset(filename)
    #shannon=calcent(mydata)
    #choosebestfeaturetosplit(mydata)
    mytree=createtree(mydata,featname)
    print json.dumps(mytree, ensure_ascii=False)   #直接打印字典，里面含有中文，控制台信息输出窗口按照ascii编码输出utf8编码的字符串。

结果如下：

{"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
说明
1、在结点上下游（递归）属性只出现一次，因为后面算法会剔除掉。同个属性可能出现在不同分叉路

2、与机器学习书相比P78，少了个色泽浅白为好瓜的判断

参考：
如何实现并应用决策树算法？
python 字典中有中文写入文件后变成编码

@绘制树形图


# -*- coding:utf-8 -*-

import sys 
reload(sys) 
sys.setdefaultencoding('utf-8')
import matplotlib.pyplot as plt
import json
#mytree={"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
anothertree={'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
#anothertree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
#print json.dumps(mytree,ensure_ascii=False)

#计算叶节点数目
def calculateleaf(mytree):
    numleaf=0
    firststr=mytree.keys()[0]  #获取字典第一个键值
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        if type(seconddict[key]).__name__=='dict':
            numleaf+= calculateleaf(seconddict[key])
        else:
            numleaf+=1
    return numleaf

#计算数的层数
def calculatedepth(mytree):
    maxdepth=0
    firststr=mytree.keys()[0]
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        #print key,
        if type(seconddict[key]).__name__=='dict':
            numdepth=1+calculatedepth(seconddict[key])
        else:
            numdepth=1   #到叶节点后，计算树深度的变量+1
        if numdepth>maxdepth:
            maxdepth=numdepth
        #print numdepth,maxdepth
    return maxdepth

def plotmidtext(cntrpt,parentpt,txtstring):
    xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
    ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1]
    createplot.ax1.text(xmid,ymid,txtstring)

decisionnode=dict(boxstyle="sawtooth",fc="0.8")
leafnode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="<-")

def plotnode(nodetext,centerpt,parentpt,nodetype):
    createplot.ax1.annotate(nodetext,xy=parentpt,xytext=centerpt,arrowprops=arrow_args,\
        xycoords='axes fraction',va='center',ha='center',bbox=nodetype)


def plottree(mytree,parentpt,nodetxt):
    numleafs=calculateleaf(mytree)
    depth=calculatedepth(mytree)
    firststr=mytree.keys()[0]
    cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
    print '子节点坐标:',cntrpt
    plotmidtext(cntrpt,parentpt,nodetxt)  #自定义函数
    plotnode(firststr,cntrpt,parentpt,decisionnode) #刚开始根节点与子节点是连在一起的？
    print '绘制连接箭头',cntrpt,parentpt
    seconddict=mytree[firststr]
    plottree.yoff=plottree.yoff-1.0/(1.0*plottree.totald) #控制宽度
    print 'y轴值:',plottree.yoff
    for key in seconddict.keys():
        if type(seconddict[key]).__name__=='dict':
            print '***sandy***',plottree.xoff  #经过else的判断后已变为1/6
            plottree(seconddict[key],cntrpt,str(key))
            print '***lam***',plottree.xoff
        else:
            plottree.xoff=plottree.xoff+1.0/plottree.totalw
            plotnode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafnode)
            print '灯灯hoho',(plottree.xoff,plottree.yoff),cntrpt
            plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
    #plottree.yoff=plottree.yoff+1.0/plottree.totald

def createplot(intree):
    fig=plt.figure(1,facecolor='white')
    fig.clf()
    axprops=dict(xticks=[0,0.2,0.4,0.6,0.8,1],yticks=[0,0.2,0.4,0.6,0.8,1])
    createplot.ax1=plt.subplot(111,frameon=True,**axprops)  #把**axprops去掉亦可，默认显示刻度
    plottree.totalw=float(calculateleaf(intree))
    plottree.totald=float(calculatedepth(intree))
    plottree.xoff=-0.5/plottree.totalw
    plottree.yoff=1.0
    plottree(intree,(0.5,1.0),'')
    plt.show()

if __name__=='__main__':
    createplot(anothertree)

@@递归探讨

当碰到递归时，沿着递归执行到最终结果（即最后停止递归的地方），然后再依次往上层执行

# -*- coding: UTF-8 -*- 
def calculatedepth(mytree):
    maxdepth=0
    firststr=mytree.keys()[0]
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        print key
        if type(seconddict[key]).__name__=='dict':
            print '**'
            numdepth=1+calculatedepth(seconddict[key])
            print '第1种情况',numdepth
        else:
            numdepth=1   #到叶节点后，计算树深度的变量+1
            print '第2种情况',numdepth
        if numdepth>maxdepth:
            maxdepth=numdepth
        print (numdepth,maxdepth)
    return maxdepth

mytree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}

if __name__=='__main__':
    a=calculatedepth(mytree)

隐形眼镜数据集.png

机器学习：决策树算法代码详细注释笔记
计算给定数据集的香农熵按照给定特征划分数据集选择数据集最好的分类属性获取分类集中概率最大的分类创建决策树递...
决策树的构建及可视化——帮自己配副隐形眼镜
本文以一个新的数据集(隐形眼镜数据集)为基础实现构建决策树、决策树的保存与加载、利用决策树分类、决策树的可视化，前...
ML08-决策树
一、决策树应用体验分类从上面可以看出，决策树对分类具有线性回归无可比拟的优势, 如果对未参与训练的数据集是...
机器学习决策树—Apple的学习笔记
决策树学习的目标：根据给定的训练数据集构建一个决策树模型，使它能够对实例进行正确的分类。决策树学习的本质：从训练...
决策树
模型决策树的学习目标是根据给定的训练数据集，建立一个决策树，能够对实例进行正确分类。决策树算法通常是递归选择最优...
机器学习实战教程（三）：决策树实战篇（c）
四、使用决策树执行分类依靠训练数据构造了决策树之后，我们可以将它用于实际数据的分类。在执行数据分类时，需要决策树...
决策树对西瓜数据集2.0二分类
@生成分类字典结果如下： {"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽...
第十四天使用决策树对乳腺癌进行分类
对乳腺癌数据进行分类这节的目标是用用决策树对乳腺癌数据进行分类，另外还会涉及到决策树的集成。根据之前学习到的知识...
决策树
决策树决策树是一种树状的机器学习模型，模型中以数据属性做为分支结点，最后的分类结果作为叶子结点。下图是西瓜书里所...
【Spark Mllib】决策树，随机森林——预测森林植被类型
数据集处理模型训练决策树有训练分类模型的函数trainClassifier和回归模型的函数trainRegre...