#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 2 10:51:33 2017
@author: lee
"""
from collections import defaultdict
import math
def CreateDataSet():
data = [[1, 1, 'yes' ],
[1, 1, 'yes' ],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return data, labels
def CreateTree(data, labels,eps=-1):
#print data
#获取训练集的分类
data_labels = [y[-1] for y in data]
#获取特征集
unique_feature = set(data[0][:-1])
#(1)如果所有实例都属于同一个类,则决策树为单结点树
#可怕的坑,不能直接用len(labels)==1
if data_labels.count(data_labels[0]) == len(data_labels):
#print len(data_labels),data_labels,1
return data_labels[0]
#(2)特征集为空,为单结点树,取训练集中实例数最大的类返回
if len(unique_feature) == 0:
dic = defaultdict(int)
for i in range(len(data_labels)):
dic[data_labels[i]] += 1
# =============================================================================
# #字典取最大值的方法:
# max(dic,key=dic.get)#返回最大值所对应的键
# max(dic.items(),key=lambda x: x[1])#返回最大值(key,value)
# =============================================================================
return max(dic,key=dic.get)
#(3)一般情况
#id3的特征选择
maxfindex,maxgain = Maxinformation_gain(data,labels)
#c45的特征选择
#maxfindex,maxgain = Maxinformation_gain_ratio(data,labels)
#(4)最大特征值增益小于阈值,为单结点树,取训练集中实例数最大的类返回
if maxgain < eps:
dic = defaultdict(int)
for i in range(len(data_labels)):
dic[data_labels[i]] += 1
return max(dic,key=dic.get)
#(5)对Ag的每一个取值划分一个树
#取出特征,并在label中删除当前选中特征
maxfeature = labels[maxfindex]
del labels[maxfindex]
#取出该特征对应的所有训练数据
featureVal = [value[maxfindex] for value in data]
#用字典表示树
idtree = {maxfeature:{}}
#取出该特征的取值
featureValunq = set(featureVal)
#(6)对每一个取值,划分出一个子树
for fval in featureValunq:
#print 'mu',maxfeature,fval,idtree
idtree[maxfeature][fval] = CreateTree(SplitData(data,maxfindex,fval),labels[:])
#print 'digui',idtree,'idtree'
return idtree
#筛选出信息增益最大的特征,返回特征索引,该特征的信息增益
def Maxinformation_gain(data,labels):
feature_gain ={}
data_labels = [y[-1] for y in data]
entropyD = entropy(data_labels)
#计算每一个feature的信息增益
for f in range(len(labels)):
featureVal = [value[f] for value in data]
entropyF = ConditionalEntropy(featureVal,data_labels)
feature_gain[f] = entropyD-entropyF
result = max(feature_gain.items(),key=lambda x:x[1])
#print 'max',labels,result,feature_gain
#print 'max',data
return result[0],result[1]
#根据特征索引split_index,特征取值划分数据集
def SplitData(data,split_index,split_value):
#print 'befor split',data,split_index,split_value
subdata = []
for row in data:
if row[split_index] == split_value:
temp1 = row[:split_index]
temp2 = row[split_index+1:]
subdata.append(temp1+temp2)
#print 'after split',subdata,split_index,split_value
return subdata
#计算数据集的经验信息熵
def entropy(y):
n = len(y)
elist = defaultdict(int)
result = 0.
for i in range(len(y)):
elist[y[i]] += 1
for i in elist.keys():
p = elist[i]/float(n)
if p == 0:
result -= 0
else:
result -= p*math.log(p,2)
return result
#计算经验条件信息熵
def ConditionalEntropy(datalist,y):
data_u = set(datalist)
data_dic = defaultdict(int)
y_dic = defaultdict(list)
n = len(datalist)
result = 0.
#计数特征不同取值的个数
for i in range(len(datalist)):
data_dic[datalist[i]] += 1
y_dic[datalist[i]].append(y[i])
for i in data_u:
result += data_dic[i]/float(n) * entropy(y_dic[i])
return result
def Maxinformation_gain_ratio(data,labels):
#计算每个特征的信息增益
result = {}
data_labels = [y[-1] for y in data]
entropyD = entropy(data_labels)
#计算每一个feature的信息增益
for f in range(len(labels)):
featureVal = [value[f] for value in data]
entropyF = ConditionalEntropy(featureVal,data_labels)
feature_gain = entropyD-entropyF
feature_data_en = FeatureDataEntropy(featureVal)
result[f] = feature_gain/feature_data_en
return max(result,key=result.get)[0],max(result,key=result.get)[1]
#计算数据集关于每个特征的熵
def FeatureDataEntropy(datalist):
data_dic = defaultdict(int)
result = 0.
n = len(datalist)
#计数特征不同取值的个数
for i in range(n):
data_dic[datalist[i]] += 1
for i in data_dic.keys():
if data_dic[i] == 0:
result += 0
else:
p = data_dic[i]/float(n)
result += p * math.log(p,2)
return result
data,label = CreateDataSet()
print CreateTree(data,label)
网友评论