参考自: ML In Action
from math import log
import operator
import pickle
# 计算香农熵 计算所有类别的信息期望值
# 用来表示某个特征的信息期望值
# 熵越高混合的数据越多
def calc_shannon_ent(data_set):
ent_num = len(data_set)
label_count = {}
for feat in data_set:
curr_label = feat[-1]
if curr_label not in label_count.keys():
label_count[curr_label] = 0
label_count[curr_label] += 1 # 统计各 label 的数量
shannon_ent = 0.0
for key in label_count:
prob = float(label_count[key]) / ent_num
shannon_ent -= prob * log(prob, 2) # log base 2 信息期望值公式
return shannon_ent
# 划分数据集
# 1.抽取 axis 位置为 value 的项
# 2.从选取的项中去掉 axis的值
# [[1, 0, 0], [1, 1, 0], [0, 1, 2]]
# 当 axis = 0 value = 1
# 结果 [[0, 0] [1, 0]]
def split_dataset(dataset, axis, value):
ret_set = []
for feat in dataset:
if feat[axis] == value:
reduce_feat = feat[:axis] # chop out axis used for spliting
reduce_feat.extend(feat[axis+1:])
ret_set.append(reduce_feat)
return ret_set
# 选择最重要 最好的特征来划分
# ID3划分算法
def choose_best_feature_to_split(dataset):
feature_num = len(dataset[0]) - 1 # 最后一列是 label 前面几列是特征
base_entropy = calc_shannon_ent(dataset) # 计算真个数据集的熵
best_info_gain = 0.0 # 最好的信息增益
best_feature = -1 # 最好的特征
# 从一个特征开始计算 各个特征的熵
for i in range(feature_num):
feat_list = [example[i] for example in dataset]
unique_val = set(feat_list) #get a set of unique values
# 计算每个特征划分数据后的熵 一个特征就是一列数据 可能包含不同的label
# 计算各个 label 的熵和即可
new_entropy = 0.0
for value in unique_val:
sub_dataset = split_dataset(dataset, i, value)
prob = len(sub_dataset) / float(len(dataset))
new_entropy += prob * calc_shannon_ent(sub_dataset)
# 计算每个特征的信息增益 熵越小 数据混合度越低 说明按这个特征划分最好
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i # 标记最好的特征
return best_feature
# 在处理完所有的特征后,发现标签已经不是唯一,采用投票表决的方法来做决定,即选取数量最多的label。
def majority_cnt(class_list):
class_count = {}
for vote in class_list:
if vote not in class_count.keys():
class_count[vote] = 0
class_count[vote] += 1
sorted_class_count = sorted(class_count.iteritems(), key = operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
def create_decision_tree(dataset, labels):
class_list = [example[-1] for example in dataset] # 标签列
if class_list.count(class_list[0]) == len(class_list):
return class_list[0] # 类别完全相同 停止划分 直接返回该类别
if len(dataset[0]) == 1: # 遍历完所有的特征列 只剩下标签列了 返回出现次数最多的
return majority_cnt(class_list)
# 选择最好的标签 开始为 no surfacing
best_feature = choose_best_feature_to_split(dataset)
best_feature_label = labels[best_feature]
new_tree = {best_feature_label : {}}
del(labels[best_feature]) # 从标签数据集中删除次最好的标签
# 在数据集中找到最好标签对应的一列数据 [1, 1, 1, 0, 0]
feat_vals = [example[best_feature] for example in dataset]
# 选择这列数据中的属性值 [1, 0]
unique_vals = set(feat_vals)
# 根据这列数据中的标签 [1, 0] 来递归分类
for value in unique_vals:
sublabels = labels[:] # 去掉了这个最好的标签后 继续递归查找分类
result = split_dataset(dataset, best_feature, value) # 去掉这个标签
new_tree[best_feature_label][value] = create_decision_tree(result, sublabels)
return new_tree
# 1. [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
# 2. 计算得出 第一列 no surfacing 为起始最好的标签 分类根据第一列
# x1 = [1, 1, 'yes'] x2 = [1, 1, 'yes'] x3 = [1, 0, 'no']
# y1 = [0, 1, 'no'], y2 = [0, 1, 'no']
# 3. 再根据 第二列分类
# x1 x2 y1 y2 一类
# x3 一类
# 决策树分类
def desicion_tree_classify(input_tree, feat_labels, test_vector):
first_str = input_tree.keys()[0] # 树的根节点 即为第一个标签
second_dict = input_tree[first_str] # 第二层的数据
feat_idx = feat_labels.index(first_str) # 将字符标签转换为索引
key = test_vector[feat_idx] # key -> 0, 1
val_of_feat = second_dict[key] # 根据 0或者1 来选择不同的分支
if isinstance(val_of_feat, dict):
class_label = desicion_tree_classify(val_of_feat, feat_labels, test_vector)
else:
class_label = val_of_feat # 一直递归到最后一层叶子节点 得到标签
return class_label
# 序列化决策树
def store_tree(input_tree, filename):
fw = open(filename, 'w')
pickle.dump(input_tree, fw)
fw.close()
def grab_tree(filename):
fr = open(filename)
return pickle.load(fr)
data_set = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
result = choose_best_feature_to_split(data_set)
new_tree = create_decision_tree(data_set, labels)
print('new_tree')
print(new_tree)
# {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
labels = ['no surfacing', 'flippers']
result = desicion_tree_classify(new_tree, labels, [1, 1])
print('result')
print(result)
# yes
网友评论