决策树理论
决策树
ID3 信息增益
C4.5 信息增益率
CART 基尼系数
前剪枝,后剪枝
from math import log
import operator
import treePlotter
import matplotlib.pyplot as plt
sklearn决策树
插件graphviz和插件pydotplus安装
在决策树中需要使用插件graphviz和插件pydotplus
第一步是安装graphviz。下载地址在:http://www.graphviz.org/。无论是linux还是windows,装完后都要设置环境变量,将graphviz的bin目录加到PATH,比如我是windows,将C:/Program Files (x86)/Graphviz2.38/bin/加入了PATH
import os
os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'
第二步是安装python插件pydotplus, 在anaconda3的命令行执行一下命令
conda install -c conda-forge pydotplus
import pydotplus
参数说明
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
训练集和测试集选择:交叉验证
y = data['class']
X = data.drop('class', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
columns = X_train.columns
数据标准化
sdata = (data -data.min())/ (data.max()- data.min())
或者
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_y = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import tree
import pydotplus
import os
dataSet = pd.read_csv("trainData.csv", encoding = "gbk")
target = dataSet["class"]
data = dataSet.drop(["class"],axis =1)
data = dataSet.loc[:,"ele_ind":"alarm_ind"]
数据标准化
sdata = (data -data.min())/ (data.max()- data.min())
X_train, X_test, y_train, y_test = train_test_split(sdata, target, random_state=0, test_size=0.2)
columns = X_train.columns
mode = tree.DecisionTreeClassifier()
mode = tree.DecisionTreeClassifier(criterion='entropy')
mode = mode.fit(X_train,y_train)
y_prob = mode.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob > 0.5, 1, 0)
mode.score(X_test, y_pred)
data_feature_name = data.columns[:]
data_target_name = [str(i) for i in np.unique(dataSet["class"])] #列表int转化str列表
import graphviz
import pydotplus
from sklearn import tree
import os
os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'
dot_tree = tree.export_graphviz(mode,out_file=None,feature_names=data_feature_name,class_names=data_target_name,filled=True, rounded=True,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_tree)
graph.write_pdf("out.pdf")
网友评论