1 Sklearn分类学习算法一览
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
1.1 机器学习算法选择

1.2 scikit-learn初探
scikit-learn中自带了一些数据集,比如说最著名的Iris数据集。
数据集中第3列和第4列数据表示花瓣的长度和宽度。而类别已经转成了数字,比如
0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica.

1.2.1加载数据
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
1.2.2 读取特征和标签
X = iris.data[: . [2,3]]
y = iris.target
print('Class labels:' , np.unique(y))
Class labels: [0 1 2]
1.2.3 数据切分
通常我们会把数据集切分成训练集和测试集,这里70%的训练集,30%的测试集。
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y test_size = 0.3 , random_state = 0)
1.2.4查看数据维度
X_train.shape
(105, 2)
X_test.shape
(45, 2)
X.shape
(150, 2)
y_train.shape
(105,)
y_test.shape
(45,)
1.2.5 对特征做标准化

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
sc.scale_
array([1.79595918, 0.77637684])
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
1.3 机器学习分类模型建模
1.3.1 感知机
我们先用plot_decision_region
函数来做一个可视化,方便一会儿直观看分类结果。
from sklearn.linear_model import Perception
ppn = Perception(max_iter = 5)
ppn.fit(X_train_std , y_train)
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
fit_intercept=True, max_iter=5, n_iter=None, n_iter_no_change=5,
n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=None,
validation_fraction=0.1, verbose=0, warm_start=False)
ppn.coef_ #wx+b 中的w
ppn.intercept_ #wx+b 中的b
y_pred = ppn.predict(X_test_std)
y_pred
array([2, 1, 0, 2, 0, 2, 0, 2, 1, 1, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 2, 2,
0, 0, 2, 0, 0, 1, 1, 0, 2, 2, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
0])
y_test
array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
0])
y_pred == y_test
array([ True, True, True, True, True, True, True, False, True,
True, False, False, True, False, True, True, False, True,
True, True, True, False, True, True, True, True, True,
True, True, True, True, False, True, True, True, True,
True, False, True, True, True, True, True, True, True])
print('Misclassified samples: %d' % (y_test != y_pred).sum())
Misclassified samples: 8
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.82
from matplotlib.colors import ListedColormap
import matlotlib.pyplot as plt
import warnings
def versiontuple(v):
return tuple(map(int,(v.split("."))))
def plot_decision_regions(X , y , classifier , test_idx = None , resolution = 0.2)
#setup marker generator and color map
markers = ('s' , 'x' , 'o' , '^' , 'v')
colors = ('red' , 'blue' , 'lightgreen' , 'gray' , ' cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
#plot the decision surface
x1_min , x1_max , = X[:.0].min() - 1 , X[:,0].max()+1
x2_min , x2_max , = X[:,1].min() - 1 , X[:,1].max()+1
xx1 , xx2 = np.meshgrid(np.arange(x1_min , x1_max , resolution),(np.arange(x2_min , x2_max , resolution))
Z = classfier.predict(np.array([xx1.ravel() , xx2.ravel()]).T)
Z = Z.reshape(XX1.shape)
plt.contourf(xx1, xx2, Z, alpha = 0.4 cmap = cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx , cl in enumerate(np.unique(y)):
plt.scatter(x = X[y == cl , 0] , y = X[y == cl , 1],
alpha = 0.8 , c = cmap(idx) ,
marker = markers[idx] , label = cl)
# highlight test samples
if test_idx:
#plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test , y_test = X[list(test_index) , : ] , y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
alpha=0.15,
linewidths=2,
marker='^',
edgecolors='black',
facecolors='none',
s=55, label='test set')
用标准化的数据做一个感知器分类器
%matplotlib inline
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined_std, y=y_combined,
classifier=ppn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
plt.show()

1.3.2 逻辑回归(Logistic Regression)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C = 1000.0 , random_state = 0)
lr.fit(X_train_std , y_train)
plot_decision_regions(X_combined_std , y combined , classifier = lr , test_index = range(105 , 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()

1.3.3附:过拟合/overfitting 与 正则化/regularization
参考知乎问题

1.3.4 最大间隔分类与支持向量机

1.3.5 通过松弛变量解决非线性切分情况

from sklearn.svm import SVC
svm = SVC(kernel = 'linear' , C = 1.0 ,random_state = 0)
svm.fit (X_train_std , y_train)
plot_decision_regression(X_combined_std , y_combined , classifier = svm , test_idx = range(105,150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
plt.show()

1.3.6神奇的SVM核函数完成非线性切分
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0,
X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)
plt.scatter(X_xor[y_xor == 1, 0],
X_xor[y_xor == 1, 1],
c='b', marker='x',
label='1')
plt.scatter(X_xor[y_xor == -1, 0],
X_xor[y_xor == -1, 1],
c='r',
marker='s',
label='-1')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('./figures/xor.png', dpi=300)
plt.show()


1.3.7 使用kernerl trick 在高维空间内找到一个可切分的超平面
svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)
svm.fit(X_xor, y_xor)
plot_decision_regions(X_xor, y_xor,
classifier=svm)
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_xor.png', dpi=300)
plt.show()

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0)
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
plt.show()

svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0)
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_2.png', dpi=300)
plt.show()

1.3.8 决策树模型完成分类

1.3.9 构建决策树
from sklearn.tree import DescisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
tree.fit(X_train, y_train)
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X_combined, y_combined,
classifier=tree, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/decision_tree_decision.png', dpi=300)
plt.show()

from sklearn.tree import export_graphviz
export_graphviz(tree,
out_file='tree.dot',
feature_names=['petal length', 'petal width'])

import pydotplus
from IPython.display import Image
from IPython.display import display
if Version(sklearn_version) >= '0.18':
try:
import pydotplus
dot_data = export_graphviz(
tree,
out_file=None,
# the parameters below are new in sklearn 0.18
feature_names=['petal length', 'petal width'],
class_names=['setosa', 'versicolor', 'virginica'],
filled=True,
rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))
except ImportError:
print('pydotplus is not installed.')

1.3.10 树模型集成与随机森林
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy',
n_estimators=10,
random_state=1,
n_jobs=2)
forest.fit(X_train, y_train)
plot_decision_regions(X_combined, y_combined,
classifier=forest, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/random_forest.png', dpi=300)
plt.show()

1.3.11 K最近邻

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=knn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)
plt.show()

网友评论