在网上没找到比较直观的多标签分类例子,于是自己写了一个。起码在入门这个领域的时候能有个直观的认识。scikit learn在多标签分类分类上一个很简单的实现。
数据准备
这次用的是emotion数据集,格式是arff格式。一共是593个instance,共78列。前72列是features,最后的6列为labels。
import arff, numpy as np
dataset = arff.load(open('emotions.arff', 'rb'))
data = np.array(dataset['data'], dtype=np.float) # dtype=np.float保证数据格式,不然后面clf.fit的时候会出错
data.shape
# output
(593, 78)
# extract feature, 提取前72列作为features
data[:, :-6]
# output
array([[ 0.034741, 0.089665, 0.091225, ..., 0.245457, 0.105065,
0.405399],
[ 0.081374, 0.272747, 0.085733, ..., 0.343547, 0.276366,
0.710924],
[ 0.110545, 0.273567, 0.08441 , ..., 0.188693, 0.045941,
0.457372],
...,
[ 0.042903, 0.089283, 0.080263, ..., 0.366192, 0.289227,
0.66168 ],
[ 0.038987, 0.05957 , 0.082053, ..., 0.581526, 0.047156,
0.774458],
[ 0.084866, 0.192814, 0.084549, ..., 0.533746, 0.587807,
1.121553]])
# extract label,提取最后6列作为labels
data[:, -6:]
# output
array([[ 0., 1., 1., 0., 0., 0.],
[ 1., 0., 0., 0., 0., 1.],
[ 0., 1., 0., 0., 0., 1.],
...,
[ 0., 1., 1., 0., 0., 0.],
[ 0., 0., 0., 1., 1., 0.],
[ 0., 1., 1., 0., 0., 0.]])
# 把数据集划为测试集和训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[:, :-6], data[:, -6:], test_size=0.33, random_state=42)
# 分类器使用1对多,SVM用linear kernel
clf1 = OneVsRestClassifier(SVC(kernel='linear'), n_jobs=-1)
# 训练
clf1.fit(X_train, y_train)
# output
OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False),n_jobs=-1)
# 输出预测的标签结果
predict_class = clf1.predict(X_test)
predict_class
# output
array([[0, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
...,
[0, 0, 1, 1, 1, 0],
[1, 0, 0, 0, 0, 1],
[0, 0, 1, 1, 1, 0]])
#准确率,预测的结果和实际的结果
clf1.score(X_test, y_test)
0.27040816326530615
网友评论