使用sklearn库：SVM

作者: 还闹不闹 | 来源:发表于2020-05-29 21:23 被阅读0次

使用sklearn库：SVM
2018-12-16
sklearn中随机测试数据：sklearn包中SVM算法库的使
无标题文章
Support Vector Machine(2)
支持向量机-代码解析与SMO算法
机器学习入门
SVM(1)介绍
Sklearn学习笔记（三）
2019-01-25

参考：
scikit-learn (sklearn) 官方文档中文版
 多分类问题OvO,OvR,MvM
在python sklearn使用 SVM做分类
 Sklearn之支持向量机分类
 SVM:利用sklearn 实现SVM分类相关参数说明ING

样本数据集

hua_se  huaban_yeshu    huaban_type
101 1   3
102 1   3
103 2   3
104 1   3
105 3   3
106 1   3
107 3   3
109 4   3
110 2   3
101 27  3
102 28  3
103 28  3
104 29  3
105 2   3
106 27  3
107 29  3
109 30  3
110 30  3
101 4   3
102 4   3
103 3   3
104 2   3
105 1   3
106 1   3
107 2   3
109 2   3
110 4   3
101 29  3
102 30  3
103 30  3
104 29  3
105 27  3
106 28  3
107 29  3
109 29  3
110 30  3
1   1   1
2   2   1
3   3   1
4   1   1
5   4   1
6   3   1
7   4   1
9   2   1
8   3   1
10  1   1
6   2   1
7   3   1
3   1   1
5   3   1
5   4   1
2   3   1
3   2   1
2   3   1
2   2   1
10  1   1
9   3   1
7   1   1
9   4   1
4   3   1
6   3   1
3   1   1
7   1   1
1   2   1
8   4   1
10  4   1
10  27  2
9   29  2
8   29  2
7   30  2
5   29  2
6   27  2
4   27  2
3   28  2
1   29  2
2   29  2
10  29  2
9   29  2
8   30  2
7   30  2
5   27  2
6   28  2
4   28  2
3   29  2
1   27  2
2   30  2
10  30  2
9   29  2
8   30  2
7   28  2
5   29  2
6   30  2
4   30  2
3   30  2
1   27  2
2   29  2

二分类.

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer


# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000，默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)

df = pd.read_csv('G:\\rasa_demo\stack\data\\train.csv.txt', sep='\t', encoding='GBK', header=0)
df = df[36:97]
# print(df)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent', copy=True)
# df = imp.fit_transform(df)
# df = pd.DataFrame(df)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # 检查数据中是否有缺失值，以下两种方式均可
# # Flase:对应特征的特征值中无缺失值
# # True:有缺失值
# print(df.isnull().any())
# print(np.isnan(df).any())
# print(np.isfinite(df).all())
# # # 查看缺失值记录
# # df_null = pd.isnull(df)
# # df_null = df[df_null == True]
# # print(df_null)
# # 缺失值处理，以下两种方式均可
# # 删除包含缺失值的行
# df.dropna(inplace=True)
# # # 缺失值填充
# # df.fillna('10.0')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# print(df.columns)
# print(df[['hua_se','huaban_yeshu']])
# print(df.iloc[:,[0,1]])
print(df.iloc[0:3,[2]])
X = df.iloc[:,[0,1]]
Y = df.iloc[:,[2]]
print(X.shape, Y.shape)
# Y = Y.values.reshape(-1,1)
print(Y.values.ravel())
X_train,X_test,Y_train,Y_test = train_test_split(X, Y.values.ravel(), test_size=0.2, random_state=35)
# print(type(X_test))

svm_double_clf_model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
svm_double_clf_model.fit(X_train, Y_train)

result = svm_double_clf_model.predict(X_test)
count = 0
for item1,item2 in zip(result, Y_test):
    if item1 == item2:
        count += 1
print('准确率：', float(count)/float(len(Y_test)))

# # 保存训练好的模型
# joblib.dump(svm_double_clf_model, 'G:\\rasa_demo\stack\model\svm_double_clf_model.pkl') # pkl是sklearn默认的保存格式
# print('模型保存成功！')
# # 加载已训练好的模型
# model = joblib.load('G:\\rasa_demo\stack\model\svm_double_clf_model.pkl')
# pred_y = model.predict(X_test)

pred_x = {'hua_se':12,'huaban_yeshu':7}
tmp = pd.DataFrame(pred_x, index=[0])
print(tmp)
print(svm_double_clf_model.predict(tmp))

多分类.

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000，默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)

df = pd.read_csv('G:\\rasa_demo\stack\data\\train.csv.txt', sep='\t', encoding='GBK', header=0)
# print(df)
# print(df.columns)
# print(df[['hua_se','huaban_yeshu']])
# print(df.iloc[:,[0,1]])
print(df.iloc[0:3,[2]])
X = df.iloc[:,[0,1]]
Y = df.iloc[:,[2]]
print(X.shape, Y.shape)
# Y = Y.values.reshape(-1,1)
print(Y.values.ravel())
X_train,X_test,Y_train,Y_test = train_test_split(X, Y.values.ravel(), test_size=0.2, random_state=35)
# print(type(X_test))

# svm_double_clf_model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
svm_multi_clf_model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# svm_multi_clf_model = SVC(kernel='linear') # 训练svm模型---基于线性核函数
# svm_multi_clf_model = SVC(kernel='poly', degree=3) # 训练svm模型---基于多项式核函数
# svm_multi_clf_model = SVC(kernel='rbf', C=1) # 训练svm模型---基于径向基核函数
svm_multi_clf_model.fit(X_train, Y_train)

result = svm_multi_clf_model.predict(X_test)
count = 0
for item1,item2 in zip(result, Y_test):
    if item1 == item2:
        count += 1
print('准确率：', float(count)/float(len(Y_test)))

# 保存训练好的模型
joblib.dump(svm_multi_clf_model, 'G:\\rasa_demo\stack\model\svm_multi_clf_model.pkl') # pkl是sklearn默认的保存格式
print('模型保存成功！')
# 加载已训练好的模型
model = joblib.load('G:\\rasa_demo\stack\model\svm_multi_clf_model.pkl')
pred_y = model.predict(X_test)

pred_x = {'hua_se':2,'huaban_yeshu':27}
tmp = pd.DataFrame(pred_x, index=[0])
print(tmp)
print(svm_multi_clf_model.predict(tmp))

盗图