参考:
scikit-learn (sklearn) 官方文档中文版
多分类问题OvO,OvR,MvM
在python sklearn使用 SVM做分类
Sklearn之支持向量机分类
SVM:利用sklearn 实现SVM分类 相关参数说明ING
- 样本数据集
hua_se huaban_yeshu huaban_type
101 1 3
102 1 3
103 2 3
104 1 3
105 3 3
106 1 3
107 3 3
109 4 3
110 2 3
101 27 3
102 28 3
103 28 3
104 29 3
105 2 3
106 27 3
107 29 3
109 30 3
110 30 3
101 4 3
102 4 3
103 3 3
104 2 3
105 1 3
106 1 3
107 2 3
109 2 3
110 4 3
101 29 3
102 30 3
103 30 3
104 29 3
105 27 3
106 28 3
107 29 3
109 29 3
110 30 3
1 1 1
2 2 1
3 3 1
4 1 1
5 4 1
6 3 1
7 4 1
9 2 1
8 3 1
10 1 1
6 2 1
7 3 1
3 1 1
5 3 1
5 4 1
2 3 1
3 2 1
2 3 1
2 2 1
10 1 1
9 3 1
7 1 1
9 4 1
4 3 1
6 3 1
3 1 1
7 1 1
1 2 1
8 4 1
10 4 1
10 27 2
9 29 2
8 29 2
7 30 2
5 29 2
6 27 2
4 27 2
3 28 2
1 29 2
2 29 2
10 29 2
9 29 2
8 30 2
7 30 2
5 27 2
6 28 2
4 28 2
3 29 2
1 27 2
2 30 2
10 30 2
9 29 2
8 30 2
7 28 2
5 29 2
6 30 2
4 30 2
3 30 2
1 27 2
2 29 2
二分类.
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000,默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)
df = pd.read_csv('G:\\rasa_demo\stack\data\\train.csv.txt', sep='\t', encoding='GBK', header=0)
df = df[36:97]
# print(df)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent', copy=True)
# df = imp.fit_transform(df)
# df = pd.DataFrame(df)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # 检查数据中是否有缺失值,以下两种方式均可
# # Flase:对应特征的特征值中无缺失值
# # True:有缺失值
# print(df.isnull().any())
# print(np.isnan(df).any())
# print(np.isfinite(df).all())
# # # 查看缺失值记录
# # df_null = pd.isnull(df)
# # df_null = df[df_null == True]
# # print(df_null)
# # 缺失值处理,以下两种方式均可
# # 删除包含缺失值的行
# df.dropna(inplace=True)
# # # 缺失值填充
# # df.fillna('10.0')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# print(df.columns)
# print(df[['hua_se','huaban_yeshu']])
# print(df.iloc[:,[0,1]])
print(df.iloc[0:3,[2]])
X = df.iloc[:,[0,1]]
Y = df.iloc[:,[2]]
print(X.shape, Y.shape)
# Y = Y.values.reshape(-1,1)
print(Y.values.ravel())
X_train,X_test,Y_train,Y_test = train_test_split(X, Y.values.ravel(), test_size=0.2, random_state=35)
# print(type(X_test))
svm_double_clf_model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
svm_double_clf_model.fit(X_train, Y_train)
result = svm_double_clf_model.predict(X_test)
count = 0
for item1,item2 in zip(result, Y_test):
if item1 == item2:
count += 1
print('准确率:', float(count)/float(len(Y_test)))
# # 保存训练好的模型
# joblib.dump(svm_double_clf_model, 'G:\\rasa_demo\stack\model\svm_double_clf_model.pkl') # pkl是sklearn默认的保存格式
# print('模型保存成功!')
# # 加载已训练好的模型
# model = joblib.load('G:\\rasa_demo\stack\model\svm_double_clf_model.pkl')
# pred_y = model.predict(X_test)
pred_x = {'hua_se':12,'huaban_yeshu':7}
tmp = pd.DataFrame(pred_x, index=[0])
print(tmp)
print(svm_double_clf_model.predict(tmp))
多分类.
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000,默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)
df = pd.read_csv('G:\\rasa_demo\stack\data\\train.csv.txt', sep='\t', encoding='GBK', header=0)
# print(df)
# print(df.columns)
# print(df[['hua_se','huaban_yeshu']])
# print(df.iloc[:,[0,1]])
print(df.iloc[0:3,[2]])
X = df.iloc[:,[0,1]]
Y = df.iloc[:,[2]]
print(X.shape, Y.shape)
# Y = Y.values.reshape(-1,1)
print(Y.values.ravel())
X_train,X_test,Y_train,Y_test = train_test_split(X, Y.values.ravel(), test_size=0.2, random_state=35)
# print(type(X_test))
# svm_double_clf_model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
svm_multi_clf_model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# svm_multi_clf_model = SVC(kernel='linear') # 训练svm模型---基于线性核函数
# svm_multi_clf_model = SVC(kernel='poly', degree=3) # 训练svm模型---基于多项式核函数
# svm_multi_clf_model = SVC(kernel='rbf', C=1) # 训练svm模型---基于径向基核函数
svm_multi_clf_model.fit(X_train, Y_train)
result = svm_multi_clf_model.predict(X_test)
count = 0
for item1,item2 in zip(result, Y_test):
if item1 == item2:
count += 1
print('准确率:', float(count)/float(len(Y_test)))
# 保存训练好的模型
joblib.dump(svm_multi_clf_model, 'G:\\rasa_demo\stack\model\svm_multi_clf_model.pkl') # pkl是sklearn默认的保存格式
print('模型保存成功!')
# 加载已训练好的模型
model = joblib.load('G:\\rasa_demo\stack\model\svm_multi_clf_model.pkl')
pred_y = model.predict(X_test)
pred_x = {'hua_se':2,'huaban_yeshu':27}
tmp = pd.DataFrame(pred_x, index=[0])
print(tmp)
print(svm_multi_clf_model.predict(tmp))
盗图
网友评论