超参数:在算法运行前需要决定的参数;
模型参数:算法过程中学习的参数。
KNN算法没有模型参数;
KNN算法中的k是典型的超参数。
调参调的是超参数。
from sklearn import datasets
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
digits = datasets.load_digits()
print(digits.keys())
descr=digits.DESCR
print(descr)
x=digits.data
y=digits.target
# some_digit 为1个64列的数据
some_digit = x[666]
some_digit_image = some_digit.reshape(8,8)
# 可视化后,显示为手写数字0
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
best_score = 0.0
best_k = -1
best_method = ""
for method in ["uniform", "distance"]:
for k in range(1, 11):
knn = KNeighborsClassifier(n_neighbors=k, weights=method)
knn.fit(x_train, y_train)
# 计算正确率
score = knn.score(x_test, y_test)
if score > best_score:
best_k = k
best_score = score
best_method = method
print("best_k-", best_k)
print("best_score=",best_score)
print("best_method=",best_method)
1常见距离公式及最优超参数寻找
网格搜索
# 曼哈顿距离,x(a),x(b)代表a点和b点
sum(abs(x(a)-x(b))**1)**1/1
# 欧拉距离
sum(abs(x(a)-x(b))**2)**1/2
# 明可夫斯基距离
sum(abs(x(a)-x(b))**p)**1/p
from sklearn import datasets
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
digits = datasets.load_digits()
print(digits.keys())
descr=digits.DESCR
print(descr)
x=digits.data
y=digits.target
# some_digit 为1个64列的数据
some_digit = x[666]
some_digit_image = some_digit.reshape(8,8)
# 可视化后,显示为手写数字0
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
best_score = 0.0
best_k = -1
best_p = -1
for k in range(1, 11):
for p in range(1, 6):
knn = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
knn.fit(x_train, y_train)
# 计算正确率
score = knn.score(x_test, y_test)
if score > best_score:
best_k = k
best_score = score
best_p = p
print("best_k-", best_k)
print("best_score=",best_score)
print("best_p=",best_p)
2网格搜索
param_grid = [
{
"weights": ["uniform"],
"n_neighbors": [i for i in range(1, 11)]
},
{
"weights": ["distance"],
"n_neighbors": [i for i in range(1, 11)],
"p": [i for i in range(1, 6)]
},
]
from sklearn import datasets
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
digits = datasets.load_digits()
print(digits.keys())
descr=digits.DESCR
print(descr)
x=digits.data
y=digits.target
# some_digit 为1个64列的数据
some_digit = x[666]
some_digit_image = some_digit.reshape(8,8)
# 可视化后,显示为手写数字0
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn, param_grid)
grid_search.fit(x_train, y_train)
grid_search.best_estimator_
# 查看准确度
grid_search.best_score_
# 查看获取最佳准确度时各参数的取值
grid_search.best_params_
# 获取最佳参数对应的分类器
knn = grid_search.best_estimator_
knn.predict(x_test)
knn.score(x_test, y_test)
# n_jobs表示使用几核进行搜索,-1表示所有核,verbose表示搜索的时候进行输出
grid_search = GridSearchCV(knn, param_grid, n_jobs=-1,verbose=2)
grid_search.fit(x_train, y_train)
3数据归一化
数据归一化:将所有的数据映射到同一尺度
最值归一化:把所有数据映射到0-1之间((x-xmin)/(xmax-xmin))
均值方差归一化:数据分布没有明显的边界;有可能存在极端数据值;均值方差归一化就是把所有数据归一到均值为0方差为1的分布中。x减去平均值在除以方差,如下:(x-xmean)/S
# 最值归一化
%matplotlib
import numpy as np
import matplotlib.pyplot as plt
x= np.random.randint(0,100,size=100)
(x-np.min(x))/(np.max(x)-np.min(x))
x= np.random.randint(0,100,size=(50,2))
x = np.array(x, dtype=float)
x[:,0]=(x[:,0]-np.min(x[:,0]))/(np.max(x[:,0])-np.min(x[:, 0]))
x[:,1]=(x[:,1]-np.min(x[:,1]))/(np.max(x[:,1])-np.min(x[:, 1]))
plt.scatter(x[:,0], x[:, 1])
np.mean(x[:, 0])
np.std(x[:, 0])
# 均值方差归一化
x2=np.random.randint(0, 100, (50,2))
x2 = np.array(x2,dtype=float)
x2[:,0]=(x2[:,0]-np.mean(x2[:,0]))/np.std(x2[:,0])
x2[:,1]=(x2[:,1]-np.mean(x2[:,1]))/np.std(x2[:,1])
plt.scatter(x2[:,0],x2[:,1])
np.mean(x2[:,0])
np.std(x2[:,0])
训练数据集使用了均值方差归一化后,对测试数据集也进行均值方差归一化,归一化的均值为训练数据集使用的均值,方差为训练数据集的方差,保证二者的一致。
因为测试数据是模拟真实环境;真实环境很可能无法得到所有测试数据的均值和方差,对数据的归一化也是算法的一部分。
sklearn中使用Scalar来进行数据归一化
机器学习中传入变量得到的数据可以使用名字加下划线进行查看,如knn的fit后的查看内容,scalar的fit后使用mean_查看平均值,scale_查看方差
scikit-learn中的Scaler
3.1均值方差归一化
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=666)
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(x_train)
standardScaler.mean_
standardScaler.scale_
x_train=standardScaler.transform(x_train)
x_test_standard=standardScaler.transform(x_test)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train,y_train)
knn_clf.score(x_test_standard, y_test)
3.2最大值最小值归一化
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=666)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
scaler.min_
scaler.scale_
x_train = scaler.transform(x_train)
x_test_scale = scaler.transform(x_test)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train,y_train)
knn_clf.score(x_test_standard, y_test)
4实现自己的数据归一化类
import numpy as np
class Standard:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self, x):
assert x.ndim == 2, 'x必须是2维的'
self.mean_ = np.array([np.mean(x[:, i]) for i in range(x.shape[1])])
self.scale_ = np.array([np.std(x[:, i]) for i in range(x.shape[1])])
return self
def tranform(self, x):
assert x.ndim == 2, 'x必须是2维的'
assert self.mean_ is not None and self.scale_ is not None, '在tranform之前必须先fit'
assert x.shape[1] == len(self.mean_),'训练集列数和测试集列数必须相等'
resX = np.empty(shape=x.shape,dtype=float)
for col in range(x.shape[1]):
resX[:, col] = (x[:, col] - self.mean_[col])/self.scale_[col]
return resX
5knn算法优缺点
5.1优点
解决分类问题,天然可以解决多分类问题,思想强大,效果强大;可以使用knn算法解决回归问题
sklearn中的KNeighborsRegressor可以用于回归算法
5.2缺点
最大缺点,效率低下,可以使用树结构进行优化
缺点2:高度数据相关
缺点3:预测结果不具有解释性
缺点4:维数灾难,随着维度的增加,“看似相近”的两个点之间的距离越来越大,解决方法:降维
6机器学习流程回顾
1.先将训练数据集分成训练数据集和测试数据集
2.将数据归一化到同一尺度(最值归一化或均值方差归一化)
3.训练得到模型
4.利用测试数据集进行预测
5.查看分类准确度
6.使用网格搜索寻找最好的超参数
网友评论