1knn算法

import numpy as np
import matplotlib.pyplot as plt

row_data_x = [
[3.39, 2.33],
[3.11, 1.78],
[1.34, 3.37],
[3.58, 4.68],
[2.28, 2.87],
[7.42, 4.69],
[5.75, 3.53],
[9.17, 2.51],
[7.79, 3.42],
[7.93, 0.79]]

row_data_y = [0,0,0,0,0,1,1,1,1,1]

x_train = np.array(row_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.09, 3.37])
plt.scatter(x_train[y_train==0, 0], x_train[y_train==0,1], color='g', marker='o')
plt.scatter(x_train[y_train==1, 0], x_train[y_train==1,1], color='r', marker='+')
plt.scatter(x[0], x[1], color='b')
plt.show()

1.1knn的过程

import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

row_data_x = [
[3.39, 2.33],
[3.11, 1.78],
[1.34, 3.37],
[3.58, 4.68],
[2.28, 2.87],
[7.42, 4.69],
[5.75, 3.53],
[9.17, 2.51],
[7.79, 3.42],
[7.93, 0.79]]

row_data_y = [0,0,0,0,0,1,1,1,1,1]

x_train = np.array(row_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.09, 3.37])
distances = [sqrt(np.sum((X_train - x)**2)) for X_train in x_train]
nearest = np.argsort(distance)
k = 6
topK_y = [y_train[i] for i in nearest[: k]]
from collections import Counter
votes = Counter(topK_y)
# 找到出现次数最多的两个数
tem= votes.most_common(2)
# 找到出现次数最多的那个数
predict_y = votes.most_common(1)[0][0]

1.2scilit-learn中的knn

from sklearn.neigbors imports KNeighborsClassifier

Knn_classifier = KNeighborsClassifier(n_neighbors=6)
Knn_classifier.fit(x_train, y_train)
result = Knn_classifier.predict(x)
# 降1维向量变成2维向量进行计算
x_predict = x.reshape(1, -1)
result = Knn_classifier.predict(x_predict)

1.3scikit-learn机器学习算法封装

import numpy as np
from math import sqrt
from collections import Counter

def KNN_classify(k,X_train, y_train, x):
    assert 1<= k <= X_train.shape[0], 'k must be valid'
    assert X_train.shape[0] == y_train.shape[0], '两个数据集尺寸需相同'
    distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
    nearest = np.argsort(distance)
    topK_y = [y_train[i] for i in nearest[: k]]
    votes = Counter(topK_y)
    return votes.most_common(1)[0][0]

1.4封装自己的sklearn近邻算法

import numpyas np
from math import sqrt
from collections import Counter

class KNNClassifier:
    def __init__(self, k):
        assert k >= 1, "k must be valid"
        self.k = k
        self._x_train = None
        self._y_train = None
    def fit(self, x_train, y_train):
        assert x_train.shape[0] == y_train.shape[0], "the size of x_train must be equal to the size of y_train"
        assert self.k <= x_train.shape[0], "the size of x_train must be at least k"
        self._x_train = x_train
        self._y_train = y_train
        return self
    def predict(self,x_predict):
        assert self._x_train is not None and self._y_train is  not None, "must fit before predict"
        assert x_predict.shape[1] == self._x_train.shape[1],"the feature number of x_predict must be equal to x_train"
        y_predict = [self._predict(x) for x in x_predict]
        return np.array(y_predict)
    def _predict(self, x):
        assert x.shape[0] == self._x_train.shape[1],"the feature number of x must be equal to x_train"
        distances = [sqrt(np.sum((x_train - x)**2)) for x_train in self._x_train]
        nearest = np.argsort(distances)
        topK_y = [self._y_train[i] for i in nearest[: self.k]]
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]

2算法效率测试

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.neigbors imports KNeighborsClassifier

iris = datasets.load_iris()
x = iris.data
y = iris.target
# 将0-150这个范围内的数字乱序，并将其作为数据集的下标
shuffle_indexes = np.random.permutation(len(x))
# 设置测试数据集的比例
test_ratio = 0.2
# 求测试数据集的数据个数
test_size = int(len(x)*test_ratio)
# 获取测试数据集的下标
test_indexes = shuffle_indexes[:test_size]
# 获取训练数据集的下标
train_indexes = shuffle_indexes[test_size:]
# 获取训练数据集和测试数据集
x_train = x[train_indexes]
y_train = y[train+indexes]
x_test = x[test_indexes]
y_test = y[test_indexes]
# 创建一个knn实例
my_knn_clf = KNeighborsClassifier(n_neighbors=3)
# 送入训练数据集
my_knn_clf.fit(x_train, y_train)
# 预测测试数据集
y_predict = my_knn_clf.predict(x_test)
# 获取测试数据预测准确率
percent = sum(y_predict == y_test)/len(y_test)

2数字识别

ipython中执行??func，执行函数的源码

from sklearn import datasets

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

digits = datasets.load_digits()
print(digits.keys())
descr=digits.DESCR
print(descr)
x=digits.data
y=digits.target
# some_digit 为1个64列的数据
some_digit = x[666]
some_digit_image = some_digit.reshape(8,8)
# 可视化后，显示为手写数字0
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)
sum(y_predict==y_test)/len(y_test)

from sklearn.metrics import accuracy_score
# 算正确率
accuracy_score(y_test,y_predict)
# 计算正确率
knn.score(x_test, y_test)

3封装自己的训练测试分割集

import numpy as np
def train_test_split(x,y,test_radio=0.2,random_state=None):
    assert x.shape[0] == y.shape[0],'x和y行数需相同'
    assert 0 <= test_radio <=1, '分割比例需在0-1之间'
    if random_state:
        np.random.seed(random_state)
    shuffled_indexes = np.random.permutation(x.shape[0])
    test_size = int(x.shape[0]*test_radio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]
    x_train = x[train_indexes]
    y_train = y[train_indexes]
    x_test = x[test_indexes]
    y_test = y[test_indexes]
    return x_train,x_test,y_train,y_test