美文网首页
第六节分类算法

第六节分类算法

作者: barriers | 来源:发表于2020-02-18 11:01 被阅读0次

    1knn算法

    import numpy as np
    import matplotlib.pyplot as plt
    
    row_data_x = [
    [3.39, 2.33],
    [3.11, 1.78],
    [1.34, 3.37],
    [3.58, 4.68],
    [2.28, 2.87],
    [7.42, 4.69],
    [5.75, 3.53],
    [9.17, 2.51],
    [7.79, 3.42],
    [7.93, 0.79]]
    
    row_data_y = [0,0,0,0,0,1,1,1,1,1]
    
    x_train = np.array(row_data_x)
    y_train = np.array(raw_data_y)
    x = np.array([8.09, 3.37])
    plt.scatter(x_train[y_train==0, 0], x_train[y_train==0,1], color='g', marker='o')
    plt.scatter(x_train[y_train==1, 0], x_train[y_train==1,1], color='r', marker='+')
    plt.scatter(x[0], x[1], color='b')
    plt.show()
    

    1.1knn的过程

    import numpy as np
    from math import sqrt
    import matplotlib.pyplot as plt
    
    row_data_x = [
    [3.39, 2.33],
    [3.11, 1.78],
    [1.34, 3.37],
    [3.58, 4.68],
    [2.28, 2.87],
    [7.42, 4.69],
    [5.75, 3.53],
    [9.17, 2.51],
    [7.79, 3.42],
    [7.93, 0.79]]
    
    row_data_y = [0,0,0,0,0,1,1,1,1,1]
    
    x_train = np.array(row_data_x)
    y_train = np.array(raw_data_y)
    x = np.array([8.09, 3.37])
    distances = [sqrt(np.sum((X_train - x)**2)) for X_train in x_train]
    nearest = np.argsort(distance)
    k = 6
    topK_y = [y_train[i] for i in nearest[: k]]
    from collections import Counter
    votes = Counter(topK_y)
    # 找到出现次数最多的两个数
    tem= votes.most_common(2)
    # 找到出现次数最多的那个数
    predict_y = votes.most_common(1)[0][0]
    

    1.2scilit-learn中的knn

    from sklearn.neigbors imports KNeighborsClassifier
    
    Knn_classifier = KNeighborsClassifier(n_neighbors=6)
    Knn_classifier.fit(x_train, y_train)
    result = Knn_classifier.predict(x)
    # 降1维向量变成2维向量进行计算
    x_predict = x.reshape(1, -1)
    result = Knn_classifier.predict(x_predict)
    

    1.3scikit-learn机器学习算法封装

    import numpy as np
    from math import sqrt
    from collections import Counter
    
    def KNN_classify(k,X_train, y_train, x):
        assert 1<= k <= X_train.shape[0], 'k must be valid'
        assert X_train.shape[0] == y_train.shape[0], '两个数据集尺寸需相同'
        distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
        nearest = np.argsort(distance)
        topK_y = [y_train[i] for i in nearest[: k]]
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]
    

    1.4封装自己的sklearn近邻算法

    import numpyas np
    from math import sqrt
    from collections import Counter
    
    class KNNClassifier:
        def __init__(self, k):
            assert k >= 1, "k must be valid"
            self.k = k
            self._x_train = None
            self._y_train = None
        def fit(self, x_train, y_train):
            assert x_train.shape[0] == y_train.shape[0], "the size of x_train must be equal to the size of y_train"
            assert self.k <= x_train.shape[0], "the size of x_train must be at least k"
            self._x_train = x_train
            self._y_train = y_train
            return self
        def predict(self,x_predict):
            assert self._x_train is not None and self._y_train is  not None, "must fit before predict"
            assert x_predict.shape[1] == self._x_train.shape[1],"the feature number of x_predict must be equal to x_train"
            y_predict = [self._predict(x) for x in x_predict]
            return np.array(y_predict)
        def _predict(self, x):
            assert x.shape[0] == self._x_train.shape[1],"the feature number of x must be equal to x_train"
            distances = [sqrt(np.sum((x_train - x)**2)) for x_train in self._x_train]
            nearest = np.argsort(distances)
            topK_y = [self._y_train[i] for i in nearest[: self.k]]
            votes = Counter(topK_y)
            return votes.most_common(1)[0][0]
    

    2算法效率测试

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import datasets
    from sklearn.neigbors imports KNeighborsClassifier
    
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target
    # 将0-150这个范围内的数字乱序,并将其作为数据集的下标
    shuffle_indexes = np.random.permutation(len(x))
    # 设置测试数据集的比例
    test_ratio = 0.2
    # 求测试数据集的数据个数
    test_size = int(len(x)*test_ratio)
    # 获取测试数据集的下标
    test_indexes = shuffle_indexes[:test_size]
    # 获取训练数据集的下标
    train_indexes = shuffle_indexes[test_size:]
    # 获取训练数据集和测试数据集
    x_train = x[train_indexes]
    y_train = y[train+indexes]
    x_test = x[test_indexes]
    y_test = y[test_indexes]
    # 创建一个knn实例
    my_knn_clf = KNeighborsClassifier(n_neighbors=3)
    # 送入训练数据集
    my_knn_clf.fit(x_train, y_train)
    # 预测测试数据集
    y_predict = my_knn_clf.predict(x_test)
    # 获取测试数据预测准确率
    percent = sum(y_predict == y_test)/len(y_test)
    

    2数字识别

    ipython中执行??func,执行函数的源码

    from sklearn import datasets
    
    import numpy as np
    import matplotlib
    import matplotlib.pyplot as plt
    
    digits = datasets.load_digits()
    print(digits.keys())
    descr=digits.DESCR
    print(descr)
    x=digits.data
    y=digits.target
    # some_digit 为1个64列的数据
    some_digit = x[666]
    some_digit_image = some_digit.reshape(8,8)
    # 可视化后,显示为手写数字0
    plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
    plt.show()
    
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train, y_train)
    y_predict = knn.predict(x_test)
    sum(y_predict==y_test)/len(y_test)
    
    from sklearn.metrics import accuracy_score
    # 算正确率
    accuracy_score(y_test,y_predict)
    # 计算正确率
    knn.score(x_test, y_test)
    

    3封装自己的训练测试分割集

    import numpy as np
    def train_test_split(x,y,test_radio=0.2,random_state=None):
        assert x.shape[0] == y.shape[0],'x和y行数需相同'
        assert 0 <= test_radio <=1, '分割比例需在0-1之间'
        if random_state:
            np.random.seed(random_state)
        shuffled_indexes = np.random.permutation(x.shape[0])
        test_size = int(x.shape[0]*test_radio)
        test_indexes = shuffled_indexes[:test_size]
        train_indexes = shuffled_indexes[test_size:]
        x_train = x[train_indexes]
        y_train = y[train_indexes]
        x_test = x[test_indexes]
        y_test = y[test_indexes]
        return x_train,x_test,y_train,y_test
    

    相关文章

      网友评论

          本文标题:第六节分类算法

          本文链接:https://www.haomeiwen.com/subject/wcomfhtx.html