美文网首页数据挖掘
第四章:分类

第四章:分类

作者: 无赖宵小 | 来源:发表于2018-10-24 21:54 被阅读1次

    特征值选取

    1、选择特征值

    2、对特征值相关度进行评分

    3、将数据进行标准化(最常用的方法时将所有数据都转化为 0 到 1 之间的值,或使用标准分 z-score —— 分值偏离均值的程度公式如下)

    标准化

    4、最邻近分类算法

    修正的标准分

    标准分的问题在于它会受异常值的影响

    修正标准分计算方法:将标准分公式中的均值改为中位数,将标准差改为绝对偏差

    修正的标准分

    中位数指的是将所有数据进行排序,取中间的那个值。如果数据量是偶数,则去中间两个数值的均值。

    def getMedian(self, alist):
        """返回中位数"""
        if alist == []:
            return []
        blist = sorted(alist)
        length = len(alist)
        if length % 2 == 1:
            # 列表有奇数个元素,返回中间的元素
            return blist[int(((length + 1) / 2) - 1)]
        else:
            # 列表有偶数个元素,返回中间两个元素的均值
            v1 = blist[int(length / 2)]
            v2 = blist[(int(length / 2) - 1)]
            return (v1 + v2) / 2.0
    
    def getAbsoluteStandardDeviation(self, alist, median):
        """计算绝对偏差"""
        sum = 0
        for item in alist:
            sum += abs(item - median)
        return sum / len(alist)
    
    def normalizeColumn(self, columnNumber):
        """标准化self.data中的第columnNumber列"""
        # 将该列的所有值提取到一个列表中
        col = [v[1][columnNumber] for v in self.data]
        median = self.getMedian(col)asd = self.getAbsoluteStandardDeviation(col, median)
        self.medianAndDeviation.append((median, asd))
        for v in self.data:
            v[1][columnNumber] = (v[1][columnNumber] - median) / asd
    

    最邻近分类算法

    def manhattan(vector1, vector2):
        distance = 0
        total = 0
        n = len(vector1)
        for i in range(n):
            distance += abs(vector1[i] - vector2[i])
        return distance
    
    def computeNearestNeighbor(itemName, itemVector, items):
        """按照距离排序,返回邻近物品列表"""
        distances = []
        for otherItem in items:
            if otherItem != itemName:
                distance = manhattan(itemVector, items[otherItem])
                distances.append((distance, otherItem))
        # 最近的排在前面
        distances.sort()
        return distances
    
    def classify(user, itemName, itemVector):    
        nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
        rating = users[user][nearest]
        return rating
    

    关于标准化

    “正规化”:将值的范围缩小到 0 和 1 之间

    正规化

    “标准化”:将特征值转换为均值为 0 的一组数,其中每个数表示偏离均值的程度(即标准偏差或绝对偏差)


    # -*- coding:utf-8 -*-
    
    '''
    Created on 2018年11月27日
    
    @author: KingSley
    '''
    from tkinter.tix import COLUMN
    
    class Classifier:
        def __init__(self, filename):
            self.medianAndDeviation = []
            # 从文件中读取数据
            f = open(filename)
            lines = f.readlines()
            f.close()
            self.format = lines[0].strip().split('\t')
            self.data = []
            for line in lines[1:]:
                fields = line.strip().split('\t')
                ignore = []
                vector = []
                for i in range(len(fields)):
                    if self.format[i] == 'num':
                        vector.append(float(fields[i]))
                    elif self.format[i] == 'comment':
                        ignore.append(fields[i])
                    elif self.format[i] == 'class':
                        classification = fields[i]
                self.data.append((classification, vector, ignore))
            self.rawData = list(self.data)
            # get length of instance vector
            self.vlen = len(self.data[0][1])
            # now normalize the data
            for i in range(self.vlen):
                self.normalizeColumn(i)
                
        def getMedian(self, alist):
            """返回中位数"""
            if alist == []:
                return []
            blist = sorted(alist)
            length = len(alist)
            if length % 2 == 1:
                # 列表有奇数个元素,返回中间元素
                return blist[int(((length + 1) / 2) -  1)]
            else:
                # 列表有偶数个元素,返回总量两个元素的均值
                v1 = blist[int(length / 2)]
                v2 =blist[(int(length / 2) - 1)]
                return (v1 + v2) / 2.0
    
        def getAbsoluteStandardDeviation(self, alist, median):
            """计算绝对偏差"""
            sum = 0
            for item in alist:
                sum += abs(item - median)
            return sum / len(alist)
        
        def normalizeColumn(self, columnNumber):
            """标准化 self.data 中的 columnNumber 列"""
            # 将该列所有值提取到一个列表中
            col = [v[1][columnNumber] for v in self.data]
            median = self.getMedian(col)
            asd = self.getAbsoluteStandardDeviation(col, median)
            #print("Median: %f   ASD = %f" % (median, asd))
            self.medianAndDeviation.append((median, asd))
            for v in self.data:
                v[1][columnNumber] = (v[1][columnNumber] - median) / asd
    
        def normalizeVector(self, v):
            """对每列的中位数和绝对偏差,计算标准化向量 v"""
            vector = list(v)
            for i in range(len(vector)):
                (median, asd) = self.medianAndDeviation[i]
                vector[i] = (vector[i] - median) / asd
            return vector
        
        def manhattan(self, vector1, vector2):
            """计算曼哈顿距离"""
            return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
        
        def nearestNeighbor(self, itemVector):
            """返回 itemVector 的邻近"""
            return min([(self.manhattan(itemVector, item[1]), item) for item in self.data])
        
        def classify(self, itemVector):
            """预测 itemVector 的分类"""
            return self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]
        
    
    def unitTest():
        classifier = Classifier('athletesTrainingSet.txt')
        br = ('Basketball', [72, 162], ['Brittainey Raven'])
        nl = ('Gymnastics', [61, 76], ['Viktoria Komova'])
        cl = ("Basketball", [74, 190], ['Crystal Langhorne'])
        # first check normalize function
        brNorm = classifier.normalizeVector(br[1])
        nlNorm = classifier.normalizeVector(nl[1])
        clNorm = classifier.normalizeVector(cl[1])
        assert(brNorm == classifier.data[1][1])
        assert(nlNorm == classifier.data[-1][1])
        print('normalizeVector fn OK')
        # check distance
        assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823)
        assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0)
        assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0)
        print('Manhattan distance fn OK')
        # Brittainey Raven's nearest neighbor should be herself
        result = classifier.nearestNeighbor(brNorm)
        assert(result[1][2]== br[2])
        # Nastia Liukin's nearest neighbor should be herself
        result = classifier.nearestNeighbor(nlNorm)
        assert(result[1][2]== nl[2])
        # Crystal Langhorne's nearest neighbor is Jennifer Lacy"
        assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy")
        print("Nearest Neighbor fn OK")
        # Check if classify correctly identifies sports
        assert(classifier.classify(br[1]) == 'Basketball')
        assert(classifier.classify(cl[1]) == 'Basketball')
        assert(classifier.classify(nl[1]) == 'Gymnastics')
        print('Classify fn OK')
        
    def test(training_filename, test_filename):
        """Test the classifier on a test set of data"""
        classifier = Classifier(training_filename)
        f = open(test_filename)
        lines = f.readlines()
        f.close()
        numCorrect = 0.0
        for line in lines:
            data = line.strip().split('\t')
            vector = []
            classInColumn = -1
            for i in range(len(classifier.format)):
                if classifier.format[i] == 'num':
                    vector.append(float(data[i]))
                elif classifier.format[i] == 'class':
                    classInColumn = i
            theClass= classifier.classify(vector)
            prefix = '-'
            if theClass == data[classInColumn]:
                # it is correct
                numCorrect += 1
                prefix = '+'
            print("%s  %12s  %s" % (prefix, theClass, line))
        print("%4.2f%% correct" % (numCorrect * 100/ len(lines)))
            
    
    test('athletesTrainingSet.txt', 'athletesTestSet.txt')
    test("irisTrainingSet.data", "irisTestSet.data")
    test("mpgTrainingSet.txt", "mpgTestSet.txt")
    

    参考原文作者:Ron Zacharski CC BY-NC 3.0] https://github.com/egrcc/guidetodatamining

    参考原文原文 http://guidetodatamining.com/

    参考译文来自 @egrcchttps://github.com/egrcc/guidetodatamining
    0

    相关文章

      网友评论

        本文标题:第四章:分类

        本文链接:https://www.haomeiwen.com/subject/lflftqtx.html