    # 训练
    class Classifier:
        def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
            """ a classifier will be built from files with the bucketPrefix
            excluding the file with textBucketNumber. dataFormat is a string that
            describes how to interpret each line of the data files. For example,
            for the iHealth data the format is:
            "attr   attr    attr    attr    class"
            total = 0
            classes = {}
            counts = {}
            # reading the data in from the file
            self.format = dataFormat.strip().split('\t')
            self.prior = {}
            self.conditional = {}
            # for each of the buckets numbered 1 through 10:
            for i in range(1, 11):
                # if it is not the bucket we should ignore, read in the data
                if i != testBucketNumber:
                    filename = "%s-%02i" % (bucketPrefix, i)
                    f = open(filename)
                    lines = f.readlines()
                    for line in lines:
                        fields = line.strip().split('\t')
                        ignore = []
                        vector = []
                        for i in range(len(fields)):
                            if self.format[i] == 'num':
                                vector.append(float(fields[i]))     #vector!!
                            elif self.format[i] == 'attr':
                            elif self.format[i] == 'comment':
                            elif self.format[i] == 'class':
                                category = fields[i]
                        # now process this instance
                        total += 1
                        classes.setdefault(category, 0)     #字典:分类类别计数
                        counts.setdefault(category, {})     #复合字典:每类的每列的具体计数
                        classes[category] += 1
                        # now process each attribute of the instance
                        col = 0
                        for columnValue in vector:
                            col += 1
                            counts[category].setdefault(col, {})
                            counts[category][col].setdefault(columnValue, 0)
                            counts[category][col][columnValue] += 1
            # ok done counting. now compute probabilities
            # first prior probabilities p(h)
            for (category, count) in classes.items():
                self.prior[category] = count / total#字典:先验概率
            # now compute conditional probabilities p(D|h)
            for (category, columns) in counts.items():
                  self.conditional.setdefault(category, {})
                  for (col, valueCounts) in columns.items():
                      self.conditional[category].setdefault(col, {})
                      for (attrValue, count) in valueCounts.items():
                          self.conditional[category][col][attrValue] = (
                              count / classes[category])        #复合字典:每类的每个属性的条件概率
            self.tmp =  counts               #应该暂时没有用
    # 分类
        def classify(self, itemVector):
            """Return class we think item Vector is in"""
            results = []
            for (category, prior) in self.prior.items():
                prob = prior
                col = 1
                for attrValue in itemVector:
                    if not attrValue in self.conditional[category][col]:
                        # we did not find any instances of this attribute value
                        # occurring with this category so prob = 0
                        prob = 0
                        prob = prob * self.conditional[category][col][attrValue]
                    col += 1
                results.append((prob, category))
            # return the category with the highest probability
    # test code
    c = Classifier("iHealth/i", 10,"attr\tattr\tattr\tattr\tclass")
    print(c.classify(['health', 'moderate', 'moderate', 'yes']))







    # pdf计算实现
    def pdf(mean, ssd, x):
       """Probability Density Function  computing P(x|y)
       input is the mean, sample standard deviation for all the items in y,
       and x."""
       ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))
       print (ePart)
       return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart
    # 连续数据的训练
    class Classifier:
        def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
            """ a classifier will be built from files with the bucketPrefix
            excluding the file with textBucketNumber. dataFormat is a string that
            describes how to interpret each line of the data files. For example,
            for the iHealth data the format is:
            "attr   attr    attr    attr    class"
            total = 0
            classes = {}
            # counts used for attributes that are not numeric
            counts = {}
            # totals used for attributes that are numereric
            # we will use these to compute the mean and sample standard deviation for
            # each attribute - class pair.
            totals = {}
            numericValues = {}
            # reading the data in from the file
            self.format = dataFormat.strip().split('\t')
            self.prior = {}
            self.conditional = {}
            # for each of the buckets numbered 1 through 10:
            for i in range(1, 11):
                # if it is not the bucket we should ignore, read in the data
                if i != testBucketNumber:
                    filename = "%s-%02i" % (bucketPrefix, i)
                    f = open(filename)
                    lines = f.readlines()
                    for line in lines:
                        fields = line.strip().split('\t')
                        ignore = []
                        vector = []
                        nums = []
                        for i in range(len(fields)):
                            if self.format[i] == 'num':
                            elif self.format[i] == 'attr':
                            elif self.format[i] == 'comment':
                            elif self.format[i] == 'class':
                                category = fields[i]
                        # now process this instance
                        total += 1
                        classes.setdefault(category, 0)
                        counts.setdefault(category, {})
                        totals.setdefault(category, {})
                        numericValues.setdefault(category, {})
                        classes[category] += 1
                        # now process each non-numeric attribute of the instance
                        col = 0
                        for columnValue in vector:
                            col += 1
                            counts[category].setdefault(col, {})
                            counts[category][col].setdefault(columnValue, 0)
                            counts[category][col][columnValue] += 1
                        # process numeric attributes
                        col = 0
                        for columnValue in nums:
                            col += 1
                            totals[category].setdefault(col, 0)
                            #totals[category][col].setdefault(columnValue, 0)
                            totals[category][col] += columnValue
                            numericValues[category].setdefault(col, [])
            # ok done counting. now compute probabilities
            # first prior probabilities p(h)
            for (category, count) in classes.items():
                self.prior[category] = count / total
            # now compute conditional probabilities p(h|D)
            for (category, columns) in counts.items():
                  self.conditional.setdefault(category, {})
                  for (col, valueCounts) in columns.items():
                      self.conditional[category].setdefault(col, {})
                      for (attrValue, count) in valueCounts.items():
                          self.conditional[category][col][attrValue] = (
                              count / classes[category])
            self.tmp =  counts               
            # now compute mean and sample standard deviation
            self.means = {}
            self.totals = totals
            for (category, columns) in totals.items():
                self.means.setdefault(category, {})
                for (col, cTotal) in columns.items():
                    self.means[category][col] = cTotal / classes[category]
            # standard deviation
            self.ssd = {}
            for (category, columns) in numericValues.items():
                self.ssd.setdefault(category, {})
                for (col, values) in columns.items():
                    SumOfSquareDifferences = 0
                    theMean = self.means[category][col]
                    for value in values:
                        SumOfSquareDifferences += (value - theMean)**2
                    columns[col] = 0
                    self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category]  - 1))              
    # 连续数据的分类
        def classify(self, itemVector, numVector):
            """Return class we think item Vector is in"""
            results = []
            sqrt2pi = math.sqrt(2 * math.pi)
            for (category, prior) in self.prior.items():
                prob = prior
                col = 1
                for attrValue in itemVector:
                    if not attrValue in self.conditional[category][col]:
                        # we did not find any instances of this attribute value
                        # occurring with this category so prob = 0
                        prob = 0
                        prob = prob * self.conditional[category][col][attrValue]
                    col += 1
                col = 1
                for x in  numVector:
                    mean = self.means[category][col]
                    ssd = self.ssd[category][col]
                    ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
                    prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
                    col += 1
                results.append((prob, category))
            # return the category with the highest probability


    • 贝叶斯优点:实现简单,和其他方法相比需要的训练数据更少
    • 贝叶斯缺点:不能学习到特征之间的相互作用。
    • kNN优点:实现简单,不用考虑数据特定的结构,需要大量的内存来存储训练集
    • kNN缺点:训练集很大的时候是一个合理的选择。




