美文网首页机器学习实战
机器学习实战之python实现logistics算法

机器学习实战之python实现logistics算法

作者: 殇_边城 | 来源:发表于2017-12-29 14:58 被阅读9次
    图片.png
    logistics的理论知识,请参见下方博文:
    Logistic回归原理及公式推导
    import numpy as np
    import matplotlib.pyplot as plt
    
    def loadDataSet():#定义一个创建数据集函数
        dataMat = []
        labelMat = []
        fr = open('Ch05/testSet.txt')
        for line in fr.readlines():
            lineArr = line.strip().split()
            dataMat.append([1,float(lineArr[0]),float(lineArr[1])]) #这里因为原数据是二维的,只有x1和x2,为了方便后面的计算,引入一个x0,并全部赋值为1
            labelMat.append(int(lineArr[2]))#建立标签列
        fr.close()
        return dataMat,labelMat
    
    def sigmoid(inx):#定义sigmoid函数
        return 1/(1+np.exp(-inx))
    
    def gradAscent(dataMatIn,classLabels):
        dataMatrix = np.mat(dataMatIn)#100*3的矩阵
        labelMat = np.mat(classLabels).T#100*1的矩阵
        m,n = dataMatrix.shape
        alpha = 0.001
        maxCycles = 500
        weights = np.ones((n,1))#3*1,定义初始回归系数
        for k in range(maxCycles):#迭代调整回归系数
            h = sigmoid(dataMatrix*weights)#100*1的矩阵,利用回归系数计算样本预测值
            error = (labelMat - h)#100*1的矩阵,计算每个样本的误差(实际值-预测值)
            weights = weights + alpha*dataMatrix.T*error#3*1的矩阵,调整回归系数weights
            #dataMatrix.T是3*100的矩阵,error是100*1的矩阵,双方经过300次乘法相加,得到3*1的矩阵
        return weights#输出最终的回归系数
    
    def plotBsetFit(wei):
        weights = wei.getA()#将矩阵转化为np.ndarry对象
        dataMat,labelMat = loadDataSet()
        dataArr = np.array(dataMat)
        n = dataArr.shape[0]
        xcord1 = []; ycord1 = []
        xcord2 = []; ycord2 = []
        for i in range(n):
            if int(labelMat[i]) == 1:
                xcord1.append(dataArr[i,1])
                ycord1.append(dataArr[i,2])
            else:
                xcord2.append(dataArr[i,1])
                ycord2.append(dataArr[i,2])
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
        ax.scatter(xcord2,ycord2,s=30,c='green')
        x = np.arange(-3,3,0.1)
        y = (-weights[0]-weights[1]*x)/weights[2]
        #这里,令sigmoid函数等于0,求出x1和x2的关系,即为要画的分割线
        ax.plot(x,y)
        plt.xlabel('X1')
        plt.ylabel('X2')
        plt.show()
        
    #以上梯度算法中,每次迭代都要遍历所有的数据,每次都有300次的乘法,如果数据量大,很容易
    #累死,下面优化为随机梯度算法,即每次迭代只选择一个样本
        
    def stocGradAscent0(dataMatrix,classLabels):
        m,n = dataMatrix.shape
        alpha = 0.01
        weights = np.ones((n,1))
        for i in range(m):
            h = sigmoid(sum(dataMatrix[i]*weights))
            error = classLabels[i] - h
            weights = weights + alpha*dataMatrix[i]*error
        return weights #注意,这里返回的weights是1*3的np.ndarry形式,不是矩阵形式
        #因此在调用画图函数时,需进行转换和转置
    #调用画图函数可以看出,迭代效果没有全数据集迭代好,下面尝试对步长进行优化
    
    def stocGradAscent1(dataMatrix,classLabels,numIter=150):
        m,n = dataMatrix.shape
        weights = np.ones((n,1))
        for j in range(numIter):#重复迭代全集样本
            dataIndex = list(range(m))
            for i in range(m):#随机迭代每一个样本
                alpha = 4/(1+j+i)+0.01 #s随着迭代调整步长
                randIndex = int(np.random.uniform(0,len(dataIndex)))#随机选择一个样本
                h = sigmoid(np.sum(dataMatrix[randIndex]*weights))
                error = classLabels[randIndex] - h
                weights = weights +alpha*dataMatrix[randIndex]*error
                del dataIndex[randIndex] 
        return weights
    #画图,可以看出效果要比上个随机梯度好很多
    

    从疝气病症预测病马的死亡率

    def classifyVector(x,weights):#定义一个分类函数,利用sigmoid计算属于1的概率
        prob = sigmoid(np.sum(x*weights))
        if prob > 0.5:
            return 1
        else:
            return 0
        
    def colicTest():
        frTrain = open('Ch05/horseColicTraining.txt')
        frTest = open('Ch05/horseColicTest.txt')
        trainingSet = []; trainlabels = []
        for i in frTrain.readlines():#加载训练数据
            ilist = i.strip().split('\t')
            trainingSet.append([float(x) for x in ilist[:-1]])
            trainlabels.append(float(ilist[-1]))  
        frTrain.close()
        
        trainWeights = stocGradAscent1(np.array(trainingSet),trainlabels,1000)
        #利用训练数据生成一组最优回归系数
        
        error = 0; numTest = 0
        for i in frTest.readlines():
            numTest += 1 
            ilist = i.strip().split('\t')
            pard = classifyVector(np.array([float(x) for x in ilist[:-1]]),trainWeights)
            #利用测试数据和最优回归系数,判定分类
            if int(pard) != int(ilist[-1]):
                error += 1
        frTest.close()
        errorRate = float(error)/numTest
        print('the error rate of this test is:%f'%errorRate)
        return errorRate
    
    def multiTest():
        numTest = 10; errorSum = 0
        for i in range(numTest): #重复10次,求平均错误率
            errorSum += colicTest()
        print('after %d iterations the average error rate is:%f'%(numTest,errorSum/numTest))
    
    #程序运行可能会报以下警告
    RuntimeWarning: overflow encountered in exp python
    这是因为float存储的位数有限制,计算过程中对于超出位数限制的,python会进行截取。不影响运算。如果要想避免,可了解下python的bigfloat

    相关文章

      网友评论

        本文标题:机器学习实战之python实现logistics算法

        本文链接:https://www.haomeiwen.com/subject/ggbigxtx.html