回归

作者: JasonChiu17 | 来源:发表于2018-12-01 12:55 被阅读5次

    线性回归

    原理:

    • 寻找一组最优参数来拟合数据

    优点

    • 结果易于理解,计算上不复杂

    缺点

    • 对非线性的数据拟合不好

    适用数据类型

    • 数值型和标称型数据
    加载数据
    import numpy as np
    def loadDataSet(fileName):
        dataList = []; labelList = []
        fr = open(fileName)
        lenOfLine = len(fr.readline().strip().split())
        for line in fr.readlines():
            lineList = line.strip().split()
            curLine =[]
            for i in range(len(lineList)-1):
                curLine.append(float(lineList[i]))
            dataList.append(curLine)
            labelList.append(float(lineList[-1]))
        return dataList,labelList
    
    dataList,labelList = loadDataSet('../../Reference Code/Ch08/ex0.txt')
    

    标准回归函数

    def standRegres(xArr,yArr):
        xMat = np.mat(xArr)
        yMat = np.mat(yArr).T
        xTx = xMat.T*xMat #矩阵乘法
        if np.linalg.det(xTx) == 0: #行列式为0
            print('This matrix is singular, cannot do inverse') #奇异矩阵,不可逆
            return
        else:
            ws = xTx.I*(xMat.T*yMat)
        return ws
    
    ws = standRegres(dataList,labelList)
    ws
    
    matrix([[3.00681047],
            [1.69667188]])
    

    画图可视化

    import matplotlib.pyplot as plt
    def data2show(dataList,labelList,ws):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.grid()
        #原始数据分布:x,y都是在一个list
        x = np.mat(dataList)[:,1].T.A[0]#.A是将matrix转换成array
        y = np.mat(labelList).A[0]
        ax.scatter(x,y,s=10,label='Raw Data') 
    
        #拟合的直线
        xCopy = np.mat(dataList).copy()
        yHat = xCopy*ws
        ax.plot(xCopy[:,1],yHat,c='r',label='Line Regression')
        
        ax.set_xlim((0,1))
        plt.legend()
        plt.show()
    
    data2show(dataList,labelList,ws)
    
    output_9_0.png

    计算预测值yHat与真是值y的匹配程度:相关系数

    xCopy = np.mat(dataList).copy()
    yHat = xCopy*ws
    y = np.mat(labelList)
    print('预测值和真实值的相关系数矩阵为:\n' ,np.corrcoef(yHat.T,y))
    
    预测值和真实值的相关系数矩阵为:
     [[1.        0.9863846]
     [0.9863846 1.       ]]
    

    局部加权线性回归(Locally Weighted Linear Regression,LWLR)

    • 线性回归可能出现欠拟合,我们给待预测点附近的每个点赋予一定的权重,如使用高斯核来作为权重,随着样本点与待预测点的距离的递增,权重将以指数级递减。
    • 缺点:增加了计算量,对每一个点做预测时都要使用整个数据集
    对单一样本使用LWLR
    
    def lwlr(testPoint,xArr,yArr,k=1.0):
        xMat = np.mat(xArr)
        yMat = np.mat(yArr).T
        m = xMat.shape[0]
        weights = np.mat(np.eye((m))) #初始化,每个样本赋予权重为1
        for j in range(m):#遍历每一个样本
            diffMat = testPoint - xMat[j,:] 
            weights[j,j] = np.exp((diffMat*diffMat.T)/(-2*k**2))
        xTx = xMat.T*(weights*xMat) #矩阵乘法
        if np.linalg.det(xTx) == 0: #行列式为0
            print('This matrix is singular, cannot do inverse') #奇异矩阵,不可逆
            return
        else:
            ws = xTx.I*(xMat.T*(weights*yMat))
        yHat = testPoint*ws
        return yHat
    
    yHat = lwlr(dataList[0],dataList,labelList,k=0.001)
    print('预测值',yHat.A[0,0])
    print('真实值:',labelList[0])
    
    预测值 3.772513220268465
    真实值: 3.816464
    
    遍历所有样本使用LWLR
    def lwlrTest(testArr,xArr,yArr,k=1.0):
        m = np.shape(testArr)[0]
        yHat = np.zeros(m)
        for i in range(m): #遍历所有测试样本,使用lwlr
            yHat[i] = lwlr(testArr[i],xArr,yArr,k)
        return yHat
    
    yHat_k1 = lwlrTest(dataList,dataList,labelList,k=1.0)
    yHat_k2 = lwlrTest(dataList,dataList,labelList,k=0.01)
    yHat_k3 = lwlrTest(dataList,dataList,labelList,k=0.003)
    
    #要画出拟合出来的直线,首先对xArr[:,1]排序,用plot一个个直线将xArr[:,1]和yHat连接起来构成拟合直线
    import matplotlib.pyplot as plt
    xArr = np.array(dataList)
    yArr = np.array(labelList)
    sortIndex = xArr[:,1].argsort(0) #按列排序
    fig = plt.figure(figsize=(6,8))
    
    #k=1.0,是一条直线,相当于线性回归
    ax = fig.add_subplot(311)
    ax.grid()
    ax.scatter(xArr[:,1],yArr,s=5)
    ax.plot(xArr[:,1][sortIndex],yHat_k1[sortIndex],c='r')
    ax.legend(("prediction", "data"))
    ax.set_xlim((0,1))
    ax.set_title('k=1.0')
    
    #k=0.1
    ax = fig.add_subplot(312)
    ax.grid()
    ax.scatter(xArr[:,1],yArr,s=5,label='data')
    ax.plot(xArr[:,1][sortIndex],yHat_k2[sortIndex],c='r')
    ax.legend(("prediction", "data"))
    ax.set_xlim((0,1))
    ax.set_title('k=0.1')
    
    #k=0.003
    ax = fig.add_subplot(313)
    # ax.grid()
    ax.scatter(xArr[:,1],yArr,s=5)
    ax.plot(xArr[:,1][sortIndex],yHat_k3[sortIndex],c='r')
    ax.legend(("prediction", "data"))
    ax.set_xlim((0,1))
    ax.set_title('k=0.03')
    
    plt.tight_layout()
    plt.show()
    
    output_19_0.png
    • k=1,所有数据等权重,相当于线性回归
    • k=0.1,效果较好
    • k=0.03 仅有很少的局部样本用于训练,考虑了太多噪声,过拟合

    预测鲍鱼的年龄

    #定义误差函数
    def rssError(yArr,yHatArr):
        return ((yArr - yHatArr)**2).sum()/len(yHatArr)
    
    训练
    abX,abY = loadDataSet('../../Reference Code/Ch08/abalone.txt')
    yHat_01 = lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1)
    yHat_1 = lwlrTest(abX[0:99],abX[0:99],abY[0:99],1)
    yHat_10 = lwlrTest(abX[0:99],abX[0:99],abY[0:99],10)
    error_01 = rssError(abY[0:99],yHat_01)
    error_1 = rssError(abY[0:99],yHat_1)
    error_10 = rssError(abY[0:99],yHat_10)
    print('k=0.1,训练集均方误差:',error_01)
    print('k=1,训练集均方误差:',error_1)
    print('k=10,训练集均方误差:',error_10)
    
    k=0.1,训练集均方误差: 0.5336662333518674
    k=1,训练集均方误差: 3.9193270598039254
    k=10,训练集均方误差: 5.155636180982812
    
    测试
    yHat_01 = lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1)
    yHat_1 = lwlrTest(abX[100:199],abX[0:99],abY[0:99],1)
    yHat_10 = lwlrTest(abX[100:199],abX[0:99],abY[0:99],10)
    error_01 = rssError(abY[100:199],yHat_01)
    error_1 = rssError(abY[100:199],yHat_1)
    error_10 = rssError(abY[100:199],yHat_10)
    print('k=0.1,测试集均方误差:',error_01)
    print('k=1,测试集均方误差:',error_1)
    print('k=10,测试集均方误差:',error_10)
    
    k=0.1,测试集均方误差: 253.07098763356757
    k=1,测试集均方误差: 6.096745426262262
    k=10,测试集均方误差: 5.6509772437177395
    
    使用线性回归
    ws = standRegres(abX[0:99],abY[0:99])
    xCopy = np.mat(abX[0:99]).copy()
    yHat = xCopy*ws
    error = rssError(abY[0:99],yHat.T.A[0])
    print('训练集均方误差:',error)
    
    xCopy = np.mat(abX[100:199]).copy()
    yHat = xCopy*ws
    error = rssError(abY[100:199],yHat.T.A[0])
    print('测试集均方误差:',error)
    
    训练集均方误差: 5.175439324394303
    测试集均方误差: 5.663372502296616
    
    • 线性回归达到了局部加权线性回归(k=10)的效果

    岭回归

    #定义岭回归的回归系数公式
    def ridgeRegres(xMat,yMat,lam=0.2):
        xTx = xMat.T*xMat
        denom = xTx + np.eye(xMat.shape[1])*lam
        if np.linalg.det(denom) == 0:
            print('Thi matrix is singular, cannot do inverse')
            return
        ws = denom.I*(xMat.T*yMat)
        return ws
    
    #标准化
    def standar(xMat,yMat):
        yMean = np.mean(yMat,0) #按列求平均
        yMat = yMat - yMean #书本这里没有除以方差
    #     yMat = (yMat - yMean)/np.var(yMat)
        xMean = np.mean(xMat,0)#按列求平均
        xVar = np.var(xMat,0)#按列求方差
        xMat = (xMat - xMean)/xVar
        return xMat,yMat
    
    #搜索最优lamda
    def searchLamda(xMat,yMat):
        numTestPts = 30
        n = xMat.shape[1]
        wMat = np.zeros((numTestPts,n))
        for i in range(numTestPts):
            ws = ridgeRegres(xMat,yMat,np.exp(i-10))
            wMat[i,:] = ws.T
        return wMat
    
    无标准化
    #加载数据并转换为mat形式
    abX,abY = loadDataSet('../../Reference Code/Ch08/abalone.txt')
    xMat = np.mat(abX);yMat = np.mat(abY).T
    #搜索lamda
    wMat = searchLamda(xMat,yMat)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(wMat) #行index为x轴,列的值为y
    ax.set_xlabel('log(lambda)')
    plt.show()
    
    output_33_0.png
    有标准化
    #加载数据并转换为mat形式
    abX,abY = loadDataSet('../../Reference Code/Ch08/abalone.txt')
    xMat = np.mat(abX);yMat = np.mat(abY).T
    xMat,yMat=standar(xMat,yMat) #标准化
    #搜索lamda
    wMat = searchLamda(xMat,yMat)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(wMat) #行index为x轴,列的值为y
    ax.set_xlabel('log(lambda)')
    plt.show()
    
    output_35_0.png
    • 普通线性回归不用标准化
    • 岭回归和Lasso需要标准化
    前向逐步回归
    def stageWise(xMat,yMat,eps =0.01,numIter=100):
        m,n = xMat.shape
        returnMat = np.zeros((numIter,n))
        ws = np.zeros((n,1));wsBest = ws.copy()
        for i in range(numIter): #迭代100次,得到100个ws
    #         print(ws.T)
            lowestErr = np.inf #初始化误差为无穷大
            for j in range(n): #遍历每一个特征
                for sign in [-1,1]: #每个特征增加或者减少
                    wsTest = ws.copy()
                    wsTest[j] += eps*sign #第j个特征,增加或者减少一个步长
                    
                    #计算误差
                    yTest = xMat * wsTest
                    rssE = rssError(yMat.A,yTest.A)
                    
                    #保存j特征下,误差最小的ws,和误差
                    if rssE < lowestErr: 
                        lowestErr = rssE
                        wsBest = wsTest
                        
                #保存所有特征下误差最小对应的ws
            ws = wsBest.copy()
            returnMat[i,:] = ws.T
        return returnMat
    
    #加载数据并转换为mat形式
    abX,abY = loadDataSet('../../Reference Code/Ch08/abalone.txt')
    xMat = np.mat(abX);yMat = np.mat(abY).T
    xMat,yMat=standar(xMat,yMat) #标准化
    returnMat=stageWise(xMat,yMat,eps =0.001,numIter=5000)
    returnMat
    
    array([[ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
           [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
           [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
           ...,
           [ 0.041, -0.01 ,  0.119, ..., -0.967, -0.106,  0.185],
           [ 0.042, -0.01 ,  0.119, ..., -0.967, -0.106,  0.185],
           [ 0.041, -0.01 ,  0.119, ..., -0.967, -0.106,  0.185]])
    
    对比普通线性回归
    dataList = xMat.tolist();labelList = yMat.T.tolist()
    ws = standRegres(dataList,labelList)
    ws.T
    
    matrix([[ 0.04166622, -0.02188497,  0.13114956,  0.02087776,  2.22254154,
             -0.99875234, -0.11703624,  0.16645722]])
    
    • 逐步线性回归算法(numIter=5000,eps=0.001)与常规的最小二乘法效果类似
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(returnMat) #行index为x轴,列的值为y
    ax.set_xlabel('log(lambda)')
    plt.show()
    
    output_43_0.png

    相关文章

      网友评论

        本文标题:回归

        本文链接:https://www.haomeiwen.com/subject/sdnlcqtx.html