回归-应对离群点

作者: 阿发贝塔伽马 | 来源:发表于2018-08-20 17:10 被阅读12次

    岭回归

    # Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
    # License: BSD 3 clause
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import linear_model
    
    # X is the 10x10 Hilbert matrix
    X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
    y = np.ones(10)
    y[0:5] = 0
    # #############################################################################
    # Compute paths
    
    n_alphas = 200
    alphas = np.logspace(-10, -2, n_alphas)
    #print alphas
    coefs = []
    scores = []
    for a in alphas:
        ridge = linear_model.Ridge(alpha=a, fit_intercept=True)
        ridge.fit(X, y)
        scores.append(ridge.score(X,y))
        coefs.append(ridge.coef_)
    
    # #############################################################################
    # Display results
    #print scores
    fig, axes = plt.subplots(1,2)
    ax0, ax1 = axes
    ax0.plot(alphas, scores)
    ax0.set_xscale('log')
    ax0.set_xlabel('alpha')
    ax0.set_ylabel('scores')
    
    ax1.plot(alphas, coefs)
    ax1.set_xscale('log')
    #ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    fig.set_figwidth = 20
    fig.set_figheight = 8
    
    ax1.set_xlabel('alpha')
    ax1.set_ylabel('weights')
    
    plt.title('Ridge coefficients as a function of the regularization')
    plt.axis('tight')
    plt.subplots_adjust(left=-0.1, right= 1.9, bottom=0.1, top=1)
    
    plt.show()
    

    huber回归

    import numpy as np
    del plt
    import matplotlib.pyplot as plt
    
    from sklearn.datasets import make_regression
    from sklearn.linear_model import HuberRegressor, Ridge
    
    def huberloss(var, delta):
        if abs(var) > delta:
            return delta*abs(var)-1./2*delta*delta
        else:
            return 1./2*var*var
        
    # Generate toy data.
    rng = np.random.RandomState(0)
    X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0,
                           bias=100.0)
    
    # Add four strong outliers to the dataset.
    X_outliers = rng.normal(0, 0.5, size=(4, 1))
    y_outliers = rng.normal(0, 2.0, size=4)
    X_outliers[:2, :] += X.max() + X.mean() / 4.
    X_outliers[2:, :] += X.min() - X.mean() / 4.
    y_outliers[:2] += y.min() - y.mean() / 4.
    y_outliers[2:] += y.max() + y.mean() / 4.
    X = np.vstack((X, X_outliers))
    y = np.concatenate((y, y_outliers))
    fig, axes = plt.subplots(1,3)
    ax_loss,ax_score, ax1 = axes
    
    ax1.plot(X, y, 'b.')
    
    # Fit the huber regressor over a series of epsilon values.
    #colors = ['r-', 'b-', 'y-', 'm-']
    
    x = np.linspace(X.min(), X.max(), 7)
    epsilon_values = [1,1.2,1.35, 1.5, 1.75, 1.9]
    scores = []
    losses =[]
    
    for k, epsilon in enumerate(epsilon_values):
        huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100,
                               epsilon=epsilon)
        huber.fit(X, y)
        np.sum(huberloss(el, epsilon) for el in (huber.predict(X)-y))
        losses.append(np.sum(huberloss(el, epsilon) for el in (huber.predict(X)-y)))
        scores.append(huber.score(X,y))
        coef_ = huber.coef_ * x + huber.intercept_
        ax1.plot(x, coef_,  label="huber loss, %s" % epsilon)
        #ax1.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon)
    
    ax_loss.plot(epsilon_values, losses)
    ax_loss.set_xlabel('delta')
    ax_loss.set_ylabel('loss')
    
    ax_score.plot(epsilon_values, scores)
    ax_score.set_xlabel('delta')
    ax_score.set_ylabel('scores')
    
    # Fit a ridge regressor to compare it to huber regressor.
    ridge = Ridge(fit_intercept=True, alpha=0.0, random_state=0, normalize=True)
    ridge.fit(X, y)
    coef_ridge = ridge.coef_
    coef_ = ridge.coef_ * x + ridge.intercept_
    ax1.plot(x, coef_, 'g-', label="ridge regression")
    ax1.set_title("Comparison of HuberRegressor vs Ridge")
    ax1.set_xlabel("X")
    ax1.set_ylabel("y")
    plt.subplots_adjust(left=-0.1, right= 1.9, bottom=0.1, top=1)
    ax1.legend(loc='lower center')
    plt.show()
    
    • 第一幅图使用loss总和来评估回归效果,delta=1,损失最少,从第三幅图来看也是拟合效果最好的
    • 第二幅图使用回归类的R2来评估,对于存在离群点,R2不适用

    logcosh回归(使用SGD实现回归算法)

    class SDGReggressor():
        def __init__(self, eta, X, Y, N,regular1=0, regular2=0):
            self.eta = eta
            self.X = X
            self.Y = Y
            self.N = N
            self.w = np.array([0]*len(X[0]))
            self.w0 = 0
            self.m = len(X)
            self.n = len(X[0])
            self.regular2 = regular2
            self.regular1 = regular1
        def output_y(self, x):
            return np.dot(x,self.w)+self.w0
        def loss(self, value):
            return np.log(np.cosh(value))
        def derivative(self, value):
            return np.tanh(value)
        def regular_fun(self):
            return self.regular2*np.dot(self.w,self.w)+self.regular1*abs(self.w.sum())
        def training(self):
            self.errors = []
            for times in xrange(self.N):
                delta_y = self.Y-self.output_y(self.X)
                error = (self.loss(delta_y)).sum()+self.regular_fun()
                self.w0 += self.eta*self.derivative(delta_y).sum()
                r1=0
                if abs(self.w.sum()) > 0:
                    r1=1
                elif abs(self.w.sum()) == 0:
                    r1 = 0
                else:
                    r1 = -1
                self.w = self.w + (self.eta*np.dot(self.derivative(delta_y),self.X)+2.0*self.regular2*self.w+r1*self.regular1)
                self.errors.append(error)
    
    per = SDGReggressor(1e-2, X, y, 1000, regular1=0, regular2=0)
    per.training()
    
    plt.plot(xrange(per.N), per.errors)
    plt.xlabel('loop')
    plt.ylabel('errors')
    plt.show()
    
    收敛曲线

    拟合直线

    plt.plot(X, y, 'b.')
    plt.plot([-1,3],[-1,3]*per.w+per.w0)
    plt.xlabel('X')
    plt.ylabel('y')
    plt.show()
    
    • logcosh对于带有离群点数据也能很好的拟合,但是logcosh不需要调参数delta
    • 需要对y数据进行缩小,当y稍微大一点,cosh(y)就趋向于∞


      cosh(x)

    为什么能减弱离群点的能量

    看一下损失函数的导函数tanh(x),当x偏离0时,tanh(x)趋向+1或者-1 tanh(x)

    在上面training函数, ΔW, 离群点delta_y是比较大的,导数值都接近+1或者-1,比普通点没有多大的区别,W的变化也变得平滑。

    self.w = self.w + (self.eta*np.dot(self.derivative(delta_y),self.X)+2.0*self.regular2*self.w+r1*self.regular1)
    

    对于离群点的回归问题,使用huber、logcosh损失函数,可以提高准确度,减少过拟合。

    相关文章

      网友评论

        本文标题:回归-应对离群点

        本文链接:https://www.haomeiwen.com/subject/dtsgbftx.html