美文网首页
深度学习的梯度消失/爆炸

深度学习的梯度消失/爆炸

作者: echo_ye4 | 来源:发表于2019-12-24 16:04 被阅读0次

    考虑一个L层的网络,前向计算时,对于第l层:
    z_l = w_la_{l-1} + b_l
    a_l = \delta (z_l)
    反向计算时,对于第l层:
    dz_l = w_{l+1}dz_{l+1} * \delta'(z_l)
    dw_l = a_{l-1}dz_l
    db_l = dz_l

    relu activation

    a_{l-1} \approx w_1w_2...w_{l-1}x
    dz_l \approx w_Lw_{L-1}...w_{l+1}dz_L
    dw_l \approx w_lw_2...w_{l-1}w_{l+1}...w_Lxdz_L
    db_l \approx w_{l-1}...w_Ldz_L
    |w_j|<1,随着L的增加,越靠近输入层,dz呈指数级减小,同时db也呈指数级减小;每一层dw的大小没有指数级的变化,但是随着L的增加,每一层的dw都呈指数级减小。

    sigmoid activation

    a_{l-1} = \delta(w_{l-1} * ...\delta(w_2 * \delta(w_1x + b_1) + b_2) + b_{l-1})
    dz_l = w_L\delta'(z_L)w_{L-1}\delta'(z_{L-1})...w_{l+1}\delta'(z_{l+1})dz_L
    dw_l = a_{l-1}dz_l
    db_l = dz_l
    a_{l-1}在0到1之间,可以忽略不计,若|w_j\delta'(z_j)|<1,随着L的增加,越靠近输入层,dz呈指数级减小,同时dw和db也呈指数级减小。

    relu demo

    假设一个10层的神经网络,每层2个节点,除了输出层是sigmoid,其他层的activator是relu

    import numpy as np
    from numpy.linalg import cholesky
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    
    def logistic(z):
        return 1 / (1 + np.exp(-z))
    
    def logistic_gradient(z):
        a = logistic(z)
        return a * (1 - a)
    
    def relu(z):
        return z * (z > 0)
    
    def relu_gradient(z):
        return np.ones(z.shape) * (z > 0)
    
    sampleNo = 10
    mu = np.array([[2, 3]])
    Sigma = np.array([[1, 0.5], [1.5, 3]])
    R = cholesky(Sigma)
    s = np.dot(np.random.randn(sampleNo, 2), R) + mu
    
    mu2 = np.array([[7, 6]])
    t = np.dot(np.random.randn(sampleNo, 2), R) + mu2
    
    plt.plot(s[:,0],s[:,1],'+')
    plt.plot(t[:,0],t[:,1],'*')
    plt.xlabel("x")
    plt.ylabel("y")
    plt.show()
    
    #构造数据
    x = np.concatenate((s, t)).T
    y1 = np.zeros(sampleNo).reshape(1,sampleNo)
    y2 = np.ones(sampleNo).reshape(1,sampleNo)
    y = np.concatenate((y1, y2), axis=1)
    print(x.shape, y.shape)
    
    #初始化网络
    layer = 10
    #考虑一个比1稍小的w
    w = {}
    for i in range(1, layer):
        w_tmp = np.array([0.8, 0, 0, 0.8]).reshape(2,2)
        w[i] = w_tmp
    w_out = np.ones(2).reshape(2,1)
    w[layer] = w_out
    #b初始化为0
    b = {}
    for i in range(1, layer):
        b_tmp = np.zeros(2).reshape(1,2)
        b[i] = b_tmp
    b_out = np.zeros(1).reshape(1,1)
    b[layer] = b_out
    #activator初始化
    act = {}
    for i in range(1, layer):
        act[i] = relu
    act[layer] = logistic
    
    iter = 1
    max_iter = 2
    m = sampleNo * 2
    alpha = 0.01
    while iter < max_iter:
        #前向计算
        a = x
        a_dict = {}
        z_dict = {}
        a_dict[0] = a
        for i in range(1, layer+1):
            z = np.dot(w[i].T, a) + b[i].T
            a = act[i](z)
            a_dict[i] = a
            z_dict[i] = z
        #反向计算
        dz = {}
        dw = {}
        db = {}  
        for i in range(layer, 0, -1):
            if i == layer:
                dz[i] = a_dict[i] - y
            else:
                dz[i] = np.dot(w[i+1], dz[i+1]) * relu_gradient(z_dict[i])
            dw[i] = np.dot(a_dict[i - 1], dz[i].T) / m
            db[i] = np.sum(dz[i].T, axis = 0) / m
        #更新参数
        for i in range(1, layer+1):
            w[i] = w[i] - alpha * dw[i]
            b[i] = b[i] - alpha * db[i]
    
        iter += 1
    
    print("反向计算")
    for i in range(layer, 0, -1):
        print("第%d层" % i)
        print("dz", dz[i][:,0])
        print("dw", dw[i])
        print("db", db[i])
    

    反向计算
    第10层
    dz [0.6525866]
    dw [[0.00245394]
    [0.07067444]]
    db [0.25647431]
    第9层
    dz [0.6525866 0.6525866]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.25647431 0.25647431]
    第8层
    dz [0.52206928 0.52206928]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.20517945 0.20517945]
    第7层
    dz [0.41765542 0.41765542]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.16414356 0.16414356]
    第6层
    dz [0.33412434 0.33412434]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.13131485 0.13131485]
    第5层
    dz [0.26729947 0.26729947]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.10505188 0.10505188]
    第4层
    dz [0.21383958 0.21383958]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.0840415 0.0840415]
    第3层
    dz [0.17107166 0.17107166]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.0672332 0.0672332]
    第2层
    dz [0.13685733 0.13685733]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.05378656 0.05378656]
    第1层
    dz [0.10948586 0.10948586]
    dw [[0.00306743 0.00306743]
    [0.08834304 0.08834304]]
    db [0.04302925 0.04302925]

    sigmoid demo

    假设一个10层的神经网络,每层2个节点,所有层都是sigmoid

    #初始化网络
    layer = 10
    #考虑一个比1稍小的w
    w = {}
    for i in range(1, layer):
        w_tmp = np.array([0.8, 0, 0, 0.8]).reshape(2,2)
        w[i] = w_tmp
    w_out = np.ones(2).reshape(2,1)
    w[layer] = w_out
    #b初始化为0
    b = {}
    for i in range(1, layer):
        b_tmp = np.zeros(2).reshape(1,2)
        b[i] = b_tmp
    b_out = np.zeros(1).reshape(1,1)
    b[layer] = b_out
    #activator初始化
    act = {}
    for i in range(1, layer+1):
        act[i] = logistic
    
    iter = 1
    max_iter = 2
    m = sampleNo * 2
    alpha = 0.01
    while iter < max_iter:
        #前向计算
        a = x
        a_dict = {}
        z_dict = {}
        a_dict[0] = a
        for i in range(1, layer+1):
            z = np.dot(w[i].T, a) + b[i].T
            a = act[i](z)
            a_dict[i] = a
            z_dict[i] = z
        #反向计算
        dz = {}
        dw = {}
        db = {}  
        for i in range(layer, 0, -1):
            if i == layer:
                dz[i] = a_dict[i] - y
            else:
                dz[i] = np.dot(w[i+1], dz[i+1]) * relu_gradient(z_dict[i])
            dw[i] = np.dot(a_dict[i - 1], dz[i].T) / m
            db[i] = np.sum(dz[i].T, axis = 0) / m
        #更新参数
        for i in range(1, layer+1):
            w[i] = w[i] - alpha * dw[i]
            b[i] = b[i] - alpha * db[i]
    
        iter += 1
    
    print("反向计算")
    for i in range(layer, 0, -1):
        print("第%d层" % i)
        print("dz", dz[i][:,0])
        print("dw", dw[i])
        print("db", db[i])
    

    反向计算
    第10层
    dz [0.77621477]
    dw [[0.17176994]
    [0.17177003]]
    db [0.27621479]
    第9层
    dz [0.77621477 0.77621477]
    dw [[0.17176998 0.17176998]
    [0.17177045 0.17177045]]
    db [0.27621479 0.27621479]
    第8层
    dz [0.62097182 0.62097182]
    dw [[0.13741614 0.13741614]
    [0.13741813 0.13741813]]
    db [0.22097183 0.22097183]
    第7层
    dz [0.49677746 0.49677746]
    dw [[0.10993356 0.10993356]
    [0.10994206 0.10994206]]
    db [0.17677747 0.17677747]
    第6层
    dz [0.39742196 0.39742196]
    dw [[0.08794963 0.08794963]
    [0.08798579 0.08798579]]
    db [0.14142197 0.14142197]
    第5层
    dz [0.31793757 0.31793757]
    dw [[0.07037154 0.07037154]
    [0.07052532 0.07052532]]
    db [0.11313758 0.11313758]
    第4层
    dz [0.25435006 0.25435006]
    dw [[0.05634747 0.05634747]
    [0.05700205 0.05700205]]
    db [0.09051006 0.09051006]
    第3层
    dz [0.20348005 0.20348005]
    dw [[0.04528946 0.04528946]
    [0.04808773 0.04808773]]
    db [0.07240805 0.07240805]
    第2层
    dz [0.16278404 0.16278404]
    dw [[0.0370514 0.0370514 ]
    [0.04934335 0.04934335]]
    db [0.05792644 0.05792644]
    第1层
    dz [0.13022723 0.13022723]
    dw [[-0.05082364 -0.05082364]
    [ 0.06565071 0.06565071]]
    db [0.04634115 0.04634115]

    二类分类

    import numpy as np
    from numpy.linalg import cholesky
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    
    sampleNo = 1000
    mu = np.array([[2, 3]])
    Sigma = np.array([[1, 0.5], [1.5, 3]])
    R = cholesky(Sigma)
    s = np.dot(np.random.randn(sampleNo, 2), R) + mu
    
    mu2 = np.array([[17, 10]])
    t = np.dot(np.random.randn(sampleNo, 2), R) + mu2
    
    plt.plot(s[:,0],s[:,1],'+')
    plt.plot(t[:,0],t[:,1],'*')
    plt.xlabel("x")
    plt.ylabel("y")
    plt.show()
    
    image.png
    #构造数据
    x = np.concatenate((s, t)).T
    y1 = np.zeros(sampleNo).reshape(1,sampleNo)
    y2 = np.ones(sampleNo).reshape(1,sampleNo)
    y = np.concatenate((y1, y2), axis=1)
    
    def logistic(z):
        return 1 / (1 + np.exp(-z))
    
    def logistic_gradient(z):
        a = logistic(z)
        return a * (1 - a)
    
    def relu(z):
        return z * (z > 0)
    
    def relu_gradient(z):
        return np.ones(z.shape) * (z > 0)
    
    #初始化网络
    #考虑一个比1稍大的w
    w = {}
    for i in range(1, 10):
        w_tmp = np.array([1.1, 0, 0, 1.1]).reshape(2,2)
        w[i] = w_tmp
    w_out = np.ones(2).reshape(2,1)
    w[10] = w_out
    #b初始化为0
    b = {}
    for i in range(1, 10):
        b_tmp = np.zeros(2).reshape(1,2)
        b[i] = b_tmp
    b_out = np.zeros(1).reshape(1,1)
    b[10] = b_out
    #activator初始化
    act = {}
    for i in range(1, 10):
        act[i] = relu
    act[10] = logistic
    
    #训练
    iter = 1
    max_iter = 50000
    m = sampleNo * 2
    alpha = 0.01
    while iter < max_iter:
        #前向计算
        a = x
        a_dict = {}
        z_dict = {}
        a_dict[0] = a
        for i in range(1, 11):
            z = np.dot(w[i].T, a) + b[i].T
            a = act[i](z)
            a_dict[i] = a
            z_dict[i] = z
        #反向计算
        dz = {}
        dw = {}
        db = {}  
        for i in range(10, 0, -1):
            if i == 10:
                dz[i] = a_dict[i] - y
            else:
                dz[i] = np.dot(w[i+1], dz[i+1]) * relu_gradient(z_dict[i])
            dw[i] = np.dot(a_dict[i - 1], dz[i].T) / m
            db[i] = np.sum(dz[i].T, axis = 0) / m
        #更新参数
        for i in range(1, 11):
            w[i] = w[i] - alpha * dw[i]
            b[i] = b[i] - alpha * db[i]
    
        iter += 1
    
    #显示结果
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.scatter(s[:,0], s[:,1], y[y==0])
    ax.scatter(t[:,0], t[:,1], y[y==1])
    
    def predict(x):
        a = x
        for i in range(1, 11):
            z = np.dot(w[i].T, a) + b[i].T
            a = act[i](z)
        return a
    
    x1_tmp = x2_tmp = np.linspace(-10, 30, 100)
    x1_tmp, x2_tmp = np.meshgrid(x1_tmp, x2_tmp)
    x_tmp = np.concatenate((x1_tmp.reshape(1, 10000), x2_tmp.reshape(1, 10000)))
    y_tmp = predict(x_tmp)
    y_tmp = y_tmp.reshape(100, 100)
    ax.plot_surface(x1_tmp, x2_tmp, y_tmp)
    ax.view_init(elev=30,azim=-120)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.show()
    
    image.png

    对比只有一层的logistic regression模型,使用相同超参的训练结果如下:


    image.png

    相关文章

      网友评论

          本文标题:深度学习的梯度消失/爆炸

          本文链接:https://www.haomeiwen.com/subject/ptrfoctx.html