多层神经网络，从零开始——（九）、优化函数

作者: 忆霜晨 | 来源:发表于2018-07-13 21:30 被阅读0次

多层神经网络，从零开始——（九）、优化函数
BP神经网络拟合非线性函数
一文理清深度学习前馈神经网络
逆向思考,优化/重构你的代码逻辑
06.神经网络学习-2
[tensorflow](二)DNN
2.5支持向量机（低维到高维的映射）
多层神经网络，从零开始——（七）、损失函数
多层神经网络，从零开始——（六）、激活函数
机器学习之神经网络

常用的优化算法有：随机梯度下降、带动量的随机梯度下降、AdaGrad算法、RMSProp算法、Adam算法，其中Adam算法结合了RMSProp和动量，一种鲁棒性非常好的方法。

一、优化函数基类

module mod_BaseGradientOptimizationMethod
use mod_NNStructure
implicit none
    
!-----------------------
! 抽象类：梯度优化算法 |
!-----------------------
type, abstract, public :: BaseGradientOptimizationMethod

!||||||||||||    
contains   !|
!||||||||||||

    !* 设置网络结构
    procedure(abs_set_NN), deferred, public :: set_NN
    
    !* 设置迭代的时间步，因为学习率可能与时间相关
    procedure(abs_set_iterative_step), deferred, public :: set_iterative_step
    
    !* 更新神经网络的参数
    procedure(abs_update_NN), deferred, public :: update_NN
    
    !* 前处理工作
    procedure(abs_pre_process), deferred, public :: pre_process
    
    !* 后处理工作
    procedure(abs_post_process), deferred, public :: post_process
   

end type BaseGradientOptimizationMethod
!===================
    

!-------------------
! 抽象类：函数接口 |
!-------------------    
abstract interface   

    !* 设置网络结构
    subroutine abs_set_NN( this, nn_structrue )
    import :: BaseGradientOptimizationMethod
    import :: NNStructure
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this
        class(NNStructure), target, intent(in) :: nn_structrue

    end subroutine
    !====
    
    !* 更新神经网络的参数
    subroutine abs_update_NN( this, bp_algorithm )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this
        character(len=*), optional, intent(in) :: bp_algorithm

    end subroutine
    !====
    
    !* 设置迭代的时间步
    subroutine abs_set_iterative_step( this, step )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this
        integer, intent(in) :: step

    end subroutine
    !====
    
    !* 前处理工作
    subroutine abs_pre_process( this )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this

    end subroutine
    !====
    
    !* 后处理工作
    subroutine abs_post_process( this )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this

    end subroutine
    !====

end interface
!===================
    
end module

二、Adam方法

!---------------------------------------------------------!
!* From Paper：                                          *!
!*   Author: Diederik P. Kingma, Jimmy Lei Ba.           *! 
!*   Title:  ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION. *!
!*   Year:   2015.                                       *!
!---------------------------------------------------------!
module mod_OptimizationAdam
use mod_Precision
use mod_NNStructure
use mod_BaseGradientOptimizationMethod
use mod_NNParameter
use mod_Log
implicit none

!-----------------------
! 工作类：Adam优化方法 |
!-----------------------
type, extends(BaseGradientOptimizationMethod), public :: OptimizationAdam
    !* 继承自BaseGradientOptimizationMethod并实现其接口
    
    !---------------------------------------------!
    !* Adam 算法使用的参数，采用                 *!
    !*《Deep Learning》, Ian Goodfellow, e.t.c.  *!
    !* 一书上的记号.                             *!
    !---------------------------------------------!
    !* 步长
    real(PRECISION), private :: eps = 0.001
    !* 矩估计衰减速率
    real(PRECISION), private :: rho_1 = 0.9
    real(PRECISION), private :: rho_2 = 0.999
    !* 衰减率的幂次
    real(PRECISION), private :: rho_1_t 
    real(PRECISION), private :: rho_2_t
    !* 数值稳定小参数，防止除以小数不稳定
    real(PRECISION), private :: delta = 1.E-8       
    !* 权重的一阶矩估计(moment estimation) s
    !*       二阶矩估计(moment estimation) r
    type (Layer_Weight), dimension(:), pointer, public :: pt_W_ME_s 
    type (Layer_Weight), dimension(:), pointer, public :: pt_W_ME_r
    
    !* 阈值的一阶矩估计(moment estimation) s
    !*       二阶矩估计(moment estimation) r
    type (Layer_Threshold), dimension(:), pointer, public :: pt_Theta_ME_r
    type (Layer_Threshold), dimension(:), pointer, public :: pt_Theta_ME_s  
    !---------------------------------------------!
    
    
    
    class(NNStructure), pointer, private :: my_NN
    
    !* 是否设置NN
    logical, private :: is_set_NN_done = .false.
        
    !* 是否初始化内存空间
    logical, private :: is_allocate_done = .false.
    
    !* 层的数目，不含输入层
    integer, private :: layers_count
    
    ! 每层节点数目构成的数组: 
    !     数组的大小是所有层的数目（含输入层）
    integer, dimension(:), allocatable, private :: layers_node_count
    
!||||||||||||    
contains   !|
!||||||||||||

    !* 设置网络结构
    procedure, public :: set_NN => m_set_NN
    
    !* 训练之前设置
    !* 修改Adam算法的默认参数
    procedure, public :: set_Adam_parameter => m_set_Adam_parameter 
    
    !* batch每迭代一次需要调用之
    procedure, public :: set_iterative_step => m_set_step
    
    !* 每完成一组batch的迭代，需要调用之
    !* 更新神经网络的参数
    procedure, public :: update_NN => m_update_NN
    !* 权值、阈值一阶、二阶矩估计置 0
    procedure, public :: set_ME_zero => m_set_ME_zero
    
    !* 前处理工作
    procedure, public :: pre_process => m_pre_process
    
    !* 后处理工作
    procedure, public :: post_process => m_post_process
    
    
    procedure, private :: allocate_pointer   => m_allocate_pointer
    procedure, private :: allocate_memory    => m_allocate_memory
    procedure, private :: deallocate_pointer => m_deallocate_pointer
    procedure, private :: deallocate_memory  => m_deallocate_memory
    
    final :: OptimizationAdam_clean_space
    
end type OptimizationAdam
!===================
    
    !-------------------------
    private :: m_set_NN
    private :: m_update_NN
    private :: m_set_Adam_parameter
    private :: m_set_step
    
    private :: m_set_ME_zero
    
    private :: m_pre_process
    private :: m_post_process
    
    private :: m_allocate_pointer
    private :: m_allocate_memory
    private :: m_deallocate_pointer
    private :: m_deallocate_memory
    !-------------------------
    
!||||||||||||    
contains   !|
!|||||||||||| 
    
    !* 更新神经网络的参数
    subroutine m_update_NN( this, bp_algorithm )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        character(len=*), optional, intent(in) :: bp_algorithm

        integer :: layer_index, l_count 
        
        l_count = this % layers_count
        
        !* 假设：一个batch完成一次完整反向计算，
        !* 计算得到了平均梯度：avg_dW、avg_dTheta
        do layer_index=1, l_count
            associate (                                                           &              
                eps        => this % eps,                                         &
                rho_1      => this % rho_1,                                       &
                rho_2      => this % rho_2,                                       &
                rho_1_t    => this % rho_1_t,                                     &
                rho_2_t    => this % rho_2_t,                                     &
                delta      => this % delta,                                       &
                W_S        => this % pt_W_ME_s( layer_index ) % W,                &
                W_R        => this % pt_W_ME_r( layer_index ) % W,                &
                Theta_S    => this % pt_Theta_ME_s( layer_index ) % Theta,        &
                Theta_R    => this % pt_Theta_ME_r( layer_index ) % Theta,        &
                W          => this % my_NN % pt_W(layer_index) % W,               &
                Theta      => this % my_NN % pt_Theta(layer_index) % Theta,       &
                dW         => this % my_NN % pt_Layer( layer_index ) % dW,        &
                dTheta     => this % my_NN % pt_Layer( layer_index ) % dTheta,    &
                avg_dW     => this % my_NN % pt_Layer( layer_index ) % avg_dW,    &               
                avg_dTheta => this % my_NN % pt_Layer( layer_index ) % avg_dTheta &
            )
        
            if (PRESENT(bp_algorithm) .and. &
                (TRIM(ADJUSTL(bp_algorithm)) == 'standard')) then
                !* s <-- ρ_1 * s + (1 - ρ_1) * g
                !* r <-- ρ_2 * r + (1 - ρ_2) * g ⊙ g
                
                W_S = rho_1 * W_S + (1 - rho_1) * dW
                W_R = rho_2 * W_R + (1 - rho_2) * dW * dW 
            
                Theta_S = rho_1 * Theta_S + (1 - rho_1) * dTheta 
                Theta_R = rho_2 * Theta_R + (1 - rho_2) * dTheta * dTheta
            
                !* △θ = -ε * s_hat / (√(r_hat) + δ)
                !* s_hat = s / (1 - ρ^t_1), r_hat = r / (1 - ρ^t_2)
                dW = -eps * (W_S / (1 - rho_1_t)) / (SQRT(W_R / (1 - rho_2_t)) + delta)
                W = W + dW
            
                dTheta = -eps * (Theta_S / (1 - rho_1_t)) / &
                    (SQRT(Theta_R / (1 - rho_2_t)) + delta)
                Theta = Theta + dTheta
            
            else
                !* 默认是针对一个batch更新阈值和权值
            
                !* s <-- ρ_1 * s + (1 - ρ_1) * g
                !* r <-- ρ_2 * r + (1 - ρ_2) * g ⊙ g
                !avg_dW     = avg_dW     + 1.E-4 * W
                !avg_dTheta = avg_dTheta + 1.E-4 * Theta
                
                W_S = rho_1 * W_S + (1 - rho_1) * avg_dW
                W_R = rho_2 * W_R + (1 - rho_2) * avg_dW * avg_dW 
            
                Theta_S = rho_1 * Theta_S + (1 - rho_1) * avg_dTheta 
                Theta_R = rho_2 * Theta_R + (1 - rho_2) * avg_dTheta * avg_dTheta
            
                !* △θ = -ε * s_hat / (√(r_hat) + δ)
                !* s_hat = s / (1 - ρ^t_1), r_hat = r / (1 - ρ^t_2)
                dW = -eps * (W_S / (1 - rho_1_t)) / (SQRT(W_R / (1 - rho_2_t)) + delta)
                W = W + dW
            
                dTheta = -eps * (Theta_S / (1 - rho_1_t)) / &
                    (SQRT(Theta_R / (1 - rho_2_t)) + delta)
                Theta = Theta + dTheta
            
                avg_dW = 0
                avg_dTheta = 0
            end if
    
            end associate
        end do 
        
        return
    end subroutine m_update_NN
    !====
    
    !* 修改Adam算法的默认参数
    !* 单独设置后面的参数需要按关键字调用
    subroutine m_set_Adam_parameter( this, eps, rho_1, rho_2, delta )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        real(PRECISION), optional, intent(in) :: eps, rho_1, rho_2, delta

        if (PRESENT(eps))  this % eps = eps
        
        if (PRESENT(rho_1))  this % rho_1 = rho_1

        if (PRESENT(rho_2))  this % rho_2 = rho_2
        
        if (PRESENT(delta))  this % delta = delta
        
        return
    end subroutine m_set_Adam_parameter
    !====
    
    !* 设置网络结构
    subroutine m_set_NN( this, nn_structrue )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        class(NNStructure), target, intent(in) :: nn_structrue

        this % my_NN => nn_structrue
        
        this % is_set_NN_done = .true.
        
        call this % allocate_pointer()
        call this % allocate_memory()
        
        return
    end subroutine m_set_NN
    !====
    
    !* 设置迭代的时间步，计算衰减率幂次
    subroutine m_set_step( this, step )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        integer, intent(in) :: step 

        this % rho_1_t = (this % rho_1)**step
        this % rho_2_t = (this % rho_2)**step
        
        return
    end subroutine m_set_step
    !====
    
    !* 前处理工作
    subroutine m_pre_process( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this

        call this % set_ME_zero()
        
        return
    end subroutine m_pre_process
    !====
    
    !* 后处理工作
    subroutine m_post_process( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this

        continue
        
        return
    end subroutine m_post_process
    !====
    
    !* 权值、阈值一阶、二阶矩估计置 0
    subroutine m_set_ME_zero( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this

        integer :: layer_index, l_count
        
        l_count = this % layers_count
        
        do layer_index=1, l_count
            this % pt_W_ME_s( layer_index ) % W = 0
            this % pt_W_ME_r( layer_index ) % W = 0
            this % pt_Theta_ME_s( layer_index ) % Theta = 0
            this % pt_Theta_ME_r( layer_index ) % Theta = 0
        end do 
        
        return
    end subroutine m_set_ME_zero
    !====
    
    
    !* 申请OptimizationAdam包含的指针所需空间
    subroutine m_allocate_pointer( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        
        integer :: l_count
        
        if (this % is_set_NN_done == .false.) then          
            call LogErr("mod_OptimizationAdam: SUBROUTINE m_allocate_pointer, &
                is_set_NN_done is false.")          
            stop
        end if
        
        l_count = this % my_NN % layers_count
        this % layers_count = l_count
    
        allocate( this % pt_W_ME_s(l_count) )
        allocate( this % pt_W_ME_r(l_count) )
        allocate( this % pt_Theta_ME_s(l_count) )
        allocate( this % pt_Theta_ME_r(l_count) )
        
        allocate( this % layers_node_count(0:l_count) )
        
        this % layers_node_count = this % my_NN % layers_node_count
    
        call LogDebug("OptimizationAdam: SUBROUTINE m_allocate_pointer")
        
        return
    end subroutine m_allocate_pointer
    !====
    
    !* 申请每层所需的内存空间
    subroutine m_allocate_memory( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        
        integer :: M, N, layer_index, l_count
        
        l_count = this % layers_count
        
        do layer_index=1, l_count
        
            M = this % layers_node_count(layer_index - 1)
            N = this % layers_node_count(layer_index)
                      
            !* undo: Fortran2003语法检测申请错误
            !* 注意：矩阵大小为 N×M，而不是 M×N.
            allocate( this % pt_W_ME_s( layer_index ) % W(N,M) )
            allocate( this % pt_W_ME_r( layer_index ) % W(N,M) )
            allocate( this % pt_Theta_ME_s( layer_index ) % Theta(N) )
            allocate( this % pt_Theta_ME_r( layer_index ) % Theta(N) )
            
        end do
    
        this % is_allocate_done = .true.
    
        call LogDebug("OptimizationAdam: SUBROUTINE m_allocate_memory")
    
        return
    end subroutine m_allocate_memory
    !====
    
    !* 销毁指针 
    subroutine m_deallocate_pointer( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        
        deallocate( this % layers_node_count )
        deallocate( this % pt_W_ME_s         )
        deallocate( this % pt_W_ME_r         )
        deallocate( this % pt_Theta_ME_s     )
        deallocate( this % pt_Theta_ME_r     )
    
        return
    end subroutine m_deallocate_pointer
    !====
    
    !* 销毁内存空间
    subroutine m_deallocate_memory( this )
    implicit none
        class(OptimizationAdam), intent(inout)  :: this
        
        integer :: layer_index
        
        do layer_index=1, this % layers_count
            
            deallocate( this % pt_W_ME_s( layer_index ) % W )
            deallocate( this % pt_W_ME_r( layer_index ) % W )
            deallocate( this % pt_Theta_ME_s( layer_index ) % Theta )
            deallocate( this % pt_Theta_ME_r( layer_index ) % Theta )
            
        end do
        
        call this % deallocate_pointer()
        
        this % is_allocate_done = .false.
    
        return
    end subroutine m_deallocate_memory 
    !====
    
    !* 析构函数，清理内存空间
    subroutine OptimizationAdam_clean_space( this )
    implicit none
        type(OptimizationAdam), intent(inout) :: this
    
        call this % deallocate_memory()
        
        call LogInfo("OptimizationAdam: SUBROUTINE clean_space.")
        
        return
    end subroutine OptimizationAdam_clean_space
    !====
    
    
end module

附录

多层神经网络，从零开始——（一）、Fortran读取MNIST数据集
 多层神经网络，从零开始——（二）、Fortran随机生成“双月”分类问题数据
 多层神经网络，从零开始——（三）、BP神经网络公式的详细推导
 多层神经网络，从零开始——（四）、多层BP神经网络的矩阵形式
 多层神经网络，从零开始——（五）、定义数据结构
 多层神经网络，从零开始——（六）、激活函数
 多层神经网络，从零开始——（七）、损失函数
 多层神经网络，从零开始——（八）、分类问题中为什么使用交叉熵作为损失函数
 多层神经网络，从零开始——（九）、优化函数
 多层神经网络，从零开始——（十）、参数初始化
 多层神经网络，从零开始——（十一）、实现训练类
 多层神经网络，从零开始——（十二）、实现算例类
多层神经网络，从零开始——（十三）、关于并行计算的简单探讨

多层神经网络，从零开始——（九）、优化函数
常用的优化算法有：随机梯度下降、带动量的随机梯度下降、AdaGrad算法、RMSProp算法、Adam算法，其中A...
BP神经网络拟合非线性函数
本章涉及知识点：1、多层神经网络的数学模型2、前向传播算法3、经典激活函数4、经典损失函数5、神经网络的优化过程6...
一文理清深度学习前馈神经网络
? Index 多层感知机（MLP）介绍深度神经网络的激活函数深度神经网络的损失函数多层感知机的反向传播算法...
逆向思考,优化/重构你的代码逻辑
阅读陈皓博客代码优化, 写了此篇文章,多层if else值得深思,至少现在我已经在优化多层if情况了. 函数中多层...
06.神经网络学习-2
神经网络的优化目标数学目标不论损失函数是什么形式，神经网络的优化目标都是使得损失函数最小化。对于均方误差函数和...
[tensorflow](二)DNN
20181125 qzd 深度学习与深层神经网络损失函数定义神经网络优化进一步优化
2.5支持向量机（低维到高维的映射）
支持向量机：扩大函数范围独树一帜人工神经网络和决策树：直接产生更多可选函数人工神经网络：多层非线性函数组合产生类...
多层神经网络，从零开始——（七）、损失函数
常用的损失函数对于回归问题有均方误差，对于分类问题一般使用的是交叉熵。通常为了提高泛化性能，还要使用正则化，此时损...
多层神经网络，从零开始——（六）、激活函数
维基百科上总结了各种激活函数列表如下，详细表格参见维基百科的词条 Activation function。其中，较...
机器学习之神经网络
什么是神经网络就是使用了平滑的激活函数的多层感知机激活函数什么是激活函数呢？激活函数就是从输入值到对应输出...