核心代码结构
本次实验采用java语言实现,其中NeuralNetworkModel为神经网络模型类,Layer为神经网络的层类,激活函数接口ActivationFunction,本实例使用了Sigmoid做激活函数,代价函数接口CostFunction,本实例使用了均方误差MSECostFunction作为代价函数。采用的梯度下降算法为MSGD小批量随机梯度下降法(Mini-batch stochastic gradient descent)
image.png
BP神经网络的实现步骤
初始化
- 初始化神经网络(包括层数、激活函数等)
this.nodeNumbers = nodeNumbers;;//网络拓扑结构
this.costFun = costFun;//代价函数
this.activationFun = activationFun;//激活函数
this.learningRate = learningRate;//学习率
-
初始化各层结构(含各层节点数、权重和偏置)
//创建神经网络各层,传入的参数为各层的节点数量。第一个为输入层节点数
this.layers = new Layer[nodeNumbers.size() - 1];
for (int i = 0; i < nodeNumbers.size() - 1; i++) {
//前置节点数量
int previousNodeNum = nodeNumbers.get(i);
//当前节点数量
int currNodeNum = nodeNumbers.get(i + 1);
//初始化层
this.layers[i] = new Layer(currNodeNum, previousNodeNum, activationFun);
}
this.deltas = new double[this.layers.length][];//各层的误差,一维为每层,二维为层中的各个节点
for (int i = 0; i < this.deltas.length; i++) {
this.deltas[i] = new double[this.layers[i].getCurrNodeNum()];
}
按周期循环训练、单个周期按MSGD进行批量拆分和训练
/**
* 小批量随机梯度下降法( Mini-batch stochastic gradient descent,SGD)
* (1)选择n个训练样本(n<m,m为总训练集样本数)
* (2)在这n个样本中进行epoch次迭代,每次使用1个样本
* (3)对epoch次迭代得出的epoch个gradient进行加权平均再并求和,作为这一次mini-batch下降梯度
* (4)不断在训练集中重复以上步骤,直到收敛。
*
* @param xArray 训练集 the attrs of all samples.
* @param yArray 训练标签集the flags of all samples.
* @param batchSize 每次批量样本数量the size of batch.
* @param epoch 周期the number of iterations.
*/
public void miniBatchSGD(double[][] xArray, double[][] yArray, int batchSize, int epoch) {
int totalSamplesNum = xArray.length;//训练集数量
double[][] batchXArray = new double[batchSize][];//按照小批量batchSize分隔 60*784
double[][] batchYArray = new double[batchSize][];//60*10
for (int i = 0; i < epoch; i++) {//迭代周期循环
System.out.format("[开始第%d次迭代,共%d次迭代]" + df.format(new Date()) + " \n", i, epoch);
ProcessBar.stageHolder.setCurrentStep(i);
List<Integer> posList = new ArrayList<>();
for (int j = 0; j < totalSamplesNum; j++) {
posList.add(j);//生成训练集下标集合
}
Collections.shuffle(posList);// 使用默认随机源对下标列表进行洗牌打乱。
for (int j = 0; j < totalSamplesNum / batchSize; j++) {//拆分训练集
for (int k = 0; k < batchSize; k++) {//按照batchsize生成每一个小批量的训练集和标签
batchXArray[k] = xArray[posList.get(j * batchSize + k)];
batchYArray[k] = yArray[posList.get(j * batchSize + k)];
}
//System.out.format("[%d-th epoch 第%d个批量]: \n", i,j);
BGD(batchXArray, batchYArray, 1,false);//对每一个小批量进行 批量梯度下降算法
}
// get cost
// 使用第一个批量集合获取代价 use 1st batch set.
for (int j = 0; j < batchSize; j++) {
batchXArray[j] = xArray[posList.get(j)];
batchYArray[j] = yArray[posList.get(j)];
}
double[][] x;
double[][] y;
double cost = 0;
for (int j = 0; j < batchSize; j++) {
x = MyUtils.toTwoDimension(batchXArray[j]);//new double[batchXArray[j].length][1];
y = MyUtils.toTwoDimension(yArray[j]);//new double[yArray[j].length][1];
//MyUtils.toTwoDimension(batchXArray[j], x);
// MyUtils.toTwoDimension(batchYArray[j], y);
feedforward(x);
// compute cost
cost += computeCost(y);
}
System.out.format("[cost of %d-th epoch]: " + (cost / batchSize) +" "+ df.format(new Date())+"\n", i);
}
}
对每个批量中每个样本进行处理
/**
* 批量梯度下降BGD 进行训练
* 在每一次迭代时使用所有样本来进行梯度的更新。
* train by using BGD.
* note: every bp using all samples.
*
* @param xArray 训练集 the attrs of all samples.
* @param yArray 标签 the flags of all samples.
* @param epoch 迭代次数 number of iterations.
* @param isOutputLog 是否输出日志,true显示输出
*/
public void BGD(double[][] xArray, double[][] yArray, int epoch, boolean isOutputLog) {
int total = xArray.length;
for (int i = 1; i <= epoch; i++) {
double[][][] deltasB = new double[layers.length][][];//偏置,第一维为层数,第二维为层的节点数,第三维为1
double[][][] deltasW = new double[layers.length][][];//权重,第一维为层数,第二维为当前层的节点数,第三维为前置层的节点数
for (int j = 0; j < layers.length; j++) {
deltasB[j] = new double[layers[j].getCurrNodeNum()][1];
deltasW[j] = new double[layers[j].getCurrNodeNum()][layers[j].getPreviousNodeNum()];
}
double[][] x;//把训练集的每一行扩展为二维
double[][] y;//把训练集标签的每一行扩展为二维
double cost = 0;//代价函数
for (int j = 0; j < xArray.length; j++) {//对样本进行遍历,对每一个样本进行前项传播、代价计算、后向传播
x = MyUtils.toTwoDimension(xArray[j]);//new double[xArray[j].length][1];//初始化二维数组
y = MyUtils.toTwoDimension(yArray[j]);//new double[yArray[j].length][1];
//MyUtils.toTwoDimension(xArray[j], x);//每一个样本扩展为二维,第二个维度长度为1
//MyUtils.toTwoDimension(yArray[j], y);
feedforward(x,false);//前向传播
// compute cost
cost += computeCost(y);//计算代价函数
backPropagation(x, y, deltasB, deltasW);//后向传播
}
// 使用日志输出代价
if (isOutputLog) {
System.out.format("[cost before the %d-th epoch]: " + (cost / total) + "\n", i);
if (log != null) {
log.print(String.format("[cost before the %d-th epoch]: " + (cost / total), i));
}
// out deltasB deltasW by using log
if (log != null) {
log.print("===================== " + i + "-th epoch =====================");
for (int j = deltasB.length - 1; j >= 0; j--) {
// about b
log.print(deltasB[j], j + "-th layer / deltaB");
// about w
log.print(deltasW[j], j + "-th layer / deltaW");
}
log.print("=====================================================");
}
}
// 更新偏置
for (int j = 0; j < layers.length; j++) {
double[][] biases = layers[j].getBiases();
/* for (int k = 0; k < biases.length; k++) {
for (int l = 0; l < biases[k].length; l++) {
biases[k][l] -= (learningRate / total * deltasB[j][k][l]);
}
}*/
bpUpdateValue(biases, learningRate, total, deltasB[j]);
}
// 更新权重
for (int j = 0; j < layers.length; j++) {
double[][] weights = layers[j].getWeights();
/* for (int k = 0; k < weights.length; k++) {
for (int l = 0; l < weights[k].length; l++) {
weights[k][l] -= (learningRate / total * deltasW[j][k][l]);
}
}*/
bpUpdateValue(weights, learningRate, total, deltasW[j]);
}
}
}
- 正向传播得到各层输出值
/**
* 向前传播得到预测数据
* feedforward for one sample and can set whether to output log
*
* @param x 训练集 x the sample input
* @param isOutputLog 是否显示日志 true will output log
*/
public void feedforward(double[][] x, boolean isOutputLog) {
double[][] tInput = x;//训练集
if (isOutputLog) {
log.print(x, "x");
}
for (int i = 0; i < layers.length; i++) {//遍历各层进行计算
Layer layer = layers[i];
double[][] z = layer.computerMidValue(tInput);//计算各层各个节点未激活时的中间值 第一维度为本层节点数量,第二维度为1.
double[][] a = layer.computerOutputValue();//计算各层各个节点的输出值(激活后) 第一维度为本层节点数量,第二维度为1.
tInput = a;//将本层输出作为下层的输入
if (isOutputLog) {
log.print(layer.getWeights(), i + "-th layer / " + "W");
log.print(layer.getBiases(), i + "-th layer / " + "B");
log.print(z, i + "-th layer / " + "Z");
log.print(a, i + "-th layer / " + "A");
}
}
}
- 反向传播计算各层误差
/**
* 反向传播
* a trip of the back propagation
* note :: please execute it after execute `feedforward(double[][] x)`!
*
* @param x 单个样本 the attrs of one sample
* @param y 单个样本的标签 the flags of one sample
* @param deltasB 整个网络的所有偏置deltasB
* @param deltasW 整个网络的所有权重deltasW
*/
private void backPropagation(double[][] x, double[][] y, double[][][] deltasB, double[][][] deltasW) {
//1.先计算最后一层的误差 1st compute deltas[L] (BP1)
int layerPos = layers.length - 1;
double[][] a = layers[layerPos].getOutputValue();//最后一层的输出值
double[][] z = layers[layerPos].getMidValue();//最后一层的中间值
ActivationFunction aFunction = layers[layerPos].getActivationFun();//激活函数
double[] zOneDimension = new double[z.length];
MyUtils.toOneDimension(z, zOneDimension);//中间值一维化
for (int i = 0; i < deltas[layerPos].length; i++) {//对最后一层的每一个节点计算误差
if (aFunction instanceof SoftmaxActivation) {
deltas[layerPos][i] = costFun.computeDerivative(a[i][0], y[i][0]) * aFunction.computeDerivative(z[i][0], zOneDimension);
} else {
deltas[layerPos][i] = costFun.computeDerivative(a[i][0], y[i][0]) * aFunction.computeDerivative(z[i][0]);
}
}
--layerPos;//往上移动一层
// 2.计算其他各层的导数 2st compute deltas[i] (BP2)
for (; layerPos >= 0; layerPos--) {
double[][] weights = layers[layerPos + 1].getWeights();//后一层权重
double[][] weightsT = new double[weights[0].length][weights.length];
MyUtils.matrixTranspose(weights, weightsT);//矩阵转置
double[] delta = deltas[layerPos + 1];//后一层导数
double[][] deltaTwoDimension = new double[delta.length][1];
MyUtils.toTwoDimension(delta, deltaTwoDimension);//将一维扩展为二维
double[][] tempDelta = new double[deltas[layerPos].length][1];//临时导数值
MyUtils.matrixMul(weightsT, deltaTwoDimension, tempDelta);//权重和后一层导数矩阵乘机放入临时导数值中
z = layers[layerPos].getMidValue();//得到当前层的中间值
aFunction = layers[layerPos].getActivationFun();//激活函数
MyUtils.toOneDimension(z, zOneDimension);//中间值降为一维,softmax激活函数会用到,其他激活韩束用不到
for (int i = 0; i < tempDelta.length; i++) {
if (aFunction instanceof SoftmaxActivation) {
tempDelta[i][0] *= aFunction.computeDerivative(z[i][0], zOneDimension);
} else {
tempDelta[i][0] *= aFunction.computeDerivative(z[i][0]);//下一层导数临时值乘以当前层激活函数导数
}
}
MyUtils.toOneDimension(tempDelta, deltas[layerPos]);//把临时导数值降为一维存入当前层导数
}
// 3.更新每个节点的偏置 3rd get deltas b (BP3)
for (int i = 0; i < layers.length; i++) {
for (int j = 0; j < layers[i].getCurrNodeNum(); j++) {
deltasB[i][j][0] += deltas[i][j];
}
}
// 4.更新每个节点的权重 4th get deltas w (BP4)
for (int i = 0; i < layers.length; i++) {
// 计算权重compute deltasW
if (i != 0) {
a = layers[i - 1].getOutputValue();//上一层输出值
} else {
a = x;//顶层时,就是输入的样本
}
for (int j = 0; j < deltasW[i].length; j++) {
for (int k = 0; k < deltasW[i][j].length; k++) {
deltasW[i][j][k] += (a[k][0] * deltas[i][j]);
}
}
}
}
- 更新各层权重和偏置
// 更新偏置
for (int j = 0; j < layers.length; j++) {
double[][] biases = layers[j].getBiases();
/* for (int k = 0; k < biases.length; k++) {
for (int l = 0; l < biases[k].length; l++) {
biases[k][l] -= (learningRate / total * deltasB[j][k][l]);
}
}*/
bpUpdateValue(biases, learningRate, total, deltasB[j]);
}
// 更新权重
for (int j = 0; j < layers.length; j++) {
double[][] weights = layers[j].getWeights();
/* for (int k = 0; k < weights.length; k++) {
for (int l = 0; l < weights[k].length; l++) {
weights[k][l] -= (learningRate / total * deltasW[j][k][l]);
}
}*/
bpUpdateValue(weights, learningRate, total, deltasW[j]);
}
核心算法部分源代码下载
百度网盘
提取码:hep1
上一篇 | JAVAEE与人工智能目录 | [下一篇] |
---|
网友评论