1 在numpy和pytorch实现梯度下降法（线性回归）

梯度下降法的一般步骤为：
(1) 设定初始值
(2) 求取梯度
(3) 在梯度方向上进行参数的更新

1.1 Numpy版本

假设欲拟合的目标函数为y = 2*x1 - 4*x2. 这是一个2元线性函数，自变量x是2维向量。通过梯度下降求解最优参数的代码如下：

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from matplotlib import style

#创建数据
N = 100
x1 = np.linspace(-10, 10, N)
x2 = np.linspace(-15, 5, N)

x = np.concatenate(([x1], [x2]), axis=0).T
w = np.array([2, -4])
y = np.dot(x, w)
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')

ax1.plot_wireframe(np.array([x1]),np.array([x2]),np.array([y]), rstride=5, cstride=5)
ax1.set_xlabel("x1")
ax1.set_ylabel("x2")
ax1.set_zlabel("y")

#梯度下降
EPOCHS = 50 #迭代总次数
LOSS_MIN = 0.0001 #loss的目标最小值，当loss小于此值时停止迭代
lr = 0.01
# w_GD = np.random.rand(2) #梯度下降(GD)过程中存储w的值
w_GD = np.zeros(2)

cost = [] #梯度下降(GD)过程中存储loss的值
w_all = []
for i in range(EPOCHS):
    w_all.append(w_GD.copy())
    y_predict = np.dot(x, w_GD) #使用当前w_GD的y预测值
    loss = np.mean((y_predict-y)**2) #计算loss
    cost.append(loss)
    dw = np.mean(2*(y_predict-y) * x.T, axis=1) #计算梯度
    w_GD -= lr*dw #梯度下降
    
print("loss:",loss)
print("w1:",w_GD[0],"w2",w_GD[1])

#画出梯度下降曲线
w_all = np.array(w_all)
fig = plt.figure()
ax2 = fig.add_subplot(111, projection='3d')
ax2.plot_wireframe(np.array([w_all[:,0]]),np.array([w_all[:,1]]),np.array([cost]))
ax2.set_xlabel("w1")
ax2.set_ylabel("w2")
ax2.set_zlabel("loss")
fig = plt.figure()

#画出loss-iteration曲线
plt.plot(range(len(cost)),cost)
plt.title('loss')
plt.xlabel('iteration')
plt.ylabel('loss')
plt.show()

运行结果：

w1, w2

loss: 2.565443781623136e-08
w1: 1.9999674457769208, w2 -3.999977280651687

目标函数

图 1 y = 2*x1-4*x2的图像

图1展示了目标函数的图像，是一条在3维空间里的直线
损失函数的梯度下降

图 2 梯度下降轨迹
损失函数
图 3 loss-iteration曲线
由图2-图3可知，梯度下降法可以找到最优的w1, w2来实现对目标函数的最佳拟合。

1.1PyTorch版本

import torch 
from torch.autograd import Variable
import numpy as np
N = 100
x = Variable(torch.randn(N,2))
w = Variable(torch.FloatTensor([2, -4]))
y = x*w

EPOCHS = 5000

lr = 0.01
w_GD = Variable(torch.FloatTensor([0, 0]), requires_grad=True)
cost = []
w_all = []
for i in range(EPOCHS):
    w_all.append(w_GD.data)
    y_predict = x*w_GD
    loss = torch.mean((y_predict-y)**2)

    cost.append(loss.data.numpy())
    loss.backward()
     #参数更新
    w_GD.data -= lr*w_GD.grad.data
    w_GD.grad.data.zero_()    
print("loss:",loss)
print("w_GD:",w_GD)

输出：

loss: tensor(8.8394e-11, grad_fn=<MeanBackward1>)
w_GD: tensor([ 2.0000, -4.0000], requires_grad=True)

2 用PyTorch实现一个简单的神经网络

这里采用官方教程给出的LeNet5网络为例，搭建一个简单的卷积神经网络，用于识别手写体数字。

import torch as t
import torchvision as tv
import torch.nn as nn
import torch.nn.functional as F
# 网络搭建
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
net = Net()
print(net)

#前向传播
x = t.randn(1,1,32,32)
out=net(x)
print("out:",out)

#损失函数
target = torch.randn(10)
target = target.view(1,-1)
criterion = nn.MSELoss()
loss = criterion(out, target)
print("loss:", loss)

#后向传播
net.zero_grad()
loss.backward()

#更新参数
lr = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * lr)
    
#使用优化器
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=0.01)
optimizer.zero_grad()
output = net(x)
loss = criterion(output, target)
loss.backward()
optimizer.step()

输出：

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
out: tensor([[ 0.0253, -0.0078,  0.0713,  0.1756,  0.0836,  0.1335, -0.1235,  0.0425,
          0.0714,  0.1090]], grad_fn=<AddmmBackward>)
loss: tensor(1.0700, grad_fn=<MseLossBackward>)