数据集-data.csv
使用文章:https://www.jianshu.com/p/38df71cad1f6里的数据集(航班数变化趋势),这里直接拷贝在下面:
Jan-49,112
Feb-49,118
Mar-49,132
Apr-49,129
May-49,121
Jun-49,135
Jul-49,148
Aug-49,148
Sep-49,136
Oct-49,119
Nov-49,104
Dec-49,118
Jan-50,115
Feb-50,126
Mar-50,141
Apr-50,135
May-50,125
Jun-50,149
Jul-50,170
Aug-50,170
Sep-50,158
Oct-50,133
Nov-50,114
Dec-50,140
Jan-51,145
Feb-51,150
Mar-51,178
Apr-51,163
May-51,172
Jun-51,178
Jul-51,199
Aug-51,199
Sep-51,184
Oct-51,162
Nov-51,146
Dec-51,166
Jan-52,171
Feb-52,180
Mar-52,193
Apr-52,181
May-52,183
Jun-52,218
Jul-52,230
Aug-52,242
Sep-52,209
Oct-52,191
Nov-52,172
Dec-52,194
Jan-53,196
Feb-53,196
Mar-53,236
Apr-53,235
May-53,229
Jun-53,243
Jul-53,264
Aug-53,272
Sep-53,237
Oct-53,211
Nov-53,180
Dec-53,201
Jan-54,204
Feb-54,188
Mar-54,235
Apr-54,227
May-54,234
Jun-54,264
Jul-54,302
Aug-54,293
Sep-54,259
Oct-54,229
Nov-54,203
Dec-54,229
Jan-55,242
Feb-55,233
Mar-55,267
Apr-55,269
May-55,270
Jun-55,315
Jul-55,364
Aug-55,347
Sep-55,312
Oct-55,274
Nov-55,237
Dec-55,278
Jan-56,284
Feb-56,277
Mar-56,317
Apr-56,313
May-56,318
Jun-56,374
Jul-56,413
Aug-56,405
Sep-56,355
Oct-56,306
Nov-56,271
Dec-56,306
Jan-57,315
Feb-57,301
Mar-57,356
Apr-57,348
May-57,355
Jun-57,422
Jul-57,465
Aug-57,467
Sep-57,404
Oct-57,347
Nov-57,305
Dec-57,336
Jan-58,340
Feb-58,318
Mar-58,362
Apr-58,348
May-58,363
Jun-58,435
Jul-58,491
Aug-58,505
Sep-58,404
Oct-58,359
Nov-58,310
Dec-58,337
Jan-59,360
Feb-59,342
Mar-59,406
Apr-59,396
May-59,420
Jun-59,472
Jul-59,548
Aug-59,559
Sep-59,463
Oct-59,407
Nov-59,362
Dec-59,405
Jan-60,417
Feb-60,391
Mar-60,419
Apr-60,461
May-60,472
Jun-60,535
Jul-60,622
Aug-60,606
Sep-60,508
Oct-60,461
Nov-60,390
Dec-60,432
导入模块
这个示例比较简单,导入几个常用的就够了:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
初始化定义
设置下gpu就行了:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
数据预处理
需要读入数据,然后按照序列长度2来设置数据集,代码如下:
with open("data.csv", "r", encoding="utf-8") as f:
data = f.read()
data = [row.split(',') for row in data.split("\n")]
value = [int(each[1]) for each in data]
# 数据是每一天的航班数
li_x = []
li_y = []
seq = 2
# 因为数据集较少,序列长度太长会影响结果
for i in range(len(data) - seq):
# 输入就是[x,x+1]天的航班数,输出时x+2天的航班数
li_x.append(value[i: i+seq])
li_y.append(value[i+seq])
# 分训练和测试集
train_x = (torch.tensor(li_x[:-30]).float() / 1000.).reshape(-1, seq, 1).to(device)
train_y = (torch.tensor(li_y[:-30]).float() / 1000.).reshape(-1, 1).to(device)
test_x = (torch.tensor(li_x[-30:]).float() / 1000.).reshape(-1, seq, 1).to(device)
test_y = (torch.tensor(li_y[-30:]).float() / 1000.).reshape(-1, 1).to(device)
定义网络模型
网络模型就使用一个lstm+全连接实现,代码如下:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lstm = nn.LSTM(input_size=1, hidden_size=32, num_layers=1, batch_first=True)
# 输入格式是1,输出隐藏层大小是32
# 对于小数据集num_layers不要设置大,否则会因为模型变复杂而导致效果会变差
# num_layers顾名思义就是有几个lstm层,假如设置成2,就相当于连续经过两个lstm层
# 原来的输入格式是:(seq, batch, shape)
# 设置batch_first=True以后,输入格式就可以改为:(batch, seq, shape),更符合平常使用的习惯
self.linear = nn.Linear(32*seq, 1)
def forward(self, x):
x, (h, c) = self.lstm(x)
x = x.reshape(-1, 32*seq)
x = self.linear(x)
return x
定义损失函数和优化器
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
loss_fun = nn.MSELoss()
训练模型
model.train()
for epoch in range(300):
output = model(train_x)
loss = loss_fun(output, train_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 20 == 0 and epoch > 0:
test_loss = loss_fun(model(test_x), test_y)
print("epoch:{}, loss:{}, test_loss: {}".format(epoch, loss, test_loss))
测试模型
通过可视化查看预测趋势和实际趋势的差别:
model.eval()
result = li_x[0][:seq-1] + list((model(train_x).data.reshape(-1))*1000) + list((model(test_x).data.reshape(-1))*1000)
# 通过模型计算预测结果并解码后保存到列表里,因为预测是从第seq个开始的,所有前面要加seq-1条数据
plt.plot(value, label="real")
# 原来的走势
plt.plot(result, label="pred")
# 模型预测的走势
plt.legend(loc='best')

完整代码
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with open("data.csv", "r", encoding="utf-8") as f:
data = f.read()
data = [row.split(',') for row in data.split("\n")]
value = [int(each[1]) for each in data]
# 数据是每一天的航班数
li_x = []
li_y = []
seq = 2
# 因为数据集较少,序列长度太长会影响结果
for i in range(len(data) - seq):
# 输入就是[x,x+1]天的航班数,输出时x+2天的航班数
li_x.append(value[i: i+seq])
li_y.append(value[i+seq])
# 分训练和测试集
train_x = (torch.tensor(li_x[:-30]).float() / 1000.).reshape(-1, seq, 1).to(device)
train_y = (torch.tensor(li_y[:-30]).float() / 1000.).reshape(-1, 1).to(device)
test_x = (torch.tensor(li_x[-30:]).float() / 1000.).reshape(-1, seq, 1).to(device)
test_y = (torch.tensor(li_y[-30:]).float() / 1000.).reshape(-1, 1).to(device)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lstm = nn.LSTM(input_size=1, hidden_size=32, num_layers=1, batch_first=True)
# 输入格式是1,输出隐藏层大小是32,对于序列比较短的数据num_layers不要设置大,否则效果会变差
# 原来的输入格式是:(seq, batch, shape),设置batch_first=True以后,输入格式就可以改为:(batch, seq, shape),更符合平常使用的习惯
self.linear = nn.Linear(32*seq, 1)
def forward(self, x):
x, (h, c) = self.lstm(x)
x = x.reshape(-1, 32*seq)
x = self.linear(x)
return x
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
loss_fun = nn.MSELoss()
model.train()
for epoch in range(300):
output = model(train_x)
loss = loss_fun(output, train_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 20 == 0 and epoch > 0:
test_loss = loss_fun(model(test_x), test_y)
print("epoch:{}, loss:{}, test_loss: {}".format(epoch, loss, test_loss))
model.eval()
result = li_x[0][:seq-1] + list((model(train_x).data.reshape(-1))*1000) + list((model(test_x).data.reshape(-1))*1000)
# 通过模型计算预测结果并解码后保存到列表里,因为预测是从第seq个开始的,所有前面要加seq-1条数据
plt.plot(value, label="real")
# 原来的走势
plt.plot(result, label="pred")
# 模型预测的走势
plt.legend(loc='best')
网友评论