基于LSTM的CDN网络流量预测(二)
4. 模型训练
4.1 导入相关的模块
import os
import pandas as pd
import numpy as np
import time
import torch
from torch.autograd import Variable
import logging
import copy
import argparse
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
4.2 设置训练参数
data_url = "./network_traffic_forecast/data/" # 训练数据来源
train_dir_url = "./network_traffic_forecast/train_output" # 训练输出路径
model_path = os.path.join(train_dir_url, "time_series.pt") # 模型保存路径
train_rs_path = os.path.join(train_dir_url, "train_rs.csv") # 结果保存路径
params = {
"data_dim":1, # 需要预测的value列的维度
"window":60, # 滑动窗口大小
"forecast_step":10, # 预测步长 本案例预测后续10min,因此步长为10
"hidden_rnn":100,
"n_layers":2,
"dropout": 0.0,
"train_interval" : 10, # 训练集滑动步长
"batch_size" : 128, # 训练批次样本数
"epochs" : 30, # 训练次数
'normalize_scale': 1, # 归一化参数 后续根据实际情况修改
}
4.3 定义读写数据函数
def getTrainData(GroupCsvPath):
cols = ['TimePoint', 'att1', 'att2', 'Value']
file_list = os.listdir(GroupCsvPath)
file_list.sort()
frame_list = []
for file_name in file_list:
if not file_name.endswith('csv'):
continue
# with mox.file.File(GroupCsvPath + i, "r") as f:
with open(GroupCsvPath + file_name, "r") as f:
df = pd.read_csv(f, index_col=None, usecols=cols)
print(len(df), file_name)
df["TimePoint"] = df["TimePoint"].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x)))
frame_list.append(df)
return pd.concat(frame_list, ignore_index=True)
4.4 定义数据处理函数
进行数据拼接和缺失值填充
# 线性填充缺失值
def interpolating(df,att1,att2):
df["TimePoint"] = pd.to_datetime(df["TimePoint"])
df = df.set_index(["TimePoint"]).asfreq("T")
df["Value"] = df["Value"].interpolate(method='linear')
df['att1'] = att1
df['att2'] = att2
df=df.ffill()
df = df.reset_index()
df = df.dropna()
return df
# 最大最小归一化
def normalize_data(data_values, normalize_scale, method="maxabs_scale"):
if method == "maxabs_scale":
n_data = data_values/normalize_scale
else:
raise ValueError("Not support this transform")
return n_data
# 样本划分
def split_data(data, data_int, window, forecast_step, data_dim,interval):
idx_set = range(window + forecast_step - 1, data_int)
group_data, data_id_lists = _data_group(data, idx_set, window, data_dim, forecast_step,interval)
return group_data, data_id_lists
def _data_group(data, idx_set, window, data_dim, forecast_step,interval):
n = len(idx_set)
X = torch.zeros((n, window, data_dim))
Y = torch.zeros((n, forecast_step, data_dim))
data_id_lists = [] # value的id列
data_group=data.groupby(by=["att1", "att2"])
i = 0
for _,item in data_group:
item_valus=np.c_[item["Value"].values]
len(item) - len(item) % (window + forecast_step)-window
j = 0
while j < len(item) - window - forecast_step:
start = j
end = start+window
X[i, :, :] = torch.from_numpy(item_valus[start:end, :])
Y[i, :] = torch.from_numpy(item_valus[end:end+forecast_step, :])
j = j + interval
i += 1
data_id_lists.extend(
item[["TimePoint", "att1", "att2"]][window:end+forecast_step].values)
X = X[:i, :, :]
Y = Y[:i, :, :]
train_g = [X, Y]
return train_g,data_id_lists
def data_preprocess(df):
routes = []
att1 = df['att1'].unique()
att2 = df['att2'].unique()
for i in att1:
for j in att2:
routes.append((i, j)) # 动态适配原始数据,将对应线路id(att1,att2)加入routes,方便数据处理
new_data = pd.DataFrame([])
for i in routes:
item = df[(df['att1'] == i[0]) & (df['att2'] == i[1])]
if len(item) % 1440 != 0:
item = interpolating(item, i[0], i[1])
new_data = pd.concat([new_data, item])
new_data = new_data.dropna(axis=0)
new_data.sort_values(by=["TimePoint", "att1", "att2"], inplace=True)
return new_data, len(routes)
4.5 定义滑动窗口划分函数
def read_df(train_df, valid_df, window, forecast_step, data_dim, train_interval, valid_interval, normalize_scale):
train_df["Value"] = normalize_data(train_df["Value"], normalize_scale)
train_v, train_data_id_lists = split_data(train_df, len(train_df), window, forecast_step, data_dim, train_interval)
valid_df["Value"] = normalize_data(valid_df["Value"], normalize_scale)
valid_v, valid_data_id_df = split_data(valid_df, len(valid_df), window, forecast_step, data_dim, valid_interval)
# get scale
scale = Variable(torch.from_numpy(np.array(normalize_scale)).float().to(device))
return train_v, valid_v, scale, valid_data_id_df
4.6 准备训练集和测试集
时间序列一般按照时间列划分
考虑是因为整个数据涉及到多个数据 其中数据间隔是min。因此一天为1440
将数据排序后 取各个线路的最后一天作为测试集 -1440 * route_counts
data = getTrainData(data_url)
data, route_counts = data_preprocess(data)
normalize_scale = np.max(np.abs(data["Value"].values))
# 保存归一化参数
params['normalize_scale'] = normalize_scale
# 测试集训练集划分
train_data = data[:-1440 * route_counts]
valid_data = data[-1440 * route_counts:]
# 训练数据准备 滑窗处理获得样本集
train_v, valid_v, normalize_scale, valid_data_id_df = read_df(train_data, valid_data, params['window'], params['forecast_step'],
data_dim=params['data_dim'],
train_interval=params['train_interval'],
valid_interval=params['forecast_step'],
normalize_scale=params['normalize_scale'])
68601 time_series_1.csv
137202 time_series_2.csv
136655 time_series_3.csv
120217 time_series_4.csv
/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
4.7 定义模型结构
LSTM(Long Short Term Memory Network,长短时记忆网络,是一种改进之后的循环神经网络,可以解决RNN无法处理长距离的依赖的问题,目前广泛用于时序预测领域。参考链接,论文链接
import torch.nn as nn
class LstmModel(nn.Module):
"""
lstm
"""
def __init__(self, dim=1,
window=10,
forecast_step=1,
hidden_rnn=100,
n_layers=1,
dropout=0.1):
super(LstmModel, self).__init__()
self.dim = dim
self.window = window
self.forecast_step = forecast_step
self.hidr = hidden_rnn
self.nlayers = n_layers
self.dropout = nn.Dropout(p=dropout)
self.lstm = nn.LSTM(input_size=self.dim, hidden_size=self.hidr, num_layers=self.nlayers)
self.fc = nn.Linear(in_features=self.hidr, out_features=self.forecast_step * self.dim)
def forward(self, x, f=None):
# lstm
x = x.permute(1, 0, 2).contiguous()
x, _ = self.lstm(x)
x = x[-1, :, :]
x = self.dropout(x)
# fc
x = x.squeeze(0)
res = self.fc(x)
res = res.view(-1, self.forecast_step, self.dim)
return res
model = LstmModel(dim=params['data_dim'],
window=params['window'],
forecast_step=params['forecast_step'],
hidden_rnn=params['hidden_rnn'],
n_layers=params['n_layers'],
dropout=params['dropout'])
4.8 定义模型构建适配函数
训练,测试,保存
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import numpy as np
class TSEstimator(object):
"""
window: number of time values to consider in each input
model: lstm
batch_size: batch size
device: cpu or gpu
forecast_step:forecast step when forecasting
lr:learning rate
"""
def __init__(self,
window,
model,
batch_size,
device,
forecast_step=1,
lr=0.0001):
self.model = model
self.forecast_step = forecast_step
self.window = window
self.batch_size = batch_size
self.device = device
self.criterion = nn.L1Loss(size_average=False).to(device)
self.model = self.model.to(device)
if device == torch.device("cuda"):
torch.distributed.init_process_group(backend='nccl',
init_method='tcp://localhost:12354',
rank=0,
world_size=1)
self.model = nn.parallel.DistributedDataParallel(self.model)
self.optimizer = optim.Adam(self.model.parameters(), lr)
def _training(self, Y, output,scale,data_dim):
"""
traing submodule
calculate the initial loss and update the model
:param Y: the real value
:param output: the predict value
:return:
"""
self.n_samples += output.size(0)
output = output[:, :, 0:data_dim] * scale
Y_scale = Y[:, :, 0:data_dim] * scale
loss = self.criterion(output, Y_scale)
loss.backward()
self.optimizer.step()
self.total_loss += loss.data
def train_deep(self, train_data,scale,data_dim):
"""
training of deep model
:param train_data: training data
:return: loss
"""
self.model.train()
self.total_loss = 0
self.n_samples = 0
X = train_data[0]
Y = train_data[1]
for X, Y in self._get_batches(X, Y, self.batch_size, True):
self.model.zero_grad()
output = self.model(X)
self._training(Y, output,scale,data_dim)
return self.total_loss / self.n_samples
def _testing(self,predict_v,output,real_v,Y):
if predict_v is None:
if isinstance(output, list):
predict_v = output[0]
else:
predict_v = output
real_v = Y
else:
if isinstance(output, list):
output = output[0]
predict_v = torch.cat((predict_v, output))
real_v = torch.cat((real_v, Y))
return predict_v,output,real_v
def test_deep(self, test_data,scale,data_dim):
"""
testing of deep model
:param test_data: testing data
:return:
"""
X = test_data[0]
Y = test_data[1]
with torch.no_grad():
self.model.eval()
predict_v = None
real_v = None
for X, Y in self._get_batches(X, Y, self.batch_size, False):
output = self.model(X)
predict_v, output, real_v, = self._testing(predict_v, output, real_v, Y)
predict_v = (predict_v[:, :, 0:data_dim] * scale).data.cpu().numpy()
real_v = (real_v * scale).data.cpu().numpy()
torch.cuda.empty_cache()
return predict_v, real_v
def _get_batches(self, inputs, targets, batch_size, shuffle=True):
length = len(inputs)
if shuffle:
index = torch.randperm(length)
else:
index = torch.LongTensor(range(length))
start_idx = 0
while (start_idx < length):
end_idx = min(length, start_idx + batch_size)
excerpt = index[start_idx:end_idx]
X = inputs[excerpt].to(self.device)
Y = targets[excerpt].to(self.device)
yield Variable(X), Variable(Y)
start_idx += batch_size
def save_model(self, best_model, model_path):
'''
保存模型
'''
print("saved at: ", model_path)
torch.save(best_model,model_path)
4.9 对应评价指标
预测相关评价指标 参考链接
from math import sqrt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def check_error(real_v, pred_v, name_col='error', index_name='measure'):
real_v = real_v.reshape((real_v.shape[0], -1))
pred_v = pred_v.reshape((pred_v.shape[0], -1))
error_series = assessment_indexes(real_v, pred_v, name_col=name_col, index_name=index_name)
return error_series
def assessment_indexes(real_v, pred_v, name_col='error', index_name='measure'):
mse = mean_squared_error(real_v, pred_v)
rmse = sqrt(mean_squared_error(real_v, pred_v))
mae = mean_absolute_error(real_v, pred_v)
mape = np.mean(np.abs((real_v - pred_v) / real_v)) * 100
r_square = r2_score(real_v, pred_v)
sigma_p = (pred_v).std(axis=0)
sigma_g = (real_v).std(axis=0, dtype=np.float32)
mean_p = pred_v.mean(axis=0)
mean_g = real_v.mean(axis=0)
index = (sigma_g != 0)
sigma_g = 1e-06 if not sigma_g.all() else sigma_g
sigma_p = 1e-06 if not sigma_p.all() else sigma_p
correlation = ((pred_v - mean_p) * (real_v - mean_g)).mean(axis=0) / (sigma_p * sigma_g)
correlation = (correlation[index]).mean()
error_group = [mse, rmse, mae, mape, r_square, correlation]
error_series = pd.DataFrame(error_group,
index=['mse', 'rmse', 'mae', 'mape', 'r_square', 'correlation'],
columns=[name_col])
error_series.index.name = index_name
return error_series
4.10 定义测试结果保存函数
时序预测有时候因为数据量级的关系,根据评价指标往往不能很好得看出模型的预测效果,例如对于mae(Mean Absolute Error, 平均绝对误差)评价指标,
若数据本来就是0.01级别的,那么mae误差也就是0.01级别的,看起来mae很低,但实际效果并不一定好。
因此需要将真实值与预测值画图查看预测结果
# save best pre value
def save_test_pred_csv(best_pre,val_real,valid_data_id_df):
rs_tag = ['TimePoint','att1','att2']
id_len = len(rs_tag)
value_tag = "Value"
rs_tag.append(value_tag)
rs_tag.append("pred" + value_tag)
train_rs = np.concatenate(
(np.array(valid_data_id_df).reshape(-1, id_len), val_real.reshape(-1, params['data_dim']), best_pre.reshape(-1, params['data_dim'])),
axis=1)
df_result = pd.DataFrame(train_rs, columns=rs_tag)
df_result.to_csv(train_rs_path,index=False)
4.11 开始训练及查看精度指标
estimator = TSEstimator(window=params['window'], model=model, batch_size=params['batch_size'], device= device, forecast_step=params['forecast_step'])
/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/torch/nn/_reduction.py:49: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
warnings.warn(warning.format(ret))
start_time = time.time()
for epoch in range(params['epochs']):
s_t = time.time()
train_loss = estimator.train_deep(train_v, normalize_scale, 1)
val_pre, val_real = estimator.test_deep(valid_v, normalize_scale, 1)
error_val_series = check_error(val_pre, val_real)
each_time = time.time() - s_t
field_names = ["epoch", "each_time", "train_loss", "rmse", "mae", "mape"]
tb_values = [epoch, each_time, train_loss.cpu().numpy().item(),
round(error_val_series.values[1][0], 4),
round(error_val_series.values[2][0], 4),
round(error_val_series.values[3][0], 4),
]
print("epoch: %d, each_time: %.4f, train_loss: %.4f, rmse: %.4f, mae: %.4f, mape: %.4f"
% (epoch, each_time, float(train_loss.cpu().numpy()),
error_val_series.values[1][0], error_val_series.values[2][0], error_val_series.values[3][0]))
best_val = float("inf")
best_model = None
best_pre = None
# 训练loss使用mae 保存最好的模型和测试集预测结果使用mape
val_loss = error_val_series.values[2][0]
if not os.path.exists(train_dir_url):
os.makedirs(train_dir_url)
if val_loss < best_val:
best_val = val_loss
del best_model
best_model = copy.deepcopy(model)
del best_pre
best_pre = val_pre
# save model
estimator.save_model(best_model, model_path)
# save best pre value
save_test_pred_csv(best_pre, val_real, valid_data_id_df)
print('total cost time: %.1f s' % (time.time() - start_time))
epoch: 0, each_time: 2.2887, train_loss: 732660224.0000, rmse: 18480941.3889, mae: 10499918.0000, mape: 20.5913
saved at: ./network_traffic_forecast/train_output/time_series.pt
/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/torch/serialization.py:250: UserWarning: Couldn't retrieve source code for container of type LstmModel. It won't be checked for correctness upon loading.
"type " + obj.__name__ + ". It won't be checked "
epoch: 1, each_time: 2.2436, train_loss: 83064176.0000, rmse: 11715864.9902, mae: 5928672.5000, mape: 9.1319
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 2, each_time: 2.2947, train_loss: 67857488.0000, rmse: 10946500.4182, mae: 5639465.0000, mape: 9.0063
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 3, each_time: 2.4198, train_loss: 63916284.0000, rmse: 10221462.3373, mae: 5354190.0000, mape: 10.0744
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 4, each_time: 2.2429, train_loss: 59925384.0000, rmse: 9200152.1482, mae: 5040141.5000, mape: 20.1258
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 5, each_time: 2.2552, train_loss: 56456340.0000, rmse: 8593183.9666, mae: 4489315.0000, mape: 6.5258
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 6, each_time: 2.2758, train_loss: 52385124.0000, rmse: 8046146.3170, mae: 4178100.7500, mape: 6.2978
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 7, each_time: 2.2549, train_loss: 50858740.0000, rmse: 7622845.0057, mae: 4177319.2500, mape: 6.1059
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 8, each_time: 2.2831, train_loss: 47659808.0000, rmse: 7048211.2660, mae: 3981684.7500, mape: 8.9254
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 9, each_time: 2.2615, train_loss: 46268492.0000, rmse: 7059527.2964, mae: 3625710.0000, mape: 5.6797
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 10, each_time: 2.2460, train_loss: 44496600.0000, rmse: 6954840.9497, mae: 4355799.0000, mape: 10.9665
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 11, each_time: 2.2510, train_loss: 42034828.0000, rmse: 6776280.2120, mae: 3712848.0000, mape: 17.7975
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 12, each_time: 2.2451, train_loss: 41806208.0000, rmse: 6402058.4824, mae: 3948734.5000, mape: 44.5868
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 13, each_time: 2.3806, train_loss: 40632128.0000, rmse: 6418718.8704, mae: 3618424.0000, mape: 44.8111
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 14, each_time: 2.2790, train_loss: 39789056.0000, rmse: 5949154.1191, mae: 3137766.5000, mape: 4.8335
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 15, each_time: 2.2641, train_loss: 39603608.0000, rmse: 5897140.5197, mae: 3183744.2500, mape: 12.7539
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 16, each_time: 2.2520, train_loss: 38622912.0000, rmse: 6323395.2618, mae: 3952862.0000, mape: 10.4008
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 17, each_time: 2.2753, train_loss: 39800208.0000, rmse: 5900194.8760, mae: 3108835.2500, mape: 5.7347
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 18, each_time: 2.2649, train_loss: 38557772.0000, rmse: 6283288.0312, mae: 3462172.5000, mape: 449.1278
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 19, each_time: 2.2506, train_loss: 36756788.0000, rmse: 5714585.9282, mae: 2982806.0000, mape: 4.2068
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 20, each_time: 2.2519, train_loss: 37624304.0000, rmse: 6439459.4780, mae: 3508441.5000, mape: 7.2250
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 21, each_time: 2.2459, train_loss: 35486608.0000, rmse: 5551700.0263, mae: 2893324.5000, mape: 4.6178
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 22, each_time: 2.2928, train_loss: 35934976.0000, rmse: 5830216.4583, mae: 3010293.0000, mape: 5.4767
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 23, each_time: 2.2563, train_loss: 35198580.0000, rmse: 5931488.5109, mae: 3143048.0000, mape: 6.0568
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 24, each_time: 2.2823, train_loss: 34571464.0000, rmse: 5436309.0470, mae: 2839868.5000, mape: 9.7854
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 25, each_time: 2.3337, train_loss: 35991756.0000, rmse: 5442173.4094, mae: 2894197.2500, mape: 25.9536
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 26, each_time: 2.3083, train_loss: 34225712.0000, rmse: 5360906.3116, mae: 2823168.7500, mape: 16.2303
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 27, each_time: 2.3077, train_loss: 33879528.0000, rmse: 5430545.7099, mae: 3016221.7500, mape: 6.4068
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 28, each_time: 2.2823, train_loss: 33766292.0000, rmse: 5285943.5441, mae: 2792550.5000, mape: 15.8276
saved at: ./network_traffic_forecast/train_output/time_series.pt
epoch: 29, each_time: 2.2686, train_loss: 33408020.0000, rmse: 5261764.1908, mae: 2705327.7500, mape: 5.5690
saved at: ./network_traffic_forecast/train_output/time_series.pt
total cost time: 82.3 s
如上述日志所示,模型保存时,使用mape做评价指标
- MAPE平均绝对百分比误差(Mean Absolute Percentage Error)
范围[0,+∞),MAPE 为0%表示完美模型,MAPE 大于 100 %则表示劣质模型。注意点:当真实值有数据等于0时,存在分母0除问题,该公式不可用!
4.12 查看预测结果
train_rs = pd.read_csv(train_rs_path)
train_rs.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
TimePoint | att1 | att2 | Value | predValue | |
---|---|---|---|---|---|
0 | 2020-06-22 00:59:00 | 0 | 0 | 107992408.0 | 107767552.0 |
1 | 2020-06-22 01:00:00 | 0 | 0 | 105027000.0 | 106568848.0 |
2 | 2020-06-22 01:01:00 | 0 | 0 | 103297008.0 | 106277728.0 |
3 | 2020-06-22 01:02:00 | 0 | 0 | 100357024.0 | 106990616.0 |
4 | 2020-06-22 01:03:00 | 0 | 0 | 98986192.0 | 103577592.0 |
value对于真实值,predValue对应预测值,TimePoint、att1、att2 分别为对应时间、区域及运营商属性值
4.13 查看每条线路预测效果对比图
定义画图函数
import matplotlib.pyplot as plt
# 双坐标画图
def plot(y1,y2,y1_label,y2_label,title):
plt.figure(figsize=(20,8))
plt.title(title, fontdict={'family': 'SimHei', 'weight': 'normal', 'size': 15})
plt.plot(y1,label=y1_label)
plt.plot(y2,label=y2_label)
plt.legend() # 显示图例
plt.show()
train_rs_0_0 = train_rs[(train_rs['att1'] == 0) & (train_rs['att2'] == 0)]
plot(train_rs_0_0['Value'].values,train_rs_0_0['predValue'].values,'Value','predValue','att1_0_att2_0')
train_rs_1_1 = train_rs[(train_rs['att1'] == 1) & (train_rs['att2'] == 1)]
plot(train_rs_1_1['Value'].values,train_rs_1_1['predValue'].values,'Value','predValue','att1_1_att2_1')
预测效果分析:
- 如上2图分别是att1=0,att2=0 与att1=1,att2=1 两条线路在测试集上的效果对比,算法拟合良好,无明显滞后。
5 模型推理
推理采用time_series_0_0_forecast.csv数据集,为训练集中att1=0,att2=0 线路的原始数据 取一个window的长度(60个),时间戳从1592837940开始
infer_data_path = "./network_traffic_forecast/time_series_0_0_forecast.csv"
infer_data = pd.read_csv(infer_data_path)
print('行数:', infer_data.shape[0])
infer_data.head()
行数: 60
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
TimePoint | att1 | att2 | Value | |
---|---|---|---|---|
0 | 1592837940 | 0 | 0 | 340698113.0 |
1 | 1592838000 | 0 | 0 | 338812992.3 |
2 | 1592838060 | 0 | 0 | 331492000.2 |
3 | 1592838120 | 0 | 0 | 327298593.0 |
4 | 1592838180 | 0 | 0 | 325431680.5 |
model_path = model_path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize_scale = params['normalize_scale']
infer_data = normalize_data(np.reshape(infer_data[-60:]["Value"].values, (-1, 1)), normalize_scale)
infer_tensor = torch.from_numpy(infer_data)
infer_tensor = infer_tensor.unsqueeze(0).float().to(device)
model = torch.load(model_path).to(device)
predict = model(infer_tensor)
# tensor转numpy 取第0列 第0列是预测值对应的列
predict = predict.data.cpu().numpy()
predict = predict[:,:,0].reshape(-1)
# 归一化还原
predict = [i * normalize_scale for i in predict]
print(predict)
[0.18802738 0.18635544 0.18579344 0.1860577 0.18072018 0.18076988
0.17839946 0.18191667 0.17893821 0.1788864 ]
6. 改进模型的思路
如上内容是使用lstm构建网络流量预测模型的过程演示,模型精度93%,有如下几个思路可以提升模型的精度:
- 加入特征。当数据在峰值或其他地方波动较大的情况下,模型比较容易拟合不准确,该种情况下可以考虑在所选取的window长度的历史数据中,比如,加入mean、max、min等统计学特征;
- 模型调参。比如使用autoserch搜参;
至此,本案例完成。
- 点赞
- 收藏
- 关注作者
评论(0)