基于LSTM的CDN网络流量预测(二)

举报
HWCloudAI 发表于 2022/12/19 14:37:31 2022/12/19
【摘要】 4. 模型训练 4.1 导入相关的模块import osimport pandas as pdimport numpy as npimport timeimport torchfrom torch.autograd import Variableimport loggingimport copyimport argparsedevice = torch.device("cuda" if t...

4. 模型训练

4.1 导入相关的模块

import os
import pandas as pd
import numpy as np
import time
import torch
from torch.autograd import Variable
import logging
import copy
import argparse

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

4.2 设置训练参数

data_url = "./network_traffic_forecast/data/"  # 训练数据来源
train_dir_url = "./network_traffic_forecast/train_output"  # 训练输出路径

model_path = os.path.join(train_dir_url, "time_series.pt")   # 模型保存路径
train_rs_path = os.path.join(train_dir_url, "train_rs.csv")  # 结果保存路径
params = {
    "data_dim":1,  # 需要预测的value列的维度
    "window":60,  # 滑动窗口大小
    "forecast_step":10,  # 预测步长 本案例预测后续10min,因此步长为10
    "hidden_rnn":100,
    "n_layers":2,
    "dropout": 0.0,
    "train_interval" : 10,  # 训练集滑动步长
    "batch_size" : 128,  # 训练批次样本数
    "epochs" : 30, # 训练次数
    'normalize_scale': 1,  # 归一化参数 后续根据实际情况修改
}

4.3 定义读写数据函数

def getTrainData(GroupCsvPath):
    cols = ['TimePoint', 'att1', 'att2', 'Value']
    file_list = os.listdir(GroupCsvPath)
    file_list.sort()
    frame_list = []
    for file_name in file_list:
        if not file_name.endswith('csv'):
            continue
        # with mox.file.File(GroupCsvPath + i, "r") as f:
        with open(GroupCsvPath + file_name, "r") as f:
            df = pd.read_csv(f, index_col=None, usecols=cols)
            print(len(df), file_name)
            df["TimePoint"] = df["TimePoint"].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x)))
            frame_list.append(df)
    return pd.concat(frame_list, ignore_index=True)

4.4 定义数据处理函数

进行数据拼接和缺失值填充

# 线性填充缺失值
def interpolating(df,att1,att2):
    df["TimePoint"] = pd.to_datetime(df["TimePoint"])
    df = df.set_index(["TimePoint"]).asfreq("T")
    df["Value"] = df["Value"].interpolate(method='linear')
    df['att1'] = att1
    df['att2'] = att2
    df=df.ffill()
    df = df.reset_index()
    df = df.dropna()
    return df

# 最大最小归一化
def normalize_data(data_values, normalize_scale, method="maxabs_scale"):
    if method == "maxabs_scale":
        n_data = data_values/normalize_scale
    else:
        raise ValueError("Not support this transform")
    return n_data

# 样本划分
def split_data(data, data_int, window, forecast_step, data_dim,interval):
    idx_set = range(window + forecast_step - 1, data_int)
    group_data, data_id_lists = _data_group(data, idx_set, window, data_dim, forecast_step,interval)
    return group_data, data_id_lists

def _data_group(data, idx_set, window, data_dim, forecast_step,interval):
    n = len(idx_set)
    X = torch.zeros((n, window, data_dim))
    Y = torch.zeros((n, forecast_step, data_dim))
    data_id_lists = [] # value的id列
    data_group=data.groupby(by=["att1", "att2"])
    i = 0
    for _,item in data_group:

        item_valus=np.c_[item["Value"].values]
        len(item) - len(item) % (window + forecast_step)-window

        j = 0
        while j < len(item) - window - forecast_step:
            start = j
            end = start+window
            X[i, :, :] = torch.from_numpy(item_valus[start:end, :])
            Y[i, :] = torch.from_numpy(item_valus[end:end+forecast_step, :])

            j = j + interval
            i += 1
        data_id_lists.extend(
            item[["TimePoint", "att1", "att2"]][window:end+forecast_step].values)
    X = X[:i, :, :]
    Y = Y[:i, :, :]
    train_g = [X, Y]
    return train_g,data_id_lists

def data_preprocess(df):
    routes = []
    att1 = df['att1'].unique()
    att2 = df['att2'].unique()
    for i in att1:
        for j in att2:
            routes.append((i, j))  # 动态适配原始数据,将对应线路id(att1,att2)加入routes,方便数据处理
    new_data = pd.DataFrame([])
    for i in routes:
        item = df[(df['att1'] == i[0]) & (df['att2'] == i[1])]
        if len(item) % 1440 != 0:
            item = interpolating(item, i[0], i[1])
        new_data = pd.concat([new_data, item])
    new_data = new_data.dropna(axis=0)
    new_data.sort_values(by=["TimePoint", "att1", "att2"], inplace=True)
    return new_data, len(routes)

4.5 定义滑动窗口划分函数

def read_df(train_df, valid_df, window, forecast_step, data_dim, train_interval, valid_interval, normalize_scale):
    train_df["Value"] = normalize_data(train_df["Value"], normalize_scale)
    train_v, train_data_id_lists = split_data(train_df, len(train_df), window, forecast_step, data_dim, train_interval)

    valid_df["Value"] = normalize_data(valid_df["Value"], normalize_scale)
    valid_v, valid_data_id_df = split_data(valid_df, len(valid_df), window, forecast_step, data_dim, valid_interval)

    # get scale
    scale = Variable(torch.from_numpy(np.array(normalize_scale)).float().to(device))

    return train_v, valid_v, scale, valid_data_id_df

4.6 准备训练集和测试集

时间序列一般按照时间列划分
考虑是因为整个数据涉及到多个数据 其中数据间隔是min。因此一天为1440
将数据排序后 取各个线路的最后一天作为测试集 -1440 * route_counts

data = getTrainData(data_url)

data, route_counts = data_preprocess(data)
normalize_scale = np.max(np.abs(data["Value"].values))

# 保存归一化参数
params['normalize_scale'] = normalize_scale

# 测试集训练集划分
train_data = data[:-1440 * route_counts]
valid_data = data[-1440 * route_counts:]
# 训练数据准备 滑窗处理获得样本集
train_v, valid_v, normalize_scale, valid_data_id_df = read_df(train_data, valid_data, params['window'], params['forecast_step'],
                                                              data_dim=params['data_dim'],
                                                              train_interval=params['train_interval'],
                                                              valid_interval=params['forecast_step'],
                                                              normalize_scale=params['normalize_scale'])
68601 time_series_1.csv

137202 time_series_2.csv

136655 time_series_3.csv

120217 time_series_4.csv


/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 

A value is trying to be set on a copy of a slice from a DataFrame.

Try using .loc[row_indexer,col_indexer] = value instead



See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

  This is separate from the ipykernel package so we can avoid doing imports until

/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 

A value is trying to be set on a copy of a slice from a DataFrame.

Try using .loc[row_indexer,col_indexer] = value instead



See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

  

/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 

A value is trying to be set on a copy of a slice from a DataFrame.

Try using .loc[row_indexer,col_indexer] = value instead



See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

  """

4.7 定义模型结构

LSTM(Long Short Term Memory Network,长短时记忆网络,是一种改进之后的循环神经网络,可以解决RNN无法处理长距离的依赖的问题,目前广泛用于时序预测领域。参考链接论文链接

import torch.nn as nn


class LstmModel(nn.Module):
    """
    lstm
    """
    def __init__(self, dim=1,
                 window=10,
                 forecast_step=1,
                 hidden_rnn=100,
                 n_layers=1,
                 dropout=0.1):
        super(LstmModel, self).__init__()
        self.dim = dim
        self.window = window
        self.forecast_step = forecast_step
        self.hidr = hidden_rnn
        self.nlayers = n_layers
        self.dropout = nn.Dropout(p=dropout)

        self.lstm = nn.LSTM(input_size=self.dim, hidden_size=self.hidr, num_layers=self.nlayers)
        self.fc = nn.Linear(in_features=self.hidr, out_features=self.forecast_step * self.dim)

    def forward(self, x, f=None):
        # lstm
        x = x.permute(1, 0, 2).contiguous()
        x, _ = self.lstm(x)
        x = x[-1, :, :]
        x = self.dropout(x)

        # fc
        x = x.squeeze(0)
        res = self.fc(x)
        res = res.view(-1, self.forecast_step, self.dim)
        return res
model = LstmModel(dim=params['data_dim'],
                       window=params['window'],
                       forecast_step=params['forecast_step'],
                       hidden_rnn=params['hidden_rnn'],
                       n_layers=params['n_layers'],
                       dropout=params['dropout'])

4.8 定义模型构建适配函数

训练,测试,保存

import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import numpy as np


class TSEstimator(object):
    """
    window: number of time values to consider in each input
    model: lstm
    batch_size: batch size
    device: cpu or gpu
    forecast_step:forecast step when forecasting
    lr:learning rate
    """
    def __init__(self,
                 window,
                 model,
                 batch_size,
                 device,
                 forecast_step=1,
                 lr=0.0001):

        self.model = model
        self.forecast_step = forecast_step
        self.window = window
        self.batch_size = batch_size
        self.device = device

        self.criterion = nn.L1Loss(size_average=False).to(device)
        self.model = self.model.to(device)

        if device == torch.device("cuda"):
            torch.distributed.init_process_group(backend='nccl',
                                                 init_method='tcp://localhost:12354',
                                                 rank=0,
                                                 world_size=1)
            self.model = nn.parallel.DistributedDataParallel(self.model)

        self.optimizer = optim.Adam(self.model.parameters(), lr)

    def _training(self, Y, output,scale,data_dim):
        """
        traing submodule
        calculate the initial loss and update the model
        :param Y: the real value
        :param output: the predict value
        :return:
        """

        self.n_samples += output.size(0)
        output = output[:, :, 0:data_dim] * scale
        Y_scale = Y[:, :, 0:data_dim] * scale
        loss = self.criterion(output, Y_scale)
        loss.backward()
        self.optimizer.step()
        self.total_loss += loss.data

    def train_deep(self, train_data,scale,data_dim):
        """
        training of deep model
        :param train_data: training data
        :return: loss
        """

        self.model.train()
        self.total_loss = 0
        self.n_samples = 0

        X = train_data[0]
        Y = train_data[1]
        for X, Y in self._get_batches(X, Y, self.batch_size, True):
            self.model.zero_grad()
            output = self.model(X)
            self._training(Y, output,scale,data_dim)

        return self.total_loss / self.n_samples


    def _testing(self,predict_v,output,real_v,Y):
        if predict_v is None:
            if isinstance(output, list):
                predict_v = output[0]
            else:
                predict_v = output
            real_v = Y
        else:
            if isinstance(output, list):
                output = output[0]
            predict_v = torch.cat((predict_v, output))
            real_v = torch.cat((real_v, Y))
        return predict_v,output,real_v


    def test_deep(self, test_data,scale,data_dim):
        """
        testing of deep model
        :param test_data: testing data
        :return:
        """

        X = test_data[0]
        Y = test_data[1]

        with torch.no_grad():
            self.model.eval()
            predict_v = None
            real_v = None

            for X, Y  in self._get_batches(X, Y, self.batch_size, False):
                output = self.model(X)
                predict_v, output, real_v, = self._testing(predict_v, output, real_v, Y)

            predict_v = (predict_v[:, :, 0:data_dim] * scale).data.cpu().numpy()
            real_v = (real_v * scale).data.cpu().numpy()
            torch.cuda.empty_cache()

            return predict_v, real_v

    def _get_batches(self, inputs, targets, batch_size, shuffle=True):
        length = len(inputs)
        if shuffle:
            index = torch.randperm(length)
        else:
            index = torch.LongTensor(range(length))
        start_idx = 0
        while (start_idx < length):
            end_idx = min(length, start_idx + batch_size)
            excerpt = index[start_idx:end_idx]
            X = inputs[excerpt].to(self.device)
            Y = targets[excerpt].to(self.device)


            yield Variable(X), Variable(Y)
            start_idx += batch_size
            
    def save_model(self, best_model, model_path): 
        '''
        保存模型
        '''
        print("saved at: ", model_path)
        torch.save(best_model,model_path)

4.9 对应评价指标

预测相关评价指标 参考链接

from math import sqrt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


def check_error(real_v, pred_v, name_col='error', index_name='measure'):
    real_v = real_v.reshape((real_v.shape[0], -1))
    pred_v = pred_v.reshape((pred_v.shape[0], -1))
    error_series = assessment_indexes(real_v, pred_v, name_col=name_col, index_name=index_name)
    return error_series


def assessment_indexes(real_v, pred_v, name_col='error', index_name='measure'):
    mse = mean_squared_error(real_v, pred_v)
    rmse = sqrt(mean_squared_error(real_v, pred_v))
    mae = mean_absolute_error(real_v, pred_v)
    mape = np.mean(np.abs((real_v - pred_v) / real_v)) * 100
    r_square = r2_score(real_v, pred_v)

    sigma_p = (pred_v).std(axis=0)
    sigma_g = (real_v).std(axis=0, dtype=np.float32)
    mean_p = pred_v.mean(axis=0)
    mean_g = real_v.mean(axis=0)
    index = (sigma_g != 0)
    sigma_g = 1e-06 if not sigma_g.all() else sigma_g
    sigma_p = 1e-06 if not sigma_p.all() else sigma_p
    correlation = ((pred_v - mean_p) * (real_v - mean_g)).mean(axis=0) / (sigma_p * sigma_g)
    correlation = (correlation[index]).mean()

    error_group = [mse, rmse, mae, mape, r_square, correlation]
    error_series = pd.DataFrame(error_group,
                                index=['mse', 'rmse', 'mae', 'mape', 'r_square', 'correlation'],
                                columns=[name_col])
    error_series.index.name = index_name

    return error_series

4.10 定义测试结果保存函数

时序预测有时候因为数据量级的关系,根据评价指标往往不能很好得看出模型的预测效果,例如对于mae(Mean Absolute Error, 平均绝对误差)评价指标,
若数据本来就是0.01级别的,那么mae误差也就是0.01级别的,看起来mae很低,但实际效果并不一定好。

因此需要将真实值与预测值画图查看预测结果

# save best pre value
def save_test_pred_csv(best_pre,val_real,valid_data_id_df):

    rs_tag = ['TimePoint','att1','att2']
    id_len = len(rs_tag)
    value_tag = "Value"
    rs_tag.append(value_tag)
    rs_tag.append("pred" + value_tag)
    train_rs = np.concatenate(
        (np.array(valid_data_id_df).reshape(-1, id_len), val_real.reshape(-1, params['data_dim']), best_pre.reshape(-1, params['data_dim'])),
        axis=1)
    df_result = pd.DataFrame(train_rs, columns=rs_tag)
    df_result.to_csv(train_rs_path,index=False)

4.11 开始训练及查看精度指标

estimator = TSEstimator(window=params['window'], model=model, batch_size=params['batch_size'], device= device, forecast_step=params['forecast_step'])
/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/torch/nn/_reduction.py:49: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.

  warnings.warn(warning.format(ret))
start_time = time.time()
for epoch in range(params['epochs']):
    s_t = time.time()
    train_loss = estimator.train_deep(train_v, normalize_scale, 1)

    val_pre, val_real = estimator.test_deep(valid_v, normalize_scale, 1)
    error_val_series = check_error(val_pre, val_real)

    each_time = time.time() - s_t

    field_names = ["epoch", "each_time", "train_loss", "rmse", "mae", "mape"]

    tb_values = [epoch, each_time, train_loss.cpu().numpy().item(),
                 round(error_val_series.values[1][0], 4),
                 round(error_val_series.values[2][0], 4),
                 round(error_val_series.values[3][0], 4),
                 ]

    print("epoch: %d, each_time: %.4f, train_loss: %.4f, rmse: %.4f, mae: %.4f, mape: %.4f" 
          % (epoch, each_time, float(train_loss.cpu().numpy()), 
             error_val_series.values[1][0], error_val_series.values[2][0], error_val_series.values[3][0]))
    
    best_val = float("inf")
    best_model = None
    best_pre = None
    
    # 训练loss使用mae 保存最好的模型和测试集预测结果使用mape
    val_loss = error_val_series.values[2][0]

    if not os.path.exists(train_dir_url):
        os.makedirs(train_dir_url)
    if val_loss < best_val:
        best_val = val_loss
        del best_model
        best_model = copy.deepcopy(model)
        del best_pre
        best_pre = val_pre

        # save model
        estimator.save_model(best_model, model_path)
        # save best pre value
        save_test_pred_csv(best_pre, val_real, valid_data_id_df)

print('total cost time: %.1f s' % (time.time() - start_time))
epoch: 0, each_time: 2.2887, train_loss: 732660224.0000, rmse: 18480941.3889, mae: 10499918.0000, mape: 20.5913

saved at:  ./network_traffic_forecast/train_output/time_series.pt


/home/ma-user/anaconda3/envs/Pytorch-1.0.0/lib/python3.6/site-packages/torch/serialization.py:250: UserWarning: Couldn't retrieve source code for container of type LstmModel. It won't be checked for correctness upon loading.

  "type " + obj.__name__ + ". It won't be checked "


epoch: 1, each_time: 2.2436, train_loss: 83064176.0000, rmse: 11715864.9902, mae: 5928672.5000, mape: 9.1319

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 2, each_time: 2.2947, train_loss: 67857488.0000, rmse: 10946500.4182, mae: 5639465.0000, mape: 9.0063

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 3, each_time: 2.4198, train_loss: 63916284.0000, rmse: 10221462.3373, mae: 5354190.0000, mape: 10.0744

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 4, each_time: 2.2429, train_loss: 59925384.0000, rmse: 9200152.1482, mae: 5040141.5000, mape: 20.1258

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 5, each_time: 2.2552, train_loss: 56456340.0000, rmse: 8593183.9666, mae: 4489315.0000, mape: 6.5258

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 6, each_time: 2.2758, train_loss: 52385124.0000, rmse: 8046146.3170, mae: 4178100.7500, mape: 6.2978

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 7, each_time: 2.2549, train_loss: 50858740.0000, rmse: 7622845.0057, mae: 4177319.2500, mape: 6.1059

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 8, each_time: 2.2831, train_loss: 47659808.0000, rmse: 7048211.2660, mae: 3981684.7500, mape: 8.9254

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 9, each_time: 2.2615, train_loss: 46268492.0000, rmse: 7059527.2964, mae: 3625710.0000, mape: 5.6797

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 10, each_time: 2.2460, train_loss: 44496600.0000, rmse: 6954840.9497, mae: 4355799.0000, mape: 10.9665

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 11, each_time: 2.2510, train_loss: 42034828.0000, rmse: 6776280.2120, mae: 3712848.0000, mape: 17.7975

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 12, each_time: 2.2451, train_loss: 41806208.0000, rmse: 6402058.4824, mae: 3948734.5000, mape: 44.5868

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 13, each_time: 2.3806, train_loss: 40632128.0000, rmse: 6418718.8704, mae: 3618424.0000, mape: 44.8111

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 14, each_time: 2.2790, train_loss: 39789056.0000, rmse: 5949154.1191, mae: 3137766.5000, mape: 4.8335

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 15, each_time: 2.2641, train_loss: 39603608.0000, rmse: 5897140.5197, mae: 3183744.2500, mape: 12.7539

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 16, each_time: 2.2520, train_loss: 38622912.0000, rmse: 6323395.2618, mae: 3952862.0000, mape: 10.4008

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 17, each_time: 2.2753, train_loss: 39800208.0000, rmse: 5900194.8760, mae: 3108835.2500, mape: 5.7347

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 18, each_time: 2.2649, train_loss: 38557772.0000, rmse: 6283288.0312, mae: 3462172.5000, mape: 449.1278

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 19, each_time: 2.2506, train_loss: 36756788.0000, rmse: 5714585.9282, mae: 2982806.0000, mape: 4.2068

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 20, each_time: 2.2519, train_loss: 37624304.0000, rmse: 6439459.4780, mae: 3508441.5000, mape: 7.2250

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 21, each_time: 2.2459, train_loss: 35486608.0000, rmse: 5551700.0263, mae: 2893324.5000, mape: 4.6178

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 22, each_time: 2.2928, train_loss: 35934976.0000, rmse: 5830216.4583, mae: 3010293.0000, mape: 5.4767

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 23, each_time: 2.2563, train_loss: 35198580.0000, rmse: 5931488.5109, mae: 3143048.0000, mape: 6.0568

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 24, each_time: 2.2823, train_loss: 34571464.0000, rmse: 5436309.0470, mae: 2839868.5000, mape: 9.7854

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 25, each_time: 2.3337, train_loss: 35991756.0000, rmse: 5442173.4094, mae: 2894197.2500, mape: 25.9536

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 26, each_time: 2.3083, train_loss: 34225712.0000, rmse: 5360906.3116, mae: 2823168.7500, mape: 16.2303

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 27, each_time: 2.3077, train_loss: 33879528.0000, rmse: 5430545.7099, mae: 3016221.7500, mape: 6.4068

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 28, each_time: 2.2823, train_loss: 33766292.0000, rmse: 5285943.5441, mae: 2792550.5000, mape: 15.8276

saved at:  ./network_traffic_forecast/train_output/time_series.pt

epoch: 29, each_time: 2.2686, train_loss: 33408020.0000, rmse: 5261764.1908, mae: 2705327.7500, mape: 5.5690

saved at:  ./network_traffic_forecast/train_output/time_series.pt

total cost time: 82.3 s

如上述日志所示,模型保存时,使用mape做评价指标

  • MAPE平均绝对百分比误差(Mean Absolute Percentage Error)

mape

范围[0,+∞),MAPE 为0%表示完美模型,MAPE 大于 100 %则表示劣质模型。注意点:当真实值有数据等于0时,存在分母0除问题,该公式不可用!

4.12 查看预测结果

train_rs = pd.read_csv(train_rs_path)
train_rs.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
TimePoint att1 att2 Value predValue
0 2020-06-22 00:59:00 0 0 107992408.0 107767552.0
1 2020-06-22 01:00:00 0 0 105027000.0 106568848.0
2 2020-06-22 01:01:00 0 0 103297008.0 106277728.0
3 2020-06-22 01:02:00 0 0 100357024.0 106990616.0
4 2020-06-22 01:03:00 0 0 98986192.0 103577592.0

value对于真实值,predValue对应预测值,TimePoint、att1、att2 分别为对应时间、区域及运营商属性值

4.13 查看每条线路预测效果对比图

定义画图函数

import matplotlib.pyplot as plt

# 双坐标画图
def plot(y1,y2,y1_label,y2_label,title):
    plt.figure(figsize=(20,8))
    plt.title(title, fontdict={'family': 'SimHei', 'weight': 'normal', 'size': 15})
    plt.plot(y1,label=y1_label)
    plt.plot(y2,label=y2_label)
    plt.legend()  # 显示图例
    plt.show()
train_rs_0_0 = train_rs[(train_rs['att1'] == 0) & (train_rs['att2'] == 0)]
plot(train_rs_0_0['Value'].values,train_rs_0_0['predValue'].values,'Value','predValue','att1_0_att2_0')

train_rs_1_1 = train_rs[(train_rs['att1'] == 1) & (train_rs['att2'] == 1)]
plot(train_rs_1_1['Value'].values,train_rs_1_1['predValue'].values,'Value','predValue','att1_1_att2_1')

预测效果分析:

  • 如上2图分别是att1=0,att2=0 与att1=1,att2=1 两条线路在测试集上的效果对比,算法拟合良好,无明显滞后。

5 模型推理

推理采用time_series_0_0_forecast.csv数据集,为训练集中att1=0,att2=0 线路的原始数据 取一个window的长度(60个),时间戳从1592837940开始

infer_data_path = "./network_traffic_forecast/time_series_0_0_forecast.csv"
infer_data = pd.read_csv(infer_data_path)
print('行数:', infer_data.shape[0])
infer_data.head()
行数: 60
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
TimePoint att1 att2 Value
0 1592837940 0 0 340698113.0
1 1592838000 0 0 338812992.3
2 1592838060 0 0 331492000.2
3 1592838120 0 0 327298593.0
4 1592838180 0 0 325431680.5
model_path = model_path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize_scale = params['normalize_scale']

infer_data = normalize_data(np.reshape(infer_data[-60:]["Value"].values, (-1, 1)), normalize_scale)
infer_tensor = torch.from_numpy(infer_data) 
infer_tensor = infer_tensor.unsqueeze(0).float().to(device)
model = torch.load(model_path).to(device)
predict = model(infer_tensor)

# tensor转numpy 取第0列 第0列是预测值对应的列
predict = predict.data.cpu().numpy()
predict = predict[:,:,0].reshape(-1)

# 归一化还原
predict = [i * normalize_scale for i in predict]
print(predict)
[0.18802738 0.18635544 0.18579344 0.1860577  0.18072018 0.18076988

 0.17839946 0.18191667 0.17893821 0.1788864 ]

6. 改进模型的思路

如上内容是使用lstm构建网络流量预测模型的过程演示,模型精度93%,有如下几个思路可以提升模型的精度:

  1. 加入特征。当数据在峰值或其他地方波动较大的情况下,模型比较容易拟合不准确,该种情况下可以考虑在所选取的window长度的历史数据中,比如,加入mean、max、min等统计学特征;
  2. 模型调参。比如使用autoserch搜参;

至此,本案例完成。

【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。