|
%% 融合 LSTM 自编码器与 KNN 的时间序列异常检测(MATLAB 完整修正版)
% 数据集:两特征(liusu, liuliang),时间戳
% 步骤:数据标准化 -> 滑动窗口 -> LSTM 自编码器训练 -> 提取嵌入与重构误差
% -> 训练 KNN 于正常嵌入 -> 组合异常分数 -> 阈值选择 -> 映射到时间轴 -> 绘图
% 需要:Deep Learning Toolbox, Statistics and Machine Learning Toolbox(可选)
clear; clc; rng(42); % 固定随机种子
%% 1. 加载数据
filename = 'data.csv'; % 请修改为实际路径
data = readtable(filename);
data.time = datetime(data.time, 'InputFormat', 'yyyy/MM/dd HH:mm');
X_raw = [data.liusu, data.liuliang]; % 两特征
T = height(data);
fprintf('原始数据行数:%d\n', T);
fprintf('特征数:%d\n', size(X_raw, 2));
%% 2. 划分训练集与验证测试集,并合成异常标签
split_ratio = 0.7;
split_train = round(T * split_ratio);
X_train_raw = X_raw(1:split_train, :);
X_valtest_raw = X_raw(split_train+1:end, :);
T_valtest = size(X_valtest_raw, 1);
% 生成异常标签
anomaly_ratio = 0.2;
labels = zeros(T, 1);
labels(1:split_train) = 0;
% 在验证测试集上随机注入异常
rng(42);
num_anom = round(T_valtest * anomaly_ratio / 50);
for i = 1:num_anom
start_idx = split_train + randi(T_valtest - 50);
end_idx = start_idx + 49;
if end_idx <= T
labels(start_idx:end_idx) = 1;
X_raw(start_idx:end_idx, :) = X_raw(start_idx:end_idx, :) + 2 * randn(50, 2);
end
end
fprintf('异常点比例:%.2f%%\n', 100 * mean(labels(split_train+1:end)));
%% 3. 数据标准化(基于训练集)
mu = mean(X_train_raw);
sigma = std(X_train_raw);
X_train = (X_train_raw - mu) ./ sigma;
X_valtest = (X_valtest_raw - mu) ./ sigma;
X = (X_raw - mu) ./ sigma; % 全数据标准化用于绘图
%% 4. 参数设置
window_len = 50; % 窗口长度
step = 1; % 步长
hidden_dim = 64; % LSTM隐藏单元数
code_dim = 32; % 嵌入维度
num_layers = 1; % LSTM层数
batch_size = 128;
epochs = 30;
lr = 1e-3;
alpha = 0.65; % 组合分数权重
k_knn = 15; % KNN近邻数
percentile_thresh = 95; % 若无标签,使用分位数阈值
%% 5. 生成滑动窗口
train_windows = generate_windows(X_train, window_len, step);
valtest_windows = generate_windows(X_valtest, window_len, step);
valtest_map = generate_window_map(split_train+1, T, window_len, step);
fprintf('训练窗口数:%d\n', size(train_windows, 1));
fprintf('验证测试窗口数:%d\n', size(valtest_windows, 1));
%% 6. 构建 LSTM 自编码器参数
params = initialize_lstm_ae(2, hidden_dim, code_dim, num_layers);
params = dlupdate(@(x) dlarray(x), params);
averageGrad = [];
averageSqGrad = [];
sigmoid = @(x) 1 ./ (1 + exp(-x)); % 定义 sigmoid
%% 7. 训练循环
train_data = train_windows;
num_samples = size(train_data, 1);
num_batches = ceil(num_samples / batch_size);
fprintf('开始训练...\n');
for epoch = 1:epochs
idx = randperm(num_samples);
train_data_shuffled = train_data(idx, :, :);
epoch_loss = 0;
for b = 1:num_batches
batch_idx = (b-1)*batch_size+1 : min(b*batch_size, num_samples);
batch_data = train_data_shuffled(batch_idx, :, :);
XBatch = dlarray(permute(batch_data, [2, 3, 1])); % [seq_len, features, batch]
[loss, grads] = dlfeval(@model_loss, XBatch, params);
[params, averageGrad, averageSqGrad] = adamupdate(params, grads, ...
averageGrad, averageSqGrad, epoch, lr);
epoch_loss = epoch_loss + extractdata(loss) * numel(batch_idx);
end
avg_loss = epoch_loss / num_samples;
if mod(epoch, 5) == 0
fprintf('Epoch %d/%d - Train Recon MSE: %.6f\n', epoch, epochs, avg_loss);
end
end
%% 8. 提取嵌入和重构误差
[z_train, mse_train] = get_embeddings_and_errors(params, train_windows);
[z_valtest, mse_valtest] = get_embeddings_and_errors(params, valtest_windows);
%% 9. KNN 计算平均距离
[Idx, Dist] = knnsearch(z_train, z_valtest, 'K', k_knn);
dk_valtest = mean(Dist, 2);
%% 10. 组合异常分数
zscore = @(x) (x - mean(x)) ./ (std(x) + eps);
score_valtest = alpha * zscore(dk_valtest) + (1-alpha) * zscore(mse_valtest);
%% 11. 阈值选择(需要标签)
valtest_y = false(size(valtest_windows, 1), 1);
for i = 1:size(valtest_map, 1)
win_start = valtest_map(i, 1);
win_end = valtest_map(i, 2);
if any(labels(win_start:win_end) == 1)
valtest_y(i) = true;
end
end
if exist('perfcurve', 'file')
[X_roc, Y_roc, T_roc, AUC] = perfcurve(valtest_y, score_valtest, true);
youdenJ = Y_roc - X_roc;
[~, best_idx] = max(youdenJ);
best_thr = T_roc(best_idx);
fprintf('Validation ROC AUC = %.3f, Best threshold = %.4f\n', AUC, best_thr);
pred = score_valtest >= best_thr;
tp = sum(pred & valtest_y);
fp = sum(pred & ~valtest_y);
fn = sum(~pred & valtest_y);
tn = sum(~pred & ~valtest_y);
precision = tp / (tp + fp + eps);
recall = tp / (tp + fn + eps);
f1 = 2 * precision * recall / (precision + recall + eps);
fprintf('Precision = %.3f, Recall = %.3f, F1 = %.3f\n', precision, recall, f1);
else
best_thr = prctile(score_valtest, percentile_thresh);
fprintf('使用分位数阈值(%d%%):%.4f\n', percentile_thresh, best_thr);
pred = score_valtest >= best_thr;
end
%% 12. 将窗口分数映射回时间轴(验证测试段)
score_time = zeros(T_valtest, 1);
count_time = zeros(T_valtest, 1);
for i = 1:size(valtest_map, 1)
s_local = valtest_map(i, 1) - split_train;
e_local = valtest_map(i, 2) - split_train;
sc = score_valtest(i);
for t = s_local:e_local
if sc > score_time(t)
score_time(t) = sc;
end
count_time(t) = count_time(t) + 1;
end
end
%% 13. 绘图
figure('Position', [100, 100, 1400, 600]);
% 子图1:原始时序 + 异常区段着色
subplot(2,1,1);
plot(1:T, X(:,1), 'b-', 'LineWidth', 1); hold on;
plot(1:T, X(:,2), 'r-', 'LineWidth', 1);
xlabel('Time'); ylabel('Normalized Value');
title('Multivariate Time Series with Highlighted Anomalies');
legend('Feature 1 (liusu)', 'Feature 2 (liuliang)', 'Anomaly', 'Location', 'best');
grid on;
% 子图2:验证测试段异常分数与阈值
subplot(2,1,2);
t_val = (split_train+1:T)';
plot(t_val, score_time, 'k-', 'LineWidth', 1.5); hold on;
yline(best_thr, 'r--', 'Threshold', 'LineWidth', 2);
xlabel('Time'); ylabel('Anomaly Score');
title('Anomaly Scores on Validation-Test Segment');
legend('Score', 'Threshold', 'Location', 'best');
grid on;
%% 图 4:PCA 投影嵌入(LSTM 嵌入向量的二维可视化)
% 假设已有:z_valtest (N×D 矩阵,每行一个嵌入向量)
% valtest_y (逻辑向量,true 表示异常窗口)
% 1. 进行 PCA 降维至 2 维(需要 Statistics and Machine Learning Toolbox)
[coeff, score, latent] = pca(z_valtest);
z2 = score(:, 1:2); % 取前两个主成分得分
% 2. 分离正常与异常点索引
normal_idx = ~valtest_y;
anom_idx = valtest_y;
% 3. 定义颜色(使用 RGB 三元组)
deepskyblue = [0, 0.749, 1]; % 深天蓝,近似于 Python 的 'deepskyblue'
crimson = [0.8627, 0.0784, 0.2353]; % 深红色,近似于 Python 的 'crimson'
% 4. 绘制散点图
figure('Name', 'PCA Projection of Embeddings', 'NumberTitle', 'off', ...
'Position', [100, 100, 800, 800]);
hold on;
% 正常点(天蓝色)
scatter(z2(normal_idx, 1), z2(normal_idx, 2), 10, deepskyblue, 'filled', ...
'MarkerFaceAlpha', 0.7, 'MarkerEdgeAlpha', 0.7);
% 异常点(深红色)
scatter(z2(anom_idx, 1), z2(anom_idx, 2), 10, crimson, 'filled', ...
'MarkerFaceAlpha', 0.7, 'MarkerEdgeAlpha', 0.7);
xlabel('PC1');
ylabel('PC2');
title('PCA Projection of LSTM Embeddings', 'FontSize', 14);
legend('Normal', 'Anomaly', 'Location', 'best');
grid on; grid minor;
hold off;
%% 图 5:ROC 曲线(组合异常分数)
% 需要已有变量:
% X_roc - 假正率 (FPR) 向量
% Y_roc - 真正率 (TPR) 向量
% AUC - ROC 曲线下面积(标量)
% best_idx- 最佳阈值在 X_roc/Y_roc 中的索引
% 颜色定义(与 Python 示例保持一致)
magenta = [1, 0, 1]; % 洋红色
limegreen = [0.1961, 0.8039, 0.1961]; % 亮绿色
gold = [1, 0.84, 0]; % 金色
% 创建图形
figure('Name', 'ROC Curve', 'NumberTitle', 'off', 'Position', [100, 100, 800, 600]);
hold on;
% 绘制 ROC 曲线(洋红色)
plot(X_roc, Y_roc, 'Color', magenta, 'LineWidth', 2, ...
'DisplayName', sprintf('ROC (AUC = %.3f)', AUC));
% 绘制随机对角线(亮绿色虚线)
plot([0, 1], [0, 1], 'Color', limegreen, 'LineStyle', '--', 'LineWidth', 1.5, ...
'DisplayName', 'Random');
% 标记最佳阈值点(金色散点)
scatter(X_roc(best_idx), Y_roc(best_idx), 80, gold, 'filled', ...
'MarkerEdgeColor', 'k', 'LineWidth', 1, ...
'DisplayName', 'Best threshold');
xlabel('False Positive Rate');
ylabel('True Positive Rate');
title('ROC Curve of Combined Anomaly Score', 'FontSize', 14);
legend('Location', 'southeast');
grid on; grid minor;
hold off;
%% ==================== 辅助函数定义 ====================
function windows = generate_windows(data, win_len, step)
[T, F] = size(data);
num_wins = floor((T - win_len) / step) + 1;
windows = zeros(num_wins, win_len, F);
for i = 1:num_wins
start_idx = (i-1)*step + 1;
windows(i, :, :) = data(start_idx:start_idx+win_len-1, :);
end
end
function map = generate_window_map(start_global, end_global, win_len, step)
T_total = end_global - start_global + 1;
num_wins = floor((T_total - win_len) / step) + 1;
map = zeros(num_wins, 2);
for i = 1:num_wins
s = start_global + (i-1)*step;
e = s + win_len - 1;
map(i, :) = [s, e];
end
end
function params = initialize_lstm_ae(input_dim, hidden_dim, code_dim, num_layers)
params.encoder.lstm.inputWeights = 0.01 * randn(4*hidden_dim, input_dim);
params.encoder.lstm.recurrentWeights = 0.01 * randn(4*hidden_dim, hidden_dim);
params.encoder.lstm.bias = zeros(4*hidden_dim, 1);
params.encoder.fc.weight = 0.01 * randn(code_dim, hidden_dim);
params.encoder.fc.bias = zeros(code_dim, 1);
params.decoder.lstm.inputWeights = 0.01 * randn(4*hidden_dim, code_dim);
params.decoder.lstm.recurrentWeights = 0.01 * randn(4*hidden_dim, hidden_dim);
params.decoder.lstm.bias = zeros(4*hidden_dim, 1);
params.decoder.fc.weight = 0.01 * randn(input_dim, hidden_dim);
params.decoder.fc.bias = zeros(input_dim, 1);
end
function [h, c] = lstm_forward(x, h_prev, c_prev, W_i, W_h, b)
hidden_dim = size(h_prev, 1);
gates = W_i * x + W_h * h_prev + b; % [4*hidden, batch]
i = sigmoid(gates(1:hidden_dim, :));
f = sigmoid(gates(hidden_dim+1:2*hidden_dim, :));
g = tanh(gates(2*hidden_dim+1:3*hidden_dim, :));
o = sigmoid(gates(3*hidden_dim+1:4*hidden_dim, :));
c = f .* c_prev + i .* g;
h = o .* tanh(c);
end
function z = encode(x, params)
[seq_len, input_dim, batch] = size(x);
hidden_dim = size(params.encoder.lstm.recurrentWeights, 2);
h = dlarray(zeros(hidden_dim, batch));
c = dlarray(zeros(hidden_dim, batch));
W_i = params.encoder.lstm.inputWeights;
W_h = params.encoder.lstm.recurrentWeights;
b = params.encoder.lstm.bias;
for t = 1:seq_len
xt = x(t, :, :); % [1, input_dim, batch]
xt = permute(xt, [2, 3, 1]); % [input_dim, batch, 1]
xt = xt(:, :, 1); % [input_dim, batch]
[h, c] = lstm_forward(xt, h, c, W_i, W_h, b);
end
z = params.encoder.fc.weight * h + params.encoder.fc.bias; % [code_dim, batch]
z = z'; % [batch, code_dim]
end
function recon = decode(z, seq_len, params)
% z: [batch, code_dim]
batch = size(z, 1);
code_dim = size(z, 2);
hidden_dim = size(params.decoder.lstm.recurrentWeights, 2);
z = z'; % [code_dim, batch]
% 复制到每个时间步
x_dec = reshape(z, [code_dim, 1, batch]); % [code_dim, 1, batch]
x_dec = repmat(x_dec, [1, seq_len, 1]); % [code_dim, seq_len, batch]
h = dlarray(zeros(hidden_dim, batch));
c = dlarray(zeros(hidden_dim, batch));
W_i = params.decoder.lstm.inputWeights;
W_h = params.decoder.lstm.recurrentWeights;
b = params.decoder.lstm.bias;
outputs = [];
for t = 1:seq_len
xt = x_dec(:, t, :); % [code_dim, 1, batch]
xt = reshape(xt, code_dim, batch); % [code_dim, batch]
[h, c] = lstm_forward(xt, h, c, W_i, W_h, b);
outputs = cat(2, outputs, h); % 收集 h,outputs 变为 [hidden_dim, seq_len*batch]
end
% 重塑为 [hidden_dim, seq_len, batch]
outputs = reshape(outputs, hidden_dim, seq_len, batch);
% 使用 pagemtimes 进行批量矩阵乘法,结果 [input_dim, seq_len, batch]
recon = pagemtimes(params.decoder.fc.weight, outputs);
% 添加偏置(自动广播)
recon = recon + params.decoder.fc.bias;
% 最终重排为 [seq_len, input_dim, batch] 以匹配输入格式
recon = permute(recon, [2, 1, 3]);
end
function [loss, grads] = model_loss(x, params)
z = encode(x, params);
recon = decode(z, size(x, 1), params);
loss = mean((recon - x).^2, 'all');
grads = dlgradient(loss, params);
end
function [z_all, mse_all] = get_embeddings_and_errors(params, windows)
num_wins = size(windows, 1);
seq_len = size(windows, 2);
input_dim = size(windows, 3);
code_dim = size(params.encoder.fc.bias, 1);
z_all = zeros(num_wins, code_dim);
mse_all = zeros(num_wins, 1);
for i = 1:num_wins
x = dlarray(permute(windows(i,:,:), [2, 3, 1])); % [seq_len, input_dim, 1]
z = encode(x, params);
recon = decode(z, seq_len, params);
mse_val = extractdata(mean((recon - x).^2, 'all'));
z_all(i, :) = extractdata(z);
mse_all(i) = mse_val;
end
end
function shade_anomalies(ax, labels, color, alpha)
in_anom = false;
start_idx = 1;
for i = 1:length(labels)
if labels(i) == 1 && ~in_anom
in_anom = true;
start_idx = i;
elseif labels(i) == 0 && in_anom
in_anom = false;
x_region = [start_idx, i, i, start_idx];
y_region = ax.YLim([1, 1, 2, 2]);
patch(ax, x_region, y_region, color, 'FaceAlpha', alpha, 'EdgeColor', 'none');
end
end
if in_anom
x_region = [start_idx, length(labels), length(labels), start_idx];
y_region = ax.YLim([1, 1, 2, 2]);
patch(ax, x_region, y_region, color, 'FaceAlpha', alpha, 'EdgeColor', 'none');
end
end
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2026-4-2 10:40
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社