diff --git a/.gitignore b/.gitignore index 6eab8ea..093dfa7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,6 @@ .idea/ */.idea/ /Data/ +.pyc */__pycache__/ \ No newline at end of file diff --git a/HDFS_drain3_state.bin b/HDFS_drain3_state.bin new file mode 100644 index 0000000..081c98d --- /dev/null +++ b/HDFS_drain3_state.bin @@ -0,0 +1 @@ +eJztWm1v2zYQ/iuFsQ9tUagk9W5sA5qkAYJ18RAH24AiMBSJTtToxaPotFnR/z5SlC1SFv0WxfEQf5F0InnkHZ873pH83ps8vM+vv+CQ9vqvekl+MwlIEWc3RkSCOBNPsyJO+LP37lUvwhN6y+oj9l3E6agkgGExMg2+jcLbOIkIzthPCAD7SfKcjrI8wuzP9016POdtpA45szv8wBu+vhgMLt/0xI8RzUWvdS8QPbYzOO8MIm03P7/99bH9oFoozk3X1Q9WECbTgmJSMPrzRn1+ym+ORVvOn9UdUZxOkoBi1tEdzkqOPeBBCHxeoxpI9To7Px2UIx4XxklAAy7BT/zj7xDH95j0eeEF5gTrnxPXSR7eSRwKEvYlMsIFndNXtWCjOOKyfAAAwLJZ/C+X3XVcEzH5n1vkIy6VkLMS+uO3kM1inJdmIYyDiPKjmQbGOWnTyJfgPjDi3Pg4OK2ZtKoC2ZIqnlwNCALXBvwLItPdQBUQGMgGBjQN6Hl9m80g6Cvq+Upi2oTHdXI3Mj3b8xD0geX5FvIt0ykHQnP+TGNChAJL/tAwfQM6QPDX6MuV9AV3oq8SrJCJoNXXH0F4h+kFLiZ5FlUaa/xboQ+wgLcZhM4G898l3yPCB8q/JvEEa5TkK0q6+tF0L1f8h9+dD/UPLnQGitPheZDi4oHxSsWEfRoc//aWf/GCYVlgBAkzEsa9tLR+g2O7z0Syo7Bdz9uRz4RmG+7LgQ/DIMsqvP+JSTyOmUwVfotpGGIc4Uhyk1rhoCUJZ7q+pcMshB0u/PCA2m1dWcVCs/zpQWzKC7/l2uYeLPzNWKeqO8TkXqC3KZ5YvPRCyt7XAu4eiLi4oCsBHWNPH8olpZxp3cy2m64sretpDdfs0HDNg+GuZbjVjLeCOB/P502yZ5Kny7EtO2pmwJapnXCnwwl3DhO+YXwRRUOaExzVAUYJgN+DCf+eTiLWdSS7u7jgT9ZOwKX2cQ2UtMPCVmDhQbCXTu/xCV0zGj/OWQAUzlgSXAjveS28KWbStAfoQNKXrTUhu0MTsg8mtHLlX+Yvl21ytLrTdkNxpIn3nSfPX4FpI5G/+rIWVN2oL8Xyl72qnN2FBvTNZTk78CShd7XVowq8wnMGxWJ8R/Ak4fkMXihhvjPgkH5dvGlopT1Cko39yadcK/4M9UqMSwMy27ahtwQHku+nJMiKMW51hwsYaRccPoPgm9g737SqXX/VkkhOAK+zI3OcT5OydpZTwUCocRZMFZT9STUqkrezkOlolwLQ4VIADktBKzQuK8TLFrJp7ifPp+f/7zGvt2054rP2QM7NNi7QQji4/pY9lNfwZzy5OB1y4VnEWcp6ghO8sAM/S+viZHlYAuUV2tMHpFaHXsh6oV5ot4FJ+3TLe8neHlhvtzFJu8zy1qO3j/uOXUUj22WkcqymdwBuhw7AfaEOoHr99eHifA0PsHxT5wJH0ywKsnKO1XIx/f9MWaLaBJJ0HMNe9Ur/rs5gHRcCz3MsDVpkBwJ3bEyy4lbGb4t+oy2FuRGRe8c2Zco2hXQ2hTq0KfTCberpIsMMU2OY80aXcYrzKVURAhGL/cs8P42TROylUlGxdO23VQT2NZhfk6iMMLzlZ6hJDclrLEAWRA9SNU4boj5/9uuBMbxWPIpqhMeC/BwK2AqbX5WsI3MtsHYYAaKXGgE2wLrGVtvcMWWsLzKdsDlt+qhZkZjtJYA7G9ROXwLftmCqX63IT/CYGhrEKYf+WsR1eOKPDif+6xyT8Cm/aaaSsxwyH4+r5U9seS/kmtWuF1hYZFNMg3ZOK0J3JCfc2qshqMMDR/TiDhy3vhm4xZnakntw++bo+C2t5Jf3jeuPniVusBKc5hRXxertxatysWbC+b6zlXOUd2S0V/hgh3f44OES365y9CYDAYz7IImjd/wryEpe/HadSExETMg7p+LuJ801uDHlwyZtGNfhaYL2MOHpp7JdAWgdw1kYWNneniWyJQVdhURqqamSFlRJXyEdUyFttTJSO7IbldWO3AaptoWNftXKTqMjlXTUtkglLXVUrlrqqR35amVfrdwYMmi0VUshbNANUmXmNXgjbgL/Adcvbuk= \ No newline at end of file diff --git a/anomalydetection/att_all_you_need/__init__.py b/anomalydetection/att_all_you_need/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/att_all_you_need/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/att_all_you_need/__pycache__/__init__.cpython-36.pyc b/anomalydetection/att_all_you_need/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f6c90fd Binary files /dev/null and b/anomalydetection/att_all_you_need/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_predict.cpython-36.pyc b/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_predict.cpython-36.pyc new file mode 100644 index 0000000..b78c8c0 Binary files /dev/null and b/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_predict.cpython-36.pyc differ diff --git a/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_train.cpython-36.pyc b/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_train.cpython-36.pyc new file mode 100644 index 0000000..2d3109c Binary files /dev/null and b/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_train.cpython-36.pyc differ diff --git a/anomalydetection/att_all_you_need/encoder_self_att_predict.py b/anomalydetection/att_all_you_need/encoder_self_att_predict.py new file mode 100644 index 0000000..77a9994 --- /dev/null +++ b/anomalydetection/att_all_you_need/encoder_self_att_predict.py @@ -0,0 +1,141 @@ +# -*- coding: UTF-8 -*- + +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import json +import pandas as pd +import numpy as np +import os +import torch.nn as nn +import time +import random +from torch.utils.data import TensorDataset, DataLoader +from anomalydetection.att_all_you_need.encoder_self_att_train import Encoder + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# len(line) < window_length + + +def make_src_mask(src, src_pad_idx): + # src = [batch size, src len] + + src_mask = (src != src_pad_idx) # + + # src_mask = [batch size, src len] # + + return src_mask.clone().detach().numpy().tolist() + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, dropout, num_of_heads, pf_dim): + + model1 = Encoder(input_size, num_classes, hidden_size, num_layers, num_of_heads, pf_dim, dropout, device).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + +def generate_robust_seq_label(file_path, sequence_length): + num_of_sessions = 0 + input_data, output_data, mask_data = [], [], [] + train_file = pd.read_csv(file_path) + i = 0 + while i < len(train_file): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + input_data.append(line) + output_data.append(int(train_file["label"][i])) + i += 1 + data_set = TensorDataset(torch.tensor(input_data), torch.tensor(output_data)) + return data_set + + +def get_batch_semantic_with_mask(seq, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + print(seq.shape) + batch_data = [] + mask_data = [] + for s in seq: + semantic_line = [] + for event in s.numpy().tolist(): + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + batch_data.append(semantic_line) + mask = make_src_mask(s, 0) + mask_data.append(mask) + return batch_data, mask_data + + +def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length, model_path, test_file_path, batch_size, pattern_vec_json, dropout, num_of_heads, pf_dim): + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, dropout, num_of_heads, pf_dim) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + + # create data set + sequence_data_set = generate_robust_seq_label(test_file_path, sequence_length) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=False, pin_memory=False) + + print('predict start') + with torch.no_grad(): + for step, (seq, label) in enumerate(data_loader): + # first traverse [0, window_size) + batch_data, mask_data = get_batch_semantic_with_mask(seq, pattern_vec_json) + seq = torch.tensor(batch_data) + mask_data = torch.tensor(mask_data) + seq = seq.view(-1, sequence_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq, mask_data)[:, 0].clone().detach().cpu().numpy() + predicted = (output > 0.5).astype(int) + label = np.array([y for y in label]) + TP += ((predicted == 1) * (label == 1)).sum() + FP += ((predicted == 1) * (label == 0)).sum() + FN += ((predicted == 0) * (label == 1)).sum() + TN += ((predicted == 0) * (label == 0)).sum() + ALL = TP + TN + FP + FN + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/att_all_you_need/encoder_self_att_train.py b/anomalydetection/att_all_you_need/encoder_self_att_train.py new file mode 100644 index 0000000..c9e532e --- /dev/null +++ b/anomalydetection/att_all_you_need/encoder_self_att_train.py @@ -0,0 +1,296 @@ +# -*- coding: UTF-8 -*- +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import TensorDataset, DataLoader + +import pandas as pd + + +import numpy as np + +import random +import math +import time +import json +import os + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def make_src_mask(src, src_pad_idx): + # src = [batch size, src len] + + src_mask = (src != src_pad_idx) # + + # src_mask = [batch size, src len] # + + return src_mask.clone().detach().numpy().tolist() + + +class Encoder(nn.Module): + def __init__(self, + input_dim, + output_dim, # + hid_dim, + n_layers, + n_heads, + pf_dim, + dropout, + device, + max_length=100): + super().__init__() + + self.device = device + + self.tok_embedding = nn.Linear(input_dim, hid_dim) # + self.pos_embedding = nn.Embedding(max_length, hid_dim) + + self.layers = nn.ModuleList([EncoderLayer(hid_dim, + n_heads, + pf_dim, + dropout, + device) + for _ in range(n_layers)]) + + self.dropout = nn.Dropout(dropout) + + self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device) + + self.output = nn.Linear(hid_dim, output_dim) # + + def forward(self, src, src_mask): + # src = [batch size, src len, input_dim] # + # src_mask = [batch size,1, 1, src len] # + + + batch_size = src.shape[0] + src_len = src.shape[1] + + pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device) + + # pos = [batch size, src len] + + src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos)) + + # src = [batch size, src len, hid dim] + + for layer in self.layers: + src = layer(src, src_mask) + + # src = [batch size, src len, hid dim] + output = self.output(src) # + output = torch.sigmoid(output[:, -1, :]) # + return output + + +class EncoderLayer(nn.Module): + def __init__(self, + hid_dim, + n_heads, + pf_dim, + dropout, + device): + super().__init__() + + self.self_attn_layer_norm = nn.LayerNorm(hid_dim) + self.ff_layer_norm = nn.LayerNorm(hid_dim) + self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device) + self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, + pf_dim, + dropout) + self.dropout = nn.Dropout(dropout) + + def forward(self, src, src_mask): + # src = [batch size, src len, hid dim] + # src_mask = [batch size, src len] + + # self attention + _src, _ = self.self_attention(src, src, src, src_mask) + + # dropout, residual connection and layer norm + src = self.self_attn_layer_norm(src + self.dropout(_src)) + + # src = [batch size, src len, hid dim] + + # positionwise feedforward + _src = self.positionwise_feedforward(src) + + # dropout, residual and layer norm + src = self.ff_layer_norm(src + self.dropout(_src)) + + # src = [batch size, src len, hid dim] + + return src + + +class MultiHeadAttentionLayer(nn.Module): + def __init__(self, hid_dim, n_heads, dropout, device): + super().__init__() + + assert hid_dim % n_heads == 0 + + self.hid_dim = hid_dim + self.n_heads = n_heads + self.head_dim = hid_dim // n_heads + + self.fc_q = nn.Linear(hid_dim, hid_dim) + self.fc_k = nn.Linear(hid_dim, hid_dim) + self.fc_v = nn.Linear(hid_dim, hid_dim) + + self.fc_o = nn.Linear(hid_dim, hid_dim) + + self.dropout = nn.Dropout(dropout) + + self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device) + + def forward(self, query, key, value, mask=None): + batch_size = query.shape[0] + + # query = [batch size, query len, hid dim] + # key = [batch size, key len, hid dim] + # value = [batch size, value len, hid dim] + + Q = self.fc_q(query) + K = self.fc_k(key) + V = self.fc_v(value) + + # Q = [batch size, query len, hid dim] + # K = [batch size, key len, hid dim] + # V = [batch size, value len, hid dim] + + Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) + K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) + V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) + + # Q = [batch size, n heads, query len, head dim] + # K = [batch size, n heads, key len, head dim] + # V = [batch size, n heads, value len, head dim] + + energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale + + # energy = [batch size, n heads, query len, key len] + + if mask is not None: + mask = mask.view(batch_size, 1, 1, -1).to(device) + energy = energy.masked_fill(mask == 0, -1e10) + + attention = torch.softmax(energy, dim=-1) + + # attention = [batch size, n heads, query len, key len] + + x = torch.matmul(self.dropout(attention), V) + + # x = [batch size, n heads, query len, head dim] + + x = x.permute(0, 2, 1, 3).contiguous() + + # x = [batch size, query len, n heads, head dim] + + x = x.view(batch_size, -1, self.hid_dim) + + # x = [batch size, query len, hid dim] + + x = self.fc_o(x) + + # x = [batch size, query len, hid dim] + + return x, attention + + +class PositionwiseFeedforwardLayer(nn.Module): + def __init__(self, hid_dim, pf_dim, dropout): + super().__init__() + + self.fc_1 = nn.Linear(hid_dim, pf_dim) + self.fc_2 = nn.Linear(pf_dim, hid_dim) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # x = [batch size, seq len, hid dim] + + x = self.dropout(torch.relu(self.fc_1(x))) + + # x = [batch size, seq len, pf dim] + + x = self.fc_2(x) + + # x = [batch size, seq len, hid dim] + + return x + + +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + num_of_sessions = 0 + input_data, output_data, mask_data = [], [], [] + train_file = pd.read_csv(file_path) + for i in range(len(train_file)): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + input_data.append(line) + output_data.append(int(train_file["label"][i])) + data_set = TensorDataset(torch.tensor(input_data), torch.tensor(output_data)) + return data_set + + +def get_batch_semantic_with_mask(seq, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + batch_data = [] + mask_data = [] + for s in seq: + semantic_line = [] + for event in s.numpy().tolist(): + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + batch_data.append(semantic_line) + mask = make_src_mask(s, 0) + mask_data.append(mask) + return batch_data, mask_data + + +def train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file, dropout, num_of_heads, pf_dim): + print("Train num_classes: ", num_of_classes) + model = Encoder(input_size, num_of_classes, hidden_size, num_of_layers, num_of_heads, pf_dim, dropout, device).to(device) + # create data set + sequence_data_set = generate_robust_seq_label(data_file, sequence_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + + # Loss and optimizer classify job + criterion = nn.BCELoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + batch_data, mask_data = get_batch_semantic_with_mask(seq, pattern_vec_file) + seq = torch.tensor(batch_data) + #print(seq.shape) + seq = seq.clone().detach().view(-1, sequence_length, input_size).to(device) + #print(seq.shape) + output = model(seq, torch.tensor(mask_data)) + + loss = criterion(output.squeeze(-1), label.float().to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + print('Training finished') \ No newline at end of file diff --git a/anomalydetection/bi_lstm_only/__init__.py b/anomalydetection/bi_lstm_only/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/bi_lstm_only/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/bi_lstm_only/__pycache__/__init__.cpython-36.pyc b/anomalydetection/bi_lstm_only/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5d1f1ba Binary files /dev/null and b/anomalydetection/bi_lstm_only/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_predict.cpython-36.pyc b/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_predict.cpython-36.pyc new file mode 100644 index 0000000..58a695c Binary files /dev/null and b/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_predict.cpython-36.pyc differ diff --git a/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_train.cpython-36.pyc b/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_train.cpython-36.pyc new file mode 100644 index 0000000..558b707 Binary files /dev/null and b/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_train.cpython-36.pyc differ diff --git a/anomalydetection/bi_lstm_only/bi_lstm_predict.py b/anomalydetection/bi_lstm_only/bi_lstm_predict.py new file mode 100644 index 0000000..50a5357 --- /dev/null +++ b/anomalydetection/bi_lstm_only/bi_lstm_predict.py @@ -0,0 +1,127 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import os +import torch.nn as nn +import time +from anomalydetection.bi_lstm_only.bi_lstm_train import Model + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# len(line) < window_length + +def generate(name, window_length): + log_keys_sequences = list() + with open(name, 'r') as f: + for line in f.readlines(): + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences.append(tuple(line)) + return log_keys_sequences + + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path): + + model1 = Model(input_size, hidden_size, num_layers, num_classes, if_bidirectional=True, batch_size=0).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + +def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + abnormal_loader = generate(test_file_path, window_length) + abnormal_label = [] + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in abnormal_loader: + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + lineNum = current_file_line * 200 + i + window_length + 1 + count_num += 1 + seq = line[i:i + window_length] + label = line[i + window_length] + for n in range(len(seq)): + if current_file_line * 200 + i + n + 1 in abnormal_label: + i = i + n + 1 + continue + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) + if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += window_length + 1 + else: + i += 1 + ALL += 1 + if vec_to_class_type[tuple(label)] not in predicted: + if lineNum in abnormal_label: + TP += 1 + else: + FP += 1 + else: + if lineNum in abnormal_label: + FN += 1 + else: + TN += 1 + current_file_line += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/bi_lstm_only/bi_lstm_train.py b/anomalydetection/bi_lstm_only/bi_lstm_train.py new file mode 100644 index 0000000..c242a7b --- /dev/null +++ b/anomalydetection/bi_lstm_only/bi_lstm_train.py @@ -0,0 +1,116 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import os +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader + +# use cuda if available otherwise use cpu +from torch.autograd import Variable + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirectional, batch_size): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional) + self.fc = nn.Linear(hidden_size*2, out_size) + self.batch_size = batch_size + if if_bidirectional: + self.num_of_directions = 2 + else: + self.num_of_directions = 1 + + + # self.out = nn.Linear(in_features=in_features, out_features=out_features) + + + def init_hidden(self, size): + # size self.batch_size same + h0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + # h_n: hidden state h of last time step + # c_n: hidden state c of last time step + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + # out shape [batch, seqlen, numdirec*hidden] + out = out[:, -1, :] + # tmp1, tmp2 = out.split(self.hidden_size, 1) + out = self.fc(out) + # print('out[:, -1, :]:') + # print(out) + return out + + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length + 1: + #print(line) + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(vec_to_class_type[line[i + window_length]]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size, hidden_size, num_of_layers, num_of_classes, True, batch_size).to(device) + # create data set + sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, window_length, input_size).to(device) + output = model(seq) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') \ No newline at end of file diff --git a/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-36.pyc b/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..185d304 Binary files /dev/null and b/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-37.pyc b/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-37.pyc index 444811b..956f3f9 100644 Binary files a/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-37.pyc and b/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-37.pyc differ diff --git a/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-36.pyc b/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-36.pyc new file mode 100644 index 0000000..72f5b58 Binary files /dev/null and b/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-36.pyc differ diff --git a/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-37.pyc b/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-37.pyc index 692a864..7a80e57 100644 Binary files a/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-37.pyc and b/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-37.pyc differ diff --git a/anomalydetection/deeplog/Model1/log_key_LSTM_train.py b/anomalydetection/deeplog/Model1/log_key_LSTM_train.py index 0f222bc..05188e0 100644 --- a/anomalydetection/deeplog/Model1/log_key_LSTM_train.py +++ b/anomalydetection/deeplog/Model1/log_key_LSTM_train.py @@ -32,25 +32,33 @@ def generate_seq_label(file_path,window_length): with open(file_path, 'r') as file: for line in file.readlines(): num_of_sessions += 1 - line = tuple(map(lambda n: n, map(int, line.strip().split()))) + line = list(map(lambda n: n, map(int, line.strip().split()))) + if(len(line) log_vector4 # so each element of inputs is a sequence,and each element of that sequence is a sequence too # nn's output is the prediction of parameter value vector - if len(x) < 2*num_of_layers: - flag = 1 - for i in range(len(vectors) - window_length): - inputs.append(vectors[i: i + window_length]) - outputs.append(vectors[i + window_length]) - # print(inputs) - # print(inputs[0]) + + # if len(x) < 2*num_of_layers: + # flag = 1 + data_set = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs)) - if len(vectors) > 0 and flag==0: - return data_set, len(vectors[0]) + + if len(inputs) > 0 and flag == 0: + return data_set, 10 else: return None, 0 def train_model2(model_dir,log_preprocessor_dir,num_epochs,batch_size,window_length,num_of_layers,learning_rate,hidden_size): - log_value_folder = log_preprocessor_dir + 'logvalue_train/' + log_value_folder = log_preprocessor_dir + '/train/logvalue/normal/' model_output_directory = model_dir + 'model2/' log_template = 'model2_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) file_names = os.listdir(log_value_folder) for i in range(len(file_names)): print(i) - file_name = str(i+1) + ".txt" + file_name = str(i+1) train_data_set_name = log_value_folder + file_name validation_data_set_name = train_data_set_name diff --git a/anomalydetection/deeplog/__pycache__/__init__.cpython-36.pyc b/anomalydetection/deeplog/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..de760d8 Binary files /dev/null and b/anomalydetection/deeplog/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/deeplog/__pycache__/__init__.cpython-37.pyc b/anomalydetection/deeplog/__pycache__/__init__.cpython-37.pyc index 5a7bc22..2bec9da 100644 Binary files a/anomalydetection/deeplog/__pycache__/__init__.cpython-37.pyc and b/anomalydetection/deeplog/__pycache__/__init__.cpython-37.pyc differ diff --git a/anomalydetection/deeplog/__pycache__/log_predict.cpython-36.pyc b/anomalydetection/deeplog/__pycache__/log_predict.cpython-36.pyc new file mode 100644 index 0000000..abe032d Binary files /dev/null and b/anomalydetection/deeplog/__pycache__/log_predict.cpython-36.pyc differ diff --git a/anomalydetection/deeplog/__pycache__/log_predict.cpython-37.pyc b/anomalydetection/deeplog/__pycache__/log_predict.cpython-37.pyc index 97d6fc2..f109c7d 100644 Binary files a/anomalydetection/deeplog/__pycache__/log_predict.cpython-37.pyc and b/anomalydetection/deeplog/__pycache__/log_predict.cpython-37.pyc differ diff --git a/anomalydetection/deeplog/log_predict.py b/anomalydetection/deeplog/log_predict.py index f9c348a..59c5215 100644 --- a/anomalydetection/deeplog/log_predict.py +++ b/anomalydetection/deeplog/log_predict.py @@ -8,11 +8,12 @@ import torch.nn as nn import os import matplotlib.pyplot as plt +from collections import Counter # use cuda if available otherwise use cpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -pattern2value = [] +# 记录每个 key 对应的 value 的长度 +value_length_of_key = [] # 继承枚举类 class LineNumber(Enum): @@ -20,35 +21,44 @@ class LineNumber(Enum): NUMBERS_LINE = 3 + def generate(name,window_length): - log_keys_sequences = list() + log_keys_sequences=list() + length=0 with open(name, 'r') as f: for line in f.readlines(): line = list(map(lambda n: n, map(int, line.strip().split()))) line = line + [-1] * (window_length + 1 - len(line)) # for i in range(len(line) - window_size): # inputs.add(tuple(line[i:i+window_size])) + # log_keys_sequences[tuple(line)] = log_keys_sequences.get(tuple(line), 0) + 1 log_keys_sequences.append(tuple(line)) - return log_keys_sequences + length+=1 + return log_keys_sequences,length -def value_log_cluster(log_preprocessor_dir): - log_value_folder_cluster = log_preprocessor_dir + 'logvalue_test/' - file_names = os.listdir(log_value_folder_cluster) - pattern2value.append([]) - for i in range(len(file_names)): - pattern2value.append([]) - with open(log_value_folder_cluster + str(i+1) + ".txt", 'r') as in_text: - for line in in_text.readlines(): - line = list(map(lambda n: n, map(float, line.strip().split()))) - pattern2value[i+1].append(line) +def get_value_length(log_preprocessor_dir,log_fttree_out_dir): + global value_length_of_key + value_length_of_key = [10]*(len(os.listdir(log_fttree_out_dir)) + 1) + log_value_folder = log_preprocessor_dir + '/train/logvalue/normal/' + file_names = os.listdir(log_value_folder) + # for i in range(len(file_names)): + # with open(log_value_folder + str(i+1), 'r') as f: + # x = f.readlines() + # if len(x) == 0 or x[0].strip('\n') == '-1': + # value_length_of_key.append(0) + # else: + # line = x[0].strip('\n') + # key_values = line.split(' ') + # value_length_of_key[i+1] = len(key_values[0].split(',')) -def load_model1(model_dir,input_size, hidden_size, num_layers): - num_classes = len(pattern2value) + 1 +def load_model1(model_dir,model_name,input_size, hidden_size, num_layers): + num_classes = len(value_length_of_key) + # num_classes = 28 print("Model1 num_classes: ", num_classes) model1_dir = model_dir + 'model1/' - model_path = model1_dir + 'Adam_batch_size=200;epoch=300.pt' + model_path = model1_dir + model_name model1 = Model1(input_size, hidden_size, num_layers, num_classes).to(device) model1.load_state_dict(torch.load(model_path, map_location='cpu')) model1.eval() @@ -56,16 +66,16 @@ def load_model1(model_dir,input_size, hidden_size, num_layers): return model1 -def load_model2(model_dir,input_size, hidden_size, num_layers): +def load_model2(model_dir,epoch,input_size, hidden_size, num_layers): model2_dir = model_dir+ 'model2/' model2 = [] - for i in range(len(pattern2value)): - if len(pattern2value[i]) == 0: + for i in range(len(value_length_of_key)): + if value_length_of_key[i] == 0: model2.append(None) continue - input_size = len(pattern2value[i][0]) + input_size = value_length_of_key[i] out_size = input_size - model_name = str(i+1) + '_epoch=50.pt' + model_name = str(i+1) + '_epoch=' + str(epoch)+ '.pt' model_path = model2_dir + str(i+1) + '/' + model_name if not os.path.exists(model_path): model2.append(None) @@ -90,18 +100,21 @@ def draw_evaluation(title, indexs, values, xlabel, ylabel): plt.show() -def do_predict(log_preprocessor_dir,model_dir,window_length,input_size, hidden_size, num_layers,num_candidates,mse_threshold): - abnormal_label_file = log_preprocessor_dir + 'HDFS_abnormal_label.txt' +def do_predict(log_preprocessor_dir,log_fttree_out_dir,model_dir,model1_name,model2_num_epochs,window_length,input_size, hidden_size, num_layers,num_candidates,mse_threshold,use_model2): + # abnormal_label_file = log_preprocessor_dir + 'HDFS_abnormal_label.txt' + + get_value_length(log_preprocessor_dir,log_fttree_out_dir) - value_log_cluster(log_preprocessor_dir) - model1 = load_model1(model_dir,input_size, hidden_size, num_layers) - model2 = load_model2(model_dir,input_size, hidden_size, num_layers) + model1 = load_model1(model_dir, model1_name, input_size, hidden_size, num_layers) + + model2 = load_model2(model_dir,model2_num_epochs,10, hidden_size, num_layers) # for Model2's prediction, store which log currently predicts for each log_key. # When model one predicts normal, model2 makes predictions. # At this time, the forward few logs with the same log_key are needed to be predicted # so the pattern_index is used to record the log_key to be predicted. - pattern_index = [0]*len(pattern2value) + #pattern_index = [0]*len(pattern2value) + #pattern_index = [0] * 63 start_time = time.time() criterion = nn.MSELoss() TP = 0 @@ -109,111 +122,176 @@ def do_predict(log_preprocessor_dir,model_dir,window_length,input_size, hidden_s TN = 0 FN = 0 ALL = 0 - abnormal_loader = generate(log_preprocessor_dir+ 'logkey/logkey_test',window_length) - abnormal_label = [] - with open(abnormal_label_file) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] + test_normal_loader, test_normal_length = generate(log_preprocessor_dir+ '/test/logkey/normal',window_length) + test_abnormal_loader, test_abnormal_length=generate(log_preprocessor_dir+'/test/logkey/abnormal',window_length) + + print('predict start') + + #normal test with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for line in abnormal_loader: - i = 0 - # first traverse [0, window_size) - for ii in range(window_length): - if ii < len(line): - pattern_index[line[ii]] += 1 - while i < len(line) - window_length: - lineNum = current_file_line * 10 + i + window_length + 1 - count_num += 1 - seq = line[i:i + window_length] + count = 1 + for line_num,line in enumerate(test_normal_loader): + model1_success=False + for i in range(len(line) - window_length-1): + seq0 = line[i:i + window_length] label = line[i + window_length] - seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1,window_length,input_size).to(device) label = torch.tensor(label).view(-1).to(device) - output = model1(seq) - predicted = torch.argsort(output, 1)[0][-num_candidates:] - print('{} - predict result: {}, true label: {}'.format(count_num, predicted, label)) - now_pattern_index = pattern_index[label] - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - for j in range(window_length + 1): - if i + window_length + j < len(line) and line[i + window_length + j] < len(pattern_index): - pattern_index[line[i + window_length + j]] += 1 - else: + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] + if label not in predicted: + FP += 1 + model1_success=True + break + if(model1_success): + continue + + + #如果模型二预测normal TN+1 否则FP+1 + + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 + + # When model one predicts normal, model2 makes predictions. + # values:all log's value vector belongs to log_key(whose id is pattern_id) + # 是否使用模型二 + if use_model2: + + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(31): + with open(log_preprocessor_dir+'/test/logvalue/normal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + FP+=1 + model2_success=True break - i += window_length + 1 - else: - pattern_index[label] += 1 - i += 1 - ALL += 1 + if(model2_success): + break + + + #abnormal test + with torch.no_grad(): + for line_num,line in enumerate(test_abnormal_loader): + model1_success=False + for i in range(len(line) - window_length): + seq0 = line[i:i + window_length] + label = line[i + window_length] + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1, window_length, input_size).to(device) + + label = torch.tensor(label,).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] if label not in predicted: - if lineNum in abnormal_label: - TN += 1 - else: - FN += 1 - # else: - # if lineNum in abnormal_label: - # FP += 1 - # else: - # TP += 1 - else: - # When model one predicts normal, model2 makes predictions. - # values:all log's value vector belongs to log_key(whose id is pattern_id) - values = pattern2value[label] - vi = now_pattern_index - if vi >= window_length and vi < len(values): - # Model2 testing - seq2 = values[vi - window_length:vi] - label2 = values[vi] - seq2 = torch.tensor(seq2, dtype=torch.float).view(-1, window_length, len(seq2[0])).to(device) - label2 = torch.tensor(label2).view(-1).to(device) - mse = 0 - if label < len(model2) and model2[label] != None: - output = model2[label](seq2) - # Calculate the MSE of the prediction result and the original result. - # If the MSE is within the confidence interval of the Gaussian distribution, the log is a normal log - mse = criterion(output[0], label2.to(device)) - - if mse < mse_threshold: - print(mse, mse_threshold) - if lineNum in abnormal_label: - FP += 1 - else: - TP += 1 - else: - if lineNum in abnormal_label: - TN += 1 - else: - FN += 1 - else: - if lineNum in abnormal_label: - FP += 1 - else: - TP += 1 - current_file_line += 1 - # Compute precision, recall and F1-measure - if TP + FP == 0: - P = 0 - else: - P = 100 * TP / (TP + FP) + TP += 1 + model1_success=True + break + if(model1_success): + continue + + # 是否使用模型二 + if use_model2: + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(31): + with open(log_preprocessor_dir+'/test/logvalue/abnormal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + TP += 1 + model2_success = True + break + if (model2_success): + break - if P + R == 0: - F1 = 0 - else: - F1 = 2 * P * R / (P + R) + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 - Acc = (TP + TN) * 100 / ALL + # Compute precision, recall and F1-measure + FN = test_abnormal_length - TP + TN=test_normal_length-FP + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) - print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + Acc = (TP + TN) * 100 /(TP+TN+FP+FN) + P = 100 * TP / (TP + FP) + R = 100 * TP / (TP + FN) + F1 = 2 * P * R / (P + R) print('Finished Predicting') elapsed_time = time.time() - start_time print('elapsed_time: {}'.format(elapsed_time)) - draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'],[Acc, P, R, F1], 'evaluations', '%') + + + + + diff --git a/anomalydetection/loganomaly/__pycache__/__init__.cpython-36.pyc b/anomalydetection/loganomaly/__pycache__/__init__.cpython-36.pyc index a94fb9b..0f6b81b 100644 Binary files a/anomalydetection/loganomaly/__pycache__/__init__.cpython-36.pyc and b/anomalydetection/loganomaly/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..c0f594c Binary files /dev/null and b/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc differ diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-36.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-36.pyc index 68a42af..4c2ee03 100644 Binary files a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-36.pyc and b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-36.pyc differ diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-36.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-36.pyc index 8a6f414..acfa2de 100644 Binary files a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-36.pyc and b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-36.pyc differ diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-37.pyc new file mode 100644 index 0000000..4d0f551 Binary files /dev/null and b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-37.pyc differ diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py index 61c0f64..a35446b 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py @@ -31,6 +31,14 @@ def load_sequential_model(input_size, hidden_size, num_layers, num_classes, mode return model1 +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): vec_to_class_type = {} with open(pattern_vec_file, 'r') as pattern_file: @@ -49,8 +57,8 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, TN = 0 FN = 0 ALL = 0 + skip_count = 0 abnormal_loader = generate(test_file_path, window_length) - abnormal_label = [] with open(anomaly_test_line_path) as f: abnormal_label = [int(x) for x in f.readline().strip().split()] print('predict start') @@ -61,30 +69,37 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, i = 0 # first traverse [0, window_size) while i < len(line) - window_length: - lineNum = current_file_line * 10 + i + window_length + 1 + lineNum = current_file_line * 200 + i + window_length + 1 count_num += 1 seq = line[i:i + window_length] label = line[i + window_length] + for n in range(len(seq)): + if current_file_line * 200 + i + n + 1 in abnormal_label: + i = i + n + 1 + continue seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) #label = torch.tensor(label).view(-1).to(device) output = sequential_model(seq) predicted = torch.argsort(output, 1)[0][-num_candidates:] - print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) + predicted = filter_small_top_k(predicted, output) + #print(output) + #print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 i += window_length + 1 + skip_count += 1 else: i += 1 ALL += 1 if vec_to_class_type[tuple(label)] not in predicted: if lineNum in abnormal_label: - TN += 1 + TP += 1 else: - FN += 1 + FP += 1 else: if lineNum in abnormal_label: - FP += 1 + FN += 1 else: - TP += 1 + TN += 1 current_file_line += 1 # Compute precision, recall and F1-measure if TP + FP == 0: @@ -109,5 +124,5 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, print('Finished Predicting') elapsed_time = time.time() - start_time print('elapsed_time: {}'.format(elapsed_time)) - + print('skip_count: {}'.format(skip_count)) #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_train.py b/anomalydetection/loganomaly/log_anomaly_sequential_train.py index b27c607..fe7f7d7 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_train.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_train.py @@ -27,8 +27,9 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): for line in file.readlines(): num_of_sessions += 1 line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) - if len(line) < 10: - print(line) + if len(line) < window_length: + #print(line) + continue for i in range(len(line) - window_length): input_data.append(line[i:i + window_length]) # line[i] is a list need to read file form a dic{vec:log_key} to get log key @@ -69,7 +70,7 @@ def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_cl train_loss += loss.item() optimizer.step() print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: + if (epoch + 1) % num_epochs == 0: if not os.path.isdir(model_output_directory): os.makedirs(model_output_directory) e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) @@ -83,7 +84,7 @@ def __init__(self, input_size, hidden_size, num_of_layers, out_size): super(Model, self).__init__() self.hidden_size = hidden_size self.num_of_layers = num_of_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True) + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, dropout=0.5) self.fc = nn.Linear(hidden_size, out_size) # self.out = nn.Linear(in_features=in_features, out_features=out_features) diff --git a/anomalydetection/robust/__pycache__/__init__.cpython-36.pyc b/anomalydetection/robust/__pycache__/__init__.cpython-36.pyc index 9ec1bca..96ca0bf 100644 Binary files a/anomalydetection/robust/__pycache__/__init__.cpython-36.pyc and b/anomalydetection/robust/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/robust/__pycache__/bi_lstm_att_predict.cpython-36.pyc b/anomalydetection/robust/__pycache__/bi_lstm_att_predict.cpython-36.pyc index 7fca626..b34a4ba 100644 Binary files a/anomalydetection/robust/__pycache__/bi_lstm_att_predict.cpython-36.pyc and b/anomalydetection/robust/__pycache__/bi_lstm_att_predict.cpython-36.pyc differ diff --git a/anomalydetection/robust/__pycache__/bi_lstm_att_train.cpython-36.pyc b/anomalydetection/robust/__pycache__/bi_lstm_att_train.cpython-36.pyc index 22214b0..54870d8 100644 Binary files a/anomalydetection/robust/__pycache__/bi_lstm_att_train.cpython-36.pyc and b/anomalydetection/robust/__pycache__/bi_lstm_att_train.cpython-36.pyc differ diff --git a/anomalydetection/robust/bi_lstm_att_predict.py b/anomalydetection/robust/bi_lstm_att_predict.py index 2175159..02b203d 100644 --- a/anomalydetection/robust/bi_lstm_att_predict.py +++ b/anomalydetection/robust/bi_lstm_att_predict.py @@ -1,9 +1,14 @@ # -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*- import torch +import json +import pandas as pd +import numpy as np import os import torch.nn as nn import time +import random +from torch.utils.data import TensorDataset, DataLoader from anomalydetection.robust.bi_lstm_att_train import Model # use cuda if available otherwise use cpu @@ -31,16 +36,48 @@ def load_sequential_model(input_size, hidden_size, num_layers, num_classes, mode print('model_path: {}'.format(model_path)) return model1 - -def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): - vec_to_class_type = {} +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + +def generate_robust_seq_label(file_path, sequence_length): + num_of_sessions = 0 + input_data, output_data, mask_data = [], [], [] + train_file = pd.read_csv(file_path) + i = 0 + while i < len(train_file): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + input_data.append(line) + output_data.append(int(train_file["label"][i])) + i += 1 + data_set = TensorDataset(torch.tensor(input_data), torch.tensor(output_data)) + return data_set + + +def get_batch_semantic(seq, pattern_vec_file): with open(pattern_vec_file, 'r') as pattern_file: - i = 0 - for line in pattern_file.readlines(): - pattern, vec = line.split('[:]') - pattern_vector = tuple(map(float, vec.strip().split(' '))) - vec_to_class_type[pattern_vector] = i - i = i + 1 + class_type_to_vec = json.load(pattern_file) + batch_data = [] + for s in seq: + semantic_line = [] + for event in s.numpy().tolist(): + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + batch_data.append(semantic_line) + return batch_data + + +def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length, model_path, test_file_path, batch_size, pattern_vec_json): sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path) @@ -49,44 +86,30 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, FP = 0 TN = 0 FN = 0 - ALL = 0 - abnormal_loader = generate(test_file_path, window_length) - abnormal_label = [] - with open(anomaly_test_line_path) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] + + # create data set + sequence_data_set = generate_robust_seq_label(test_file_path, sequence_length) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + print('predict start') with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for line in abnormal_loader: - i = 0 - # first traverse [0, window_size) - while i < len(line) - window_length: - lineNum = current_file_line * 10 + i + window_length + 1 - count_num += 1 - seq = line[i:i + window_length] - label = line[i + window_length] - seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) - #label = torch.tensor(label).view(-1).to(device) - output = sequential_model(seq) - predicted = torch.argsort(output, 1)[0][-num_candidates:] - print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - i += window_length + 1 - else: - i += 1 - ALL += 1 - if vec_to_class_type[tuple(label)] not in predicted: - if lineNum in abnormal_label: - TN += 1 - else: - FN += 1 - else: - if lineNum in abnormal_label: - FP += 1 - else: - TP += 1 - current_file_line += 1 + count = 0 + for step, (seq, label) in enumerate(data_loader): + batch_data = get_batch_semantic(seq, pattern_vec_json) + seq = torch.tensor(batch_data) + seq = seq.view(-1, sequence_length, input_size).to(device) + output = sequential_model(seq)[:, 0].cpu().clone().detach().numpy() + predicted = (output > 0.2).astype(int) + label = np.array([y for y in label]) + TP += ((predicted == 1) * (label == 1)).sum() + FP += ((predicted == 1) * (label == 0)).sum() + FN += ((predicted == 0) * (label == 1)).sum() + TN += ((predicted == 0) * (label == 0)).sum() + count += 1 + if count > 100000: + break + ALL = TP + TN + FP + FN # Compute precision, recall and F1-measure if TP + FP == 0: P = 0 diff --git a/anomalydetection/robust/bi_lstm_att_train.py b/anomalydetection/robust/bi_lstm_att_train.py index 0416371..75509f1 100644 --- a/anomalydetection/robust/bi_lstm_att_train.py +++ b/anomalydetection/robust/bi_lstm_att_train.py @@ -1,5 +1,7 @@ # -*- coding: UTF-8 -*- +import json import torch +import pandas as pd import torch.nn as nn import torch.optim as optim import torch.nn.functional as F @@ -17,25 +19,32 @@ def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirect super(Model, self).__init__() self.hidden_size = hidden_size self.num_of_layers = num_of_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional) - self.fc = nn.Linear(hidden_size*2, out_size) - self.batch_size = batch_size + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional, dropout=0.5) if if_bidirectional: self.num_of_directions = 2 else: self.num_of_directions = 1 + self.fc = nn.Linear(hidden_size*self.num_of_directions, out_size) + self.batch_size = batch_size self.att_weight = nn.Parameter(torch.randn(1, 1, self.hidden_size*self.num_of_directions)) # self.out = nn.Linear(in_features=in_features, out_features=out_features) +# att BiLSTM paper actually H is different from the paper in paper H = hf + hb def attention_net(self, H): - # print(lstm_output.size()) = (squence_length, batch_size, hidden_size*layer_size) + # print(H.size()) = [batch, numdirec*hidden, seqlen] M = F.tanh(H) a = F.softmax(torch.matmul(self.att_weight, M), 2) a = torch.transpose(a, 1, 2) return torch.bmm(H, a) + def robust_attention_net(self, H): + # print(H.size()) = [batch, numdirec*hidden, seqlen] + M = torch.matmul(self.att_weight, H) + a = torch.tanh(M) + a = torch.transpose(a, 1, 2) + return torch.bmm(H, a) def init_hidden(self, size): # size self.batch_size same @@ -52,12 +61,12 @@ def forward(self, input): # out shape [batch, seqlen, numdirec*hidden] out = torch.transpose(out, 1, 2) # out shape [batch, numdirec*hidden, seqlen] - att_out = self.attention_net(out) + att_out = self.robust_attention_net(out) out = self.fc(att_out[:, :, 0]) - # print('out[:, -1, :]:') - # print(out) - return out + # out shape[batch, num_of_class = 1] + # add sigmoid + return torch.sigmoid(out) def generate_seq_label(file_path, window_length, pattern_vec_file): @@ -75,8 +84,9 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): for line in file.readlines(): num_of_sessions += 1 line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) - if len(line) < 10: - print(line) + if len(line) < window_length + 1: + # print(line) + continue for i in range(len(line) - window_length): input_data.append(line[i:i + window_length]) # line[i] is a list need to read file form a dic{vec:log_key} to get log key @@ -85,7 +95,31 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): return data_set -def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + num_of_sessions = 0 + input_data, output_data = [], [] + train_file = pd.read_csv(file_path) + for i in range(len(train_file)): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + semantic_line = [] + for event in line: + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + input_data.append(semantic_line) + output_data.append(int(train_file["label"][i])) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): # log setting log_directory = root_path + 'log_out/' log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) @@ -93,23 +127,23 @@ def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_cl print("Train num_classes: ", num_of_classes) model = Model(input_size, hidden_size, num_of_layers, num_of_classes, True, batch_size).to(device) # create data set - sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + sequence_data_set = generate_robust_seq_label(data_file, sequence_length, pattern_vec_file) # create data_loader data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) writer = SummaryWriter(logdir=log_directory + log_template) # Loss and optimizer classify job - criterion = nn.CrossEntropyLoss() + criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters()) # Training for epoch in range(num_epochs): train_loss = 0 for step, (seq, label) in enumerate(data_loader): - seq = seq.clone().detach().view(-1, window_length, input_size).to(device) + seq = seq.clone().detach().view(-1, sequence_length, input_size).to(device) output = model(seq) - loss = criterion(output, label.to(device)) + loss = criterion(output.squeeze(-1), label.float().to(device)) # Backward and optimize optimizer.zero_grad() @@ -117,7 +151,7 @@ def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_cl train_loss += loss.item() optimizer.step() print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: + if (epoch + 1) % num_epochs == 0: if not os.path.isdir(model_output_directory): os.makedirs(model_output_directory) e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) diff --git a/anomalydetection/self_att_lstm/__init__.py b/anomalydetection/self_att_lstm/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/self_att_lstm/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/self_att_lstm/__pycache__/__init__.cpython-36.pyc b/anomalydetection/self_att_lstm/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..6692742 Binary files /dev/null and b/anomalydetection/self_att_lstm/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_predict.cpython-36.pyc b/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_predict.cpython-36.pyc new file mode 100644 index 0000000..e405fa1 Binary files /dev/null and b/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_predict.cpython-36.pyc differ diff --git a/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_train.cpython-36.pyc b/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_train.cpython-36.pyc new file mode 100644 index 0000000..253937c Binary files /dev/null and b/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_train.cpython-36.pyc differ diff --git a/anomalydetection/self_att_lstm/self_att_lstm_predict.py b/anomalydetection/self_att_lstm/self_att_lstm_predict.py new file mode 100644 index 0000000..b62d7ed --- /dev/null +++ b/anomalydetection/self_att_lstm/self_att_lstm_predict.py @@ -0,0 +1,246 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import os +import torch.nn as nn +import time +from anomalydetection.self_att_lstm.self_att_lstm_train import Model +import torch.nn.functional as F + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# len(line) < window_length + +def generate(name, window_length): + log_keys_sequences = list() + with open(name, 'r') as f: + for line in f.readlines(): + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences.append(tuple(line)) + return log_keys_sequences + +def generate_log_deep(name, window_length): + log_keys_sequences = {} + with open(name, 'r') as f: + for line in f.readlines(): + if len(line) < window_length + 1: + continue + ln = list(map(lambda n: n-1, map(int, line.strip().split()))) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences[tuple(ln)] = log_keys_sequences.get(tuple(ln), 0) + 1 + return log_keys_sequences + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, window_size): + + model1 = Model(input_size, hidden_size, num_layers, num_classes, if_bidirectional=False, sequen_len=window_size).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + +def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, window_length) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + abnormal_loader = generate(test_file_path, window_length) + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + # for testing model using train set + # abnormal_label = [] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in abnormal_loader: + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + lineNum = current_file_line * 200 + i + window_length + 1 + input_abnormal = False + count_num += 1 + seq = line[i:i + window_length] + origin_seq = seq + label = line[i + window_length] + for n in range(len(seq)): + if current_file_line * 200 + i + n + 1 in abnormal_label: + input_abnormal = True + continue + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + output = F.softmax(output, 1) + # print(torch.sort(output, 1)) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + # print(predicted) + # print('Fp {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + '''if lineNum in abnormal_label or in: # 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += window_length + 1 + else: + i += 1''' + i += 1 + ALL += 1 + if vec_to_class_type[tuple(label)] not in predicted: + if lineNum in abnormal_label or input_abnormal: + TP += 1 + else: + FP += 1 + + else: + if lineNum in abnormal_label or input_abnormal: + print('FN {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + print(torch.sort(output, 1)) + for l in origin_seq: + print(str(vec_to_class_type[tuple(l)]), end='') + print(',', end='') + print(str(vec_to_class_type[tuple(label)])) + FN += 1 + else: + TN += 1 + current_file_line += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + FAR = FP * 100 / (FP+TN) + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%, FAR: {:.3f}%'.format(Acc, P, R, F1, FAR)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') + + +def do_log_deep_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, test_normal_file_path, test_abnormal_file_path, num_candidates, pattern_vec_file): + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, window_length) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + normal_loader = generate_log_deep(test_normal_file_path, window_length) + abnormal_loader = generate_log_deep(test_abnormal_file_path, window_length) + # for testing model using train set + # abnormal_label = [] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in normal_loader.keys(): + count_num += 1 + print(count_num) + if count_num > 6000: + break + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + seq = line[i:i + window_length] + label = line[i + window_length] + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + output = F.softmax(output, 1) + # print(torch.sort(output, 1)) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + # print(predicted) + # print('Fp {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + if label in predicted: + TN += normal_loader[line] + else: + FP += normal_loader[line] + i += 1 + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in abnormal_loader.keys(): + count_num += 1 + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + seq = line[i:i + window_length] + label = line[i + window_length] + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + output = F.softmax(output, 1) + # print(torch.sort(output, 1)) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + # print(predicted) + # print('Fp {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + if label in predicted: + FN += abnormal_loader[line] + else: + TP += abnormal_loader[line] + i += 1 + print(count_num) + + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 /(TP + TN + FN + FP) + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/self_att_lstm/self_att_lstm_train.py b/anomalydetection/self_att_lstm/self_att_lstm_train.py new file mode 100644 index 0000000..b90dfb5 --- /dev/null +++ b/anomalydetection/self_att_lstm/self_att_lstm_train.py @@ -0,0 +1,140 @@ +# -*- coding: UTF-8 -*- +# regularization waiting for heliren sparse +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import os +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader + +# use cuda if available otherwise use cpu +from torch.autograd import Variable + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirectional, sequence_len): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional, dropout=0.5) + if if_bidirectional: + self.num_of_directions = 2 + else: + self.num_of_directions = 1 + self.fc = nn.Linear(hidden_size*self.num_of_directions, out_size) + + self.att_weight = nn.Parameter(torch.randn(1, 1, self.hidden_size*self.num_of_directions)) + self.att_bias = nn.Parameter(torch.randn(1, 1, sequence_len)) + + # self.out = nn.Linear(in_features=in_features, out_features=out_features) + +# l1 regularization will add later + def attention_net(self, H): + # print(H.size()) = [batch, numdirec*hidden, seqlen] + a = F.softmax(torch.matmul(self.att_weight, H) + self.att_bias, 2) + a = torch.transpose(a, 1, 2) + return torch.bmm(H, a) + + def init_hidden(self, size): + # size self.batch_size same + h0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + # h_n: hidden state h of last time step + # c_n: hidden state c of last time step + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + + # out = torch.transpose(out, 0, 1) + # out shape [batch, seqlen, numdirec*hidden] + out = torch.transpose(out, 1, 2) + # out shape [batch, numdirec*hidden, seqlen] + att_out = self.attention_net(out) + # att_out shape[batch, num_direc*hidden_size, 1] + # att_out[:, :, 0] shape[batch, num_direc*hidden_size] + out = self.fc(att_out[:, :, 0]) + # out shape[batch, num_of_class] + return out + + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length + 1: + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(vec_to_class_type[line[i + window_length]]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + +def generate_logdeep_seq_label(file_path, window_length): + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + line = tuple(map(lambda n: n-1, map(int, line.strip().split()))) + if len(line) < window_length + 1: + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(line[i + window_length]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size, hidden_size, num_of_layers, num_of_classes, False, window_length).to(device) + # create data set + sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), weight_decay=0.0001) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, window_length, input_size).to(device) + output = model(seq) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') \ No newline at end of file diff --git a/anomalydetection/self_att_lstm_supervised/__init__.py b/anomalydetection/self_att_lstm_supervised/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/self_att_lstm_supervised/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/self_att_lstm_supervised/__pycache__/__init__.cpython-36.pyc b/anomalydetection/self_att_lstm_supervised/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..66c3c53 Binary files /dev/null and b/anomalydetection/self_att_lstm_supervised/__pycache__/__init__.cpython-36.pyc differ diff --git a/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_predict.cpython-36.pyc b/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_predict.cpython-36.pyc new file mode 100644 index 0000000..0d3c6e2 Binary files /dev/null and b/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_predict.cpython-36.pyc differ diff --git a/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_train.cpython-36.pyc b/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_train.cpython-36.pyc new file mode 100644 index 0000000..e014999 Binary files /dev/null and b/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_train.cpython-36.pyc differ diff --git a/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py new file mode 100644 index 0000000..7414db8 --- /dev/null +++ b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py @@ -0,0 +1,131 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import json +import pandas as pd +import numpy as np +import os +import torch.nn as nn +import time +import random +from torch.utils.data import TensorDataset, DataLoader +from anomalydetection.self_att_lstm_supervised.self_att_lstm_supervised_train import Model + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# len(line) < window_length + +def generate(name, window_length): + log_keys_sequences = list() + with open(name, 'r') as f: + for line in f.readlines(): + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences.append(tuple(line)) + return log_keys_sequences + + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, batch_size, sequence_length): + + model1 = Model(input_size, hidden_size, num_layers, num_classes, if_bidirectional=False, batch_size=0, sequence_len=sequence_length).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + num_of_sessions = 0 + input_data, output_data = [], [] + train_file = pd.read_csv(file_path) + i = 0 + while i < len(train_file): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + semantic_line = [] + for event in line: + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event - 1)]) + input_data.append(semantic_line) + output_data.append(int(train_file["label"][i])) + i += random.randint(6, 8) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length, model_path, test_file_path, batch_size, pattern_vec_json): + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, batch_size, sequence_length) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + + # create data set + sequence_data_set = generate_robust_seq_label(test_file_path, sequence_length, pattern_vec_json) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + + print('predict start') + with torch.no_grad(): + count = 0 + for step, (seq, label) in enumerate(data_loader): + # first traverse [0, window_size) + seq = seq.view(-1, sequence_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq)[:, 0].clone().detach().numpy() + predicted = (output > 0.2).astype(int) + label = np.array([y for y in label]) + TP += ((predicted == 1) * (label == 1)).sum() + FP += ((predicted == 1) * (label == 0)).sum() + FN += ((predicted == 0) * (label == 1)).sum() + TN += ((predicted == 0) * (label == 0)).sum() + count += 1 + if count > 100000: + break + ALL = TP + TN + FP + FN + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py new file mode 100644 index 0000000..219e7a1 --- /dev/null +++ b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py @@ -0,0 +1,154 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import json +import torch +import pandas as pd +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import os +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader + +# use cuda if available otherwise use cpu +from torch.autograd import Variable + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirectional, batch_size, sequence_len): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional, dropout=0.5) + if if_bidirectional: + self.num_of_directions = 2 + else: + self.num_of_directions = 1 + self.fc = nn.Linear(hidden_size*self.num_of_directions, out_size) + self.batch_size = batch_size + + self.att_weight = nn.Parameter(torch.randn(1, 1, self.hidden_size*self.num_of_directions)) + self.att_bias = nn.Parameter(torch.randn(1, 1, sequence_len)) + # self.out = nn.Linear(in_features=in_features, out_features=out_features) + + # l1 regularization will add later + def attention_net(self, H): + # print(H.size()) = [batch, numdirec*hidden, seqlen] + a = F.softmax(torch.matmul(self.att_weight, H) + self.att_bias, 2) + a = torch.transpose(a, 1, 2) + return torch.bmm(H, a) + + + def init_hidden(self, size): + # size self.batch_size same + h0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + # h_n: hidden state h of last time step + # c_n: hidden state c of last time step + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + + # out = torch.transpose(out, 0, 1) + # out shape [batch, seqlen, numdirec*hidden] + out = torch.transpose(out, 1, 2) + # out shape [batch, numdirec*hidden, seqlen] + att_out = self.attention_net(out) + + out = self.fc(att_out[:, :, 0]) + # out shape[batch, num_of_class = 1] + # add sigmoid + return torch.sigmoid(out) + + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length + 1: + # print(line) + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(vec_to_class_type[line[i + window_length]]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + num_of_sessions = 0 + input_data, output_data = [], [] + train_file = pd.read_csv(file_path) + for i in range(len(train_file)): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + semantic_line = [] + for event in line: + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event - 1)]) + input_data.append(semantic_line) + output_data.append(int(train_file["label"][i])) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size, hidden_size, num_of_layers, num_of_classes, False, batch_size, sequence_length).to(device) + # create data set + sequence_data_set = generate_robust_seq_label(data_file, sequence_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.BCELoss() + optimizer = optim.Adam(model.parameters(), weight_decay=0.001) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, sequence_length, input_size).to(device) + output = model(seq) + + loss = criterion(output.squeeze(-1), label.float().to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') \ No newline at end of file diff --git a/deeplog_detection.py b/deeplog_detection.py index 4341b7b..508cd1c 100644 --- a/deeplog_detection.py +++ b/deeplog_detection.py @@ -1,75 +1,62 @@ import os -from logparsing.fttree import fttree -from extractfeature import hdfs_fs_deeplog_preprocessor +import sys +sys.path.append('./') +from logparsing.drain.HDFS_drain import get_hdfs_drain_clusters +from extractfeature.hdfs_deeplog_preprocessor import hdfs_preprocessor from anomalydetection.deeplog.Model1 import log_key_LSTM_train from anomalydetection.deeplog.Model2 import variable_LSTM_train from anomalydetection.deeplog import log_predict -# 原始日志文件 -log_file_dir = './Data/log/hdfs/' -log_file_name = 'HDFS_split' -log_file_abnormal_label = 'HDFS_split_anomaly' -# FT-tree -log_result = './Data/FTTreeResult-HDFS/' -log_fttree_out_dir = log_result+'clusters/' + # log_train,log_test,logkey,logvalue -log_preprocessor_dir = log_result+'deeplog_files/' -# model -model_dir = log_result+'deeplog_model_train/' +log = './Data/log/hdfs/HDFS_40w' +drain_out = './Data/Drain_HDFS/clusters/' +bin_dir = './HDFS_drain3_state.bin' +log_preprocessor_dir = './Data/Drain_HDFS/log_preprocessor' +model_dir = './Data/Drain_HDFS/deeplog_model_train/' + # train parameters window_length = 4 input_size = 1 hidden_size = 20 num_of_layers = 3 -model1_num_epochs = 300 +model1_num_epochs = 100 model1_batch_size = 200 model2_num_epochs = 50 model2_batch_size = 20 learning_rate = 0.01 num_candidates = 3 mse_threshold = 0.1 +# 是否使用模型二 +use_model2 = False -if not os.path.exists(log_result): - os.makedirs(log_result) -if not os.path.exists(log_fttree_out_dir): - os.makedirs(log_fttree_out_dir) -if not os.path.exists(log_preprocessor_dir): - os.makedirs(log_preprocessor_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) -# FT-tree -def pattern_extract(): - fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_dir, 5, 4, 2) - -# 将原日志文件分成训练集和测试集两部分 -def log_split(): - hdfs_fs_deeplog_preprocessor.log_split(log_file_dir,log_file_name,log_file_abnormal_label,log_preprocessor_dir) +def drain(): + get_hdfs_drain_clusters(log,drain_out,bin_dir) -# 生成log_key -def generate_log_key(): - hdfs_fs_deeplog_preprocessor.generate_log_key(log_file_dir,log_file_abnormal_label,log_preprocessor_dir,log_fttree_out_dir) - -# 提取并处理log_value -def generate_log_value(): - hdfs_fs_deeplog_preprocessor.generate_log_value(log_file_dir,log_file_name,log_file_abnormal_label,log_preprocessor_dir,log_fttree_out_dir) +def generate_logkey_and_value(): + hdfs_preprocessor() # 训练 +def train_model(): + train_model1() + if use_model2: + train_model2() + def train_model1(): - log_key_LSTM_train.train_model1(model_dir,log_preprocessor_dir,log_fttree_out_dir,model1_num_epochs,model1_batch_size,window_length,input_size,hidden_size,num_of_layers) + log_key_LSTM_train.train_model1(model_dir,log_preprocessor_dir,drain_out,model1_num_epochs,model1_batch_size,window_length,input_size,hidden_size,num_of_layers) def train_model2(): variable_LSTM_train.train_model2(model_dir,log_preprocessor_dir,model2_num_epochs,model2_batch_size,window_length,num_of_layers,learning_rate,hidden_size) # 测试 def test_model(): - log_predict.do_predict(log_preprocessor_dir,model_dir,window_length,input_size, hidden_size, num_of_layers,num_candidates,mse_threshold) - + model1_name = 'Adam_batch_size=' + str(model1_batch_size) + ';epoch=' + str(model1_num_epochs) + '.pt' + log_predict.do_predict(log_preprocessor_dir,drain_out,model_dir,model1_name,model2_num_epochs,window_length, input_size, hidden_size, num_of_layers, num_candidates, mse_threshold, use_model2) -# pattern_extract() -# log_split() -# generate_log_key() -# generate_log_value() -# train_model1() -# train_model2() -test_model() \ No newline at end of file +#drain() +generate_logkey_and_value() +# train_model() +#test_model() \ No newline at end of file diff --git a/ecoder_anomaly_detection.py b/ecoder_anomaly_detection.py new file mode 100644 index 0000000..78f9038 --- /dev/null +++ b/ecoder_anomaly_detection.py @@ -0,0 +1,71 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- + +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.att_all_you_need import encoder_self_att_train +from anomalydetection.att_all_you_need import encoder_self_att_predict + +# parameters for early prepare +logparser_structed_file = './Data/logparser_result/Drain/HDFS.log_structured.csv' +logparser_event_file = './Data/logparser_result/Drain/HDFS.log_templates.csv' +anomaly_label_file = './Data/log/hdfs/anomaly_label.csv' +sequential_directory = './Data/DrainResult-HDFS/sequential_files/' +train_file_name = 'robust_train_file' +test_file_name = 'robust_test_file' +valid_file_name = 'robust_valid_file' +wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)' +pattern_vec_out_path = './Data/DrainResult-HDFS/pattern_vec' +variable_symbol = '<*> ' + +# my encoder +sequence_length = 50 +input_size = 300 +hidden_size = 256 +num_of_layers = 4 +# 1 using sigmoid, 2 using softmax +num_of_classes = 1 +num_epochs = 100 +batch_size = 1000 +# for robust attention bi +train_root_path = './Data/DrainResult-HDFS/att_all_you_need/' +model_out_path = train_root_path + 'model_out/' +train_file = sequential_directory + train_file_name +pattern_vec_json = pattern_vec_out_path +dropout = 0.5 +num_of_heads = 8 +pf_dim = 512 + + +# predict parameters +# log anomaly sequential model parameters + +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def train_model(): + encoder_self_att_train.train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, train_file, pattern_vec_json, dropout, num_of_heads, pf_dim) + + +def test_model(): + # do something + encoder_self_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, sequence_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + valid_file_name, batch_size, pattern_vec_json, dropout, num_of_heads, pf_dim) + +#pattern_extract() +#extract_feature() +#train_model() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + diff --git a/extractfeature/hdfs_deeplog_preprocessor.py b/extractfeature/hdfs_deeplog_preprocessor.py new file mode 100644 index 0000000..ed89416 --- /dev/null +++ b/extractfeature/hdfs_deeplog_preprocessor.py @@ -0,0 +1,261 @@ +import csv +import os +import random + +class hdfs_deeplog_preprocessor: + # 日志变量设置 + LOG_LINE = 400000 + NUM_OF_LOGKEY = 31 + VECTOR_DIMENSION = 10 + NORMAL_STAGE_TO_STAGE_SIZE = [2000, 1000, 1000] + ABNORMAL_STAGE_TO_STAGE_SIZE = [800, 200, 200] + + # 读入数据部分 + ANOMALY_LABEL = './Data/log/hdfs/anomaly_label.csv' + LOG_FILE = './Data/log/hdfs/HDFS_40w' + MOFIFIED_LOG_FILE = './Data/log/hdfs/modified_HDFS_40w' + WORD_VECTOR_FILE = './Data/log/hdfs/word2vec_HDFS_40w' + LOGKEY_DIR = './Data/Drain_HDFS/clusters/' + is_block_normal = {} + block_to_lines = {} + line_to_logkey = [] + word_to_vector = {} + modified_logs = [] + + # 输出数据部分 + OUTPUT_DIR_PREFIX = './Data/Drain_HDFS/log_preprocessor/' + STAGE_TO_OUTPUT_DIR_INFIX = ['train/','validate/','test/'] + normal_blocks = [] + abnormal_blocks = [] + normal_block_index_to_stage = [] + abnormal_block_index_to_stage = [] + + + + ''' + ----------------------------------------------- + 以下是load_data部分 + ----------------------------------------------- + ''' + + def load_normal_info(self): + NORMAL_WORD = 'Normal' + FIRST_LINE_BLOCK_NAME = 'BlockId' + + with open(self.ANOMALY_LABEL,'r') as f: + lines = csv.reader(f) + for line in lines: + block = line[0] + normal_word = line[1] + if normal_word == NORMAL_WORD: + normal_info = True + else: + normal_info = False + if block != FIRST_LINE_BLOCK_NAME: + self.is_block_normal[block] = normal_info + + def load_line_info(self): + with open(self.LOG_FILE,'r') as f: + for line_index in range(self.LOG_LINE): + line = f.readline() + block = self.get_blockid(line) + if block not in self.block_to_lines.keys(): + self.block_to_lines[block] = [] + self.block_to_lines[block].append(line_index) + # print(self.block_to_lines['blk_-1608999687919862906']) + + def load_logkey_info(self): + self.line_to_logkey = [0 for i in range(self.LOG_LINE)] + for logkey in range(1,self.NUM_OF_LOGKEY+1): + with open(self.LOGKEY_DIR+str(logkey),'r') as f: + print(self.LOGKEY_DIR+str(logkey)) + lines = f.readline().strip().split(' ') + for line in lines: + line_index = int(line) + if line_index>=self.LOG_LINE: + print('cluster文件中某行的行数过大') + print(line) + exit(2) + self.line_to_logkey[line_index] = logkey + + def load_word_vector(self): + with open(self.WORD_VECTOR_FILE, 'r') as r: + for line in r.readlines(): + list_line = line.split(' ') + value = list(map(float, list_line[1:])) + key = list_line[0] + self.word_to_vector[key] = value + + def load_modified_log(self): + with open(self.MOFIFIED_LOG_FILE, 'r') as file: + content_list = file.readlines() + self.modified_logs = [x.strip() for x in content_list] + + def generate_block_list(self): + for block in self.block_to_lines.keys(): + if self.is_block_normal[block]: + self.normal_blocks.append(block) + else: + self.abnormal_blocks.append(block) + + ''' + ----------------------------------------------- + 以下是一些辅助函数 + ----------------------------------------------- + ''' + + def get_blockid(self, line): + words = line.strip().split(' ') + for word in words: + if len(word)>4 and word[:4] == 'blk_': + return word + print('无法找到block_id') + print(line) + exit(1) + + + def get_sentence_vector(self, sentence): + words = sentence.split(' ') + old_vector = [0.0 for i in range(self.VECTOR_DIMENSION)] + for word in words: + # print(word) + if word not in self.word_to_vector.keys(): + another_vector = [0.0 for i in range(self.VECTOR_DIMENSION)] + else: + another_vector = self.word_to_vector[word] + new_vector = [] + for i, j in zip(old_vector, another_vector): + new_vector.append(i + j) + old_vector = new_vector + + word_count = len(words) + for idx, value in enumerate(old_vector): + old_vector[idx] = value / word_count + vector_str = list(map(str, old_vector)) + sentence_vector = ','.join(vector_str) + return sentence_vector + + def get_logkey_and_logvalue_for_session(self, lines): + logkeys = [] + logkey_to_logvalues = [[] for i in range(self.NUM_OF_LOGKEY+1)] + for line in lines: + logkey = self.line_to_logkey[line] + logkeys.append(logkey) + log = self.modified_logs[line] + vector = self.get_sentence_vector(log) + logkey_to_logvalues[logkey].append(vector) + return logkeys,logkey_to_logvalues + ''' + ----------------------------------------------- + 以下是output_logkey_and_logvalue部分 + ----------------------------------------------- + ''' + + def get_block_stage_info(self,total_length,stage_to_length): + if sum(stage_to_length) > total_length: + print('要输出的条目太大,大于数据集中存在的条目。') + print(total_length) + print(stage_to_length) + exit(3) + block_index_list = [i for i in range(total_length)] + random.shuffle(block_index_list) + table = [-1 for i in range(total_length)] + + used_block_count = 0 + for stage in range(len(stage_to_length)): + block_index_start = used_block_count + block_index_end = used_block_count + stage_to_length[stage] + for block_index in block_index_list[block_index_start:block_index_end]: + table[block_index] = stage + used_block_count = block_index_end + return table + + def output(self,stage,output_normal): + if output_normal: + OUTPUT_DIR_SUFFIXES = ['logkey/','logvalue/normal/'] + LOGKEY_FILE = 'normal' + blocks = self.normal_blocks + block_index_to_stage = self.normal_block_index_to_stage + else: + OUTPUT_DIR_SUFFIXES = ['logkey/', 'logvalue/abnormal/'] + LOGKEY_FILE = 'abnormal' + blocks = self.abnormal_blocks + block_index_to_stage = self.abnormal_block_index_to_stage + + LOGKEY_OUTPUT_DIR = self.OUTPUT_DIR_PREFIX + \ + self.STAGE_TO_OUTPUT_DIR_INFIX[stage] + OUTPUT_DIR_SUFFIXES[0] + LOGVALUE_OUTPUT_DIR = self.OUTPUT_DIR_PREFIX + \ + self.STAGE_TO_OUTPUT_DIR_INFIX[stage] + OUTPUT_DIR_SUFFIXES[1] + if not os.path.exists(LOGKEY_OUTPUT_DIR): + os.makedirs(LOGKEY_OUTPUT_DIR) + if not os.path.exists(LOGVALUE_OUTPUT_DIR): + os.makedirs(LOGVALUE_OUTPUT_DIR) + logkey_writelist = [] + logkey_to_logvalue_writelist = [[] for i in range(self.NUM_OF_LOGKEY+1)] + + for block_index,block in enumerate(blocks): + if block_index_to_stage[block_index] == stage: + lines = self.block_to_lines[block] + logkeys, logkey_to_logvalues = \ + self.get_logkey_and_logvalue_for_session(lines) + logkey_line = ' '.join(str(logkey) for logkey in logkeys) + logkey_writelist.append(logkey_line+'\n') + for logkey in range(1,self.NUM_OF_LOGKEY+1): + if len(logkey_to_logvalues[logkey]) == 0: + logvalue_line = '-1' + else: + logvalue_line = ' '.join(logkey_to_logvalues[logkey]) + logkey_to_logvalue_writelist[logkey].append(logvalue_line+'\n') + + with open(LOGKEY_OUTPUT_DIR + LOGKEY_FILE,'w') as f: + f.writelines(logkey_writelist) + for logkey in range(1,self.NUM_OF_LOGKEY+1): + LOGVALUE_FILE = str(logkey) + with open(LOGVALUE_OUTPUT_DIR + LOGVALUE_FILE,'w') as f: + f.writelines(logkey_to_logvalue_writelist[logkey]) + + + ''' + ----------------------------------------------- + 以下是main函数部分 + ----------------------------------------------- + ''' + + + def load_data(self): + self.load_normal_info() + print('正常/异常标签加载成功') + self.load_line_info() + print('数据集block信息加载成功') + self.load_logkey_info() + print('从clusters取出logkey信息成功') + self.load_word_vector() + print('读入word vector信息成功') + self.load_modified_log() + print('读入log信息成功') + self.generate_block_list() + print('将block划分为正常/异常成功') + + def output_logkey_and_logvalue(self): + self.abnormal_block_index_to_stage = self.get_block_stage_info \ + (len(self.abnormal_blocks),self.ABNORMAL_STAGE_TO_STAGE_SIZE) + print('给异常block选择train validate test数据成功') + self.normal_block_index_to_stage = self.get_block_stage_info \ + (len(self.normal_blocks), self.NORMAL_STAGE_TO_STAGE_SIZE) + print('给正常block选择train validate test数据成功') + for stage in range(len(self.STAGE_TO_OUTPUT_DIR_INFIX)): + self.output(stage, output_normal=True) + print('给阶段' + str(stage) + '输出正常logkey和logvalue成功') + self.output(stage, output_normal=False) + print('给阶段' + str(stage) + '输出异常logkey和logvalue成功') + + def __init__(self): + self.load_data() + print('数据加载成功') + print('正常的session数:' + str(len(self.normal_blocks))) + print('异常的session数:' + str(len(self.abnormal_blocks))) + self.output_logkey_and_logvalue() + print('数据生成成功') + +def hdfs_preprocessor(): + hdfs_deeplog_preprocessor() diff --git a/extractfeature/hdfs_fs_deeplog_preprocessor.py b/extractfeature/hdfs_fs_deeplog_preprocessor.py index 0d94ff8..0044e0e 100644 --- a/extractfeature/hdfs_fs_deeplog_preprocessor.py +++ b/extractfeature/hdfs_fs_deeplog_preprocessor.py @@ -71,8 +71,18 @@ def generate_log_key(log_file_dir,log_file_abnormal_label,log_preprocessor_dir,l # 提取并处理log_value def generate_log_value(log_file_dir,log_file_name,log_file_abnormal_label,log_preprocessor_dir,log_fttree_out_dir): - log = log_file_dir+log_file_name + N_CLUSTER = 21 + WORD2VEC_FILE = 'word2vec' + STRING_VECTOR_FILE = 'string_vector' + + log_list = [] + word_vector = {} + + # log = log_file_dir+log_file_name + word2vec = log_file_dir+WORD2VEC_FILE + string_vector = log_file_dir+STRING_VECTOR_FILE in_abnormal = log_file_dir+log_file_abnormal_label + log_value_dir = ['logvalue_train/', 'logvalue_test/'] log_value_train_directory = log_preprocessor_dir+log_value_dir[0] log_value_test_directory = log_preprocessor_dir +log_value_dir[1] @@ -83,53 +93,58 @@ def generate_log_value(log_file_dir,log_file_name,log_file_abnormal_label,log_pr if not os.path.exists(log_value_test_directory): os.makedirs(log_value_test_directory) - log_list = [] - with open(log, 'r') as file: + with open(string_vector, 'r') as file: content_list = file.readlines() log_list = [x.strip() for x in content_list] + with open(word2vec, 'r') as r: + for line in r.readlines(): + list_line = line.split(' ') + value = list(map(float, list_line[1:])) + key = list_line[0] + word_vector[key] = value + abnormal = get_abnormal(in_abnormal) clusters = get_logkey(log_fttree_out_dir)[0] num = [0, 170000, 199999] - for i in range(0, 2): - for j in range(1, 62): + for i in range(len(log_value_dir)): + for j in range(N_CLUSTER): print("process:", i, j) - para1 = [] - para2 = [] - para3 = [] - out_path = log_preprocessor_dir + log_value_dir[i] + str(j) + ".txt" - for t in clusters[j - 1]: + out_path = log_preprocessor_dir + log_value_dir[i] + str(j+1) + ".txt" + write_list = [] + for t in clusters[j]: s = int(t) - if (i != 1 and s not in abnormal and s >= num[i] and s < num[i + 1]) or ( - i == 1 and s >= num[i] and s < num[i + 1]): - templog = [] - for word in log_list[s].split(' '): - templog.append(word) - para1.append(int(templog[0])) - para2.append(int(templog[1])) - para3.append(int(templog[2])) + if (i != 1 and s not in abnormal and num[i] <= s < num[i + 1]) or ( + i == 1 and num[i] <= s < num[i + 1]): + output = calc_sentence_vector(log_list[s],word_vector) + write_list.append(output) elif s >= num[i + 1]: - break; - if len(para1) > 0: - para1 = preprocessing.scale(para1) - if len(para2) > 0: - para2 = preprocessing.scale(para2) - if len(para3) > 0: - para3 = preprocessing.scale(para3) + break with open(out_path, mode='w', encoding='utf-8') as f: - for w in range(0, len(para1)): - print(para1[w], file=f, end='') - print(' ', file=f, end='') - print(para2[w], file=f, end='') - print(' ', file=f, end='') - print(para3[w], file=f, end='') - print(' ', file=f, end='') - print(' ', file=f) - - - - - + f.write('\n'.join(write_list)) + +def calc_sentence_vector(sentence,word_vector): + VECTOR_DIMENSION = 10 + + words = sentence.split(' ') + old_vector = [0.0 for i in range(VECTOR_DIMENSION)] + for word in words: + # print(word) + if word not in word_vector.keys(): + another_vector = [0.0 for i in range(VECTOR_DIMENSION)] + else: + another_vector = word_vector[word] + new_vector = [] + for i,j in zip(old_vector,another_vector): + new_vector.append(i+j) + old_vector = new_vector + + word_count = len(words) + for idx,value in enumerate(old_vector): + old_vector[idx] = value/word_count + vector_str = list(map(str, old_vector)) + output = ','.join(vector_str) + return output diff --git a/extractfeature/hdfs_ft_preprocessor.py b/extractfeature/hdfs_ft_preprocessor.py index 52df723..981614f 100644 --- a/extractfeature/hdfs_ft_preprocessor.py +++ b/extractfeature/hdfs_ft_preprocessor.py @@ -124,7 +124,7 @@ def preprocessor_hdfs_ft(cluster_directory, anomaly_file_path, wordvec_path, out for f in log_cluster[i]: train_file_obj.write(str(f)) train_file_obj.write(' ') - if count % 10 == 0: + if count % 200 == 0: train_file_obj.write('\n') else: train_file_obj.write(', ') @@ -138,8 +138,60 @@ def preprocessor_hdfs_ft(cluster_directory, anomaly_file_path, wordvec_path, out for f in log_cluster[i]: test_file_obj.write(str(f)) test_file_obj.write(' ') - if count % 10 == 0: + if count % 200 == 0: test_file_obj.write('\n') else: test_file_obj.write(', ') - count = count + 1 \ No newline at end of file + count = count + 1 + + +def preprocessor_hdfs_ft_split_abnormal(cluster_directory, anomaly_file_path, wordvec_path, out_dic, train_out_file_name, + test_out_file_name, label_out_file_name, pattern_vec_out_path, degree, num_of_lines): + anomaly_log_lines = set() + with open(anomaly_file_path, 'r') as anomaly_file: + line = anomaly_file.readline() + lines_str = line.split(' ') + anomaly_log_lines.update([int(x) for x in lines_str if len(x) > 0]) + + pattern_vec = pattern_to_vec(cluster_directory, wordvec_path, pattern_vec_out_path) + + log_cluster = {} + file_names = os.listdir(cluster_directory) + for file_name in file_names: + with open(cluster_directory + file_name, 'r') as cluster: + lines = cluster.readlines() + line_numbers = [int(x) for x in lines[1].split(' ') if len(x) > 0] + for number in line_numbers: + if not (number in anomaly_log_lines and number < int(degree * num_of_lines)): + log_cluster[number] = pattern_vec[lines[0].strip()] + + with open(out_dic + train_out_file_name, 'w+') as train_file_obj, open(out_dic + test_out_file_name, + 'w+') as test_file_obj, open( + out_dic + label_out_file_name, 'w+') as label_file_obj: + count = 1 + last_i = 0 + for i in sorted(log_cluster): + if i < int(degree * num_of_lines): + if i - last_i > 1: + train_file_obj.write('\n') + else: + train_file_obj.write(', ') + for f in log_cluster[i]: + train_file_obj.write(str(f)) + train_file_obj.write(' ') + count = count + 1 + else: + if i == int(degree * num_of_lines): + count = 1 + if i in anomaly_log_lines: + label_file_obj.write(str(count)) + label_file_obj.write(' ') + for f in log_cluster[i]: + test_file_obj.write(str(f)) + test_file_obj.write(' ') + if count % 200 == 0: + test_file_obj.write('\n') + else: + test_file_obj.write(', ') + count = count + 1 + last_i = i \ No newline at end of file diff --git a/extractfeature/hdfs_robust_preprocessor.py b/extractfeature/hdfs_robust_preprocessor.py new file mode 100644 index 0000000..e6760a5 --- /dev/null +++ b/extractfeature/hdfs_robust_preprocessor.py @@ -0,0 +1,166 @@ +# -*- coding: UTF-8 -*- +import os +import io +import re +import random +import math +import json +import pandas as pd +import numpy as np +block_id_regex = r'blk_(|-)[0-9]+' +special_patterns = {'dfs.FSNamesystem:': ['dfs', 'FS', 'Name', 'system'], 'dfs.FSDataset:': ['dfs', 'FS', 'dataset']} + + +def get_anomaly_block_id_set(anomaly_label_file): + datafile = open(anomaly_label_file, 'r', encoding='UTF-8') + data = pd.read_csv(datafile) + + data = data[data['Label'].isin(['Anomaly'])] + # 16838 anomaly block right with the log anomaly paper + anomaly_block_set = set(data['BlockId']) + return anomaly_block_set + + +def get_log_template_dic(logparser_event_file): + dic = {} + datafile = open(logparser_event_file, 'r', encoding='UTF-8') + data = pd.read_csv(datafile) + for _, row in data.iterrows(): + dic[row['EventId']] = row['numberID'] + return dic + + +# log parser_file should be structed.csv +def generate_train_and_test_file(logparser_structed_file, logparser_event_file, anomaly_label_file, out_dic, train_out_file_name, validation_out_file_name, test_out_file_name, wordvec_path, pattern_vec_out_path, variable_symbol): + anomaly_block_set = get_anomaly_block_id_set(anomaly_label_file) + log_template_dic = get_log_template_dic(logparser_event_file) + session_dic = {} + logparser_result = pd.read_csv(logparser_structed_file, header=0) + normal_block_ids = set() + abnormal_block_ids = set() + for _, row in logparser_result.iterrows(): + key = row['EventTemplate'] + content = row['Content'] + block_id = re.search(block_id_regex, content).group() + session_dic.setdefault(block_id, []).append(log_template_dic[row['EventId']]) + if block_id in anomaly_block_set: + abnormal_block_ids.add(block_id) + else: + normal_block_ids.add(block_id) + abnormal_block_ids = list(abnormal_block_ids) + normal_block_ids = list(normal_block_ids) + random.shuffle(abnormal_block_ids) + random.shuffle(normal_block_ids) + with open(out_dic + train_out_file_name, 'w+') as train_file_obj, open(out_dic + test_out_file_name, + 'w+') as test_file_obj, open( + out_dic + validation_out_file_name, 'w+') as validation_file_obj: + train_file_obj.write('Sequence,label\n') + test_file_obj.write('Sequence,label\n') + validation_file_obj.write('Sequence,label\n') + for i in range(len(normal_block_ids)): + if i < 6000: + train_file_obj.write(' '.join([str(num_id) for num_id in session_dic[normal_block_ids[i]]])) + train_file_obj.write(', 0\n') + elif i < 6000 + 50000: + validation_file_obj.write(' '.join([str(num_id) for num_id in session_dic[normal_block_ids[i]]])) + validation_file_obj.write(', 0\n') + else: + test_file_obj.write(' '.join([str(num_id) for num_id in session_dic[normal_block_ids[i]]])) + test_file_obj.write(', 0\n') + + for i in range(len(abnormal_block_ids)): + if i < 6000: + train_file_obj.write(' '.join([str(num_id) for num_id in session_dic[abnormal_block_ids[i]]])) + train_file_obj.write(', 1\n') + elif i < 6000 + 1000: + validation_file_obj.write(' '.join([str(num_id) for num_id in session_dic[abnormal_block_ids[i]]])) + validation_file_obj.write(', 1\n') + else: + test_file_obj.write(' '.join([str(num_id) for num_id in session_dic[abnormal_block_ids[i]]])) + test_file_obj.write(', 1\n') + + pattern_to_vec(logparser_event_file, wordvec_path, pattern_vec_out_path, variable_symbol) + + +def load_vectors(fname): + fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') + data = {} + for line in fin: + tokens = line.rstrip().split(' ') + data[tokens[0]] = list(map(float, tokens[1:])) + return data + + +def get_lower_case_name(text): + word_list = [] + if text in special_patterns: + return + for index, char in enumerate(text): + if not char.isupper(): + break + else: + if index == len(text) - 1: + return [text] + lst = [] + for index, char in enumerate(text): + if char.isupper() and index != 0: + word_list.append("".join(lst)) + lst = [] + lst.append(char) + word_list.append("".join(lst)) + return word_list + + +def preprocess_pattern(log_pattern): + special_list = [] + if log_pattern.split(' ')[0] in special_patterns.keys(): + special_list = special_patterns[log_pattern.split(' ')[0]] + log_pattern = log_pattern[len(log_pattern.split(' ')[0]):] + pattern = r'\*|,|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)' + result_list = [x for x in re.split(pattern, log_pattern) if len(x) > 0] + final_list = list(map(get_lower_case_name, result_list)) + final_list.append(special_list) + return [x for x in re.split(pattern, final_list.__str__()) if len(x) > 0] + + +def pattern_to_vec(logparser_event_file, wordvec_path, pattern_vec_out_path, variable_symbol): + data = load_vectors(wordvec_path) + pattern_to_words = {} + pattern_to_vectors = {} + datafile = open(logparser_event_file, 'r', encoding='UTF-8') + df = pd.read_csv(datafile) + pattern_num = len(df) + for _, row in df.iterrows(): + wd_list = preprocess_pattern(row['EventTemplate'].replace(variable_symbol, '').strip()) + pattern_to_words[row['EventTemplate'].replace(variable_symbol, '').strip()] = wd_list + print(pattern_to_words) + IDF = {} + for key in pattern_to_words.keys(): + wd_list = pattern_to_words[key] + pattern_vector = np.array([0.0 for _ in range(300)]) + word_used = 0 + for word in wd_list: + if not word in data.keys(): + print('out of 0.1m words', ' ', word) + else: + word_used = word_used + 1 + weight = wd_list.count(word)/1.0/len(pattern_to_words[key]) + if word in IDF.keys(): + pattern_vector = pattern_vector + weight * IDF[word] * np.array(data[word]) + else: + pattern_occur_num = 0 + for k in pattern_to_words.keys(): + if word in pattern_to_words[k]: + pattern_occur_num = pattern_occur_num + 1 + IDF[word] = math.log10(pattern_num/1.0/pattern_occur_num) + #print('tf', weight, 'idf', IDF[word], word) + #print(data[word]) + pattern_vector = pattern_vector + weight * IDF[word] * np.array(data[word]) + pattern_to_vectors[key] = pattern_vector / word_used + numberid2vec = {} + for _, row in df.iterrows(): + numberid2vec[row['numberID']] = pattern_to_vectors[row['EventTemplate'].replace(variable_symbol, '').strip()].tolist() + json_str = json.dumps(numberid2vec) + with open(pattern_vec_out_path, 'w+') as file_obj: + file_obj.write(json_str) + return pattern_to_vectors \ No newline at end of file diff --git a/extractfeature/k8s/__pycache__/__init__.cpython-37.pyc b/extractfeature/k8s/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..5a8ef95 Binary files /dev/null and b/extractfeature/k8s/__pycache__/__init__.cpython-37.pyc differ diff --git a/extractfeature/k8s/__pycache__/log_preprocessor.cpython-36.pyc b/extractfeature/k8s/__pycache__/log_preprocessor.cpython-36.pyc index c4f5232..f9df3d5 100644 Binary files a/extractfeature/k8s/__pycache__/log_preprocessor.cpython-36.pyc and b/extractfeature/k8s/__pycache__/log_preprocessor.cpython-36.pyc differ diff --git a/extractfeature/k8s/__pycache__/log_preprocessor.cpython-37.pyc b/extractfeature/k8s/__pycache__/log_preprocessor.cpython-37.pyc new file mode 100644 index 0000000..9a0756f Binary files /dev/null and b/extractfeature/k8s/__pycache__/log_preprocessor.cpython-37.pyc differ diff --git a/java/deeplog_java.py b/java/deeplog_java.py new file mode 100644 index 0000000..8b8a481 --- /dev/null +++ b/java/deeplog_java.py @@ -0,0 +1,266 @@ +import os +import linecache +import sys +import time +import torch +import torch.nn as nn +path = "C:\\study\\code\\LogAnalysis\\" +sys.path.append(path) +from logparsing.drain.HDFS_drain import get_hdfs_drain_clusters +from anomalydetection.deeplog.Model1.log_key_LSTM_train import Model as Model1 +from anomalydetection.deeplog.Model2.variable_LSTM_train import Model as Model2 +import sys +import shutil + +log_detect_name = sys.argv[1] +use_model2 = sys.argv[2] + +# log_detect_name = 'detect.log' +# use_model2 = '1' + +log_file_dir = path+'/Data/log/hdfs/' +log_file_name = 'HDFS_split' +base_dir = path+'/java/' +# log_detect +log_detect_dir = base_dir+'/detect_log/' +# Drian +drain_out = log_detect_dir + 'clusters/' +bin_dir = path + 'HDFS_drain3_state.bin' + +WORD_VECTOR_FILE = path + '/Data/log/hdfs/word2vec_HDFS_40w' +# model +model_dir = path+'Data/Drain_HDFS/'+'deeplog_model_train/' +N_Clusters = 31 +window_length = 4 +input_size = 1 +hidden_size = 20 +num_layers = 3 +model1_num_epochs = 100 +model1_batch_size = 200 +model2_num_epochs = 50 +model2_batch_size = 20 +learning_rate = 0.01 +num_candidates = 3 +mse_threshold = 0.1 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +shutil.rmtree(drain_out) +os.makedirs(drain_out) +shutil.rmtree(log_detect_dir+'logvalue/') +os.makedirs(log_detect_dir+'logvalue/') + + +def load_word_vector(): + word_to_vector = {} + with open(WORD_VECTOR_FILE, 'r') as r: + for line in r.readlines(): + list_line = line.split(' ') + value = list(map(float, list_line[1:])) + key = list_line[0] + word_to_vector[key] = value + return word_to_vector + +def get_sentence_vector(word_to_vector, sentence): + words = sentence.split(' ') + old_vector = [0.0 for i in range(10)] + for word in words: + if word not in word_to_vector.keys(): + another_vector = [0.0 for i in range(10)] + else: + another_vector = word_to_vector[word] + new_vector = [] + for i, j in zip(old_vector, another_vector): + new_vector.append(i + j) + old_vector = new_vector + + word_count = len(words) + for idx, value in enumerate(old_vector): + old_vector[idx] = value / word_count + vector_str = list(map(str, old_vector)) + sentence_vector = ','.join(vector_str) + return sentence_vector + +def generate(name,window_length): + log_keys_sequences=list() + length=0 + with open(name, 'r') as f: + for line in f.readlines(): + line = list(map(lambda n: n, map(int, line.strip().split()))) + line = line + [-1] * (window_length + 1 - len(line)) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + # log_keys_sequences[tuple(line)] = log_keys_sequences.get(tuple(line), 0) + 1 + log_keys_sequences.append(tuple(line)) + length+=1 + return log_keys_sequences,length + +def load_model1(model_dir,model_name,input_size, hidden_size, num_layers): + value_length_of_key = [10] * (31 + 1) + num_classes = len(value_length_of_key) + print("Model1 num_classes: ", num_classes) + model1_dir = model_dir + 'model1/' + model_path = model1_dir + model_name + model1 = Model1(input_size, hidden_size, num_layers, num_classes).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def load_model2(model_dir,epoch,input_size, hidden_size, num_layers): + model2_dir = model_dir+ 'model2/' + model2 = [] + value_length_of_key = [10] * (31 + 1) + for i in range(len(value_length_of_key)): + if value_length_of_key[i] == 0: + model2.append(None) + continue + input_size = value_length_of_key[i] + out_size = input_size + model_name = str(i+1) + '_epoch=' + str(epoch)+ '.pt' + model_path = model2_dir + str(i+1) + '/' + model_name + if not os.path.exists(model_path): + model2.append(None) + continue + model = Model2(input_size, hidden_size, num_layers, out_size).to(device) + model.load_state_dict(torch.load(model_path, map_location='cpu')) + model.eval() + print('model_path: {}'.format(model_path)) + model2.append(model) + return model2 + +def generate_log_key_and_value(): + print("generating log key...") + get_hdfs_drain_clusters(log_detect_dir+log_detect_name, drain_out,bin_dir) + log_to_key = {} + for i in range(0,N_Clusters): + if os.path.exists(drain_out+str(i+1)): + with open(drain_out+str(i+1),'r') as file: + for line in (file.readline().split()): + log_to_key[line] = i+1 + print(log_to_key) + # with open(log_detect_dir+'logkey.txt','w') as file: + # for i in range(0,len(log_to_key)): + # file.write(str(log_to_key[str(i)])) + # file.write(" ") + + print("generating log value...") + word_to_vector = load_word_vector() + logkey_to_logvalues = [[] for i in range(N_Clusters + 1)] + logkeys = [] + with open(log_detect_dir+log_detect_name,'r') as file: + lines = file.readlines() + for i in range(0,len(lines)): + logkey = log_to_key[str(i)] + logkeys.append(logkey) + vector = get_sentence_vector(word_to_vector,lines[i]) + logkey_to_logvalues[logkey].append(vector) + logkey_line = ' '.join(str(logkey) for logkey in logkeys) + logkey_writelist = [] + logkey_to_logvalue_writelist = [[] for i in range(N_Clusters + 1)] + logkey_writelist.append(logkey_line + '\n') + for logkey in range(1, N_Clusters + 1): + if len(logkey_to_logvalues[logkey]) == 0: + logvalue_line = '-1' + else: + logvalue_line = ' '.join(logkey_to_logvalues[logkey]) + logkey_to_logvalue_writelist[logkey].append(logvalue_line + '\n') + print(logkey_writelist) + with open(log_detect_dir+'logkey.txt', 'w') as f: + f.writelines(logkey_writelist) + os.makedirs(log_detect_dir+'logvalue/',exist_ok=True) + for logkey in range(1, N_Clusters + 1): + LOGVALUE_FILE = str(logkey) + with open(log_detect_dir+'logvalue/' + LOGVALUE_FILE, 'w') as f: + f.writelines(logkey_to_logvalue_writelist[logkey]) + +def log_predict(use_model2): + model1_name = 'Adam_batch_size=' + str(model1_batch_size) + ';epoch=' + str(model1_num_epochs) + '.pt' + model1 = load_model1(model_dir, model1_name, input_size, hidden_size, num_layers) + model2 = load_model2(model_dir, model2_num_epochs, 10, hidden_size, num_layers) + start_time = time.time() + criterion = nn.MSELoss() + test_normal_loader, test_normal_length = generate(log_detect_dir + 'logkey.txt', window_length) + print('predict start') + FP=0 + with torch.no_grad(): + for line_num, line in enumerate(test_normal_loader): + model1_success = False + for i in range(len(line) - window_length - 1): + seq0 = line[i:i + window_length] + label = line[i + window_length] + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1, window_length, input_size).to(device) + label = torch.tensor(label).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output,1)[0][-num_candidates:] + if label not in predicted: + FP+=1 + print(FP) + model1_success = True + break + if (model1_success): + continue + + if use_model2=='1': + seq = [] + for i in range(31): + with open(log_detect_dir + '/logvalue/' + str(i + 1), 'r')as f: + key_values = f.readlines() + key_values = key_values[line_num].strip('\n') + if (key_values == '-1'): + continue + seq.append(key_values.split(' ')) + # 将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2] = seq[k1][k2].strip('\n') + seq[k1][k2] = seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if (seq[k1][k2][k3] != ''): + seq[k1][k2][k3] = float(seq[k1][k2][k3]) + + # 补全 + for i in range(len(seq)): + if (len(seq[i]) < window_length + 1): + for j in range(window_length + 1 - len(seq[i])): + seq[i].append([0.0] * 10) + model2_success = False + # 预测 + for i in range(len(seq)): + if (model2[i] == None): + continue + for j in range(len(seq[i]) - window_length): + seq2 = seq[i][j:j + window_length] + label2 = seq[i][j + window_length] + + seq2 = torch.tensor(seq2, dtype=torch.float).view( + -1, window_length, 10).to(device) + label2 = torch.tensor(label, dtype=torch.float).view(-1).to(device) + output = model2[i](seq2) + mse = criterion(output[0], label2.to(device)) + if mse > mse_threshold: + FP += 1 + model2_success = True + break + if (model2_success): + break + if(FP==1): + print("predict result: abnormal") + else: + print("predict result: normal") + + + + + +generate_log_key_and_value() +log_predict(use_model2) + + + + + + + + diff --git a/java/detect_log/clusters/1 b/java/detect_log/clusters/1 new file mode 100644 index 0000000..f647c99 --- /dev/null +++ b/java/detect_log/clusters/1 @@ -0,0 +1 @@ +0 2 3 \ No newline at end of file diff --git a/java/detect_log/clusters/2 b/java/detect_log/clusters/2 new file mode 100644 index 0000000..7b57bd1 --- /dev/null +++ b/java/detect_log/clusters/2 @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/java/detect_log/clusters/3 b/java/detect_log/clusters/3 new file mode 100644 index 0000000..b4fe77f --- /dev/null +++ b/java/detect_log/clusters/3 @@ -0,0 +1 @@ +4 5 8 \ No newline at end of file diff --git a/java/detect_log/clusters/4 b/java/detect_log/clusters/4 new file mode 100644 index 0000000..cea0e89 --- /dev/null +++ b/java/detect_log/clusters/4 @@ -0,0 +1 @@ +6 7 9 \ No newline at end of file diff --git a/java/detect_log/detect.log b/java/detect_log/detect.log new file mode 100644 index 0000000..2cefae8 --- /dev/null +++ b/java/detect_log/detect.log @@ -0,0 +1,10 @@ +081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010 +081109 203518 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906 +081109 203519 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010 +081109 203519 145 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.14.224:42420 dest: /10.250.14.224:50010 +081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_-1608999687919862906 terminating +081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-1608999687919862906 terminating +081109 203519 145 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.10.6 +081109 203519 145 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.19.102 +081109 203519 147 INFO dfs.DataNode$PacketResponder: PacketResponder 0 for block blk_-1608999687919862906 terminating +081109 203519 147 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.14.224 \ No newline at end of file diff --git a/java/detect_log/logkey.txt b/java/detect_log/logkey.txt new file mode 100644 index 0000000..4b52ee1 --- /dev/null +++ b/java/detect_log/logkey.txt @@ -0,0 +1 @@ +1 2 1 1 3 3 4 4 3 4 diff --git a/java/detect_log/logvalue/1 b/java/detect_log/logvalue/1 new file mode 100644 index 0000000..f60f1ab --- /dev/null +++ b/java/detect_log/logvalue/1 @@ -0,0 +1 @@ +-0.27948891666666664,0.39378741666666667,0.4394363333333333,0.4158543333333334,0.9347174166666669,-0.08590000000000002,-0.5342015,-0.36786,-1.1335827499999998,-0.6025183333333334 -0.34474441666666666,0.46685858333333335,0.5180023333333333,0.5142760000000001,1.0520995,-0.20836200000000002,-0.4979293333333333,-0.19980916666666668,-1.1568605833333332,-0.6416392500000001 -0.5807055833333334,0.34877700000000006,0.5318064166666666,0.6473328333333334,1.0815334166666668,-0.37340983333333333,-0.2918375,-0.3758109166666667,-1.3126602499999998,-0.6551390833333334 diff --git a/java/detect_log/logvalue/10 b/java/detect_log/logvalue/10 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/10 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/11 b/java/detect_log/logvalue/11 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/11 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/12 b/java/detect_log/logvalue/12 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/12 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/13 b/java/detect_log/logvalue/13 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/13 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/14 b/java/detect_log/logvalue/14 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/14 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/15 b/java/detect_log/logvalue/15 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/15 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/16 b/java/detect_log/logvalue/16 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/16 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/17 b/java/detect_log/logvalue/17 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/17 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/18 b/java/detect_log/logvalue/18 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/18 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/19 b/java/detect_log/logvalue/19 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/19 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/2 b/java/detect_log/logvalue/2 new file mode 100644 index 0000000..760d7e6 --- /dev/null +++ b/java/detect_log/logvalue/2 @@ -0,0 +1 @@ +0.7032797777777778,-0.2436938888888889,-0.16089766666666666,-0.024760222222222225,0.4287812222222222,0.505934,0.17868633333333334,0.4231786666666667,0.08776533333333335,-0.18805511111111112 diff --git a/java/detect_log/logvalue/20 b/java/detect_log/logvalue/20 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/20 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/21 b/java/detect_log/logvalue/21 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/21 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/22 b/java/detect_log/logvalue/22 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/22 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/23 b/java/detect_log/logvalue/23 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/23 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/24 b/java/detect_log/logvalue/24 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/24 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/25 b/java/detect_log/logvalue/25 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/25 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/26 b/java/detect_log/logvalue/26 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/26 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/27 b/java/detect_log/logvalue/27 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/27 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/28 b/java/detect_log/logvalue/28 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/28 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/29 b/java/detect_log/logvalue/29 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/29 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/3 b/java/detect_log/logvalue/3 new file mode 100644 index 0000000..79646df --- /dev/null +++ b/java/detect_log/logvalue/3 @@ -0,0 +1 @@ +0.0664019999999999,0.18280827272727274,0.28105936363636363,0.2908666363636364,1.4609825454545453,0.28252763636363637,0.3609693636363636,-0.17393918181818188,-1.6194566363636362,-0.6568330909090908 0.08106327272727269,0.17677836363636365,0.27736190909090913,0.28871454545454545,1.4653256363636364,0.29304009090909094,0.36668563636363627,-0.1619914545454546,-1.6175321818181816,-0.6542397272727273 -0.10534100000000005,0.09833072727272728,0.2820625454545454,0.3601499090909091,1.5713586363636365,0.22264509090909093,0.4250600909090909,-0.34029572727272733,-1.7507773636363633,-0.7001486363636363 diff --git a/java/detect_log/logvalue/30 b/java/detect_log/logvalue/30 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/30 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/31 b/java/detect_log/logvalue/31 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/31 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/4 b/java/detect_log/logvalue/4 new file mode 100644 index 0000000..2f33070 --- /dev/null +++ b/java/detect_log/logvalue/4 @@ -0,0 +1 @@ +-0.7186064615384615,0.4540127692307693,0.4793014615384616,0.5282222307692307,1.2471863846153846,-0.4967562307692308,-0.2032558461538462,-0.3366557692307693,-1.765158,-0.28286999999999995 -0.7186064615384615,0.4540127692307693,0.4793014615384616,0.5282222307692307,1.2471863846153846,-0.4967562307692308,-0.2032558461538462,-0.3366557692307693,-1.765158,-0.28286999999999995 -0.485389923076923,0.6849405384615385,0.4014096153846154,0.28411176923076914,1.465224923076923,-0.21241961538461537,-0.00538892307692312,-0.26430853846153846,-2.068485,-0.2633912307692307 diff --git a/java/detect_log/logvalue/5 b/java/detect_log/logvalue/5 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/5 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/6 b/java/detect_log/logvalue/6 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/6 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/7 b/java/detect_log/logvalue/7 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/7 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/8 b/java/detect_log/logvalue/8 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/8 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/9 b/java/detect_log/logvalue/9 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/9 @@ -0,0 +1 @@ +-1 diff --git a/java/java.iml b/java/java.iml new file mode 100644 index 0000000..c90834f --- /dev/null +++ b/java/java.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/java/out/production/java/deeplog.class b/java/out/production/java/deeplog.class new file mode 100644 index 0000000..954dd5a Binary files /dev/null and b/java/out/production/java/deeplog.class differ diff --git a/java/src/deeplog.java b/java/src/deeplog.java new file mode 100644 index 0000000..3e496a9 --- /dev/null +++ b/java/src/deeplog.java @@ -0,0 +1,29 @@ +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.Scanner; +import java.io.IOException; + +public class deeplog { + public static void main(String[] args) throws Exception { + String detect_file = "detect.log"; + String use_model2 = "1"; + System.out.println("\nExecuting python script file now."); + try { + String cmds = String.format("python C:\\study\\code\\LogAnalysis\\java\\deeplog_java.py %s %s", + detect_file,use_model2); + Process proc = Runtime.getRuntime().exec(cmds); + BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + proc.waitFor(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +} diff --git a/l_a_d_bi_lstm.py b/l_a_d_bi_lstm.py new file mode 100644 index 0000000..8ba6690 --- /dev/null +++ b/l_a_d_bi_lstm.py @@ -0,0 +1,91 @@ +# -*- coding: UTF-8 -*- +from extractfeature.k8s import log_preprocessor +from extractfeature.k8s import value_extract +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.bi_lstm_only import bi_lstm_train +from anomalydetection.bi_lstm_only import bi_lstm_predict + +# parameters for early prepare +log_file_dir = './Data/log/hdfs/' +log_file_name = 'HDFS_split' +log_fttree_out_directory = './Data/FTTreeResult-HDFS/clusters/' +# anomaly file name used which is also used in ./Data/log/file_split +anomaly_line_file = './Data/log/hdfs/HDFs_split_anomaly' +wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)' +sequential_directory = './Data/FTTreeResult-HDFS/sequential_files/' +train_file_name = 'train_file' +test_file_name = 'test_file' +label_file_name = 'label_file' +pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' +split_degree = 0.2 +# log file line used which is also used in ./Data/log/file_split +log_line_num = 200000 + +# bi lstm only model parameters +window_length = 20 +input_size = 300 +hidden_size = 128 +num_of_layers = 2 +num_of_classes = 26 +num_epochs = 10 +batch_size = 1000 +# for bi lstm only +train_root_path = './Data/FTTreeResult-HDFS/bi_model_train/' +model_out_path = train_root_path + 'bi_model_out/' +data_file = sequential_directory + train_file_name +pattern_vec_file = pattern_vec_out_path + +# predict parameters + +# log anomaly sequential model parameters + +if not os.path.exists(log_fttree_out_directory): + os.makedirs(log_fttree_out_directory) +if not os.path.exists(sequential_directory): + os.makedirs(sequential_directory) +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def pattern_extract(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + +# 同时生成train file 和 test file好点 +def extract_feature(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, train_file_name, test_file_name, label_file_name, pattern_vec_out_path, split_degree, log_line_num) + + +def pattern_extract_test(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + + +def extract_feature_test(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, 'train_file') + + +def train_model(): + #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + bi_lstm_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + + +def test_model(): + # do something + #log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + bi_lstm_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 10, pattern_vec_file) + + +#extract_feature() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + diff --git a/log_anomaly_detection.py b/log_anomaly_detection.py index ddc34dd..6f710cf 100644 --- a/log_anomaly_detection.py +++ b/log_anomaly_detection.py @@ -19,18 +19,18 @@ test_file_name = 'test_file' label_file_name = 'label_file' pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' -split_degree = 0.2 +split_degree = 0.8 # log file line used which is also used in ./Data/log/file_split log_line_num = 200000 # log anomaly sequential model parameters some parameter maybe changed to train similar models -window_length = 4 +window_length = 20 input_size = 300 -hidden_size = 30 +hidden_size = 128 num_of_layers = 2 -num_of_classes = 61 -num_epochs = 200 -batch_size = 200 +num_of_classes = 26 +num_epochs = 10 +batch_size = 1000 # for log anomaly train_root_path = './Data/FTTreeResult-HDFS/model_train/' model_out_path = train_root_path + 'model_out/' @@ -41,7 +41,7 @@ pattern_vec_file = pattern_vec_out_path # predict parameters - +num_of_candidates = 10 # log anomaly sequential model parameters if not os.path.exists(log_fttree_out_directory): @@ -69,17 +69,17 @@ def extract_feature_test(): def train_model(): - #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) - bi_lstm_att_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + #bi_lstm_att_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) def test_model(): # do something - log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) - #bi_lstm_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 10, pattern_vec_file) + #bi_lstm_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + train_file_name, num_of_candidates, pattern_vec_file) -pattern_extract() -extract_feature() +#pattern_extract() +#extract_feature() train_model() test_model() diff --git a/log_deep_data_anomaly.py b/log_deep_data_anomaly.py new file mode 100644 index 0000000..9db9090 --- /dev/null +++ b/log_deep_data_anomaly.py @@ -0,0 +1,69 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +from extractfeature.k8s import log_preprocessor +from extractfeature.k8s import value_extract +import os +import torch +from torch.utils.data import TensorDataset, DataLoader +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.self_att_lstm import self_att_lstm_train +from anomalydetection.self_att_lstm import self_att_lstm_predict + +sequential_directory = './Data/logdeepdata/' +train_file_name = 'hdfs_train' +test_abnormal_name = 'hdfs_test_abnormal' +test_normal_name = 'hdfs_test_normal' +pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' + + +# lstm att model parameters +window_length = 10 +input_size = 1 +hidden_size = 128 +num_of_layers = 2 +num_of_classes = 28 +num_epochs = 20 +batch_size = 2000 +# for self att lstm +train_root_path = './Data/Logdeep_Result/self_att_lstm_model_train/' +model_out_path = train_root_path + 'sa_lstm_model_out/' +data_file = sequential_directory + train_file_name +pattern_vec_file = pattern_vec_out_path + +# predict parameters +num_of_candidates = 8 +# log anomaly sequential model parameters + +if not os.path.exists(sequential_directory): + os.makedirs(sequential_directory) +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + + +def train_model(): + #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + self_att_lstm_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + + +def test_model(): + # do something + #log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + self_att_lstm_predict.do_log_deep_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + test_normal_name, sequential_directory + test_abnormal_name, num_of_candidates, pattern_vec_file) + + +#pattern_extract() +#extract_feature_spilt_abnormal() +#train_model() +#get_label_sequentials('./Data/FTTreeResult-HDFS/pattern_sequntials') +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/log_predict.py b/log_predict.py new file mode 100644 index 0000000..8b5268d --- /dev/null +++ b/log_predict.py @@ -0,0 +1,305 @@ +#!/usr/bin/python +# -*- coding:utf-8 -*- +import torch +import time +from enum import Enum +from anomalydetection.deeplog.Model1.log_key_LSTM_train import Model as Model1 +from anomalydetection.deeplog.Model2.variable_LSTM_train import Model as Model2 +import torch.nn as nn +import os +import matplotlib.pyplot as plt +from collections import Counter + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# 记录每个 key 对应的 value 的长度 +value_length_of_key = [] + +# 继承枚举类 +class LineNumber(Enum): + PATTERN_LINE = 0 + NUMBERS_LINE = 3 + + + +def generate(name,window_length): + log_keys_sequences=list() + length=0 + with open(name, 'r') as f: + for line in f.readlines(): + line = list(map(lambda n: n, map(int, line.strip().split()))) + line = line + [-1] * (window_length + 1 - len(line)) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + # log_keys_sequences[tuple(line)] = log_keys_sequences.get(tuple(line), 0) + 1 + log_keys_sequences.append(tuple(line)) + length+=1 + return log_keys_sequences,length + + +def get_value_length(log_preprocessor_dir,log_fttree_out_dir): + global value_length_of_key + value_length_of_key = [10]*(len(os.listdir(log_fttree_out_dir)) + 1) + log_value_folder = log_preprocessor_dir + 'logvalue_train/' + file_names = os.listdir(log_value_folder) + # for i in range(len(file_names)): + # with open(log_value_folder + str(i+1), 'r') as f: + # x = f.readlines() + # if len(x) == 0 or x[0].strip('\n') == '-1': + # value_length_of_key.append(0) + # else: + # line = x[0].strip('\n') + # key_values = line.split(' ') + # value_length_of_key[i+1] = len(key_values[0].split(',')) + + +def load_model1(model_dir,model_name,input_size, hidden_size, num_layers): + num_classes = len(value_length_of_key) + # num_classes = 28 + print("Model1 num_classes: ", num_classes) + model1_dir = model_dir + 'model1/' + model_path = model1_dir + model_name + model1 = Model1(input_size, hidden_size, num_layers, num_classes).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def load_model2(model_dir,epoch,input_size, hidden_size, num_layers): + model2_dir = model_dir+ 'model2/' + model2 = [] + for i in range(len(value_length_of_key)): + if value_length_of_key[i] == 0: + model2.append(None) + continue + input_size = value_length_of_key[i] + out_size = input_size + model_name = str(i+1) + '_epoch=' + str(epoch)+ '.pt' + model_path = model2_dir + str(i+1) + '/' + model_name + if not os.path.exists(model_path): + model2.append(None) + continue + model = Model2(input_size, hidden_size, num_layers, out_size).to(device) + model.load_state_dict(torch.load(model_path, map_location='cpu')) + model.eval() + print('model_path: {}'.format(model_path)) + model2.append(model) + return model2 + + +def draw_evaluation(title, indexs, values, xlabel, ylabel): + fig = plt.figure(figsize=(15,10)) + x = indexs + y = values + plt.bar(x, y, align='center', alpha=0.5, width=0.4) + plt.xticks(x, x) + plt.ylabel(ylabel) + plt.xlabel(xlabel) + plt.title(title) + plt.show() + + +def do_predict(log_preprocessor_dir,log_fttree_out_dir,model_dir,model1_name,model2_num_epochs,window_length,input_size, hidden_size, num_layers,num_candidates,mse_threshold,use_model2): + # abnormal_label_file = log_preprocessor_dir + 'HDFS_abnormal_label.txt' + + get_value_length(log_preprocessor_dir,log_fttree_out_dir) + + model1 = load_model1(model_dir, model1_name, input_size, hidden_size, num_layers) + + model2 = load_model2(model_dir,model2_num_epochs,10, hidden_size, num_layers) + + # for Model2's prediction, store which log currently predicts for each log_key. + # When model one predicts normal, model2 makes predictions. + # At this time, the forward few logs with the same log_key are needed to be predicted + # so the pattern_index is used to record the log_key to be predicted. + #pattern_index = [0]*len(pattern2value) + #pattern_index = [0] * 63 + start_time = time.time() + criterion = nn.MSELoss() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + test_normal_loader, test_normal_length = generate(log_preprocessor_dir+ 'logkey/logkey_normal',window_length) + test_abnormal_loader, test_abnormal_length=generate(log_preprocessor_dir+'logkey/logkey_abnormal',window_length) + + + print('predict start') + + #normal test + with torch.no_grad(): + for line_num,line in enumerate(test_normal_loader): + model1_success=False + for i in range(len(line) - window_length-1): + seq0 = line[i:i + window_length] + label = line[i + window_length] + + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1,window_length,input_size).to(device) + label = torch.tensor(label).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] + if label not in predicted: + FP += 1 + model1_success=True + break + if(model1_success): + continue + + + #如果模型二预测normal TN+1 否则FP+1 + + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 + + # When model one predicts normal, model2 makes predictions. + # values:all log's value vector belongs to log_key(whose id is pattern_id) + + # 是否使用模型二 + if use_model2: + + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(26): + with open(log_preprocessor_dir+'/logvalue_normal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + FP+=1 + model2_success=True + break + if(model2_success): + break + + + #abnormal test + with torch.no_grad(): + for line in test_abnormal_loader: + model1_success=False + for i in range(len(line) - window_length): + seq0 = line[i:i + window_length] + label = line[i + window_length] + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1, window_length, input_size).to(device) + + label = torch.tensor(label,).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] + if label not in predicted: + TP += 1 + model1_success=True + break + if(model1_success): + continue + + # 是否使用模型二 + if use_model2: + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(26): + with open(log_preprocessor_dir+'/logvalue_abnormal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + TP+=1 + model2_success=True + break + if(model2_success): + break + + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 + + + # Compute precision, recall and F1-measure + FN = test_abnormal_length - TP + TN=test_normal_length-FP + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + Acc = (TP + TN) * 100 /(TP+TN+FP+FN) + P = 100 * TP / (TP + FP) + R = 100 * TP / (TP + FN) + F1 = 2 * P * R / (P + R) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + # print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'],[Acc, P, R, F1], 'evaluations', '%') + + + + + + + diff --git a/logparsing/converter/__init__.py b/logparsing/converter/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/logparsing/converter/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/logparsing/converter/__pycache__/__init__.cpython-36.pyc b/logparsing/converter/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..36fcdb4 Binary files /dev/null and b/logparsing/converter/__pycache__/__init__.cpython-36.pyc differ diff --git a/logparsing/converter/__pycache__/eventid2number.cpython-36.pyc b/logparsing/converter/__pycache__/eventid2number.cpython-36.pyc new file mode 100644 index 0000000..d1f0a5f Binary files /dev/null and b/logparsing/converter/__pycache__/eventid2number.cpython-36.pyc differ diff --git a/logparsing/converter/eventid2number.py b/logparsing/converter/eventid2number.py new file mode 100644 index 0000000..ceba5e0 --- /dev/null +++ b/logparsing/converter/eventid2number.py @@ -0,0 +1,8 @@ +import pandas as pd + +def add_numberid(logparser_templates_file): + df = pd.read_csv(logparser_templates_file, header=0) + df['numberID'] = range(1, len(df) + 1) + print(df) + + df.to_csv(logparser_templates_file, columns=df.columns, index=0, header=1) \ No newline at end of file diff --git a/logparsing/converter/logparser2cluster.py b/logparsing/converter/logparser2cluster.py new file mode 100644 index 0000000..48b95ba --- /dev/null +++ b/logparsing/converter/logparser2cluster.py @@ -0,0 +1,25 @@ +# coding:utf-8 +import pandas as pd +import os + +# log parser_file should be structed.csv output should be './Data/FTTreeResult-HDFS/clusters/' +def logparser2cluster(logparser_file, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + logparser_result = pd.read_csv(logparser_file, header=0) + key_dict = {} + value_dict = {} + for _, row in logparser_result.iterrows(): + key = row['EventTemplate'] + if not key in key_dict: + key_dict[key] = [] + key_dict[key].append(str(row['LineId'])) + key_num = 1 + for key, lines in key_dict.items(): + with open(output_dir + "/" + str(key_num), 'w') as f: + f.write(key + "\n") + f.write(" ".join(lines)) + key_num += 1 + +if __name__ == "__main__": + logparser2cluster("Drain_result/HDFS.log_structured.csv", "clusters") diff --git a/logparsing/drain/.gitignore b/logparsing/drain/.gitignore new file mode 100644 index 0000000..546f7e3 --- /dev/null +++ b/logparsing/drain/.gitignore @@ -0,0 +1,9 @@ +**/__pycache__/* +MANIFEST +dist/* +venv/* +.idea/* +drain3.egg-info/* +snapshot.txt +examples/snapshot.txt +*.bin diff --git a/logparsing/drain/CONTRIBUTING.md b/logparsing/drain/CONTRIBUTING.md new file mode 100644 index 0000000..b54d7be --- /dev/null +++ b/logparsing/drain/CONTRIBUTING.md @@ -0,0 +1,48 @@ +All contributors must agree to the Developer Certificate of Origin Version 1.1. (DCO 1.1) by signing their commits with: + +``` +Signed-off-by: [NAME] <[EMAIL]> +``` + +This can be simply achieved with `git commit -s` when formatting your commit message. + +The full text of the DCO 1.1 is as follows: + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I +have the right to submit it under the open source license +indicated in the file; or + +(b) The contribution is based upon previous work that, to the best +of my knowledge, is covered under an appropriate open source +license and I have the right under that license to submit that +work with modifications, whether created in whole or in part +by me, under the same open source license (unless I am +permitted to submit under a different license), as indicated +in the file; or + +(c) The contribution was provided directly to me by some other +person who certified (a), (b) or (c) and I have not modified +it. + +(d) I understand and agree that this project and the contribution +are public and that a record of the contribution (including all +personal information I submit with it, including my sign-off) is +maintained indefinitely and may be redistributed consistent with +this project or the open source license(s) involved. +``` diff --git a/logparsing/drain/HDFS_drain.py b/logparsing/drain/HDFS_drain.py new file mode 100644 index 0000000..b14e226 --- /dev/null +++ b/logparsing/drain/HDFS_drain.py @@ -0,0 +1,34 @@ +import configparser +import json +import logging +import sys +import os +import shutil + +from logparsing.drain.drain3.template_miner import TemplateMiner +from logparsing.drain.drain3.file_persistence import FilePersistence + + +def get_hdfs_drain_clusters(log,drain_out,bin_dir): + persistence_type = "FILE" + config = configparser.ConfigParser() + config.read('drain3.ini') + logger = logging.getLogger(__name__) + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') + persistence = FilePersistence(bin_dir) + template_miner = TemplateMiner(persistence) + shutil.rmtree(drain_out) + os.makedirs(drain_out,exist_ok=True) + with open(log,'r') as file: + lineNum = 0 + for line in file.readlines(): + print(lineNum) + result = template_miner.add_log_message(line) + cluster_id = json.dumps(result["cluster_id"]) + cluster_id = int(cluster_id[2:-1]) + with open(drain_out+str(cluster_id),'a') as outfile: + outfile.write(str(lineNum) + " ") + lineNum += 1 + # print("Clusters:") + #for cluster in template_miner.drain.clusters: + #print(cluster) diff --git a/logparsing/drain/LICENSE.txt b/logparsing/drain/LICENSE.txt new file mode 100644 index 0000000..d152f60 --- /dev/null +++ b/logparsing/drain/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 International Business Machines + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/logparsing/drain/README.md b/logparsing/drain/README.md new file mode 100644 index 0000000..ec7d78a --- /dev/null +++ b/logparsing/drain/README.md @@ -0,0 +1,169 @@ +# Drain3 +## Introduction + +Drain3 is an online log template miner that can extract templates (clusters) from a stream of log messages +in a timely manner. It employs a parse tree with fixed depth to guide the log group search process, +which effectively avoids constructing a very deep and unbalanced tree. + +Drain3 continuously learns on-the-fly and automatically extracts "log templates" from raw log entries. + +#### Example: + +For the input: + +``` +connected to 10.0.0.1 +connected to 10.0.0.2 +connected to 10.0.0.3 +Hex number 0xDEADBEAF +Hex number 0x10000 +user davidoh logged in +user eranr logged in +``` + +Drain3 extracts the following templates: + +``` +A0001 (size 3): connected to +A0002 (size 2): Hex number +A0003 (size 2): user <*> logged in +``` + +This project is an upgrade of the original [Drain](https://github.com/logpai/logparser/blob/master/logparser/Drain) +project by LogPAI from Python 2.7 to Python 3.6 or later with some bug-fixes and additional features. + +Read more information about Drain from the following paper: + +- Pinjia He, Jieming Zhu, Zibin Zheng, and Michael R. Lyu. [Drain: An Online Log Parsing Approach with Fixed Depth Tree](http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf), Proceedings of the 24th International Conference on Web Services (ICWS), 2017. + +A possible Drain3 use case in this blog post: [Use open source Drain3 log-template mining project to monitor for network outages](https://developer.ibm.com/blogs/how-mining-log-templates-can-help-ai-ops-in-cloud-scale-data-centers). + + +#### New features + +- **Persistence**. Save and load Drain state into an [Apache Kafka](https://kafka.apache.org) topic or a file. +- **Streaming**. Support feeding Drain with messages one-be-one. +- **Masking**. Replace some message parts (e.g numbers, IPs, emails) with wildcards. This improves the accuracy of template mining. +- **Packaging**. As a pip package. + +#### Expected Input and Output + +The input for Drain3 is the unstructured free-text portion log messages. It is recommended to extract +structured headers like timestamp, hostname. severity, etc.. from log messages before passing to Drain3, +in order to improve mining accuracy. + +The output is a dictionary with the following fields: +- `change_type`: indicates either if a new template was identified, an existing template was changed or message added to an existing cluster. +- `cluster_id`: Sequential ID of the cluster that the log belongs to, for example, `A0008` +- `cluster_size`: The size (message count) of the cluster that the log belongs to +- `cluster_count`: Count clusters seen so far +- `template_mined`: the last template of above cluster_id + +Templates may change over time based on input, for example: + +``` +aa aa aa +{"change_type": "cluster_created", "cluster_id": "A0001", "cluster_size": 1, "template_mined": "aa aa aa", "cluster_count": 1} + +aa aa ab +{"change_type": "cluster_template_changed", "cluster_id": "A0001", "cluster_size": 2, "template_mined": "aa aa <*>", "cluster_count": 1} +``` + +**Explanation:** *Drain3 learned that the third token is a parameter* + +## Configuration + +Drain3 is configured using [configparser](https://docs.python.org/3.4/library/configparser.html) using file `drain3.ini` available parameters are: +- `[DEFAULT]/snapshot_poll_timeout_sec` - maximum timeout for restoring snapshot from Kafka (default 60) +- `[DEFAULT]/sim_th` - recognition threshold (default 0.4) +- `[DEFAULT]/masking` - parameters masking - in json format (default "") +- `[DEFAULT]/snapshot_interval_minutes` - interval for new snapshots (default 1) +- `[DEFAULT]/compress_state` - whether to compress the state before saving it. This can be useful when using Kafka persistence. + +## Masking + +This feature allows masking of specific parameters in log message to specific keywords. Use a list of regular expression +dictionaries in the configuration file with the format {'regex_pattern', 'mask_with'} to set custom masking. + +In order to mask an IP address created the file `drain3.ini` : + +``` +[DEFAULT] +masking = [ + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, + ] +``` + +Now, Drain3 recognizes IP addresses in templates, for example with input such as: +``` +IP is 12.12.12.12 +{"change_type": "cluster_created", "cluster_id": "A0013", "cluster_size": 1, "template_mined": "IP is ", "cluster_count": 13} +``` + +Note: template parameters that do not match custom masking are output as <*> + +## Persistence +The persistence feature saves and loads a snapshot of Drain3 state in (compressed) json format. This feature adds restart resiliency +to Drain allowing continuation of activity and knowledge across restarts. + +Drain3 state includes the search tree and all the clusters that were identified up until snapshot time. + +The snapshot also persist number of occurrences per cluster, and the cluster_id. + +An example of a snapshot: +``` +{"clusters": [{"cluster_id": "A0001", "log_template_tokens": `["aa", "aa", "<\*>"]`, "py/object": "drain3_core.LogCluster", "size": 2}, {"cluster_id": "A0002", "log_template_tokens": `["My", "IP", "is", ""]`, "py/object": "drain3_core.LogCluster", "size": 1}]... +``` + +This example snapshot persist two clusters with the templates: + +> `["aa", "aa", "<\*>"]` - occurs twice +> +> `["My", "IP", "is", ""]` - occurs once + +Snapshots are created in the following events: + +- `cluster_created` - in any new template +- `cluster_template_changed` - in any update of a template +- `periodic` - after n minutes from the last snapshot. This is intended to save cluster sizes even if no new template was identified. + +Drain3 currently supports 3 persistence modes: + +- **Kafka** - The snapshot is saved in a dedicated topic used only for snapshots - the last message in this topic +is the last snapshot that will be loaded after restart. +For Kafka persistence, you need to provide: `topic_name` and `server_name`. + +- **File** - The snapshot is saved to a file. + +- **None** - No persistence. + +Drain3 persistence modes can be easily extended to another medium / database by +inheriting the [PersistenceHandler](drain3/persistence_handler.py) class. + + +## Installation + +Drain3 is available from [PyPI](https://pypi.org/project/drain3). To install use `pip`: + +```pip3 install drain3``` + + +## Examples + +Run [examples/drain_stdin_demo.py](examples/drain_stdin_demo.py) from the root folder of the repository by: + +``` +python -m examples.drain_stdin_demo +``` + +Use Drain3 with input from stdin and persist to either Kafka / file / no persistence. + +Enter several log lines using the command line. Press `q` to end execution. + +Change `persistence_type` variable in the example to change persistence mode. + +An example drain3.ini file with masking instructions exists in the `examples` folder. + +## Contributing + +Our project welcomes external contributions. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for further details. diff --git a/logparsing/drain/__init__.py b/logparsing/drain/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/logparsing/drain/drain3/__init__.py b/logparsing/drain/drain3/__init__.py new file mode 100644 index 0000000..3113a50 --- /dev/null +++ b/logparsing/drain/drain3/__init__.py @@ -0,0 +1,2 @@ +from logparsing.drain.drain3.template_miner import TemplateMiner + diff --git a/logparsing/drain/drain3/drain.py b/logparsing/drain/drain3/drain.py new file mode 100644 index 0000000..9b961d4 --- /dev/null +++ b/logparsing/drain/drain3/drain.py @@ -0,0 +1,258 @@ +""" +Description : This file implements the Drain algorithm for log parsing +Author : LogPAI team +Modified by : david.ohana@ibm.com, moshikh@il.ibm.com +License : MIT +""" + +param_str = '<*>' + + +class LogCluster: + def __init__(self, log_template_tokens: list, cluster_id): + self.log_template_tokens = log_template_tokens + self.cluster_id = cluster_id + self.size = 1 + + def get_template(self): + return ' '.join(self.log_template_tokens) + + def __str__(self): + return f"{self.cluster_id} (size {self.size}): {self.get_template()}" + + +class Node: + def __init__(self, key, depth): + self.depth = depth + self.key = key + self.key_to_child_node = {} + self.clusters = [] + + +class Drain: + + def __init__(self, depth=4, sim_th=0.4, max_children=100): + """ + Attributes + ---------- + depth : depth of all leaf nodes + sim_th : similarity threshold + max_children : max number of children of an internal node + """ + self.depth = depth - 2 + self.sim_th = sim_th + self.max_children = max_children + self.root_node = Node("(ROOT)", 0) + self.clusters = [] + + @staticmethod + def has_numbers(s): + return any(char.isdigit() for char in s) + + def tree_search(self, root_node: Node, tokens): + + token_count = len(tokens) + parent_node = root_node.key_to_child_node.get(token_count) + + # no template with same token count yet + if parent_node is None: + return None + + # handle case of empty log string + if token_count == 0: + return parent_node.clusters[0] + + cluster = None + current_depth = 1 + for token in tokens: + at_max_depth = current_depth == self.depth + is_last_token = current_depth == token_count + + if at_max_depth or is_last_token: + break + + key_to_child_node = parent_node.key_to_child_node + if token in key_to_child_node: + parent_node = key_to_child_node[token] + elif param_str in key_to_child_node: + parent_node = key_to_child_node[param_str] + else: + return cluster + current_depth += 1 + + cluster = self.fast_match(parent_node.clusters, tokens) + + return cluster + + def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster): + token_count = len(cluster.log_template_tokens) + if token_count not in root_node.key_to_child_node: + first_layer_node = Node(key=token_count, depth=1) + root_node.key_to_child_node[token_count] = first_layer_node + else: + first_layer_node = root_node.key_to_child_node[token_count] + + parent_node = first_layer_node + + # handle case of empty log string + if len(cluster.log_template_tokens) == 0: + parent_node.clusters.append(cluster) + return + + current_depth = 1 + for token in cluster.log_template_tokens: + + # Add current log cluster to the leaf node + at_max_depth = current_depth == self.depth + is_last_token = current_depth == token_count + if at_max_depth or is_last_token: + parent_node.clusters.append(cluster) + break + + # If token not matched in this layer of existing tree. + if token not in parent_node.key_to_child_node: + if not self.has_numbers(token): + if param_str in parent_node.key_to_child_node: + if len(parent_node.key_to_child_node) < self.max_children: + new_node = Node(key=token, depth=current_depth + 1) + parent_node.key_to_child_node[token] = new_node + parent_node = new_node + else: + parent_node = parent_node.key_to_child_node[param_str] + else: + if len(parent_node.key_to_child_node) + 1 < self.max_children: + new_node = Node(key=token, depth=current_depth + 1) + parent_node.key_to_child_node[token] = new_node + parent_node = new_node + elif len(parent_node.key_to_child_node) + 1 == self.max_children: + new_node = Node(key=param_str, depth=current_depth + 1) + parent_node.key_to_child_node[param_str] = new_node + parent_node = new_node + else: + parent_node = parent_node.key_to_child_node[param_str] + + else: + if param_str not in parent_node.key_to_child_node: + new_node = Node(key=param_str, depth=current_depth + 1) + parent_node.key_to_child_node[param_str] = new_node + parent_node = new_node + else: + parent_node = parent_node.key_to_child_node[param_str] + + # If the token is matched + else: + parent_node = parent_node.key_to_child_node[token] + + current_depth += 1 + + # seq1 is template + @staticmethod + def get_seq_distance(seq1, seq2): + assert len(seq1) == len(seq2) + sim_tokens = 0 + param_count = 0 + + for token1, token2 in zip(seq1, seq2): + if token1 == param_str: + param_count += 1 + continue + if token1 == token2: + sim_tokens += 1 + + ret_val = float(sim_tokens) / len(seq1) + + return ret_val, param_count + + def fast_match(self, cluster_list: list, tokens): + match_cluster = None + + max_sim = -1 + max_param_count = -1 + max_cluster = None + + for cluster in cluster_list: + cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens) + if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count): + max_sim = cur_sim + max_param_count = param_count + max_cluster = cluster + + if max_sim >= self.sim_th: + match_cluster = max_cluster + + return match_cluster + + @staticmethod + def get_template(seq1, seq2): + assert len(seq1) == len(seq2) + ret_val = [] + + i = 0 + for word in seq1: + if word == seq2[i]: + ret_val.append(word) + else: + ret_val.append(param_str) + + i += 1 + + return ret_val + + def print_tree(self): + self.print_node(self.root_node, 0) + + def print_node(self, node, depth): + out_str = '' + for i in range(depth): + out_str += '\t' + + if node.depth == 0: + out_str += 'Root' + elif node.depth == 1: + out_str += '<' + str(node.key) + '>' + else: + out_str += node.key + + print(out_str) + + if node.depth == self.depth: + return 1 + for child in node.key_to_child_node: + self.print_node(node.key_to_child_node[child], depth + 1) + + @staticmethod + def num_to_cluster_id(num): + cluster_id = "A{:04d}".format(num) + return cluster_id + + def add_log_message(self, content: str): + content = content.strip() + content_tokens = content.split() + match_cluster = self.tree_search(self.root_node, content_tokens) + + # Match no existing log cluster + if match_cluster is None: + cluster_num = len(self.clusters) + 1 + cluster_id = self.num_to_cluster_id(cluster_num) + match_cluster = LogCluster(content_tokens, cluster_id) + self.clusters.append(match_cluster) + self.add_seq_to_prefix_tree(self.root_node, match_cluster) + update_type = "cluster_created" + + # Add the new log message to the existing cluster + else: + new_template_tokens = self.get_template(content_tokens, match_cluster.log_template_tokens) + if ' '.join(new_template_tokens) != ' '.join(match_cluster.log_template_tokens): + match_cluster.log_template_tokens = new_template_tokens + update_type = "cluster_template_changed" + else: + update_type = "none" + match_cluster.size += 1 + + return match_cluster, update_type + + def get_total_cluster_size(self): + size = 0 + for c in self.clusters: + size += c.size + return size diff --git a/logparsing/drain/drain3/file_persistence.py b/logparsing/drain/drain3/file_persistence.py new file mode 100644 index 0000000..26faf66 --- /dev/null +++ b/logparsing/drain/drain3/file_persistence.py @@ -0,0 +1,25 @@ +""" +Description : This file implements the persist/restore from file +Author : Moshik Hershcovitch +Author_email: moshikh@il.ibm.com +License : MIT +""" + +import os +import pathlib + +from logparsing.drain.drain3.persistence_handler import PersistenceHandler + + +class FilePersistence(PersistenceHandler): + def __init__(self, file_path): + self.file_path = file_path + + def save_state(self, state): + pathlib.Path(self.file_path).write_bytes(state) + + def load_state(self): + if not os.path.exists(self.file_path): + return None + + return pathlib.Path(self.file_path).read_bytes() diff --git a/logparsing/drain/drain3/kafka_persistence.py b/logparsing/drain/drain3/kafka_persistence.py new file mode 100644 index 0000000..c5a05d4 --- /dev/null +++ b/logparsing/drain/drain3/kafka_persistence.py @@ -0,0 +1,45 @@ +""" +Author : Moshik Hershcovitch +Author : David Ohana, Moshik Hershcovitch, Eran Raichstein +Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com +License : MIT +""" +import configparser + +import kafka + +# logger = logging.getLogger(__name__) +from logparsing.drain.drain3.persistence_handler import PersistenceHandler + +config = configparser.ConfigParser() +config.read('drain3.ini') + + +class KafkaPersistence(PersistenceHandler): + def __init__(self, server_list, topic): + self.server_list = server_list + self.topic = topic + self.producer = kafka.KafkaProducer(bootstrap_servers=server_list) + + def save_state(self, state): + self.producer.send(self.topic, value=state) + + def load_state(self): + consumer = kafka.KafkaConsumer(bootstrap_servers=self.server_list) + partition = kafka.TopicPartition(self.topic, 0) + consumer.assign([partition]) + end_offsets = consumer.end_offsets([partition]) + end_offset = list(end_offsets.values())[0] + if end_offset > 0: + consumer.seek(partition, end_offset - 1) + snapshot_poll_timeout_ms = int(config.get('DEFAULT', 'snapshot_poll_timeout_sec', fallback=60)) * 1000 + records = consumer.poll(snapshot_poll_timeout_ms) + if not records: + raise RuntimeError(f"No message received from Kafka during restore even though end_offset>0") + last_msg = records[partition][0] + state = last_msg.value + else: + state = None + + consumer.close() + return state diff --git a/logparsing/drain/drain3/masking.py b/logparsing/drain/drain3/masking.py new file mode 100644 index 0000000..a57b9bb --- /dev/null +++ b/logparsing/drain/drain3/masking.py @@ -0,0 +1,65 @@ +""" +Description : This file implements the persist/restore from Kafka +Author : Moshik Hershcovitch +Author_email: moshikh@il.ibm.com +License : MIT +""" +import configparser +import json +import logging +import re +from typing import List + +logger = logging.getLogger(__name__) +config = configparser.ConfigParser() +config.read('drain3.ini') + + +class MaskingInstruction: + def __init__(self, regex_pattern: str, mask_with: str): + self.regex_pattern = regex_pattern + self.mask_with = mask_with + self.regex = re.compile(regex_pattern) + self.mask_with_wrapped = "<" + mask_with + ">" + + +class RegexMasker: + def __init__(self, masking_instructions: List[MaskingInstruction]): + self.masking_instructions = masking_instructions + + def mask(self, content: str): + for mi in self.masking_instructions: + content = re.sub(mi.regex, mi.mask_with_wrapped, content) + return content + + +# Some masking examples +# --------------------- +# +# masking_instances = [ +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)', "ID"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})((?=[^A-Za-z0-9])|$)', "IP"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)', "SEQ"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)', "SEQ"), +# +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)', "HEX"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)', "NUM"), +# MaskingInstruction(r'(?<=executed cmd )(".+?")', "CMD"), +# ] + + +class LogMasker: + def __init__(self): + masking_instances = [] + self.masker = None + m = json.loads(config.get('DEFAULT', 'masking', fallback="[]")) + for i in range(len(m)): + logger.info("Adding custom mask {0} --> {1}".format(str(m[i]['mask_with']), str(m[i]['regex_pattern']))) + masking_instances.append(MaskingInstruction(m[i]['regex_pattern'], m[i]['mask_with'])) + self.masker = RegexMasker(masking_instances) + + def mask(self, content: str): + if self.masker is not None: + return self.masker.mask(content) + else: + return content diff --git a/logparsing/drain/drain3/persistence_handler.py b/logparsing/drain/drain3/persistence_handler.py new file mode 100644 index 0000000..a1e5076 --- /dev/null +++ b/logparsing/drain/drain3/persistence_handler.py @@ -0,0 +1,18 @@ +""" +Description : This file implements an abstract class for implementing a Drain3 persistence handler +Author : David Ohana +Author_email: david.ohana@ibm.com +License : MIT +""" +from abc import ABC, abstractmethod + + +class PersistenceHandler(ABC): + + @abstractmethod + def save_state(self, state): + pass + + @abstractmethod + def load_state(self): + pass diff --git a/logparsing/drain/drain3/template_miner.py b/logparsing/drain/drain3/template_miner.py new file mode 100644 index 0000000..aeb4b79 --- /dev/null +++ b/logparsing/drain/drain3/template_miner.py @@ -0,0 +1,98 @@ +""" +Description : This file implements wrapper of the Drain core algorithm - add persistent and recovery +Author : David Ohana, Moshik Hershcovitch, Eran Raichstein +Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com +License : MIT +""" +import base64 +import configparser +import logging +import time +import zlib + +import jsonpickle + +from logparsing.drain.drain3.drain import Drain +from logparsing.drain.drain3.masking import LogMasker +from logparsing.drain.drain3.persistence_handler import PersistenceHandler + +logger = logging.getLogger(__name__) +config = configparser.ConfigParser() +config.read('drain3.ini') + + +class TemplateMiner: + + def __init__(self, persistence_handler: PersistenceHandler): + logger.info("Starting Drain3 template miner") + self.compress_state = config.get('DEFAULT', 'compress_state', fallback=True) + self.persistence_handler = persistence_handler + self.snapshot_interval_seconds = int(config.get('DEFAULT', 'snapshot_interval_minutes', fallback=5)) * 60 + self.drain = Drain(sim_th=float(config.get('DEFAULT', 'sim_th', fallback=0.4))) + self.masker = LogMasker() + self.last_save_time = time.time() + if persistence_handler is not None: + self.load_state() + + def load_state(self): + logger.info("Checking for saved state") + + state = self.persistence_handler.load_state() + if state is None: + logger.info("Saved state not found") + return + + if self.compress_state: + state = zlib.decompress(base64.b64decode(state)) + + drain: Drain = jsonpickle.loads(state) + + # After loading, the keys of "parser.root_node.key_to_child" are string instead of int, + # so we have to cast them to int + keys = [] + for i in drain.root_node.key_to_child_node.keys(): + keys.append(i) + for key in keys: + drain.root_node.key_to_child_node[int(key)] = drain.root_node.key_to_child_node.pop(key) + + self.drain = drain + logger.info("Restored {0} clusters with {1} messages".format( + len(drain.clusters), drain.get_total_cluster_size())) + + def save_state(self, snapshot_reason): + state = jsonpickle.dumps(self.drain).encode('utf-8') + if self.compress_state: + state = base64.b64encode(zlib.compress(state)) + + logger.info(f"Saving state of {len(self.drain.clusters)} clusters " + f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, " + f"reason: {snapshot_reason}") + self.persistence_handler.save_state(state) + + def get_snapshot_reason(self, change_type): + if change_type != "none": + return change_type + + diff_time_sec = time.time() - self.last_save_time + if diff_time_sec >= self.snapshot_interval_seconds: + return "periodic" + + return None + + def add_log_message(self, log_message: str): + masked_content = self.masker.mask(log_message) + cluster, change_type = self.drain.add_log_message(masked_content) + result = { + "change_type": change_type, + "cluster_id": cluster.cluster_id, + "cluster_size": cluster.size, + "template_mined": cluster.get_template(), + "cluster_count": len(self.drain.clusters) + } + + if self.persistence_handler is not None: + snapshot_reason = self.get_snapshot_reason(change_type) + if snapshot_reason: + self.save_state(snapshot_reason) + self.last_save_time = time.time() + return result diff --git a/logparsing/drain/examples/drain3.ini b/logparsing/drain/examples/drain3.ini new file mode 100644 index 0000000..8cd0ec8 --- /dev/null +++ b/logparsing/drain/examples/drain3.ini @@ -0,0 +1,14 @@ +[DEFAULT] +sim_th = 0.4 +snapshot_interval_minutes = 10 +snapshot_poll_timeout_sec = 60 +masking = [ + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"} + ] + + diff --git a/logparsing/drain/examples/drain_stdin_demo.py b/logparsing/drain/examples/drain_stdin_demo.py new file mode 100644 index 0000000..00b45de --- /dev/null +++ b/logparsing/drain/examples/drain_stdin_demo.py @@ -0,0 +1,36 @@ +""" +Description : Example of using Drain3 with Kafka persistence +Author : David Ohana, Moshik Hershcovitch, Eran Raichstein +Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com +License : MIT +""" +import configparser +import json +import logging +import sys +sys.path.append('../') + +from logparsing.drain.drain3.template_miner import TemplateMiner +from logparsing.drain.drain3.file_persistence import FilePersistence + +persistence_type = "FILE" + +config = configparser.ConfigParser() +config.read('drain3.ini') + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') +persistence = FilePersistence("drain3_state.bin") +template_miner = TemplateMiner(persistence) +print(f"Drain3 started with '{persistence_type}' persistence, reading from std-in (input 'q' to finish)") +while True: + log_line = input() + if log_line == 'q': + break + result = template_miner.add_log_message(log_line) + result_json = json.dumps(result) + print(result_json) + +print("Clusters:") +for cluster in template_miner.drain.clusters: + print(cluster) diff --git a/logparsing/drain/requirements.txt b/logparsing/drain/requirements.txt new file mode 100644 index 0000000..6fa3443 --- /dev/null +++ b/logparsing/drain/requirements.txt @@ -0,0 +1,5 @@ +jsonpickle==1.3 +kafka==1.3.5 + + + diff --git a/logparsing/drain/setup.cfg b/logparsing/drain/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/logparsing/drain/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/logparsing/drain/setup.py b/logparsing/drain/setup.py new file mode 100644 index 0000000..cfb897e --- /dev/null +++ b/logparsing/drain/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup +from os import path + +this_directory = path.abspath(path.dirname(__file__)) +with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='drain3', + packages= ['drain3'], + version="0.7.2", + license='MIT', + description="persistent log parser", + long_description=long_description, + long_description_content_type="text/markdown", + author="IBM Research Haifa", + author_email="drain3@il.ibm.com", + url="https://github.com/IBM/Drain3", + download_url = 'https://github.com/IBM/Drain3/archive/v_01.tar.gz', + keywords = ['drain', 'log', 'parser', 'IBM'], + install_requires=[ + 'jsonpickle==1.3', + 'kafka==1.3.5' + ], + classifiers=[ + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries", + ], +) diff --git a/logparsing/fttree/__pycache__/__init__.cpython-36.pyc b/logparsing/fttree/__pycache__/__init__.cpython-36.pyc index aac5a61..43201da 100644 Binary files a/logparsing/fttree/__pycache__/__init__.cpython-36.pyc and b/logparsing/fttree/__pycache__/__init__.cpython-36.pyc differ diff --git a/logparsing/fttree/__pycache__/__init__.cpython-37.pyc b/logparsing/fttree/__pycache__/__init__.cpython-37.pyc index 3f45930..eedcba7 100644 Binary files a/logparsing/fttree/__pycache__/__init__.cpython-37.pyc and b/logparsing/fttree/__pycache__/__init__.cpython-37.pyc differ diff --git a/logparsing/fttree/__pycache__/fttree.cpython-36.pyc b/logparsing/fttree/__pycache__/fttree.cpython-36.pyc index bc40bad..2e2cbbf 100644 Binary files a/logparsing/fttree/__pycache__/fttree.cpython-36.pyc and b/logparsing/fttree/__pycache__/fttree.cpython-36.pyc differ diff --git a/logparsing/fttree/__pycache__/fttree.cpython-37.pyc b/logparsing/fttree/__pycache__/fttree.cpython-37.pyc index ab64d4f..693b5b2 100644 Binary files a/logparsing/fttree/__pycache__/fttree.cpython-37.pyc and b/logparsing/fttree/__pycache__/fttree.cpython-37.pyc differ diff --git a/logparsing/fttree/fttree.py b/logparsing/fttree/fttree.py index 2eac6ef..a842d32 100644 --- a/logparsing/fttree/fttree.py +++ b/logparsing/fttree/fttree.py @@ -23,10 +23,10 @@ def pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, detai if message_type_number == -1: log_type.append('NO_TYPE') log_type_index.append([]) - for i in range(1, len(log_list)): + for i in range(0, len(log_list)): log_type_index[0].append(i) else: - for i in range(1, len(log_list)): + for i in range(0, len(log_list)): log = [] for word in log_list[i].split(' '): log.append(word) @@ -79,7 +79,7 @@ def pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, detai FT_tree.append(log_type[i]) for j in range(0, len(log_type_index[i])): sub_word_support = {} - for word in log_message[log_type_index[i][j] - 1]: + for word in log_message[log_type_index[i][j]]: support = word_support[word] sub_word_support[word] = support sub_word_list = sorted(sub_word_support, key=sub_word_support.__getitem__, reverse=True) @@ -123,7 +123,7 @@ def pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, detai for i in range(0, len(log_type)): for j in range(0, len(log_type_index[i])): sub_word_support = {} - for word in log_message[log_type_index[i][j] - 1]: + for word in log_message[log_type_index[i][j]]: support = word_support[word] sub_word_support[word] = support sub_word_list = sorted(sub_word_support, key=sub_word_support.__getitem__, reverse=True) @@ -145,7 +145,7 @@ def pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, detai file_obj.write(pattern + '\n') file_obj.write(' '.join(str(x) for x in log_cluster[pattern])) i = i + 1 - # print(log_cluster) + if __name__ == '__main__': diff --git a/robust_anomaly_detection.py b/robust_anomaly_detection.py new file mode 100644 index 0000000..cd30373 --- /dev/null +++ b/robust_anomaly_detection.py @@ -0,0 +1,101 @@ +# -*- coding: UTF-8 -*- + +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_robust_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.robust import bi_lstm_att_train +from anomalydetection.robust import bi_lstm_att_predict +from logparsing.converter import eventid2number + +# parameters for early prepare +logparser_structed_file = './Data/logparser_result/Drain/HDFS.log_structured.csv' +logparser_event_file = './Data/logparser_result/Drain/HDFS.log_templates.csv' +anomaly_label_file = './Data/log/hdfs/anomaly_label.csv' +sequential_directory = './Data/DrainResult-HDFS/sequential_files/' +train_file_name = 'robust_train_file' +test_file_name = 'robust_test_file' +valid_file_name = 'robust_valid_file' +wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)' +pattern_vec_out_path = './Data/DrainResult-HDFS/pattern_vec' +variable_symbol = '<*> ' +'''log_file_dir = './Data/log/hdfs/' +log_file_name = 'HDFS_split' +log_fttree_out_directory = './Data/FTTreeResult-HDFS/clusters/' +# anomaly file name used which is also used in ./Data/log/file_split +anomaly_line_file = './Data/log/hdfs/HDFs_split_anomaly' +sequential_directory = './Data/FTTreeResult-HDFS/sequential_files/' + +pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec''' + + + +# log anomaly sequential model parameters some parameter maybe changed to train similar models +sequence_length = 50 +input_size = 300 +hidden_size = 128 +num_of_layers = 2 +# 1 using sigmoid, 2 using softmax +num_of_classes = 1 +num_epochs = 200 +batch_size = 1000 +# for robust attention bi +train_root_path = './Data/DrainResult-HDFS/robust_att_bi_model_train/' +model_out_path = train_root_path + 'model_out/' +train_file = sequential_directory + train_file_name +pattern_vec_json = pattern_vec_out_path + + +# predict parameters +# log anomaly sequential model parameters + +'''if not os.path.exists(log_fttree_out_directory): + os.makedirs(log_fttree_out_directory)''' +if not os.path.exists(sequential_directory): + os.makedirs(sequential_directory) +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +'''def pattern_extract(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + + 同时生成train file 和 test file好点 +def extract_feature(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, train_file_name, test_file_name, label_file_name, pattern_vec_out_path, split_degree, log_line_num) + + +def pattern_extract_test(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + + +def extract_feature_test(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, 'train_file') +''' +def extract_feature(): + hdfs_robust_preprocessor.generate_train_and_test_file(logparser_structed_file, logparser_event_file, anomaly_label_file, sequential_directory, train_file_name, valid_file_name, test_file_name, wordvec_file_path, pattern_vec_out_path, variable_symbol) + + +def train_model(): + bi_lstm_att_train.train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, train_file, pattern_vec_json) + + +def test_model(): + # do something + bi_lstm_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, sequence_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + test_file_name, batch_size, pattern_vec_json) + + +#eventid2number.add_numberid(logparser_event_file) +#pattern_extract() +#extract_feature() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + diff --git a/self_att_lstm_anomaly_detection.py b/self_att_lstm_anomaly_detection.py new file mode 100644 index 0000000..2b47e4e --- /dev/null +++ b/self_att_lstm_anomaly_detection.py @@ -0,0 +1,142 @@ +# -*- coding: UTF-8 -*- +from extractfeature.k8s import log_preprocessor +from extractfeature.k8s import value_extract +import os +import torch +from torch.utils.data import TensorDataset, DataLoader +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.self_att_lstm import self_att_lstm_train +from anomalydetection.self_att_lstm import self_att_lstm_predict + +# parameters for early prepare +log_file_dir = './Data/log/hdfs/' +# log file name used which is also used in ./Data/log/file_split +log_file_name = 'HDFS_split_40w' +log_fttree_out_directory = './Data/FTTreeResult-HDFS/clusters/' +# anomaly file name used which is also used in ./Data/log/file_split +anomaly_line_file = './Data/log/hdfs/HDFs_split_anomaly_40w' +wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)' +sequential_directory = './Data/FTTreeResult-HDFS/sequential_files/' +train_file_name = 'train_file' +test_file_name = 'test_file' +label_file_name = 'label_file' +pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' +split_degree = 0.9 +# log file line used which is also used in ./Data/log/file_split +log_line_num = 400000 + +# bi lstm only model parameters +window_length = 20 +input_size = 300 +hidden_size = 128 +num_of_layers = 2 +num_of_classes = 26 +num_epochs = 10 +batch_size = 1000 +# for self att lstm +train_root_path = './Data/FTTreeResult-HDFS/self_att_lstm_model_train/' +model_out_path = train_root_path + 'sa_lstm_model_out/' +data_file = sequential_directory + train_file_name +pattern_vec_file = pattern_vec_out_path + +# predict parameters +num_of_candidates = 8 +# log anomaly sequential model parameters + +if not os.path.exists(log_fttree_out_directory): + os.makedirs(log_fttree_out_directory) +if not os.path.exists(sequential_directory): + os.makedirs(sequential_directory) +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def pattern_extract(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + +# 同时生成train file 和 test file好点 +def extract_feature(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, train_file_name, test_file_name, label_file_name, pattern_vec_out_path, split_degree, log_line_num) + +def extract_feature_spilt_abnormal(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft_split_abnormal(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, train_file_name, test_file_name, label_file_name, pattern_vec_out_path, split_degree, log_line_num) + + +def pattern_extract_test(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + + +def extract_feature_test(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, 'train_file') + + +def train_model(): + #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + self_att_lstm_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + + +def test_model(): + # do something + #log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + self_att_lstm_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, num_of_candidates, pattern_vec_file) + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple( + map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length: + continue + for i in range(len(line) - window_length): + label_line = [] + for j in range(window_length): + label_line.append(vec_to_class_type[line[i+j]]) + label_line.append(vec_to_class_type[line[i + window_length]]) + input_data.append(label_line) + return input_data + + +def get_label_sequentials(sequential_out_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + with open(sequential_out_file, 'w+') as file: + sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + for line in sequence_data_set: + for label in line: + file.write(str(label)) + file.write(',') + file.write('\n') + + +#pattern_extract() +#extract_feature_spilt_abnormal() +#train_model() +#get_label_sequentials('./Data/FTTreeResult-HDFS/pattern_sequntials') +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/self_att_supervised_detection.py b/self_att_supervised_detection.py new file mode 100644 index 0000000..df27674 --- /dev/null +++ b/self_att_supervised_detection.py @@ -0,0 +1,62 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- + +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.self_att_lstm_supervised import self_att_lstm_supervised_train +from anomalydetection.self_att_lstm_supervised import self_att_lstm_supervised_predict + +# parameters for early prepare + +temp_directory = './Data/logdeepdata/' +train_file_name = 'robust_log_train.csv' +test_file_name = 'robust_log_test.csv' +valid_file_name = 'robust_log_valid.csv' + +# log anomaly sequential model parameters some parameter maybe changed to train similar models +sequence_length = 50 +input_size = 300 +hidden_size = 128 +num_of_layers = 2 +# 1 using sigmoid, 2 using softmax +num_of_classes = 1 +num_epochs = 20 +batch_size = 1000 +# for robust attention bi +train_root_path = './Data/FTTreeResult-HDFS/self_att_supervised_model_train/' +model_out_path = train_root_path + 'model_out/' +train_file = temp_directory + train_file_name +pattern_vec_json = './Data/logdeepdata/event2semantic_vec.json' + + +# predict parameters +# log anomaly sequential model parameters + +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def train_model(): + self_att_lstm_supervised_train.train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, train_file, pattern_vec_json) + + +def test_model(): + # do something + self_att_lstm_supervised_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, sequence_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', temp_directory + test_file_name, batch_size, pattern_vec_json) + +#pattern_extract() +#extract_feature() +#train_model() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict +