From c8f2227cdb9b627c7c2243ca7382e7e98741450e Mon Sep 17 00:00:00 2001 From: cainiao66 <17717084193@163.com> Date: Thu, 30 Apr 2020 09:51:14 +0800 Subject: [PATCH 1/4] add loganomaly_quantitive_train/predict --- .../log_anomaly_quantitive_predict.py | 122 ++++++++++++++++++ .../log_anomaly_quantitive_train.py | 99 ++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 anomalydetection/loganomaly/log_anomaly_quantitive_predict.py create mode 100644 anomalydetection/loganomaly/log_anomaly_quantitive_train.py diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py new file mode 100644 index 0000000..5aad10d --- /dev/null +++ b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py @@ -0,0 +1,122 @@ +import torch +import os +import torch.nn as nn +import time +import numpy as np +from anomalydetection.loganomaly.log_anomaly_quantitive_train import Model +from anomalydetection.loganomaly.log_anomaly_quantitive_train import train_model + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def generate_test_label(logkey_path, window_length): + f = open(logkey_path,'r') + keys = f.readline().split() + keys = list(map(int, keys)) + print(keys) + length = len(keys) + input = np.zeros((length -window_length,num_of_classes)) + output = np.zeros(length -window_length,dtype=np.int) + for i in range(0,length -window_length): + for j in range(i,i+window_length): + input[i][keys[j]-1] += 1 + output[i] = keys[i+window_length]-1 + new_input = np.zeros((length -2*window_length+1,window_length,num_of_classes)) + for i in range(0,length -2*window_length+1): + for j in range(i,i+window_length): + new_input[i][j-i] = input[j] + new_output = output[window_length-1:] + print(new_input.shape) + print(new_output.shape) + print(new_input[0]) + print(new_output[0]) + return length,new_input,new_output + +def load_quantitive_model(input_size, hidden_size, num_layers, num_classes, model_path): + model2 = Model(input_size, hidden_size, num_layers, num_classes).to(device) + model2.load_state_dict(torch.load(model_path, map_location='cpu')) + model2.eval() + print('model_path: {}'.format(model_path)) + return model2 + +def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): + quantitive_model = load_quantitive_model(input_size, hidden_size, num_layers, num_classes, model_path) + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + length,input,output = generate_test_label(logkey_path, window_length) + abnormal_label = [] + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for i in range(0,length-2*window_length+1): + lineNum = i + 2*window_length + quan = input[i] + label = output[i] + quan = torch.tensor(quan, dtype=torch.float).view(-1, window_length, input_size).to(device) + test_output = quantitive_model(quan) + predicted = torch.argsort(test_output , 1)[0][-num_candidates:] + print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) + if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += 2*window_length + 1 + else: + i += 1 + ALL += 1 + if label not in predicted: + if lineNum in abnormal_label: + TN += 1 + else: + FN += 1 + else: + if lineNum in abnormal_label: + FP += 1 + else: + TP += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + +input_size = 61 +hidden_size = 30 +num_of_layers = 2 +num_of_classes = 61 +num_epochs = 100 +batch_size = 200 +window_length = 5 +train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' +test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' +train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' +label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' +model_out_path = train_root_path + 'quantitive_model_out/' + +# train_model(window_length, input_size, hidden_size, +# num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, +# model_out_path,train_logkey_path) + +do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, + model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) + diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py new file mode 100644 index 0000000..e81b506 --- /dev/null +++ b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py @@ -0,0 +1,99 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader +import numpy as np +import argparse +import os + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, out_size) + + def init_hidden(self, size): + h0 = torch.zeros(self.num_of_layers, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + out = self.fc(out[:, -1, :]) + return out + +def generate_quantitive_label(logkey_path, window_length,num_of_classes): + f = open(logkey_path,'r') + keys = f.readline().split() + keys = list(map(int, keys)) + print(keys) + length = len(keys) + input = np.zeros((length -window_length,num_of_classes)) + output = np.zeros(length -window_length,dtype=np.int) + for i in range(0,length -window_length): + for j in range(i,i+window_length): + input[i][keys[j]-1] += 1 + output[i] = keys[i+window_length]-1 + new_input = np.zeros((length -2*window_length+1,window_length,num_of_classes)) + for i in range(0,length -2*window_length+1): + for j in range(i,i+window_length): + new_input[i][j-i] = input[j] + new_output = output[window_length-1:] + print(new_input.shape) + print(new_output.shape) + print(new_input[0]) + print(new_output[0]) + dataset = TensorDataset(torch.tensor(new_input,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) + return dataset + +def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): + # log setting + log_directory = root_path + 'quantitive_log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + model = Model(input_size, hidden_size, num_of_layers, num_of_classes).to(device) + # create data set + quantitive_data_set = generate_quantitive_label(logkey_path, window_length,num_of_classes) + # create data_loader + data_loader = DataLoader(dataset=quantitive_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (quan, label) in enumerate(data_loader): + quan = quan.clone().detach().view(-1, window_length, input_size).to(device) + output = model(quan) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % 100 == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') + + + + + + + + From 68408608baf203735c5486bde01c73926c7a8638 Mon Sep 17 00:00:00 2001 From: cainiao66 <17717084193@163.com> Date: Thu, 7 May 2020 12:40:03 +0800 Subject: [PATCH 2/4] fix log_anomaly --- .../__pycache__/__init__.cpython-37.pyc | Bin 0 -> 198 bytes ...og_anomaly_quantitive_train.cpython-37.pyc | Bin 0 -> 3412 bytes ..._anomaly_sequential_predict.cpython-37.pyc | Bin 0 -> 3287 bytes ...og_anomaly_sequential_train.cpython-37.pyc | Bin 0 -> 3832 bytes .../log_anomaly_train.cpython-37.pyc | Bin 0 -> 4309 bytes .../loganomaly/log_anomaly_predict.py | 122 ++++++++++++++++++ .../log_anomaly_quantitive_predict.py | 6 +- .../log_anomaly_quantitive_train.py | 4 +- .../log_anomaly_sequential_predict.py | 3 + .../log_anomaly_sequential_train.py | 2 + .../loganomaly/log_anomaly_train.py | 121 +++++++++++++++++ 11 files changed, 252 insertions(+), 6 deletions(-) create mode 100644 anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc create mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_quantitive_train.cpython-37.pyc create mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-37.pyc create mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-37.pyc create mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_train.cpython-37.pyc create mode 100644 anomalydetection/loganomaly/log_anomaly_predict.py create mode 100644 anomalydetection/loganomaly/log_anomaly_train.py diff --git a/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17b247bb22f2c009cd882030eab5a6488a96c89d GIT binary patch literal 198 zcmZ?b<>g`kg1wVk<3RLd5CH>>K!yVl7qb9~6oz01O-8?!3`HPe1o1To$aS@fdAelz z)3wV$)RP^ZPj+<2m@2&3yZ*`UZO^uLzgRyt#wS1BF)uNvvN*FCnXOx#k(rkopORV@ zlbDyE3sjJjT9TSvl9`_ularqg6^n_F&&;|H3dbp&4<{E{huVMxRtZ7xa3#uIay1-T zmdq^-$VE^D1`Gqe?*c`O9*P`-1_6rxBXg}&#FyTB$))|h;W||flmutqym_2?-|xNK zZZ{a7FMs*PC#!Re{hbys9tXfRe8p1?f(f3opqly63M^j1_Rt9&16!#(^a9Voc3K1?!H%YU~IbPu>)L9xf354 zXG(N+!ElAWeG5`H??kG<7Y-6xs9r<6#o;KHvW5$^ABIVms4&#cy+nvO3yb6=*6r+Y z80Ld8jgDhkXe-OKvvu><4eg|b8ftGV$>KDgDmefHI7J8x`!^S95R{q@U#{N=N6 zKL6d$R=)dA7Ny5UQk;cvFl3ciruQ0yMJKV#{cLE}JiO)3-oF4>>j&={4%7igHr@In^Xc#5{+ z8k&bM3sW29@+gvG9e>OOxxg>*lXfM@Y!|@;*Uh3~4A*Uh;V>76DZ#CfES*+6#=sR9 zKg4mmJ@deRoTlNKn|oX?6PKh-6Bp#W=5n96YQAf5eF{r@$YV&uMsFE}R?wW_r=%r& z!l#VFb?hix!5d2F)H>x@d4M=)sz&qJRsN)2x+PyjSPv{{(Nq5Z3r6Ona8jDzL?*e)4t{-`b_{*B zTh>Y+`2E|g;zEFrCANk?R>}U>zy?2!`&-rBXLrqgm%+oumv}g5c);x?c*9sP{WaYE zgdbS9*}*@-L9OJ(8TrpT8#h%`wf5U(17n9|H7ml+(F(e3L1T^muF;uSHY=!X!PC5L z%_Z`QE7UO(nejR^P?+}SwUdwHO#3p9M2akdd^buHh18W4O54L|r0pbA+8s%n>=eXS zHj?CU+LckZ8%r{^TqA_1OhM4@Ak8B>B+qtn;$Ae0du`oBhPfLbhoeaC>Gn~QiTo(U zfxB3CW;xX4ytvSOpdHW_+N%iUhB+TmfV`=BqWQkAXYmmdG9elS&NgeJsz)!!%KC1c z#WGUy+3c1^cjA;HulO+r#v8na%;w`=KyIUV1BroRYKgzXyZi^BXmI%fX!|A)0yu}S zj?cLGQz*Br3Ao$1A&}Zi?-a1GSIO~Rf?-ai3`%ZrZf9aCa-f!iJC<<0Q+wP-Ahb(Q zm?HSt8F$FIWfP~=9`o^>s!e=^Ojp$>jj}aqPARR-KjHU1)uJ2VVFsp9- zcU#wry~DvEjdf#`WZ^I#=JL450}ls^M4IrDui~eZA6=(YHh%SBeCfgX&DRHG`4f;n7Z|?fmrm@>kvf^bMfDbh8v{(mUMzaw*LI<*7DSS-$@4Uj z3rec7k|J4Bp4Hv=P#cTunUeYFcnhF*^N~u1+95H)b0kp&(7YA$4Wh5d2s;xK+U+A) zW2&SoLYZ~$CGnB2Q?U(qWhAtJC+gp&QllMoe>kyF@km~#Q#_QJVWi~Gh!3MND=)NT z!rV;?=tcX(=xz+4sPcqUME9V=LQyn|q>uc9>x;N|PQFElEYk2@UnHsABE->U8PgQ< zLm+~>%yVUuNw9*77N@fWfCDAE8yH4jA-Xmh`#W6Fq9r64rMO(X=Fi})Z&3OO(a)lj zCJ32~8d~qjbiF~nZ)WsPx^8p}r;*wYATwP6xEa@8q!VN5-3Uw<0=JNm^`@8mc5u?xtNB0_`C2ru|6n z!l{Z_23S^(Nj;x#B}EwBixNnVVDWC4j>B{YmE#hFXT&!>nrYAcs}LXE$uFo#6@;$@ QY(YR0*0k}ON|m;9CZ9Ob)e6Qe;}Twnx2bIlnjkQ!+&z#UYl!e+U)*Iun; zE|m`-7bxl!XmL0sedxOc6g{+PaZfD{DEfaCd~L2czaG$+ytKceEZYrk7bJ(nA!j%o z{)X>WDkX;R>)Aj3yfVeuzo;|%To6A*Nd+pw1dmyB*zzW~eW#sk=J=53HeK+X*lXvT zdCo%CFNAET*dH4(`JTunj&KLOSrDGczhKRhD2O83aWN)JXv>0sz#8L+(8Sgo+;~6f zh%h#8&=bDN1>IiY&=5WkWenvUb*+1}EJ>_hD9r|bdyh-nA5 z71j-x0f$2Rs3l82KuPCNDW&BtUwf(N~Hsg-3jF&1I zp_g{!NMXM``6H8Mtn;5=qI!RI+HGY9Z7z~_H%KVG#ulseAn=2)(^v~g5fu%TXB@C zpwsRSjUgR*w!idE=8Od2@N12tacQD)JKZobV>0wb4CkaWj|F6bDheb?H^AimE~Erb z%T4@tXev*mM0B3EVAz)0ZcdZOcEjiCk-LI0HGGp4z$J8UC-$1+$S|@#r9sB?yv(co zN^Xfu@=8FlLm!z!#YXdcAmAr>N;^7t;Pw$w1D<)>>m!DDJnafDoKL~y47~gRVWZR`_#KUSN43 zuA&fa{kfcDEMi#+J{_~z@E{u(PF8-lrpNo-;=^iI(Um^XVZb+>teQ;_S}0FXkaxi+ zzY;!7>as5B$y-d9g*VT#^4{`kgiwXiscG-P%cfMZUmUP(8ro)1YACZPb13s_Oc#JN zPPVYcvPIaIKj1Ix*%^@+1)$TJeu>bDw3CI`aXD+gS=3v8FYa^y|DB*tpk$2^d5p4r{Jqt6QOU6!5I z73{N2{9ieXEL*mH^gy3e)_Ontgh+0@CJ^+>bf`%gtdCkV_)U|tUjyH>k?2HdKR&wUE>|SxSM~)2ShVRXh)Gq3m_ z&>wyx4NH4ppFU|lqigV*v+F>IfR`^MlcJ($;88nrmSq<}%^l198ZsA8$zTLB7xe{w z@mS{1khzr9L{(qFIu}vwDC90-ty#!bz?(<^GCVZ_Pu27sjSP^l5DnvE61>YaOJ6$X zP1!LUiF$g0Wmh0K4Y?~rtPlX{#>~1TD6%-bpZnSBqc^;MNT3{yGTr|ndJQ8M(aR%5 zA0oNEy@caHhK3_2X-P^PZ{2D+4c@yb=@Eu6i_USUlRj{4c)!s~>0e>-CYC4AWk$2;HvLmF$J}*nb?x)IX$0%ps}WUtYUElvrCIwhxc` z4{d+{{X0Q`fz_4oZDN`GG7O>=_&H)-zE*z_BK^^_RtAi<8`s;RpY~*U%-o+@i;^ha z4h1biU>YS``;%epchgX`RMaNftT88NVU(mdgv~^!vQv?GHboJ}8JFhR^kIslWF{|> z%y}9vB%Ril^o5+DUS5Ss+L6Y65`|CYGD)2yzL#$M-Ov<#xrJdRFN4{*DzA_Lfv5Zy zRSl{LY~{C6HRnce1r?^MMLExAk!IEQH=}sew;=B)A`-|TshLEsQK3v)Pr{%@VrGgc z)fr}j79FE0kA$oMkK|dXG=(Fs$zg`6j%3h5Mrux;dbHTYU`jSZtXrGT!~0EV4dvlF z+Vy7c&b@oh3IB0I87oFY^hGF})e{BY#mUV?m)pTS?^RZCBUDmnO3SFvK1 z=dtQZT|;k}Q_Ecb7&bI2!?F^rOVS zI&3gXG|AQd$YU#2w5PCm=HD=NdOtgN{AgU&WhNJ96D=S?Wx|~UiWm* zR@d6zNwjBP$b0vz`2f)p!mR%i!bnd)w6L(5H+gLepT<) z`}ozH?{vE@hVsv=fB4}ui;Vr9Cdb9W;04s;S7-ziJY_-M^N|%;ydK*lCveQzO5Kqc zcxG&;jgcStqh`>Y)3t)OIinMFOy3QuY4 zn=CjXTB42qBtTq5&Rk(^zK7GwlUzkd+lH}isa#SnW$I}EZyCV zvm%$5A{7;}(k&WX$s-ZV`y6|)g+HmC;em((dpJHA+H?}i0%{ZWCDdXQqzS8R#Y#)? zBkN~ZD@RzBd-vtinerLC#BQ9gywaOm!k+Oht7=rfvX!IU(w}-WhIMC#eMh`%mTqa6 z%?qq_g}cHkcmE65*>x!LRd$uV#&BwDhwU(nRqd(+eo#92O{?zRUzXiDub+ZfOSs=; zRg1U?Pc)zsqbkTy^J~rcC3aNv^i-qtj<{&wV)C!br!i-DFR5m|Uh-8?;tuC3w`@;a zr3V?^lCQ8ITc_9zv}FgYwpqvODg0=4|8LjX{@b6HUbR(cuUk^@Y~ zXR(Y_OdG;9x*4bI7t(wfrNu=IYVuF?V_^#gsnO3_40hl#Hq43oYM774wOGW%W0NQTzhK8(9`L8`6QF>L zE7ajQ9xP}lAIF()$~Y1!)Kh;y40LRqusGgGT2T=*-oszXpG%LQeeWw zxg73lPnn>GU3Vkm5Hz=wG!Dm++ST1VNhb0;A+ETCX&;L~f-Jlp4@05~7HOpNaF|9# z5t4{O=a`5HG*7fca_Yu>S=-p6d&fkWK#X7^I~axeHtbXsuzL}-l5BjSLIH#c+WCPx zo@mGL_2~ns)n^p0Pkdgpgg(VnXc%{R2epUaD);#^#x1P$Fn$6(tt?w#;b(cD%O^oU zB#lxzgZc@8y@x<58v?;wwS@)e00gWsVXq*dN|(j}8!nvEIRdI9z<6mP;GH993V@mM zstYJJwj{y?_CFYi1u{3JSiY8AZ|&mlnZ;S zNBj(%*#?x%7tvj2qIt`bZ>v*#r>AF1f5zYC#T9jGdX~m`&pxX|v(M=9w0d;<1>F0w z87qC!1E15-%)$zrL9+yxWq-Ox@<5h8@f3*O!Q#fmC$xauJU!{ZD56n#Gg8Cdut+BH z^OMH6<8eOReSUIsldxlrT+jG#T%2Fqe0kvL26>lx;U4bOjbis;dmBFA8YfveibuIT z9Pq#cCy~^+d;i}klgBU9@oTTYQ9b=eb>pcugHTDf6Q+3q4sV=ae|Ebv(0pIOuO9HH zC-(YSO%{zfAY9u{vZUCJ#o&wD2IxuxBkjJDi#XMm%C!fqOop-XQAv;`30Jh0WkLTX z2`-m2CG+v&6%4hTk5w`X9MXesjUyS20I#x;gp;}%mPe#K@(2SYi8QctsG9~y2vX#iX-k*<2BOxVgAPq`*KrD2 z1J@i9EY`2j5W>$v2)P)I@Jb>T(%v1DF9pq^=>ca!)7StPl7VX^3|hu3j7!K1bZZ-@ zYS+9`+N^qB*&*cVo4r%*wq4$A`fz6l&c>DtEEw;@C7uK78U<+F~$RUw&3J<;e z7GBQ4L+mx=*RgL%c?K2gJybfdcmWON93?W1sb!!A*+*yt99f9`ZqKWsl9=3cu+~uS z)CYJvGoovn;I=n55q0f4Ft_1UYS6;lbuf;lVXgfzMEIyMl$XIzzD&(?XaeWTwar)X z8Wn03cvtWyMN(VaLnJ#+5vSYo5>bA#NBb~>@jJY}&2aw0g%96$`908^T*2i0(5Hvr z5}));pspW_F{Id}LV-&9cnP)m78=E-(8V4-5Nyu|Fhd7`cfjHQT({{N*|zEo&8uU{ zrSL53OB&Ii!MXBTGy_MHaJU=XF0gjbX=`}yb53M5Rma#Cs0*?c!{fNhCyVtpK1K0g zH}OGeZbIIR91>3pT3&MG=X^##3r~<$!M0OKjz1~LfW^GuwkhkatdeeHZ718S~QbG2SIcX3Tpplw)2^SLP~0hN?%=?0L9;3>kSzKDI+I%`k5^+o1i2MzL& z3?LYWx)p{9paY~kq3(p?{y~)1UyN~UDQ?i&-MIq{<1~e58K%@A4g5YjBY<%~64x+C zpf!EhtPPfpu>#*DT{Z)8?GOF~fPVsq literal 0 HcmV?d00001 diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_train.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_train.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d82bb0513d0adec490a64e0381970d80348f3c79 GIT binary patch literal 4309 zcmbVP%a0sK8LxiL^gMU={qQ5}BuES-yX%k#a_rzZ&g0-139sa+S<`Ay)$Go?r)OMU zYkMa>r!|UDV&Mz-87TsBKsg{#B!t8fiT|LEkf8X&0SOLV;P+L}Zfs+GV0xGyH}>{q>LjG|$*ysB`$Mf;f+o{u32v+=^L{w^r8L%v#gMnml=1@vdkwc=W=_Ac?^8}HRiCh?B=IeX~e82?h?yN zSsDEMS6MCc&g|~3<7S$(>L3SFe-MOC2`S403+k24r{A2R2LEeIqw`!!U zWaTBu1)1p`Y7g`7TQ@haGSufGbBSbuRP4w0G8@*R_v&uNXugouj;Q*Cs;b&s`45zXwK&o(ATT*(seL%$HYQHZe0p@{CtZD7``XbT36cLX*iO^o! zk2^}caVJ&U=|+9+bP}b#zNE&9G1*S~+8YSjOSLDXWSbW4MA(-wr#;n^tsPxZ2Cv#{ z$GwR5>m}RGhOWfD?Hgi0>_=)x*KT(b-n$)Q(`|I?$zC_?wZm2%rKw1@)z&WNrn)eh zQ0yfAJr$nTes53Z)P(htF61n2b+omsD~Y%r9?pKK^O7#p$YgGj=hpN~CSQQ)|JXzI z_L;3OKmEm*zxp{fUwry_27e*f7Qzx(Z9fAjHIpZ#p>Qg8dsB#QUbPWmwY zV!8tbhFsj*ijrOz6Sz>Kr8>Q23vWH)5gAVCP|2v1to8SGeOn|#MoNTnbX~-C2$H^v zidki=YSk?t&x|!^${gB;wQ4O{$IV0iRdCAYp%J-;l`UF+_&f9l>GR+S7Fe7G_D8I3 zap3Q+A2{6ME?k`Z5p!AK0XLrZ-tTb{w+fRzsHH+%r5~VDOj#Msl^lr0GACo910!T1 z9La7Wv%qm_`+LMI8lL?K&sW~4#NmeKSmupw%qnw+qutA!z-r}sq*^;+yCYL2D>$!N z0nYXSGr5kQ_Cs87M}?uT>~uI6Ntkv9Lf0lBilcoYQ(!gG?xic6?`tnkRd*wAPV2%Y z+*KsC-EK7t+D%2=*3}#^v`x72Nbj`#4vv&Bn{xpka};DOg*jNm&;UrjNV8r-MQ}Zh2P?K-nJchy3&91+L$r6 zhh^*o4-fe(Sp}EuT+ZFlzwv8Um+Z(-QJEn($pzh;fmU zXFxTp@^v)9#1U-BXQ}7zY=j$u+oIZ|3)|3EXt<{h_jJxZ9hACzvFeaR&r~0h*Tl9-16eg!pfxq%JDos+Nomw8$*zcO0dJVgU39xcAKG z2&Kacw-N4(2cWsLNTI(#Gy*3cy=1xGfj2@LK*0~G04@w1#{{K%R^%Qp z@ZvppIEAF3l2x&zf6p2=D1;(`m{#RcC9939V}|uJ_pLkMR5d#7B0}A@mEkNe4d?FM zSM_X)c-QU%nDblCaACMOT*?+kQ%FP_DASzIwE*^9l7(#Xu`&B-Y!QJK<49$ev*~O( zTgumX}(Mf3LnK7A1 z9p@Z_?xq&m7}~%236g-uXqDuFEDbu-_`vDQ1D_%VFu6Wxyvd_(nEOLAjB|tH>!ROl z?VKCTZc+$YC8swDn`c%x-)Xv%(4xy?@mjx@$7 z&tuWx`L}87)gOL1T>o%*?S)knq&mrV828fj%<6|{)?RH7O-Av6WB$La^?|e2SA%II z3MQ_$;h57M!JDVF1LrNta13DO1)>P0CPYa>v!p~sl7Z>QTgWNWw-Y6M{ryWIwb$#b zPFK66009ds%1dAbCE4pKlLrM0a1tV(WF^>DB5T0yNHXi7 zMld+y1y${P->YLiHI&4g_3jCo{=Uf zoBK7FE3c8HPAYC{7rrV%u%v86euLP^;LE^Emh7^% zU{~#iRd;-p8OxpIEoHj_4qO=?%3kVlrN=?}mh53IIngeV9fl7`n-55bCJT_?r|uuA zqNH~4#@gCC{`5Jt{?6v66ygUWg-?I+y^HT$S?5CZNn!0yEYfvQ$R{UFo_r0?{r~Gl zss_!cF3<1PC}(|e;i*#}m6U`Jqw5L5GtP6uHFZY~%1209Z$6(R(l`0`T9V{8RLQ&M zn&lN}D5&O{`1Kyp|ADbK-N+~HsZN}(5n@d0Zp=}80BV0<&S9zcBe@NuON9)nJ274Q zoJ4w*hS9C41BArFShFs?Z(?0;>2!(mJ*xhfNys*hk}Vk1(bI=O{Q3i-p`c>|+|eh) zR6gm*cJ!2isJUPhb#?tnMaH-uf+|JTJgoa?F$rP$4f+yFDUHa1SOpLXlz8kV$3HPM P?>Buw_Q{WL67K&1a4VK< literal 0 HcmV?d00001 diff --git a/anomalydetection/loganomaly/log_anomaly_predict.py b/anomalydetection/loganomaly/log_anomaly_predict.py new file mode 100644 index 0000000..106120f --- /dev/null +++ b/anomalydetection/loganomaly/log_anomaly_predict.py @@ -0,0 +1,122 @@ +import torch +import os +import torch.nn as nn +import time +import numpy as np +from anomalydetection.loganomaly.log_anomaly_train import Model +from anomalydetection.loganomaly.log_anomaly_train import train_model + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def generate_test_label(logkey_path, window_length,num_of_classes): + f = open(logkey_path,'r') + keys = f.readline().split() + keys = list(map(int, keys)) + print(keys) + length = len(keys) + input_1 = np.zeros((length -window_length,1)) + output_1 = np.zeros(length -window_length,dtype=np.int) + input_2 = np.zeros((length -window_length,num_of_classes)) + output = np.zeros(length -window_length,dtype=np.int) + for i in range(0,length -window_length): + for j in range(i,i+window_length): + input_1[i][0] = keys[j] + input_2[i][keys[j]-1] += 1 + output[i] = keys[i+window_length]-1 + new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) + new_input_2 = np.zeros((length - 2 * window_length + 1, window_length, num_of_classes)) + for i in range(0,length -2*window_length+1): + for j in range(i,i+window_length): + new_input_1[i][j - i] = input_1[j] + new_input_2[i][j-i] = input_2[j] + new_output = output[window_length-1:] + return length,new_input_1,new_input_2,new_output + +def load_model(input_size_1,input_size_2, hidden_size, num_layers, num_classes, model_path): + model = Model(input_size_1,input_size_2,hidden_size, num_layers, num_classes).to(device) + model.load_state_dict(torch.load(model_path, map_location='cpu')) + model.eval() + print('model_path: {}'.format(model_path)) + return model + +def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): + model = load_model(input_size_1,input_size_2 ,hidden_size, num_layers, num_classes, model_path) + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + length,input_1,input_2,output = generate_test_label(logkey_path, window_length,num_classes) + abnormal_label = [] + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for i in range(0,length-2*window_length+1): + lineNum = i + 2*window_length + seq = input_1[i] + quan = input_2[i] + label = output[i] + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size_1).to(device) + quan = torch.tensor(quan, dtype=torch.float).view(-1, window_length, input_size_2).to(device) + test_output = model(seq,quan) + predicted = torch.argsort(test_output , 1)[0][-num_candidates:] + print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) + if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += 2*window_length + 1 + else: + i += 1 + ALL += 1 + if label not in predicted: + if lineNum in abnormal_label: + TP += 1 + else: + FP += 1 + else: + if lineNum in abnormal_label: + FN += 1 + else: + TN += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + +if __name__=='__main__': + input_size_1 = 1 + input_size_2 = 61 + hidden_size = 30 + num_of_layers = 2 + num_of_classes = 61 + num_epochs = 100 + batch_size = 200 + window_length = 5 + train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' + test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' + train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' + label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' + model_out_path = train_root_path + 'model_out/' + + do_predict(input_size_1,input_size_2, hidden_size, num_of_layers, num_of_classes, window_length, + model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 9, test_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py index 5aad10d..2aa0d55 100644 --- a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py @@ -113,9 +113,9 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' model_out_path = train_root_path + 'quantitive_model_out/' -# train_model(window_length, input_size, hidden_size, -# num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, -# model_out_path,train_logkey_path) +train_model(window_length, input_size, hidden_size, + num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, + model_out_path,train_logkey_path) do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py index e81b506..4426687 100644 --- a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py +++ b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py @@ -44,11 +44,9 @@ def generate_quantitive_label(logkey_path, window_length,num_of_classes): for j in range(i,i+window_length): new_input[i][j-i] = input[j] new_output = output[window_length-1:] + dataset = TensorDataset(torch.tensor(new_input,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) print(new_input.shape) print(new_output.shape) - print(new_input[0]) - print(new_output[0]) - dataset = TensorDataset(torch.tensor(new_input,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) return dataset def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py index 61c0f64..ee13038 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py @@ -65,9 +65,12 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, count_num += 1 seq = line[i:i + window_length] label = line[i + window_length] + print(label) seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + print(seq.shape) #label = torch.tensor(label).view(-1).to(device) output = sequential_model(seq) + print(output) predicted = torch.argsort(output, 1)[0][-num_candidates:] print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_train.py b/anomalydetection/loganomaly/log_anomaly_sequential_train.py index b27c607..77e32de 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_train.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_train.py @@ -34,6 +34,7 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): # line[i] is a list need to read file form a dic{vec:log_key} to get log key output_data.append(vec_to_class_type[line[i + window_length]]) data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + print(torch.tensor(input_data).shape) return data_set @@ -85,6 +86,7 @@ def __init__(self, input_size, hidden_size, num_of_layers, out_size): self.num_of_layers = num_of_layers self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True) self.fc = nn.Linear(hidden_size, out_size) + # self.out = nn.Linear(in_features=in_features, out_features=out_features) def init_hidden(self, size): diff --git a/anomalydetection/loganomaly/log_anomaly_train.py b/anomalydetection/loganomaly/log_anomaly_train.py new file mode 100644 index 0000000..9202c1b --- /dev/null +++ b/anomalydetection/loganomaly/log_anomaly_train.py @@ -0,0 +1,121 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader +import numpy as np +import argparse +import os +from . import * + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def generate_label(logkey_path, window_length,num_of_classes): + f = open(logkey_path,'r') + keys = f.readline().split() + keys = list(map(int, keys)) + print(keys) + length = len(keys) + input_1 = np.zeros((length -window_length,1)) + output_1 = np.zeros(length -window_length,dtype=np.int) + input_2 = np.zeros((length -window_length,num_of_classes)) + output = np.zeros(length -window_length,dtype=np.int) + for i in range(0,length -window_length): + for j in range(i,i+window_length): + input_1[i][0] = keys[j] + input_2[i][keys[j]-1] += 1 + output[i] = keys[i+window_length]-1 + new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) + new_input_2 = np.zeros((length - 2 * window_length + 1, window_length, num_of_classes)) + for i in range(0,length -2*window_length+1): + for j in range(i,i+window_length): + new_input_1[i][j - i] = input_1[j] + new_input_2[i][j-i] = input_2[j] + new_output = output[window_length-1:] + print(new_input_1.shape) + print(new_input_2.shape) + print(new_output.shape) + dataset = TensorDataset(torch.tensor(new_input_1,dtype=torch.float), + torch.tensor(new_input_2,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) + return dataset + +class Model(nn.Module): + def __init__(self, input_size_0,input_size_1, hidden_size, num_of_layers, out_size): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm0 = nn.LSTM(input_size_0, hidden_size, num_of_layers, batch_first=True) + self.lstm1 = nn.LSTM(input_size_1, hidden_size, num_of_layers, batch_first=True) + self.fc = nn.Linear(2*hidden_size, out_size) + + + def forward(self, input_0,input_1): + h0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) + c0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) + out_0, _ = self.lstm0(input_0, (h0_0, c0_0)) + h0_1 = torch.zeros(self.num_of_layers, input_1.size(0), self.hidden_size).to(device) + c0_1 = torch.zeros(self.num_of_layers, input_1.size(0), self.hidden_size).to(device) + out_1, _ = self.lstm1(input_1, (h0_1, c0_1)) + multi_out = torch.cat((out_0[:, -1, :], out_1[:, -1, :]), -1) + out = self.fc(multi_out) + return out + +def train_model(window_length, input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes).to(device) + # create data set + data_set = generate_label(logkey_path, window_length,num_of_classes) + # create data_loader + data_loader = DataLoader(dataset=data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, quan, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, window_length, input_size_0).to(device) + quan = quan.clone().detach().view(-1, window_length, input_size_1).to(device) + output = model(seq,quan) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.6f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % 100 == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') + +if __name__=='__main__': + input_size_0 = 1 + input_size_1 = 61 + hidden_size = 30 + num_of_layers = 2 + num_of_classes = 61 + num_epochs = 100 + batch_size = 200 + window_length = 5 + train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' + test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' + train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' + label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' + model_out_path = train_root_path + 'model_out/' + train_model(window_length, input_size_0,input_size_1, hidden_size, + num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, + model_out_path, train_logkey_path) \ No newline at end of file From 0c67da83a5422541cdb4b55efc5d2e5765dbb5ee Mon Sep 17 00:00:00 2001 From: cainiao66 <17717084193@163.com> Date: Thu, 7 May 2020 17:41:21 +0800 Subject: [PATCH 3/4] fix log_anomaly --- ...og_anomaly_quantitive_train.cpython-37.pyc | Bin 3412 -> 3412 bytes .../log_anomaly_sequence_train.cpython-37.pyc | Bin 0 -> 3901 bytes ..._anomaly_sequential_predict.cpython-37.pyc | Bin 3287 -> 3600 bytes ...og_anomaly_sequential_train.cpython-37.pyc | Bin 3832 -> 3815 bytes .../log_anomaly_train.cpython-37.pyc | Bin 4309 -> 4334 bytes .../loganomaly/log_anomaly_predict.py | 19 ++- .../log_anomaly_quantitive_predict.py | 52 +++++--- .../log_anomaly_quantitive_train.py | 1 + .../log_anomaly_sequence_predict.py | 123 ++++++++++++++++++ .../loganomaly/log_anomaly_sequence_train.py | 107 +++++++++++++++ .../log_anomaly_sequential_predict.py | 32 +++-- .../log_anomaly_sequential_train.py | 13 +- .../loganomaly/log_anomaly_train.py | 11 +- 13 files changed, 310 insertions(+), 48 deletions(-) create mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_sequence_train.cpython-37.pyc create mode 100644 anomalydetection/loganomaly/log_anomaly_sequence_predict.py create mode 100644 anomalydetection/loganomaly/log_anomaly_sequence_train.py diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_quantitive_train.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_quantitive_train.cpython-37.pyc index 37c9d88256cd4afef5d5dc2e2bae3a62944f8c06..2a26ce1e6db7da3dd9dccbb2dfd70625e2666b40 100644 GIT binary patch delta 40 ucmca2bw!HTiI_Q@=aCYw!pWEh!+IK%+fN(o{B diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequence_train.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequence_train.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f8318b26445d6b774b9fa69de8510172cece2f5 GIT binary patch literal 3901 zcmbVO%a0tz8L#TcOwV&?AKr&wtbs%ff!!q!NDL-CO@xdOcqOBbNvAzkvpegao^f@p z?Hzi!ED?&p3YT2bj1(oxA>t4vLW*)u{)Rdx5aDa2aNv^szUtZKp@c)ar>efHuBz{w z@AdnhYc#41&oBP+`=4A~VC?TSIeO|Ko@6O;;gYC}2ELq-%c6P1QU4TPxFY7lRnZF9#C&)X^eM3ro)(MY z8L`#B#VUR>E^)HN0P*6Kg9hd_bO%XXT1qm22{(JS9)dGxD5#K;Cy)gVg*A zYoEM_|IRMAxoKS4@AqSMa8>oRRBgvJwq=?P)C;kWGpS9Lh)aW5NOh0n436-Jjn~;1 zv2i=Y{fH^gT2`bC||gDopJfjtGX(@V&9?j0(W^8o96S6RVXfcFw3?kg5f;|~=LZi3BDIyC-BszGLa6{kCNXg9{aWM$*)f$Hp< zlC~aV{BAOc>Apd_({7krGT6Bx529hLcTMAVFBOB^5iZ@qw3+Vrqd_<7BypC>%Ql`&<@O6 z{pRD}{hEf4KK%JdAHH+#d+VS4`Gb%C^v8ev;hj%D_~o@rgPmv7I626A*%#?Yvt1}C z67tryI34t{K}an-x;IF#;a|T`*l*ya>GVRDntGJhL3=d3mN2XH6T1H8HT_#OP zls$xw@hY$LCJ*o~@TJ+i%1`56LTllBmEX_Rx3ITE2acYj(8?afiew=dEOg#xT`u54 zZv~+%T;U;KcyBY0g+5%uH~uRFA(IY8B0V%xqpPxK&}pW5jtHS>TYb`@T-4<}KNfPo1jSgfa` z$kcXwLdZ1AdLwBXGZ#qW1F152T57yYSGHdSsTZcF z^U>CqW{;^Y@KTRihVTi?19DZO!pEY>`h|L!c5R~j23lr80j4P7UE-sr)z!w4yd8=g zddjo*JjQH@PO}M=w&&!0;tD<`+MBu)Uzd;`oUi=~1y2z2xs$s{XwEff?2Ub_1yHk1 z0>UZAuDix4;pHW(rR#CHQTxBh`0l!{g}MW%UGx za28Zsp|`sgq3=-dleV@DkBked&=Q4^Rh21s2kLgL1X*&XuCx5C_0v(KP}gh+j08)k zabLpH)hOx@#C}3_J&JDb$4T+U>RvsE+e~9-nvP78Bl8gBHdjv`@wXmC-ky2r0$wlM zl}-&adQPEL(5z+NN3zQUfrLM<30NU7AA%O{3PDea=y6?h0*XVRx`y@03UGG+(4PQ? zpzQ{rsG2jZxZ@^LaWgLqUz9}ot~Z_oDAn>hZVc}7af<)~05-3ylUm-G)Td;vh4=WK zl5P+;z|d#oMNt_q-5Kg;K4(`1?0o`ldAyP@Pv!uEEwp(-JUKk5kYYJ+b1R-lT&$R%HQMJWxX~zO*CqQuWHfWXf=6KpP8J+|GsYu zNE1!)ISI|ot*|L{d-GiYTWhjT@<5gr@wCZe{_<%4I8##kgv)J?TF;8O9~BvdT=~MN z{G=QXI=dG}i`$e=*C|)bu;Tps_QkfRD3hA%=Vweg+uiSW6KSf$UK;gff1nQ9JS>5W z=qVdM>M0x=J@_JBz5e4j#+z@9Uw>rXChT6i6D5NzJHP(M`HjcBV+*7A1onT%H%IQq zP><%VDA>5(MLx@RrD&fsZl;w&MaisRs;7ziA$63z6*;YJXQ!FlwFxxru`L(|7{_}_#L3Cq|nweAlJLqlebNUkU82>u`t2) zxO0O7)p)4eaAT(B(3B9H{aC9-62WTB0g`(bk^No zIw%LgIg{;E>I_}-ed>Nd-EqEEXE8NZGLbb}re4BASW$z4w%{KwBNxbI2GzJ#0ZL&V zb(;1y$PJFy!a7ay4~7XOFupw*288Z_SdHSEDAXCKKTYYjEmFfu$BsZ}Q?UxAbZC4l zO<1+|wZ>Jik)$ph5FkUCtRA2R6f9X5`oQVI&R6qatvGGP_zpH_luwxx@KvYEmz}!P z;!QU|YjKY+1HA*Mg*h@6-gRJk6Igy4RKQh_m?1~@;n7ia?*gXxNOy{|nR=O~f2EEp zxY5%a8=Lr1wY_<9ds|8Qs?7Ei{pgQgxOin#NI4`$b$f}-HbJ3ynhiw(GP>~ppO={) zwZHawVNa`?&CzpTyY<3m!}8S-^Ji6gB~VpQidhJCCmL$T~cdyc4gp>ITaT QbHPIaJomr9fvrUU14gJd!vFvP literal 0 HcmV?d00001 diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-37.pyc index 7a6a90f862c8a2dfb3737e6919cea587a42e49e0..179af285b504f560c852fecfd05748c53c39c6c6 100644 GIT binary patch delta 1232 zcmYjQO>7%g5Pt7@?X365acsxCZX73#W2de_YSlw&FRf?~T!NrQO%$nEPM#A58{2u_ zKm@Zgl8e*}!e!wI@=9E|fw=X^iE}SVzHmV59nKv1c2k`0YTnL#^S#-bd2jwGP0E?K zZQEdY?mqwZ!+ZH>8JBmTZAsgd))Tg8Oa72G>|Z9gw2_Edc(hk?58vT6EJw^2XLVxt z+2e|zATD`{QKH18=xE9yF@+U!&Q6RSMgs8??IsCZQ?KZvspxM~~K8)+(8aH^B zXSj-TJcuzU4M?QN*&hafYTsE>_^y|x1=7YM_##;wt?V$#-e(WkF7r&OkCD4)P3@3P z_}4#j)a92Us}p|q4VCT#ef-q7fmlF1vb+r0V@{gOI&318qE(qLFtQv*n!{w$VSa_m zGI^8vWwOb_3;at_V*XlWk0KMs`q^D?5y95Re9Y0@YyRceLR~%&MJt`ugsWWyd4)u2 zoa{!jgS0=&(i+yJG`{(&{CbZ(5^BXv$+G`}ZK&sIu)#jv^Ak}*XnW0StszG8ecy$C@ z2G0iOz}`a77SXc;6(dim#dpb+SzueyE~?M5rP!xW^{XMHoXh+x*qj6LN{I8v6*OfY zmV0hVhCwO>LRb7waH`+tzXw0-FLIw{hMnU<^Pqp)^KP$y?l+v^N&3-^q-yn!+Ow3` z?hX5DrhRhU{!~>k25Jq2eZEWso5t<+U0!>7TwW;KDq44#`SPL5yV7KU_P=+F!S wiR|E|c|DD>-NueaCa730zZKlH+No%c)pqc`RWIhC>RbmMl5jQM($l*BACf#FNB{r; delta 884 zcmYjP-D}fO6u&1++cdq&+9gfXwd=ZmY)k|Z2b+8F!HPs?N}N+h@obR6*#Q6x_}(N|=$#Py(mQ386&X~KjVDIAU}_9ug@lzo zE0SUkD+36tgjCP1BO9yIOIiK7AS3uFk0qQ#J(!3}7pX?{H743cu+C*D%WDVhfZ7`ZmG>7uR z*#>2TJzx*nCPNEtnv~L3=p%wATkB+nffh|MW=brqQRG}`-jb&NpN1IH)Uk&3f11AH zeH06d{b&1KPqh3k^Ahm3Tl@YrY2__`!#YiNkAmJ&u$Aul zEwTGNXbE}DSCX~pN*~K_ynamw6?ABT2Am=lOi&dEY|68QXix=R(de9}rGqIQ<9f

@!ik^J$N>RNaBTyCrvz=7_;%<&40n8^V*rrZ{PghyqVc~?_by7E+b_aQG&8? zc<^|C-)IB4&d)%ld3V!Td+Qr}P$OgH(JTH5F23+#nb2>gt+g?xvalbjf)8z^3K%f7 zsrcGn#1HujDkR8Dsh!jSh_55tROgXxM-^=8_W~(xEszCbk-KD;+$Su`wCB37Au5)( zBQb4H_w^c7rOlWhnFy0=bxF zOM&8emO}$|CA)0n!lPYcU-q)Ab~@^@EVDC~lf$gJyzAL!2kqvm$OM>tJCcMReiT`U z%luNb0+al6^mfM}p3W%%XUL*wpnJ8S2M|M1o-iQ>HWgjeZUu@FTr)e4s&1i|u;?<^|ko zurvIj(GO4fYhz3k6&n$0{?q6lpA~+q0MWYeybyv4x~eL5&WzVbomQDkb4%w71)9Ww zwAOIlB6C*g1%5R)k-UO@&}ff}ai(?0%VbxxPCmPs$IC^EPKuZi-h3IEL4FuZ{Radn B&6xlI delta 808 zcmX9+&rcIU6rQ(q+imwpOVk#SS_5J%AkiY6)EH6)BUWiKRU$T}Tf!o?Wp+RyLm(j@ zByt)L>Zvv1=)p85{sZ1UcrcrI@nVSaIb~iJKFq86r4- zWt)#i-Th&{|Y)z_KL`a4ScWX46r=K;%e%L=rsJW}CHgGAyQi7u6bdDoi}} z^#?rFT`Jbf;*IYs3=3E73xftxh$`ox7a~9hn{U-gP;68D^xqt|Vubg^h#iiRhJrTH zB9GKhHwX|0c0~5J3HI*HY-z#?94be)(6xac+KVPU^>)=RuXqZpdVagS?v%=OP)uuA zATIW`EDVWnS^^T{A7Y<~2dB``VlV~$;&pHnE{kKKUC4-+q3M$;tX5JHN!}?z zT9+GKc@7#FK}~IG$taax<}_BNrKOQdg^o$8w`sM;oF{Z#3|UjDtEhWzdo&<(496`L b*Ne_dabX3o2_>45IexsuHB?gK(3i7V2}UDOx~;TQTBZwDGYtt1 z2M;FVTs)YF9K9Ki9E>-kXD@5w(TjzL`TQ82lIXgPRheq=VH{^QWX(3Zt|pk{OX9!Y95tKstqxSiCMK121G&q34>xrW7@EsI3hl4XRJO+`y~t^^d0`z zuEEdnv>niXR>TeYOl&{Ad`04II^V9CdB>VY}{|w?C7=RO6+Nx zIiunzIH$?}?2d?rT9Zv(64<0n`vwM)_2Q@~2tm+8Z16A@>Y~sqijiTM5(kloT~fs4 z*T|+N3`mfAMkS1jc6|r+@AT=5*Ja{{gvNFy>qPLv>zl=;Y`K(Kt+12`8E;@w92hkr({9ohK)z1I` delta 1039 zcmZ{j-)qxQ6vyvL(=<(!w9L)5Y;#UfESsIU)yb^b&4Cl=5124eXY87mb#%*io6RaI z1B8Ip>~ylHBeMeIGKX4MSsi zHD0vted*aTcR}9y{4t)uPz2{CoV7g=O~@y0sVu#RhT^I#Q7`O4lA#ORA@Ku_S3=LF zMzp@>YPQ_YHKiH0c(WD*^8x^r+8Y5*>a9~D!F#lmXa*YFfL87 z5X-TR4-1qD_ES_eq`qybmwA9ZA>`3r^&RrqNYu@eSV1|joipl7ZK6N30mUqm<^Lhe zAej=7=>bp7(~?MlbVO?$#I|$nC2ZVUAk8CQlwk1I?1_|O4bxRTeI?ceVCamFGxweD z8Y`wp8n^Ma1uNsPm5Hka<(i>J+>6h%v~t}C&cpaM(%^?9w)=3s#6i3X)gd7c%Vn`G zkHLs&$**!}v50{NI+&p1=z{KGcH?gXjzSXpgF)+~7}0`38J-c%@J|>OOA1%UunQj- zzS0*TA$yAC49ZALj!eSXI9gw>RUVZ%mHLwF_=;WHsuW9nf@;rEG*m3Od=gz+=jX+S zdNmm|>vZ%ZWrsf~a62o$sW+#oKfg>H4a+K43$(p?WTnWJOkgjP;-NnG-n*Tp(#bJh>w% zzP7fxYL(WV;&P48iDSlVSQNX)$^H&ifaAwpUZ|{DtBzf&=EW~#3LG(I4#16;Wv1Xz zmRiMJhZmPK&bnJ!<@4fM{K{>M;`?;;pY!!f%_?jaD%HYLwG;sR$~DK{#6|Eh4i@oQ olK&C&GG*sPCUHM;0NH-);%#DB8R+U!)2jHI=v8GrBXkCS1C1Qrxc~qF diff --git a/anomalydetection/loganomaly/log_anomaly_predict.py b/anomalydetection/loganomaly/log_anomaly_predict.py index 106120f..637bbd8 100644 --- a/anomalydetection/loganomaly/log_anomaly_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_predict.py @@ -14,16 +14,17 @@ def generate_test_label(logkey_path, window_length,num_of_classes): keys = list(map(int, keys)) print(keys) length = len(keys) - input_1 = np.zeros((length -window_length,1)) + input_1 = np.zeros((length -window_length,num_of_classes)) output_1 = np.zeros(length -window_length,dtype=np.int) input_2 = np.zeros((length -window_length,num_of_classes)) output = np.zeros(length -window_length,dtype=np.int) for i in range(0,length -window_length): + for t in range(0,num_of_classes): + input_1[i][t] = keys[i] for j in range(i,i+window_length): - input_1[i][0] = keys[j] input_2[i][keys[j]-1] += 1 output[i] = keys[i+window_length]-1 - new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) + new_input_1 = np.zeros((length -2*window_length+1,window_length,num_of_classes)) new_input_2 = np.zeros((length - 2 * window_length + 1, window_length, num_of_classes)) for i in range(0,length -2*window_length+1): for j in range(i,i+window_length): @@ -39,6 +40,13 @@ def load_model(input_size_1,input_size_2, hidden_size, num_layers, num_classes, print('model_path: {}'.format(model_path)) return model +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): model = load_model(input_size_1,input_size_2 ,hidden_size, num_layers, num_classes, model_path) start_time = time.time() @@ -64,6 +72,7 @@ def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, quan = torch.tensor(quan, dtype=torch.float).view(-1, window_length, input_size_2).to(device) test_output = model(seq,quan) predicted = torch.argsort(test_output , 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, test_output) print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 i += 2*window_length + 1 @@ -104,7 +113,7 @@ def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, print('elapsed_time: {}'.format(elapsed_time)) if __name__=='__main__': - input_size_1 = 1 + input_size_1 = 61 input_size_2 = 61 hidden_size = 30 num_of_layers = 2 @@ -119,4 +128,4 @@ def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, model_out_path = train_root_path + 'model_out/' do_predict(input_size_1,input_size_2, hidden_size, num_of_layers, num_of_classes, window_length, - model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 9, test_logkey_path) \ No newline at end of file + model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 5, test_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py index 2aa0d55..6286fce 100644 --- a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py @@ -38,6 +38,13 @@ def load_quantitive_model(input_size, hidden_size, num_layers, num_classes, mode print('model_path: {}'.format(model_path)) return model2 +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): quantitive_model = load_quantitive_model(input_size, hidden_size, num_layers, num_classes, model_path) start_time = time.time() @@ -61,6 +68,7 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, quan = torch.tensor(quan, dtype=torch.float).view(-1, window_length, input_size).to(device) test_output = quantitive_model(quan) predicted = torch.argsort(test_output , 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, test_output) print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 i += 2*window_length + 1 @@ -69,14 +77,14 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, ALL += 1 if label not in predicted: if lineNum in abnormal_label: - TN += 1 + TP += 1 else: - FN += 1 + FP += 1 else: if lineNum in abnormal_label: - FP += 1 + FN += 1 else: - TP += 1 + TN += 1 # Compute precision, recall and F1-measure if TP + FP == 0: P = 0 @@ -100,23 +108,25 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, elapsed_time = time.time() - start_time print('elapsed_time: {}'.format(elapsed_time)) -input_size = 61 -hidden_size = 30 -num_of_layers = 2 -num_of_classes = 61 -num_epochs = 100 -batch_size = 200 -window_length = 5 -train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' -test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' -train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' -label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' -model_out_path = train_root_path + 'quantitive_model_out/' -train_model(window_length, input_size, hidden_size, - num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, - model_out_path,train_logkey_path) +if __name__ == '__main__': + input_size = 61 + hidden_size = 30 + num_of_layers = 2 + num_of_classes = 61 + num_epochs = 100 + batch_size = 200 + window_length = 5 + train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' + test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' + train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' + label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' + model_out_path = train_root_path + 'quantitive_model_out/' + + train_model(window_length, input_size, hidden_size, + num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, + model_out_path, train_logkey_path) -do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, - model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) + do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, + model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py index 4426687..b4b3b7f 100644 --- a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py +++ b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py @@ -27,6 +27,7 @@ def forward(self, input): out = self.fc(out[:, -1, :]) return out + def generate_quantitive_label(logkey_path, window_length,num_of_classes): f = open(logkey_path,'r') keys = f.readline().split() diff --git a/anomalydetection/loganomaly/log_anomaly_sequence_predict.py b/anomalydetection/loganomaly/log_anomaly_sequence_predict.py new file mode 100644 index 0000000..5542c3a --- /dev/null +++ b/anomalydetection/loganomaly/log_anomaly_sequence_predict.py @@ -0,0 +1,123 @@ +import torch +import os +import torch.nn as nn +import time +import numpy as np +from anomalydetection.loganomaly.log_anomaly_sequence_train import Model +from anomalydetection.loganomaly.log_anomaly_sequence_train import train_model + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def generate_test_label(logkey_path, window_length,num_of_classes): + f = open(logkey_path,'r') + keys = f.readline().split() + keys = list(map(int, keys)) + print(keys) + length = len(keys) + input_1 = np.zeros((length -window_length,1)) + output = np.zeros(length -window_length,dtype=np.int) + for i in range(0,length -window_length): + for j in range(i,i+window_length): + input_1[i][0] = keys[j] + output[i] = keys[i+window_length]-1 + new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) + for i in range(0,length -2*window_length+1): + for j in range(i,i+window_length): + new_input_1[i][j - i] = input_1[j] + new_output = output[window_length-1:] + return length,new_input_1,new_output + +def load_model(input_size_1,input_size_2, hidden_size, num_layers, num_classes, model_path): + model = Model(input_size_1,input_size_2,hidden_size, num_layers, num_classes).to(device) + model.load_state_dict(torch.load(model_path, map_location='cpu')) + model.eval() + print('model_path: {}'.format(model_path)) + return model + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + +def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): + model = load_model(input_size_1,input_size_2 ,hidden_size, num_layers, num_classes, model_path) + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + length,input_1,output = generate_test_label(logkey_path, window_length,num_classes) + abnormal_label = [] + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for i in range(0,length-2*window_length+1): + lineNum = i + 2*window_length + seq = input_1[i] + label = output[i] + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size_1).to(device) + test_output = model(seq) + predicted = torch.argsort(test_output , 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, test_output) + print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) + if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += 2*window_length + 1 + else: + i += 1 + ALL += 1 + if label not in predicted: + if lineNum in abnormal_label: + TP += 1 + else: + FP += 1 + else: + if lineNum in abnormal_label: + FN += 1 + else: + TN += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + +if __name__=='__main__': + input_size_1 = 1 + input_size_2 = 61 + hidden_size = 30 + num_of_layers = 2 + num_of_classes = 61 + num_epochs = 100 + batch_size = 200 + window_length = 5 + train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' + test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' + train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' + label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' + model_out_path = train_root_path + 'sequence_model_out/' + + do_predict(input_size_1,input_size_2, hidden_size, num_of_layers, num_of_classes, window_length, + model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_sequence_train.py b/anomalydetection/loganomaly/log_anomaly_sequence_train.py new file mode 100644 index 0000000..dab9ed3 --- /dev/null +++ b/anomalydetection/loganomaly/log_anomaly_sequence_train.py @@ -0,0 +1,107 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader +import numpy as np +import argparse +import os +from . import * + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def generate_label(logkey_path, window_length,num_of_classes): + f = open(logkey_path,'r') + keys = f.readline().split() + keys = list(map(int, keys)) + print(keys) + length = len(keys) + input_1 = np.zeros((length -window_length,1)) + output = np.zeros(length -window_length,dtype=np.int) + for i in range(0,length -window_length): + for j in range(i,i+window_length): + input_1[i][0] = keys[j] + output[i] = keys[i+window_length]-1 + new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) + for i in range(0,length -2*window_length+1): + for j in range(i,i+window_length): + new_input_1[i][j - i] = input_1[j] + new_output = output[window_length-1:] + print(new_input_1.shape) + print(new_output.shape) + dataset = TensorDataset(torch.tensor(new_input_1,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) + return dataset + +class Model(nn.Module): + def __init__(self, input_size_0,input_size_1, hidden_size, num_of_layers, out_size): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm0 = nn.LSTM(input_size_0, hidden_size, num_of_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, out_size) + + def forward(self, input_0): + h0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) + c0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) + out_0, _ = self.lstm0(input_0, (h0_0, c0_0)) + out = self.fc(out_0[:, -1, :]) + return out + +def train_model(window_length, input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): + # log setting + log_directory = root_path + 'sequence_log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes).to(device) + # create data set + data_set = generate_label(logkey_path, window_length,num_of_classes) + # create data_loader + data_loader = DataLoader(dataset=data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, window_length, input_size_0).to(device) + output = model(seq) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.6f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % 100 == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') + +if __name__=='__main__': + input_size_0 = 1 + input_size_1 = 61 + hidden_size = 30 + num_of_layers = 2 + num_of_classes = 61 + num_epochs = 100 + batch_size = 200 + window_length = 5 + train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' + test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' + train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' + label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' + model_out_path = train_root_path + 'sequence_model_out/' + train_model(window_length, input_size_0,input_size_1, hidden_size, + num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, + model_out_path, train_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py index ee13038..7c010b7 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py @@ -31,6 +31,14 @@ def load_sequential_model(input_size, hidden_size, num_layers, num_classes, mode return model1 +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): vec_to_class_type = {} with open(pattern_vec_file, 'r') as pattern_file: @@ -49,8 +57,8 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, TN = 0 FN = 0 ALL = 0 + skip_count = 0 abnormal_loader = generate(test_file_path, window_length) - abnormal_label = [] with open(anomaly_test_line_path) as f: abnormal_label = [int(x) for x in f.readline().strip().split()] print('predict start') @@ -61,33 +69,37 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, i = 0 # first traverse [0, window_size) while i < len(line) - window_length: - lineNum = current_file_line * 10 + i + window_length + 1 + lineNum = current_file_line * 200 + i + window_length + 1 count_num += 1 seq = line[i:i + window_length] label = line[i + window_length] - print(label) + for n in range(len(seq)): + if current_file_line * 200 + i + n + 1 in abnormal_label: + i = i + n + 1 + continue seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) - print(seq.shape) #label = torch.tensor(label).view(-1).to(device) output = sequential_model(seq) - print(output) predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + #print(output) print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 i += window_length + 1 + skip_count += 1 else: i += 1 ALL += 1 if vec_to_class_type[tuple(label)] not in predicted: if lineNum in abnormal_label: - TN += 1 + TP += 1 else: - FN += 1 + FP += 1 else: if lineNum in abnormal_label: - FP += 1 + FN += 1 else: - TP += 1 + TN += 1 current_file_line += 1 # Compute precision, recall and F1-measure if TP + FP == 0: @@ -112,5 +124,5 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, print('Finished Predicting') elapsed_time = time.time() - start_time print('elapsed_time: {}'.format(elapsed_time)) - + print('skip_count: {}'.format(skip_count)) #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_train.py b/anomalydetection/loganomaly/log_anomaly_sequential_train.py index 77e32de..3d87fc2 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_train.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_train.py @@ -27,20 +27,20 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): for line in file.readlines(): num_of_sessions += 1 line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) - if len(line) < 10: - print(line) + if len(line) < window_length: + #print(line) + continue for i in range(len(line) - window_length): input_data.append(line[i:i + window_length]) # line[i] is a list need to read file form a dic{vec:log_key} to get log key output_data.append(vec_to_class_type[line[i + window_length]]) data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) - print(torch.tensor(input_data).shape) return data_set def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): # log setting - log_directory = root_path + 'log_out/' + log_directory = root_path + 'sequence_log_out/' log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) print("Train num_classes: ", num_of_classes) @@ -70,7 +70,7 @@ def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_cl train_loss += loss.item() optimizer.step() print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: + if (epoch + 1) % num_epochs == 0: if not os.path.isdir(model_output_directory): os.makedirs(model_output_directory) e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) @@ -84,9 +84,8 @@ def __init__(self, input_size, hidden_size, num_of_layers, out_size): super(Model, self).__init__() self.hidden_size = hidden_size self.num_of_layers = num_of_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True) + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, dropout=0.5) self.fc = nn.Linear(hidden_size, out_size) - # self.out = nn.Linear(in_features=in_features, out_features=out_features) def init_hidden(self, size): diff --git a/anomalydetection/loganomaly/log_anomaly_train.py b/anomalydetection/loganomaly/log_anomaly_train.py index 9202c1b..b515ed5 100644 --- a/anomalydetection/loganomaly/log_anomaly_train.py +++ b/anomalydetection/loganomaly/log_anomaly_train.py @@ -17,16 +17,17 @@ def generate_label(logkey_path, window_length,num_of_classes): keys = list(map(int, keys)) print(keys) length = len(keys) - input_1 = np.zeros((length -window_length,1)) + input_1 = np.zeros((length -window_length,num_of_classes)) output_1 = np.zeros(length -window_length,dtype=np.int) input_2 = np.zeros((length -window_length,num_of_classes)) output = np.zeros(length -window_length,dtype=np.int) for i in range(0,length -window_length): + for t in range(0,num_of_classes): + input_1[i][t] = keys[i] for j in range(i,i+window_length): - input_1[i][0] = keys[j] input_2[i][keys[j]-1] += 1 output[i] = keys[i+window_length]-1 - new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) + new_input_1 = np.zeros((length -2*window_length+1,window_length,num_of_classes)) new_input_2 = np.zeros((length - 2 * window_length + 1, window_length, num_of_classes)) for i in range(0,length -2*window_length+1): for j in range(i,i+window_length): @@ -103,14 +104,14 @@ def train_model(window_length, input_size_0,input_size_1, hidden_size, num_of_la print('Training finished') if __name__=='__main__': - input_size_0 = 1 + input_size_0 = 61 input_size_1 = 61 hidden_size = 30 num_of_layers = 2 num_of_classes = 61 num_epochs = 100 batch_size = 200 - window_length = 5 + window_length = 10 train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' From 4f26ca9d8396a5e08f76366e51aa373b9160810e Mon Sep 17 00:00:00 2001 From: cainiao66 <17717084193@163.com> Date: Thu, 18 Jun 2020 14:31:57 +0800 Subject: [PATCH 4/4] add add --- .gitignore | 1 + HDFS_drain3_state.bin | 1 + anomalydetection/att_all_you_need/__init__.py | 1 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 184 bytes .../encoder_self_att_predict.cpython-36.pyc | Bin 0 -> 4003 bytes .../encoder_self_att_train.cpython-36.pyc | Bin 0 -> 7139 bytes .../encoder_self_att_predict.py | 141 ++++++++ .../encoder_self_att_train.py | 296 ++++++++++++++++ anomalydetection/bi_lstm_only/__init__.py | 1 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 180 bytes .../bi_lstm_predict.cpython-36.pyc | Bin 0 -> 3244 bytes .../__pycache__/bi_lstm_train.cpython-36.pyc | Bin 0 -> 3931 bytes .../bi_lstm_only/bi_lstm_predict.py | 127 +++++++ .../bi_lstm_only/bi_lstm_train.py | 116 +++++++ .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 182 bytes .../__pycache__/__init__.cpython-37.pyc | Bin 202 -> 160 bytes .../log_key_LSTM_train.cpython-36.pyc | Bin 0 -> 3357 bytes .../log_key_LSTM_train.cpython-37.pyc | Bin 3376 -> 3380 bytes .../deeplog/Model1/log_key_LSTM_train.py | 16 +- .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 182 bytes .../__pycache__/__init__.cpython-37.pyc | Bin 202 -> 160 bytes .../variable_LSTM_train.cpython-36.pyc | Bin 0 -> 3900 bytes .../variable_LSTM_train.cpython-37.pyc | Bin 3916 -> 3933 bytes .../deeplog/Model2/variable_LSTM_train.py | 50 ++- .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 175 bytes .../__pycache__/__init__.cpython-37.pyc | Bin 195 -> 153 bytes .../__pycache__/log_predict.cpython-36.pyc | Bin 0 -> 5415 bytes .../__pycache__/log_predict.cpython-37.pyc | Bin 5742 -> 6208 bytes anomalydetection/deeplog/log_predict.py | 316 +++++++++++------- .../__pycache__/__init__.cpython-36.pyc | Bin 178 -> 178 bytes .../__pycache__/__init__.cpython-37.pyc | Bin 198 -> 182 bytes ...og_anomaly_quantitive_train.cpython-37.pyc | Bin 3412 -> 0 bytes .../log_anomaly_sequence_train.cpython-37.pyc | Bin 3901 -> 0 bytes ..._anomaly_sequential_predict.cpython-36.pyc | Bin 3226 -> 3509 bytes ..._anomaly_sequential_predict.cpython-37.pyc | Bin 3600 -> 0 bytes ...og_anomaly_sequential_train.cpython-36.pyc | Bin 3790 -> 3789 bytes ...og_anomaly_sequential_train.cpython-37.pyc | Bin 3815 -> 3790 bytes .../log_anomaly_train.cpython-37.pyc | Bin 4334 -> 0 bytes .../loganomaly/log_anomaly_predict.py | 131 -------- .../log_anomaly_quantitive_predict.py | 132 -------- .../log_anomaly_quantitive_train.py | 98 ------ .../log_anomaly_sequence_predict.py | 123 ------- .../loganomaly/log_anomaly_sequence_train.py | 107 ------ .../log_anomaly_sequential_predict.py | 2 +- .../log_anomaly_sequential_train.py | 2 +- .../loganomaly/log_anomaly_train.py | 122 ------- .../__pycache__/__init__.cpython-36.pyc | Bin 174 -> 174 bytes .../bi_lstm_att_predict.cpython-36.pyc | Bin 3240 -> 4306 bytes .../bi_lstm_att_train.cpython-36.pyc | Bin 4311 -> 5474 bytes .../robust/bi_lstm_att_predict.py | 113 ++++--- anomalydetection/robust/bi_lstm_att_train.py | 66 +++- anomalydetection/self_att_lstm/__init__.py | 1 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 181 bytes .../self_att_lstm_predict.cpython-36.pyc | Bin 0 -> 5806 bytes .../self_att_lstm_train.cpython-36.pyc | Bin 0 -> 4953 bytes .../self_att_lstm/self_att_lstm_predict.py | 246 ++++++++++++++ .../self_att_lstm/self_att_lstm_train.py | 140 ++++++++ .../self_att_lstm_supervised/__init__.py | 1 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 192 bytes ...att_lstm_supervised_predict.cpython-36.pyc | Bin 0 -> 4198 bytes ...f_att_lstm_supervised_train.cpython-36.pyc | Bin 0 -> 5338 bytes .../self_att_lstm_supervised_predict.py | 131 ++++++++ .../self_att_lstm_supervised_train.py | 154 +++++++++ deeplog_detection.py | 73 ++-- ecoder_anomaly_detection.py | 71 ++++ extractfeature/hdfs_deeplog_preprocessor.py | 261 +++++++++++++++ .../hdfs_fs_deeplog_preprocessor.py | 91 ++--- extractfeature/hdfs_ft_preprocessor.py | 58 +++- extractfeature/hdfs_robust_preprocessor.py | 166 +++++++++ .../k8s/__pycache__/__init__.cpython-37.pyc | Bin 0 -> 173 bytes .../log_preprocessor.cpython-36.pyc | Bin 7946 -> 7946 bytes .../log_preprocessor.cpython-37.pyc | Bin 0 -> 7916 bytes java/deeplog_java.py | 266 +++++++++++++++ java/detect_log/clusters/1 | 1 + java/detect_log/clusters/2 | 1 + java/detect_log/clusters/3 | 1 + java/detect_log/clusters/4 | 1 + java/detect_log/detect.log | 10 + java/detect_log/logkey.txt | 1 + java/detect_log/logvalue/1 | 1 + java/detect_log/logvalue/10 | 1 + java/detect_log/logvalue/11 | 1 + java/detect_log/logvalue/12 | 1 + java/detect_log/logvalue/13 | 1 + java/detect_log/logvalue/14 | 1 + java/detect_log/logvalue/15 | 1 + java/detect_log/logvalue/16 | 1 + java/detect_log/logvalue/17 | 1 + java/detect_log/logvalue/18 | 1 + java/detect_log/logvalue/19 | 1 + java/detect_log/logvalue/2 | 1 + java/detect_log/logvalue/20 | 1 + java/detect_log/logvalue/21 | 1 + java/detect_log/logvalue/22 | 1 + java/detect_log/logvalue/23 | 1 + java/detect_log/logvalue/24 | 1 + java/detect_log/logvalue/25 | 1 + java/detect_log/logvalue/26 | 1 + java/detect_log/logvalue/27 | 1 + java/detect_log/logvalue/28 | 1 + java/detect_log/logvalue/29 | 1 + java/detect_log/logvalue/3 | 1 + java/detect_log/logvalue/30 | 1 + java/detect_log/logvalue/31 | 1 + java/detect_log/logvalue/4 | 1 + java/detect_log/logvalue/5 | 1 + java/detect_log/logvalue/6 | 1 + java/detect_log/logvalue/7 | 1 + java/detect_log/logvalue/8 | 1 + java/detect_log/logvalue/9 | 1 + java/java.iml | 11 + java/out/production/java/deeplog.class | Bin 0 -> 1831 bytes java/src/deeplog.java | 29 ++ l_a_d_bi_lstm.py | 91 +++++ log_anomaly_detection.py | 26 +- log_deep_data_anomaly.py | 69 ++++ log_predict.py | 305 +++++++++++++++++ logparsing/converter/__init__.py | 1 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 171 bytes .../__pycache__/eventid2number.cpython-36.pyc | Bin 0 -> 496 bytes logparsing/converter/eventid2number.py | 8 + logparsing/converter/logparser2cluster.py | 25 ++ logparsing/drain/.gitignore | 9 + logparsing/drain/CONTRIBUTING.md | 48 +++ logparsing/drain/HDFS_drain.py | 34 ++ logparsing/drain/LICENSE.txt | 21 ++ logparsing/drain/README.md | 169 ++++++++++ logparsing/drain/__init__.py | 0 logparsing/drain/drain3/__init__.py | 2 + logparsing/drain/drain3/drain.py | 258 ++++++++++++++ logparsing/drain/drain3/file_persistence.py | 25 ++ logparsing/drain/drain3/kafka_persistence.py | 45 +++ logparsing/drain/drain3/masking.py | 65 ++++ .../drain/drain3/persistence_handler.py | 18 + logparsing/drain/drain3/template_miner.py | 98 ++++++ logparsing/drain/examples/drain3.ini | 14 + logparsing/drain/examples/drain_stdin_demo.py | 36 ++ logparsing/drain/requirements.txt | 5 + logparsing/drain/setup.cfg | 2 + logparsing/drain/setup.py | 32 ++ .../__pycache__/__init__.cpython-36.pyc | Bin 168 -> 168 bytes .../__pycache__/__init__.cpython-37.pyc | Bin 188 -> 146 bytes .../fttree/__pycache__/fttree.cpython-36.pyc | Bin 3218 -> 3210 bytes .../fttree/__pycache__/fttree.cpython-37.pyc | Bin 3238 -> 3188 bytes logparsing/fttree/fttree.py | 10 +- robust_anomaly_detection.py | 101 ++++++ self_att_lstm_anomaly_detection.py | 142 ++++++++ self_att_supervised_detection.py | 62 ++++ 148 files changed, 4413 insertions(+), 1015 deletions(-) create mode 100644 HDFS_drain3_state.bin create mode 100644 anomalydetection/att_all_you_need/__init__.py create mode 100644 anomalydetection/att_all_you_need/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/att_all_you_need/__pycache__/encoder_self_att_predict.cpython-36.pyc create mode 100644 anomalydetection/att_all_you_need/__pycache__/encoder_self_att_train.cpython-36.pyc create mode 100644 anomalydetection/att_all_you_need/encoder_self_att_predict.py create mode 100644 anomalydetection/att_all_you_need/encoder_self_att_train.py create mode 100644 anomalydetection/bi_lstm_only/__init__.py create mode 100644 anomalydetection/bi_lstm_only/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/bi_lstm_only/__pycache__/bi_lstm_predict.cpython-36.pyc create mode 100644 anomalydetection/bi_lstm_only/__pycache__/bi_lstm_train.cpython-36.pyc create mode 100644 anomalydetection/bi_lstm_only/bi_lstm_predict.py create mode 100644 anomalydetection/bi_lstm_only/bi_lstm_train.py create mode 100644 anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-36.pyc create mode 100644 anomalydetection/deeplog/Model2/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/deeplog/Model2/__pycache__/variable_LSTM_train.cpython-36.pyc create mode 100644 anomalydetection/deeplog/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/deeplog/__pycache__/log_predict.cpython-36.pyc delete mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_quantitive_train.cpython-37.pyc delete mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_sequence_train.cpython-37.pyc delete mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-37.pyc delete mode 100644 anomalydetection/loganomaly/__pycache__/log_anomaly_train.cpython-37.pyc delete mode 100644 anomalydetection/loganomaly/log_anomaly_predict.py delete mode 100644 anomalydetection/loganomaly/log_anomaly_quantitive_predict.py delete mode 100644 anomalydetection/loganomaly/log_anomaly_quantitive_train.py delete mode 100644 anomalydetection/loganomaly/log_anomaly_sequence_predict.py delete mode 100644 anomalydetection/loganomaly/log_anomaly_sequence_train.py delete mode 100644 anomalydetection/loganomaly/log_anomaly_train.py create mode 100644 anomalydetection/self_att_lstm/__init__.py create mode 100644 anomalydetection/self_att_lstm/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/self_att_lstm/__pycache__/self_att_lstm_predict.cpython-36.pyc create mode 100644 anomalydetection/self_att_lstm/__pycache__/self_att_lstm_train.cpython-36.pyc create mode 100644 anomalydetection/self_att_lstm/self_att_lstm_predict.py create mode 100644 anomalydetection/self_att_lstm/self_att_lstm_train.py create mode 100644 anomalydetection/self_att_lstm_supervised/__init__.py create mode 100644 anomalydetection/self_att_lstm_supervised/__pycache__/__init__.cpython-36.pyc create mode 100644 anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_predict.cpython-36.pyc create mode 100644 anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_train.cpython-36.pyc create mode 100644 anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py create mode 100644 anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py create mode 100644 ecoder_anomaly_detection.py create mode 100644 extractfeature/hdfs_deeplog_preprocessor.py create mode 100644 extractfeature/hdfs_robust_preprocessor.py create mode 100644 extractfeature/k8s/__pycache__/__init__.cpython-37.pyc create mode 100644 extractfeature/k8s/__pycache__/log_preprocessor.cpython-37.pyc create mode 100644 java/deeplog_java.py create mode 100644 java/detect_log/clusters/1 create mode 100644 java/detect_log/clusters/2 create mode 100644 java/detect_log/clusters/3 create mode 100644 java/detect_log/clusters/4 create mode 100644 java/detect_log/detect.log create mode 100644 java/detect_log/logkey.txt create mode 100644 java/detect_log/logvalue/1 create mode 100644 java/detect_log/logvalue/10 create mode 100644 java/detect_log/logvalue/11 create mode 100644 java/detect_log/logvalue/12 create mode 100644 java/detect_log/logvalue/13 create mode 100644 java/detect_log/logvalue/14 create mode 100644 java/detect_log/logvalue/15 create mode 100644 java/detect_log/logvalue/16 create mode 100644 java/detect_log/logvalue/17 create mode 100644 java/detect_log/logvalue/18 create mode 100644 java/detect_log/logvalue/19 create mode 100644 java/detect_log/logvalue/2 create mode 100644 java/detect_log/logvalue/20 create mode 100644 java/detect_log/logvalue/21 create mode 100644 java/detect_log/logvalue/22 create mode 100644 java/detect_log/logvalue/23 create mode 100644 java/detect_log/logvalue/24 create mode 100644 java/detect_log/logvalue/25 create mode 100644 java/detect_log/logvalue/26 create mode 100644 java/detect_log/logvalue/27 create mode 100644 java/detect_log/logvalue/28 create mode 100644 java/detect_log/logvalue/29 create mode 100644 java/detect_log/logvalue/3 create mode 100644 java/detect_log/logvalue/30 create mode 100644 java/detect_log/logvalue/31 create mode 100644 java/detect_log/logvalue/4 create mode 100644 java/detect_log/logvalue/5 create mode 100644 java/detect_log/logvalue/6 create mode 100644 java/detect_log/logvalue/7 create mode 100644 java/detect_log/logvalue/8 create mode 100644 java/detect_log/logvalue/9 create mode 100644 java/java.iml create mode 100644 java/out/production/java/deeplog.class create mode 100644 java/src/deeplog.java create mode 100644 l_a_d_bi_lstm.py create mode 100644 log_deep_data_anomaly.py create mode 100644 log_predict.py create mode 100644 logparsing/converter/__init__.py create mode 100644 logparsing/converter/__pycache__/__init__.cpython-36.pyc create mode 100644 logparsing/converter/__pycache__/eventid2number.cpython-36.pyc create mode 100644 logparsing/converter/eventid2number.py create mode 100644 logparsing/converter/logparser2cluster.py create mode 100644 logparsing/drain/.gitignore create mode 100644 logparsing/drain/CONTRIBUTING.md create mode 100644 logparsing/drain/HDFS_drain.py create mode 100644 logparsing/drain/LICENSE.txt create mode 100644 logparsing/drain/README.md delete mode 100644 logparsing/drain/__init__.py create mode 100644 logparsing/drain/drain3/__init__.py create mode 100644 logparsing/drain/drain3/drain.py create mode 100644 logparsing/drain/drain3/file_persistence.py create mode 100644 logparsing/drain/drain3/kafka_persistence.py create mode 100644 logparsing/drain/drain3/masking.py create mode 100644 logparsing/drain/drain3/persistence_handler.py create mode 100644 logparsing/drain/drain3/template_miner.py create mode 100644 logparsing/drain/examples/drain3.ini create mode 100644 logparsing/drain/examples/drain_stdin_demo.py create mode 100644 logparsing/drain/requirements.txt create mode 100644 logparsing/drain/setup.cfg create mode 100644 logparsing/drain/setup.py create mode 100644 robust_anomaly_detection.py create mode 100644 self_att_lstm_anomaly_detection.py create mode 100644 self_att_supervised_detection.py diff --git a/.gitignore b/.gitignore index 6eab8ea..093dfa7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,6 @@ .idea/ */.idea/ /Data/ +.pyc */__pycache__/ \ No newline at end of file diff --git a/HDFS_drain3_state.bin b/HDFS_drain3_state.bin new file mode 100644 index 0000000..081c98d --- /dev/null +++ b/HDFS_drain3_state.bin @@ -0,0 +1 @@ +eJztWm1v2zYQ/iuFsQ9tUagk9W5sA5qkAYJ18RAH24AiMBSJTtToxaPotFnR/z5SlC1SFv0WxfEQf5F0InnkHZ873pH83ps8vM+vv+CQ9vqvekl+MwlIEWc3RkSCOBNPsyJO+LP37lUvwhN6y+oj9l3E6agkgGExMg2+jcLbOIkIzthPCAD7SfKcjrI8wuzP9016POdtpA45szv8wBu+vhgMLt/0xI8RzUWvdS8QPbYzOO8MIm03P7/99bH9oFoozk3X1Q9WECbTgmJSMPrzRn1+ym+ORVvOn9UdUZxOkoBi1tEdzkqOPeBBCHxeoxpI9To7Px2UIx4XxklAAy7BT/zj7xDH95j0eeEF5gTrnxPXSR7eSRwKEvYlMsIFndNXtWCjOOKyfAAAwLJZ/C+X3XVcEzH5n1vkIy6VkLMS+uO3kM1inJdmIYyDiPKjmQbGOWnTyJfgPjDi3Pg4OK2ZtKoC2ZIqnlwNCALXBvwLItPdQBUQGMgGBjQN6Hl9m80g6Cvq+Upi2oTHdXI3Mj3b8xD0geX5FvIt0ykHQnP+TGNChAJL/tAwfQM6QPDX6MuV9AV3oq8SrJCJoNXXH0F4h+kFLiZ5FlUaa/xboQ+wgLcZhM4G898l3yPCB8q/JvEEa5TkK0q6+tF0L1f8h9+dD/UPLnQGitPheZDi4oHxSsWEfRoc//aWf/GCYVlgBAkzEsa9tLR+g2O7z0Syo7Bdz9uRz4RmG+7LgQ/DIMsqvP+JSTyOmUwVfotpGGIc4Uhyk1rhoCUJZ7q+pcMshB0u/PCA2m1dWcVCs/zpQWzKC7/l2uYeLPzNWKeqO8TkXqC3KZ5YvPRCyt7XAu4eiLi4oCsBHWNPH8olpZxp3cy2m64sretpDdfs0HDNg+GuZbjVjLeCOB/P502yZ5Kny7EtO2pmwJapnXCnwwl3DhO+YXwRRUOaExzVAUYJgN+DCf+eTiLWdSS7u7jgT9ZOwKX2cQ2UtMPCVmDhQbCXTu/xCV0zGj/OWQAUzlgSXAjveS28KWbStAfoQNKXrTUhu0MTsg8mtHLlX+Yvl21ytLrTdkNxpIn3nSfPX4FpI5G/+rIWVN2oL8Xyl72qnN2FBvTNZTk78CShd7XVowq8wnMGxWJ8R/Ak4fkMXihhvjPgkH5dvGlopT1Cko39yadcK/4M9UqMSwMy27ahtwQHku+nJMiKMW51hwsYaRccPoPgm9g737SqXX/VkkhOAK+zI3OcT5OydpZTwUCocRZMFZT9STUqkrezkOlolwLQ4VIADktBKzQuK8TLFrJp7ifPp+f/7zGvt2054rP2QM7NNi7QQji4/pY9lNfwZzy5OB1y4VnEWcp6ghO8sAM/S+viZHlYAuUV2tMHpFaHXsh6oV5ot4FJ+3TLe8neHlhvtzFJu8zy1qO3j/uOXUUj22WkcqymdwBuhw7AfaEOoHr99eHifA0PsHxT5wJH0ywKsnKO1XIx/f9MWaLaBJJ0HMNe9Ur/rs5gHRcCz3MsDVpkBwJ3bEyy4lbGb4t+oy2FuRGRe8c2Zco2hXQ2hTq0KfTCberpIsMMU2OY80aXcYrzKVURAhGL/cs8P42TROylUlGxdO23VQT2NZhfk6iMMLzlZ6hJDclrLEAWRA9SNU4boj5/9uuBMbxWPIpqhMeC/BwK2AqbX5WsI3MtsHYYAaKXGgE2wLrGVtvcMWWsLzKdsDlt+qhZkZjtJYA7G9ROXwLftmCqX63IT/CYGhrEKYf+WsR1eOKPDif+6xyT8Cm/aaaSsxwyH4+r5U9seS/kmtWuF1hYZFNMg3ZOK0J3JCfc2qshqMMDR/TiDhy3vhm4xZnakntw++bo+C2t5Jf3jeuPniVusBKc5hRXxertxatysWbC+b6zlXOUd2S0V/hgh3f44OES365y9CYDAYz7IImjd/wryEpe/HadSExETMg7p+LuJ801uDHlwyZtGNfhaYL2MOHpp7JdAWgdw1kYWNneniWyJQVdhURqqamSFlRJXyEdUyFttTJSO7IbldWO3AaptoWNftXKTqMjlXTUtkglLXVUrlrqqR35amVfrdwYMmi0VUshbNANUmXmNXgjbgL/Adcvbuk= \ No newline at end of file diff --git a/anomalydetection/att_all_you_need/__init__.py b/anomalydetection/att_all_you_need/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/att_all_you_need/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/att_all_you_need/__pycache__/__init__.cpython-36.pyc b/anomalydetection/att_all_you_need/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6c90fd720361723419c0fcde2a51ccddbc90b2b GIT binary patch literal 184 zcmXr!<>gvawmD9mfq~&M5W@i@kmUfx#VkM~g&~+hlhJP_LlH4|xXIhDnk#W6nl>5edVVqShOP&g&EBsIAt zGe0jTv7{tEF()U!GQTuFFEuqKCO$qhFS8^*Uaz3?7Kcr4eoARhsvXGsVjyM!0G*mP A0ssI2 literal 0 HcmV?d00001 diff --git a/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_predict.cpython-36.pyc b/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_predict.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b78c8c0473a30d3a8ac483c60ae4a626baf868b3 GIT binary patch literal 4003 zcmZ`+&2JmW6`$E%l1qwT`m_|wvVtZ?*gAsU91<9YlQfA71diaw4qUKux9gn|wbXK# znjI>Vy1bML&`Xf?pD2)`r~Vl|VQzr&ei!!FwVaAC&OSa_);Dl^{iimIr6#`=rG z7j^VYq9NwcFN>yVpeP(|Rn^JX7z7I?Sau4H`Vi zLLpU;o7x9ylwoMRXd*)6M&n6DSEiqfM&tL;@^z4c6@1^FI)yW2GdARC4w^e-cp=~J zd8QI2St?CcNF7FdrjkxZ<6~3RSrX^E=NdOxk(mPt#-Rveab$d&jKX~XYo^+G$G4OB z?(BSYyc;JnN1w=0rExmg86AI`seCUR?dc={QSw zLal=^NrK~S5~Na!9a-Kr$Yrt{(9^M!B98PH&e$4-`!YB?%nDuE17WW3sV@4+rx||r z(`L2uAtsvi;ugGcifU0SDw@yyp##dNe&G*Y?F}o1C%ACFaAuq{@TxP$nXURxL?~4!v9+R31!Z zs^c&Ttk@KA+sW>|xN z?R%<05G`C%>_OuRD;y}--Bz5|{qyU;|LLP=58sBCtKoPoQvu-CPoW@%sp3k|3e*%Q z8lJ(&m#?r;J_&N*EeUis4)$Ne&gJ>J#~Zv$=kI}cetx=J1=WIs@EdfNMEy z>Q23BMZ>8I2H;fjw087WQPTj`jPEn`hr$ugo4B)Zh4&dNs-lAX*4E&D0ut}jD!@(f zf`9I;v8j)h{tPp${WT!x>5AY3K&{~AlU25aH2|JFtdb-~-mt1`*crRh&Zoev+4t5C zUU|Z_uj|7G>DBfGU1wS^>d>xVYWF#*I3hwniXLRRM`-yH$N;|bv^nMQ864#|&Vl!+ zbbA=X1vU|`oUL2NiNz(If7`fAAtcI1<9lz?ZnSlkoxDDfsZ<1dl|7r}z_L65aG%L! z>n^!ZlHbGNY^CuZS=>ePTfS*3R<1W^oUt&z0(gTcKQwM4Q&Y*uNem3gBMpGSi?D@y z1@BZpAVP0f%OHqtHoEp2rcN=-`l#vP6o7Xm1MD@>d!}>hA#kA_4h=slJx?GOo4FEj zft}PT7M8cBcq#LnHwp&V7Om_7nCo`zh0I!G=HtC(rcIDo11M zoqN+NX0@Ytit4a}bb;UppS@+#>g+ozDQd%-s0_I9N<y2RnHgeBOA&JWCbDd9?}x zgYSi7sVKTD@>OeWK?s{=cbj*QHbyt1=gq)d5BP;H+TWpio2(7&SrY_cGeKY3~BcHiJ?Aa*Ik)FB-+%%%3)4WMueh3!$iu z)~cJvQ~ltl_qPye>#OqbA> zrz`Fe2$a9XybRg6}P3s9nZx=^&rvraVEFMR_fcD5XJ}^$V?$);W#9kE5YGiQ7z4afkMGo zm&iCaFNoDggGmYk9ceUZo}zfhGgorZRA~YZXo(D@m2f3 z!MosBA@hPRVYXxo-X>QX`}NkWCDo5HMkyVpBFxnfF)35iFNp1Ax$$jzzm=viQmTGt zr4^*Jk?EF?COS^?Ez(L^t`JoEZ(aUM+PeIewDn(KNwy5JUmPWDsEjquI1j?ZFh&iP zNSh)pHf_;cLX+IqUg?4!=?hQU&j9}}>i$V0@6m4gCWw!eRsI{c@A9U*TJak`{TBSi F{{Z?f1Ni^| literal 0 HcmV?d00001 diff --git a/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_train.cpython-36.pyc b/anomalydetection/att_all_you_need/__pycache__/encoder_self_att_train.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d3109c620c61e435a3a8d7bbd0590da24102242 GIT binary patch literal 7139 zcmbVR%WovdeeUYl^z=M9BxlJbmy&2%ay*JmDRQ2&q-dqp+HqD3+Y+K=yz55qR1ewB zndxCw4VR>P0z@NYiNH=|*tzJG6k_)YYc!;X5 z_w%do_uw}hjqvaO<==m_@f())pVpqCJwFkkl+N}=5Zpg}>wQk+CXmp$AyV6}T-&VI}zN_6;^Ihw%neTdcJ+ZPj!nJH; zW{ID0XJm7C#=33pasN}RyZLR4S9tKL#e>AouFstA7D}o-L`gU=*+xl?*HKcRm)t-} zgEvvqoR{qI72d*l&+t{ghVQd{owxCQj@3ztLx!(D!TGsIs0RdUcGQW7=3*NI&1lt zh}8UKnRSlyj%)>#7VO-f*~odw`9*b}s~{IVvL3NV_9N$!+xIx;|1?n50vNkx&De~g zPl#5v^DF(QzgV+#x5uy|`VkBQgmnZ1g?%jU zetXK4b#9+K=k7VfK;F;nBlm;y9y?6e=R3X=pC$t6v-e)rEb4d!kK?qL=n5D4IG-q8 zIY@cL)1j`6qJt#nQv2gR6&Of}24?*te1kC5%ZKB)zDcW~CHdaj3-d|#q9`4uDvI{r z805WpAm2h_(V?@hJvkWz=E0(wXrsTjM4LZm9e)N`?;-&H*@GeQ%j;jB|OZ zCVW5VlRyE^rnRxnGWAPTpBj7M00`_2~iuM`!)*}W?W z&W}y>zxZ{UxPyZjuy_&W+K55GEhk437f%r`-q}MHKR1-dkRqr5@DMqaj&)}Z416*m(+^FeaqMcaKKwek+xp){0 z{x(X=yYWI6+RwYYZk;uqknOOumBro6+jf{}p?SwAsE^`d5=Adk`03Q}J|Fyt77&1Z_O^HtzIYw2#J4DVgCgQPgL?5MrOe4uv^YvbiUi6( zLnyJ%>a7x{C|UlNSpOP|z7*@eQ3Tj!ML^s{;~pI`{T4_!X#WKYF3^r6v^PMehql~S z_9y1RBc~uefR5MR#??S0@IjrZGjcn)lB-_Hg;#PlVQPJ5nA-L#50Jr+wVn3CO$Orv5E5wm>Uij zbdUnNyj4;zaqI#Y%keKH-r~NlLjMzVlGNG)SGH`Q`S?PfL83Jw&(3zA@^qJP;qLNw zXy6Khc?>T>a4f|ei7YyG27yG+z&2uro&e5t8WcXBfX7Lwl|s(VD$vd#_Mrc}=uFN`W}l&S7$@z2kT=NwiBtiQl1G_I=I%ShGLr`pGB}`={M1 z42-!9U?&$Gb3tH#gKm=4F5uLMTt~di@aI4425f`1?54Zsw%wK+x@XTl4YLc51GWBo zG)HXo7YGZQTQB(uCClHE=KdcdonB1Ooex1@8ia!UT|^foSBmku*-;90droE^>ijuH zL9Z+2M{vPF;wroRmSL3FE*PcbUJ{&3Rr$*~4FbsvzjDb7r2Kj?p?&=;%}1LISUnI_veX(#$EB5+?f)|poK2m5y!L%u zx(yp(>P>w`lxg(wlbw1zLRp!UQ$`9;}8^JA@wZZCORIh2!F|5XhJXtyN(~ zupg66QAO`62#OtA;v9qqF~}K-u7Z^{*R3}CFbfn|p_WK_87YPpUMWYWxtVqJ@<*1c zsxYgOz%yT19iY9i2!XiZ5{X?ghUpkSK#*ic8e_Ol#xULORU_C*M+z57mX)?s{^UJw zq5QvS90HJUoqc1U=p$B1B=QFnso<+Pg5LQc8GM;ttAh^`u+pfPe8G0fl0dsoUE5>c z4Fn0{o;(&rG1?(}!;|Ads$X6i^k$QBRzi-g@d8{RfsWNwc@0?Y2OvYtVRip#9bnhin_PDvnXRe72-N zlD21X`}auD6~acf|GG7`&GRL}?tBER^83~gtUtw7PhAA>WL#K4?!nYYu5$7=1U}qg znLDG>SApk%+lRJz01@qRC{^4o@pyELN(u)iOE80-Y%q8WcC3q1ya#5fO5u}r47__N zGuS=aqZWl{1gn2nGhs|wBU{s#p_dp**$K3p<1V$E$s95K4O|eZi7G`YA~@ibFdi-V z8ywuG7*c6{agB6^-PO`>piak0q;k62h51T1B95pBlih|)hB3q?Y+1T4FFA&|6Ccr3 zSIJ86>?bOk8^DV`4^wqu{C@w94#uTq;g;ICHy1bMnwwj*H%Kf(brU3_dgO_tA8C}d zC39<^WBUqxgZFFW^uPc(DcB`Ny=W9QWzT}lLCmI21=R;G%n%C8it>sTI%ndWpSsf) zDaTN3R*m}8%)%Kl&e~_}(eb#sf+6vW24w0l?iGRR3uWEKoQP-MH@x4*f1@(4CibYtHpM<&YR$|&8&?c zX63-bjbbgkHnYHDH^`NYZyTx2nFY>lqwYGD9i4#7;84!CigoU9g7J#B+AdnzjTyUP z&7hc~-@FEfg$I0cwUa%QJxf^pjL9D$_Z+3+@D9{&0p{I=J9^VX+VrFRbO$T`O3}=o zpMjxyhLD&i{y;H9*lMr#TpO_-{FW|Fi-p65G=%?I$dYxd2{vu$+yJ<=ama8LBrFA) z7UHfMZ{czB`s5|l)w4w8(Y}be4j#n4L-GP?H&qFyk}4T%AL1P9tw?ByPiP1mz9iSM zw3kYr!qrJWZ6u`lnCe{_ALDGLBH_Z*p6Wa!22?R8Xycly!4b=oQR$~@%&FWssHASr zv-3ODA?3~4%JKrMBF|MRC^q14N(Rw_?o$DmJg!q$4W|>!7~(0l?)rRSNzq5ncE~s1Q8F&*+#6ih30NIYq={rP~fn zNmrQwO8ZZ8SFIFHW<3t5M!tq_&=T1WY{`}rzz??uy4nCqh4v2Y&25xHLBiiIL=Z|L zdUl!kNA&9Kma;W4e&M~*s0353%$f~|bF}NOFB)>@Ja)>uxF^n&nY5BMvz*g>#2<5<2Z#n zfqNp-)&Bd&e^|Z>@{AZO!RUZSpqB8aJzhG$rM9YrH*kJ%nZHH)g?tH7V2HhrxMc@U bVAGkIKj*p~gloUQ_Vu;zu5AXbVD 0.001: + filter.append(p) + return filter + + +def generate_robust_seq_label(file_path, sequence_length): + num_of_sessions = 0 + input_data, output_data, mask_data = [], [], [] + train_file = pd.read_csv(file_path) + i = 0 + while i < len(train_file): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + input_data.append(line) + output_data.append(int(train_file["label"][i])) + i += 1 + data_set = TensorDataset(torch.tensor(input_data), torch.tensor(output_data)) + return data_set + + +def get_batch_semantic_with_mask(seq, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + print(seq.shape) + batch_data = [] + mask_data = [] + for s in seq: + semantic_line = [] + for event in s.numpy().tolist(): + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + batch_data.append(semantic_line) + mask = make_src_mask(s, 0) + mask_data.append(mask) + return batch_data, mask_data + + +def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length, model_path, test_file_path, batch_size, pattern_vec_json, dropout, num_of_heads, pf_dim): + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, dropout, num_of_heads, pf_dim) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + + # create data set + sequence_data_set = generate_robust_seq_label(test_file_path, sequence_length) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=False, pin_memory=False) + + print('predict start') + with torch.no_grad(): + for step, (seq, label) in enumerate(data_loader): + # first traverse [0, window_size) + batch_data, mask_data = get_batch_semantic_with_mask(seq, pattern_vec_json) + seq = torch.tensor(batch_data) + mask_data = torch.tensor(mask_data) + seq = seq.view(-1, sequence_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq, mask_data)[:, 0].clone().detach().cpu().numpy() + predicted = (output > 0.5).astype(int) + label = np.array([y for y in label]) + TP += ((predicted == 1) * (label == 1)).sum() + FP += ((predicted == 1) * (label == 0)).sum() + FN += ((predicted == 0) * (label == 1)).sum() + TN += ((predicted == 0) * (label == 0)).sum() + ALL = TP + TN + FP + FN + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/att_all_you_need/encoder_self_att_train.py b/anomalydetection/att_all_you_need/encoder_self_att_train.py new file mode 100644 index 0000000..c9e532e --- /dev/null +++ b/anomalydetection/att_all_you_need/encoder_self_att_train.py @@ -0,0 +1,296 @@ +# -*- coding: UTF-8 -*- +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import TensorDataset, DataLoader + +import pandas as pd + + +import numpy as np + +import random +import math +import time +import json +import os + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def make_src_mask(src, src_pad_idx): + # src = [batch size, src len] + + src_mask = (src != src_pad_idx) # + + # src_mask = [batch size, src len] # + + return src_mask.clone().detach().numpy().tolist() + + +class Encoder(nn.Module): + def __init__(self, + input_dim, + output_dim, # + hid_dim, + n_layers, + n_heads, + pf_dim, + dropout, + device, + max_length=100): + super().__init__() + + self.device = device + + self.tok_embedding = nn.Linear(input_dim, hid_dim) # + self.pos_embedding = nn.Embedding(max_length, hid_dim) + + self.layers = nn.ModuleList([EncoderLayer(hid_dim, + n_heads, + pf_dim, + dropout, + device) + for _ in range(n_layers)]) + + self.dropout = nn.Dropout(dropout) + + self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device) + + self.output = nn.Linear(hid_dim, output_dim) # + + def forward(self, src, src_mask): + # src = [batch size, src len, input_dim] # + # src_mask = [batch size,1, 1, src len] # + + + batch_size = src.shape[0] + src_len = src.shape[1] + + pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device) + + # pos = [batch size, src len] + + src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos)) + + # src = [batch size, src len, hid dim] + + for layer in self.layers: + src = layer(src, src_mask) + + # src = [batch size, src len, hid dim] + output = self.output(src) # + output = torch.sigmoid(output[:, -1, :]) # + return output + + +class EncoderLayer(nn.Module): + def __init__(self, + hid_dim, + n_heads, + pf_dim, + dropout, + device): + super().__init__() + + self.self_attn_layer_norm = nn.LayerNorm(hid_dim) + self.ff_layer_norm = nn.LayerNorm(hid_dim) + self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device) + self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, + pf_dim, + dropout) + self.dropout = nn.Dropout(dropout) + + def forward(self, src, src_mask): + # src = [batch size, src len, hid dim] + # src_mask = [batch size, src len] + + # self attention + _src, _ = self.self_attention(src, src, src, src_mask) + + # dropout, residual connection and layer norm + src = self.self_attn_layer_norm(src + self.dropout(_src)) + + # src = [batch size, src len, hid dim] + + # positionwise feedforward + _src = self.positionwise_feedforward(src) + + # dropout, residual and layer norm + src = self.ff_layer_norm(src + self.dropout(_src)) + + # src = [batch size, src len, hid dim] + + return src + + +class MultiHeadAttentionLayer(nn.Module): + def __init__(self, hid_dim, n_heads, dropout, device): + super().__init__() + + assert hid_dim % n_heads == 0 + + self.hid_dim = hid_dim + self.n_heads = n_heads + self.head_dim = hid_dim // n_heads + + self.fc_q = nn.Linear(hid_dim, hid_dim) + self.fc_k = nn.Linear(hid_dim, hid_dim) + self.fc_v = nn.Linear(hid_dim, hid_dim) + + self.fc_o = nn.Linear(hid_dim, hid_dim) + + self.dropout = nn.Dropout(dropout) + + self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device) + + def forward(self, query, key, value, mask=None): + batch_size = query.shape[0] + + # query = [batch size, query len, hid dim] + # key = [batch size, key len, hid dim] + # value = [batch size, value len, hid dim] + + Q = self.fc_q(query) + K = self.fc_k(key) + V = self.fc_v(value) + + # Q = [batch size, query len, hid dim] + # K = [batch size, key len, hid dim] + # V = [batch size, value len, hid dim] + + Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) + K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) + V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) + + # Q = [batch size, n heads, query len, head dim] + # K = [batch size, n heads, key len, head dim] + # V = [batch size, n heads, value len, head dim] + + energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale + + # energy = [batch size, n heads, query len, key len] + + if mask is not None: + mask = mask.view(batch_size, 1, 1, -1).to(device) + energy = energy.masked_fill(mask == 0, -1e10) + + attention = torch.softmax(energy, dim=-1) + + # attention = [batch size, n heads, query len, key len] + + x = torch.matmul(self.dropout(attention), V) + + # x = [batch size, n heads, query len, head dim] + + x = x.permute(0, 2, 1, 3).contiguous() + + # x = [batch size, query len, n heads, head dim] + + x = x.view(batch_size, -1, self.hid_dim) + + # x = [batch size, query len, hid dim] + + x = self.fc_o(x) + + # x = [batch size, query len, hid dim] + + return x, attention + + +class PositionwiseFeedforwardLayer(nn.Module): + def __init__(self, hid_dim, pf_dim, dropout): + super().__init__() + + self.fc_1 = nn.Linear(hid_dim, pf_dim) + self.fc_2 = nn.Linear(pf_dim, hid_dim) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # x = [batch size, seq len, hid dim] + + x = self.dropout(torch.relu(self.fc_1(x))) + + # x = [batch size, seq len, pf dim] + + x = self.fc_2(x) + + # x = [batch size, seq len, hid dim] + + return x + + +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + num_of_sessions = 0 + input_data, output_data, mask_data = [], [], [] + train_file = pd.read_csv(file_path) + for i in range(len(train_file)): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + input_data.append(line) + output_data.append(int(train_file["label"][i])) + data_set = TensorDataset(torch.tensor(input_data), torch.tensor(output_data)) + return data_set + + +def get_batch_semantic_with_mask(seq, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + batch_data = [] + mask_data = [] + for s in seq: + semantic_line = [] + for event in s.numpy().tolist(): + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + batch_data.append(semantic_line) + mask = make_src_mask(s, 0) + mask_data.append(mask) + return batch_data, mask_data + + +def train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file, dropout, num_of_heads, pf_dim): + print("Train num_classes: ", num_of_classes) + model = Encoder(input_size, num_of_classes, hidden_size, num_of_layers, num_of_heads, pf_dim, dropout, device).to(device) + # create data set + sequence_data_set = generate_robust_seq_label(data_file, sequence_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + + # Loss and optimizer classify job + criterion = nn.BCELoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + batch_data, mask_data = get_batch_semantic_with_mask(seq, pattern_vec_file) + seq = torch.tensor(batch_data) + #print(seq.shape) + seq = seq.clone().detach().view(-1, sequence_length, input_size).to(device) + #print(seq.shape) + output = model(seq, torch.tensor(mask_data)) + + loss = criterion(output.squeeze(-1), label.float().to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + print('Training finished') \ No newline at end of file diff --git a/anomalydetection/bi_lstm_only/__init__.py b/anomalydetection/bi_lstm_only/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/bi_lstm_only/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/bi_lstm_only/__pycache__/__init__.cpython-36.pyc b/anomalydetection/bi_lstm_only/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d1f1ba59dfc697e4409764c20b836abfe53b45b GIT binary patch literal 180 zcmXr!<>g{qH7!n@fq~&M5W@i@kmUfx#VkM~g&~+hlhJP_LlH4|xXIhDnk#W6nl>5edVVqShOP&g&EBsIAt wGe0jTDKkE&xFk0|KQE^;CO$qhFS8^*Uaz3?7Kcr4eoARhsvXGcVjyM!06elYq5uE@ literal 0 HcmV?d00001 diff --git a/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_predict.cpython-36.pyc b/anomalydetection/bi_lstm_only/__pycache__/bi_lstm_predict.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58a695c7b45a8649409368a439d9223c6bdbb144 GIT binary patch literal 3244 zcmZuzOK%&=5$>J`Um__=qF$D)rQc!JR$^>`Yz)I#V;vy~yn^F3uz@0HFq&>jLk(xB zo~CWlJtsOx!%qG}PWcl#=a_R}bJ8`)$%iCgH$_P>c8Gpdb#+&DRq?C-s?n&u{pVl* zn9LFKFLLQ~F#iTE-$Ta;qY2rWjC6-u+UdDFuAO;19!4ke`=y-{C7kriXq8@dOvE?L zO&#Wq=}wvXtn`lT)L5BSFwU?lt6{7&`aS8+oI(%T?o#c4moc7bFB-B?yU}168C%Tf zqczdKMa%cmNirsdBdLOYW9P%u!hQ2Zxjko0o{*Pc7oK#b#~kLqBg|v|JEFXvKPJ5r zD~;(BGRB@UEj;D)%F=%7Ja=ZB0c)q0oG0^w7RX%a_ne zBB^u~>A5)D8xwOXcG2ywuYEZjBwV}waG?GDBnzeXb17m_`5=iU?n~3Z5mCoEpANr$ zu>0M~R-Eu0<5M1rG)@n8`zNn6ksoG*-6T5*(=a*7<9zpNcJO#Q57VpWFmBa zbrXp5tFMDNAd0+f7gQ1j=P`2R2~W{KJI9`JfH1DY4c&yNT}L~cKiBk)z3QZEVEI$+ zVG`#u%KC$eF-Av;jMjh5oT=a^dbeBAp4q6qY`{}p6+C1~obp_haDXVAu4=j(I$a6} zkYWgkmxg_=8?WP(Wv>Gmb|4S+JRBPw@sm8r`SFma5zjTr`9?9hpQmpE0F^W)t=|n7~i)lya2Is7cKCr5;iTN$4xThh**f$}_3@ z9z06GD~*wFvdl`o$}~ZhZ&UKc#Ji4e#rwft%wjQ#eVFLlUMQo(Adg2}m~bHkbtGV5 z;%d@k)a;wTSTG3X;e*cGqFWWl>)Mf-E;0Tpj<}vhuCgGPNQl5>4Kg_s(II>SzV`Sl z=;tW51Y_m0krPH4kkIXl=Abv?a7Uss`x& zVoX=D-{$;PwR%+gNX6fcc4HH)sizg=Z8KBLc3i&Lr>-59A^%@B-T5E2U}YAtt8HiP zbp34A5%X%v@|Li=XlIKb=`kQ5-Y^>0kN0&9u!L>N?K3fo4}BcN1qVLaq+pGU!|IR1=H!7CmGMoiC{nan0YADH;*`c%jY&{&ukgf5?PwTm6PIh6r{f}>lLf_#Ddx@J?u zI#oL_o@r+b?Zr06ZS6jO`c%(`d#Nc%3F=+QxX{fDhDi}&N{6mS*)Wv>?xE+Rp%5H| zy`Qx>WkZSbrmJTu$eE_kG<^=$_n|t9bp3M3Kd)o8i`+Ny%SY%4 z_3(7?P`sLS1x0MhX*m_p(_am<8oh}XBW2_~XUUek7C0TqUcs+Mo0wJTJZ3cqKlDwU z)ubhychR>&we9XY6)&N^+nA)1ljXW%>-%P!Y9~#_*I3>8^7D7)=H++gIUz+Dr}hP5 zQ%<1P5?zn;Abb_Za9uJ^wxU=2WhwGv# zT2I*Uv}lVC+JWG=*iQEucw>7zTz7BHCX-Me-;uG3@At4jX!4siEB)xrowHV|fL&ROrg$GgOKxUb2OiSPRBkX_YLnOB=PL+{7z6W^BPr zS6EM7%r-FVmzJ>CS?MfjceC_Rw@O#u7IdzidBUMqFficm-G=z>{ZNe#gK;birMvsF zh-EZVah`?gGYd>StuBwfpj<#VP7$SJ-HNm6Oqtu|3Mk9I zA)hbjhW&UPe1~N@Aa1#C2Eix|iy{dA&Yq7yxpeQw@wGUO3bfb3Ll$QT_a?`Wa#4#Z9!L*1}t&)5DaX_L3d6pi(7$_OW+3xgM_CR|TrC3A7 zc#nJd27C>*!+mSsGxpv+QQnBuOFw!*R6ao|uAovZf#DOFFd<`7-&xp+tKjmL^}VY$ zImOt5dHEi>n9AkoP`mR;=7rIo+yDWiOl_%Ldm?%mkD?t%Zekv4Cqr6?7qvCID9_PW zQ@ufpZDaJ>;%{Q2@K7=9JU`FptF>&e?D&p&E;btRK12uN)I#wR2qE}_KI@)USrym^ z3a~%Ii|cl5Z?VcKqO?f(l0ryqwxqK8#@nf7Q$i10;&Kg)Pd)}>OTCPKz@cS&BMDY7o5KMA0OR3)pf}>fO)+@#d zYCe97t8{nih@&V?gI8$paCsK%k`!wm6?wK%sLwln-!pTg6LcZpCy9&-KZfivt84{G zfby(A->e*ARqo?!rIYXlyTa~Ys=U&JYV8GwS{s$GYz5FL{lr_quY^MihdP_33qWaJ zW&ja)gH`U)`8(_m4DdU4lYI(ku387|fLW|+R~^s;xP#tbtM21(%I=cZm!Q=W?yp$Y zA}YcY4T59Qe8Rv(O|P}!SJ;xKry3>vxa36f0h9kY`ttD=<)6%MVjm>9Sh;08Apmpn z@eR1d7FGa*9n9Kh9OZT7iZR1ZP@+|sMORw@4N;LKFDXjpekqaYXk{Hemq#)GNyXRYXOBz*wAw zC4o#qw}C1Yqs+iD*ZS0l0o7}|)ocH+L=wIrq+o~<9h4rvP3|LZIJ||K9{Lw}-?A_> zuzteNad{KFM{PQTAE5}LmVt(E;k0E#;9*p4fj~hIZ39`=2AsEasgGyOkpmz<;0(@e za)=Vi33hr3o-aV53!rG0tZXgps;9iNS9+AZh{jW=TER1ImmS>cKjl?l`AKuZs#VoW z+5m(O9tE$~zUGg8)g^jY*w^g-^XimnR_l+as#mU<$1MO#r%?vgMmb1U@SOT6t6~LI zIRahH$Dr(%1KhV+u9lnSMm>w?yH&1H76>TVLdHk!I(q911_)Zm+G(0Q`t}ZK z@|0+8K>BiBZImm?=7OJP3jia|EmsCiG#^;2))H_O)ZE`=8%|yke?c3LO9Ni!fBL^c~(v9M9HXg&_ThllT zCeb99$9RPvNFb+r87k**U8Q@s?|xpr^Lcgu?QN5}AQcMIyuj6;U)p_dTs`OS%3^SH54(Nuc~*^^KiP?3b++R6*<;8`OTiVG-3`;+iNgicWdj8lXU zp}#^A^on={_y*-7+BqvJA(LBFz05#+OGH>&MYm{dj8@bpY05lTHB1eVC`2jQ+Q5WK zuVEnFG$=uUA+HjPUGj)J2d3o?4I$Dr1xwmBm*`bQg;|u1z~d5Pbldb0aqx#ldymY& z8r>Y37GR~DM%BpbU@^m?ZW*&1o6CqO+qhM`CfI1N<`cMR>=a?!!Rs*At}iew?csvr zm#CQK^G&OT(y{uyXZt8?z_0-j%eVR%BeB7^&9Mq(JBOan~?o1i3?!-k9J`7`M2#?IWVG_UE@r}Ea46o?uWy0)VhK`UVen64T W 0])) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences.append(tuple(line)) + return log_keys_sequences + + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path): + + model1 = Model(input_size, hidden_size, num_layers, num_classes, if_bidirectional=True, batch_size=0).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + +def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + abnormal_loader = generate(test_file_path, window_length) + abnormal_label = [] + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in abnormal_loader: + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + lineNum = current_file_line * 200 + i + window_length + 1 + count_num += 1 + seq = line[i:i + window_length] + label = line[i + window_length] + for n in range(len(seq)): + if current_file_line * 200 + i + n + 1 in abnormal_label: + i = i + n + 1 + continue + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) + if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += window_length + 1 + else: + i += 1 + ALL += 1 + if vec_to_class_type[tuple(label)] not in predicted: + if lineNum in abnormal_label: + TP += 1 + else: + FP += 1 + else: + if lineNum in abnormal_label: + FN += 1 + else: + TN += 1 + current_file_line += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/bi_lstm_only/bi_lstm_train.py b/anomalydetection/bi_lstm_only/bi_lstm_train.py new file mode 100644 index 0000000..c242a7b --- /dev/null +++ b/anomalydetection/bi_lstm_only/bi_lstm_train.py @@ -0,0 +1,116 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import os +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader + +# use cuda if available otherwise use cpu +from torch.autograd import Variable + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirectional, batch_size): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional) + self.fc = nn.Linear(hidden_size*2, out_size) + self.batch_size = batch_size + if if_bidirectional: + self.num_of_directions = 2 + else: + self.num_of_directions = 1 + + + # self.out = nn.Linear(in_features=in_features, out_features=out_features) + + + def init_hidden(self, size): + # size self.batch_size same + h0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + # h_n: hidden state h of last time step + # c_n: hidden state c of last time step + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + # out shape [batch, seqlen, numdirec*hidden] + out = out[:, -1, :] + # tmp1, tmp2 = out.split(self.hidden_size, 1) + out = self.fc(out) + # print('out[:, -1, :]:') + # print(out) + return out + + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length + 1: + #print(line) + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(vec_to_class_type[line[i + window_length]]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size, hidden_size, num_of_layers, num_of_classes, True, batch_size).to(device) + # create data set + sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters()) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, window_length, input_size).to(device) + output = model(seq) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') \ No newline at end of file diff --git a/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-36.pyc b/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..185d304783c47d18f0183c2924f193e1688837a2 GIT binary patch literal 182 zcmXr!<>lHt%Q=n#2p)q77+?f49Dul(1xTbY1T$zd`mJOr0tq9CU&+o^F}{^lIYq;;_lhPbtkwwF6mR48#lo$OSYs literal 0 HcmV?d00001 diff --git a/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-37.pyc b/anomalydetection/deeplog/Model1/__pycache__/__init__.cpython-37.pyc index 444811b310f9bc3af53773a8b790a6dfaab45484..956f3f953ba3843c46234b7294ee6f9eb7bc5146 100644 GIT binary patch delta 60 zcmX@bxPXz{iIEq%@^6COJPPHO41D-7zmQr?NP+I6g16 Md}6m1>t~R106v@($N&HU delta 102 zcmZ3$c#4tRiI3$JYBN<>Dpx=>dB7ICp)@hOch@2UH@eF rwr5+rU#y=RG`wAV`F15tdsT4C@TZXN*fUtL12>`gOmk{RmW_rnX0k7>FI8& zY5<#_T%?Ki3Wcj&MIK>qlCP>w77;1ks=rfJr%s)p@6^wS!(Q;W zKYv@T67o;-!RMiW3q}4O6(fx1Bx+llIgw*2ZstWEdQR@ogD9Y8-pxDnFbe10sB6bG z??pXMvOdZn8#Y9I&Ah^4enX@FZwL!m=QUv+?qn;C8?CaCbzhTc#Coicc8$>oWHNXM z8OZ$!)x!si`8*LPk3_1tm^ga)kQcHPcN3LJu5{0I4$6dap}pyXC6PO=7E`nR{pX%_ zyiq@oY zGN&Pw)J{!ew{}r`%FlwDg5nzXZ#>XCpoO4#jak=a1V^~{A5Nf;CyAOK$Fo#OHR)+z zE-EgBIZ20coEE8yW8FVa8RJDP(^Ia8#bO?pvp7#qxRBZ@ipV>7@bI4Ya;fG~aF7-} z5!#teaSKoKe5QM8Q7x3+F1lbXdZ?oQ8KbBCL_*KqIG*N-lyUr5^27Yj^}~B7zfN;5 z(H?+dkrqdX^OF~)kjG_pn3qRMk>n>bm4^rA(e3yBq$uYgGp_hlrDbu*IIpny(5myA z*%#x$*hm~Jk)*|bbs|E@GeD6>FhW;ofPaIYuGn4s?-en1%rriuA5-q2$O@Gr8LW{x zHO-*)#`L|$&3qL!bm{!xS=cP=wDy`gjdSQMy`>LYSQ|ele{|p+ww<}#1l+vZgY~@o zg5unFw)!`|Nmu)-6w_nvpK?)3t7GUNCbK%%jw*G)`15qig}JL3qM8I^1&#Ktg|u^g zRXfwG+ASAK(^#yTb+#ItRh?NWUL=C8W6-L~xlAw94QGd*4qH{7t(XMHeTrnx;Xb`M zo|kNqoBkkv{Si4L4q3WOPmvn9(Kvtl!_t3wTKgGo$X)WdSOzt8`~`uX9ECt+{%fLK z<<&vvHzW%hL>Yd%3|`(+9TnEFX-&_=7TEpSvQxWRw+`wKe!2yF87mcdwJx|o6X3=bd?#M3m5LPNy$)W_-=|L@%|K`6P2icpXC=m01X(p|cVe~<1^@e9mM zjfuUdkDd1W#^s;MQ3~i6Oa5tE8i!g5s@KC#^zwFnP`Dh3AU^!G&Ihk9- zwHnF-P*;@4I`vB9P@;V6QXxiO7N`!~ZxyaLtbI7gNOkKG;2|`r>y5WuQ@y&!dRwIK z2` zR=tyLHRKDj+``;>Lwoj*M+Em>$+qiFHiYbMy=COfE;RHaX&@VdXq;VSs~^(awT+88 zyNiuB2^&0h#P`tUPIjs8HuNo(f5GUo>El80u_f$2YmH0llWZ5-`AI|SuGQZq=x=Qc z0T21Ue`L=8sgV_Ojx8?}W*hgT0C>jpYmtX^a_w|H0&He0Ggz+qB2!_1%XPSNDG8qdW3=F`MN)>Q!kG&-uI*CqK|D zh{*s9St@J}u+VV2_mw$u_jj+BS6(e2f3|CL8**lxmlCXBUElwFw$wdKwzPN5h~e&6 z>U7P@g2lTtWPEwdnb^SOvfkyBNmmkKg~;oF>;nl zkk-8CDSr`lO>&EmM8b6VB$+-n=}3Ecnaz>w6t8rE3^Y%a?wHU=o@yL_Ob7)#BgNQ7h0>2=DK*ETnS!@atbsmg{6)S zb*7ZyJO&&YTGTV9wQeBAb)%2ZO@%iv`QQz-*>ZjlVX45Y1xAjBh9S6|4F_h3J61V_ zi&k4BTDUZcH0oNIGvoGAb>>jox7?#O;QN9XQ(Wr|4x_&90eH=;HF#m4wo!L#TOh3N zS|gdaL)25HO7hq!H}dU$#7%RF161%2{!IHecj%zK1<)5}#!`6M(Z{$uXhm&>^Zr}c323XcY< zjFaa{id!Ss1o!OIri}#~0Y;P$?)~HQDZGJu;~8|*q;6?mG$AfvQq72l?s;cC4p;sI DiU5WE literal 0 HcmV?d00001 diff --git a/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-37.pyc b/anomalydetection/deeplog/Model1/__pycache__/log_key_LSTM_train.cpython-37.pyc index 692a8640be5778d0dd2052caf387a37c293da659..7a80e57ec597d9be72e267a7604d493cc7030848 100644 GIT binary patch delta 858 zcmYjOOKTKC5bl~+b|#yLNzCfT#V7$;A8d3*i81)TMLdWK?P}<3x0^AuGn1ZPb!CEw zJu66q2T!{Q9z2Mmc<|sK@TjK&e}JGTJ$Vpoj0bz@ZZs-nJnReoC@Jv%bu5jM9?v@(7J)ron+rN-JkHJbBOzcbrV z)^pYs_@?h$<*}EW_Pyt+-G>1=Pi{Qb&#e7N)KKd5AO&ug$U2d$8*{0{oz_2QE4qVB zkrLT-?wwWzHKadT3-Cyf*p=-=4-GZkFQ(Q2dd6lUEnFtmf<9$mg&+Efz5jun!~v>6 zZLYepsE%e<6pJu(rC>bpLy=@w;3q0G+bqsZKUA5WDCx)eivu6|0M)i+VM}BVi!ql= z6HCe|1nMXeCXvkSTS3HB)urw&KM*T1Q)}7CT|eY{(NTMMp_G6H9u#2|3NQ=fU_cqB z@tcEPAnR{FIt^HKN+dPYl7e*G?Q#!h$iVCYkaVIy!^i&T(`k;{fr+udFFvkU3-wdT z^#EO}sNMb9bgaK;KvI``ShZ3jd>dvMKsMfA%Pa035FpCp6L|+t(6veZxp1H~Ra27r zVJ(PS>!Mo=BiUwwoYmXieYo5ucPHXC_qf*X;pzb#D>I@b8_F%@KKc+#v`{)2kkdJo za;WR??%^fs#Z0ocP(mj1V4muVu+tWjDM9UK5QXS47m794sCCB|cd2_{$Y`Y{8K?TX zSHaF*Zyxc5*Km&Jg=@Mv93!zgys$*wC{}*k#~kXZNU>lgQ6i?ch@H{o+12sUH{X n^9!il8qmfUE&9m{yTg2}90=-MjCdyyTUOL_<+A=+T!e`~5-;pi delta 758 zcmYjOO-K|`9H0N}&bTu>?)tH=vbv@s$SUHBNUS23WDw{nDv+&p+`;iOmvjpkqE2Pe2T=rFy9Ckg(NV*W9r~a{^oBb09`E=6{&@fYv()2MY^JeM75HiE z+r5&EKZ&KqbKf?(JUfv;y7%Dd;r)OdzPxq#awk9DbNuGu@b!z&&v%Xw_VROX>9nI; zRp0dg=ZF2W>7asv*6CUCH!wOInoyWn&N8u>W%OpqhHm;jbOGMdxo|3eeM{QdtV#P& z7tRQm9a;$w${VLh1BuWN;T$Z{sFZG#cSRE2*J|N9Y>QiwTI9TtX`oZmMeyluX>c_I zOfpE-Lzx&0yB>0wf{|`myb+&Cq~ckKg>2npA=4pD@(DIQPCU!x64sp(H(NYP);%6< z;1HhxPHORK0l@|P3{)=lS&NV}DRPS)U;E6d9kT2MXHDcHz%arGCB@X^pk z<)noBdCT;UG74GxPDw}GxrjRg$kDIL=oC{u9qTqC2>W=1>zRU_ijA;N5R;0Q>mVi@ zi0H*K3$L5#>VmR~uv;i$-C(q!rg@z$HOujlnwKYo#N)In>T$)jXl{xru18GU zafxmfe1iBAkmnZ5cslT(52}WIw8kV$UqTj>-3sw4gwY?-3=Gj;tu+cQeCP@Ont)Dg p!>~wgZ2)HJo;D`W1W7vRdo7t@^1N$QEQIAimkVH#{?sNR_6M}n@8|#k diff --git a/anomalydetection/deeplog/Model1/log_key_LSTM_train.py b/anomalydetection/deeplog/Model1/log_key_LSTM_train.py index 0f222bc..05188e0 100644 --- a/anomalydetection/deeplog/Model1/log_key_LSTM_train.py +++ b/anomalydetection/deeplog/Model1/log_key_LSTM_train.py @@ -32,25 +32,33 @@ def generate_seq_label(file_path,window_length): with open(file_path, 'r') as file: for line in file.readlines(): num_of_sessions += 1 - line = tuple(map(lambda n: n, map(int, line.strip().split()))) + line = list(map(lambda n: n, map(int, line.strip().split()))) + if(len(line)lHp%Q=n#2p)q77+?f49Dul(1xTbY1T$zd`mJOr0tq9CU&+o^F}{^lIYq;;_lhPbtkwwF6mR48#lo$k#MB literal 0 HcmV?d00001 diff --git a/anomalydetection/deeplog/Model2/__pycache__/__init__.cpython-37.pyc b/anomalydetection/deeplog/Model2/__pycache__/__init__.cpython-37.pyc index d68d822ad074bb8f4c305f7492c43f1bfda4f958..20ebbfa38ac7a99167c63e980a86fd4fa6a68666 100644 GIT binary patch delta 60 zcmX@bxPXz{iIEq%@^6COJPPHO41D-7zmQr?NP+I6g16 Md}6m1>t~R106v@($N&HU delta 102 zcmZ3$c#4tRiI3$JYBN<>Dpx=>dB7ICp)@hOch@2UH@eF rwr5+rU#y=R@o zyz{IRbYhlv(Ryir%H$Wq$t~eddC>ig2~V`1G0}>xbY^M?v%(kcXDpZ#9nr-&FZedw z?7f5x?A|8V{q2L%D3ZqyWTIlZY3cs`I4=u%J5o^@E8U^Vogxyk)Xv~QMA{yV4+gaT z)z3belpC1DEZ~9#)>mxE1sB#cKd^-@931U@#T*v6INsIn-9p6Kz&+VYqlZSU%3J6Z zQ@mnRE48MKv&yPiXje9RN4cq2aZqex|J(ts1)2|vGo^KH!El6q?>=tV-Hp^>KO82q zRGS^`mIvcl$`%gLei$ZsqQX#j_Y)!FJS>w*to!-FC@h9y79Gd3)K;Er=g#*1yV}W0 zHPYUlB#$Ght>FN-aLPCv>Q0i650u$1yI?JQhJSHFgBa~F9Ar^hhT&h>xB166cJ3a3 znq+Z_@ea7TN7-?ilsk8dy!N? zc*p4S`p#h_lV~@KLy|XCGD`BT@v-bcnpw1xCXCN;56=>x%$RLkuhcNG=rew#9kRv8 zXr)4@SPEODR>f23d`fd?YNxL9ro6WP=T}%PZ8g>!T2pJssvR`9_CWV5@^tn)3+`dY zp?#Wxqft|H5yxtzA{f!M=7Uu`J3)%MKaMXkA;X(4L47lR+%$zH%Pj zW=k$$(5|tOw)U@SYj925Fiv3ahk0NR!dx!Xa?`EQ;?}T`Pa-MkK>ZI(oPEK*WJkAD8!O-xm9+{(;-t6hZsmZ3S9S4ND=;>h7y<2`P(f=b zkG}^0j_Rhp%0cgl)^p3e0oJBg-D@}t{>Ul*S+y`)FSzVfhAueq7+Tv*_-GXV?Z$mH zml8P42INNcqmkM+no(~>tNA@+2}COYku&%oWYK6>L^rR|7Bn{5*o*U6Mk)@=_%V=T zH_o;`$cjOfl{YbImcOKU0!LS<<8i!cH3Dt+wNs4aT(@N$i40&;!mSU+S*-0*G}i7g zD3!z|j&paRp+PKS9U1^WD?qKJTEd^BH#zD6~wEeGdJcB|pO2floW|qwQ@p) z8Y^Uy6$U?sM`LUY8{TbCK@-jj;HiSIg5FU`q?Hf9?Y^*%zA+x%h4;V{Usw;nh7Y+_ zkGu~%dcs|S&j=6u`jubxG0s#o^{n!$naV{fu>n+b^?WriT8}LG`^O)tRyBvX@hPD` zcj^VKoc(U)V!ed#i*I~CRn6DS@Y?yRg;mQ_Cv7W4QoW+OxJSpxeIf4$NoT8-bZ*KjPxPN#^%+F>S+sMA)dlc+>jgjhtFr0-xFdh~=Xy=F>hst$ zPigZ=ss&nyNF3re>u4A1i`9j65mCN`c1oP8oN69m-pF>LTB$C8@1^Q=b*Z}8P!W9_ z)tPkJB;t*9#f;~xjdXR&-e)-NOnRDVFZkn+9HOjtNbJ0F3=nvbTPdIEpNawm&N=KY`I|G4R7P4 zi|{nd$a@9)SxqmKRv`BhaUz`=#9FyGnME=`jIx8cN#0i{E##9pyMA@D&`iRnoS~fe z;iUERcw7wjKQsV0akkWvGAZ-^rmZ`#7wT5Ie=r;ZU^?R@4@dE+kjE1jM_s?Fc?g8^ zFc*o$%*y0s;}hb#{_Cgp)u;8tx7SS_fdVDWin6@1{`AJyyTe*{4B60LUPr8c%OP#p z2sE*`#%eNup9&CcTpwa6_hTVXW1;CbuV@z3`lO?6X@$xg947G--KLTv+>?>e{%$mQMA=_EsE9|X@l-sP@6u^b4C`nQxqk$VZV@7y z5;+ngEP?*03mrmVR2wC3r2SF!D8{60fZ8deLs+3ylsQB)0L(*{GKMzV#=am*y##B7 zV~I110kFD|hT0+QHtH=U;iCK*b*9klXD20ONGZwGLcts{X}C zAL|~i#68CuHU}M(L8xje%7BBzsOS3~RkzlU5kM^FJ|1G*bFeAj!ZQ}}576xtd*^nG8m~Bq} zjsOoXRFs8eUF{mp%3p%9+#&8Z61#RyX4hV$Go(Utv^E*B$+17E8hgzNO{Crimr|f( zNO?YAMe19|IEUs~)LWR-nXxG^ zOzhpoo+j+*Pnzx}Wf&bs33LNU@$Q;}w+T>6;zpD=N`%)%wf_M$gW~umaV@DRr+T1+ O>p>m%s7%Q6n<~K|K8m=X&O5zq)9~@BDD!6K=`RDku(TORYHsEsw4!fV;jfDKf9C4 zDYJ@Hw-N%C8coI7zHmU|09@dX;KGIL3M4o|#DxQ=oOo+jfSA#|zj<%w?R?+xoH5}IVgXbzI-;N0}3B(x0=-gtb&m1J~AQ&z)hJ0X)fxp68-*s7@E}D81(PIVLlo;_BUl(!_l? z=A?bce}MJOqn76nKUhRXdcst)|557Y-j&dlLhW%iBP1|_b|!{ zx91VdCxyQ=BD1|r9|$5>)vb9n)F;+y^wb4=Uj1R6#B*xQUK#i+=C=tAxv$>0XK_S* zWub4&b~eRzn;bh{`By>sJA0o=rg2$ zu62UwpX<%_fxcYl_B z`Fd=bjNmB}vEB?SbwN-4X8J>XM*T+eB^A0QTvM_89=@Wkdb`HOBz;@`=&k3jkr{Fl qKsZ{MBMc`Q{fkSWhaY%8a~#PyP+*{wZq! delta 1153 zcmZ8gU1%It6h3GEXLfgzwlR~nHgusTU9dr@X+%q`n^q`jpax8pv|*AvyKXbx-OSvC zB)vl(wh9u0!He3*T~P~)6cHsN3cmT^t1kmSD5ZjsY>y2QM1?!*9D4MceplUYB3`snM01Okg28l+Z zlNe;190V|>t-QIESiF8bP7+q*n zW*aHa=$OLkBEss#Ja9|cZQN#tz#&+HNH8@q=Yhc`co8nc#yM_Nh|xx&j6fTy)niFc zXjH-wIa-PbuPH^y$_f2nFZueO)mzlS;kj#hp%H4#+*Nq4lNWjy+e(t}S7x}ft?gly z7bdAnPwTy6SfrL!We%7{0&`HdXCt~CNHMyf?aLWtPqt?=M-poUB8$e^Xqfaerr79d zeelSaT4zK5`V#Y}A17-&CV;IIwV)A1J`Wb-V0AI{mx6HOY}j1(!+4sMe*ArfLaC(A z_+~38EBzCd3#ry@1&!2>0-uF6+&I6p#cGNhsg|{pzZ#Ph zCn*I`d$96op+j;UU40N;9L|>d2#(;O5>@0?vxohWFNz~ log_vector4 # so each element of inputs is a sequence,and each element of that sequence is a sequence too # nn's output is the prediction of parameter value vector - if len(x) < 2*num_of_layers: - flag = 1 - for i in range(len(vectors) - window_length): - inputs.append(vectors[i: i + window_length]) - outputs.append(vectors[i + window_length]) - # print(inputs) - # print(inputs[0]) + + # if len(x) < 2*num_of_layers: + # flag = 1 + data_set = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs)) - if len(vectors) > 0 and flag==0: - return data_set, len(vectors[0]) + + if len(inputs) > 0 and flag == 0: + return data_set, 10 else: return None, 0 def train_model2(model_dir,log_preprocessor_dir,num_epochs,batch_size,window_length,num_of_layers,learning_rate,hidden_size): - log_value_folder = log_preprocessor_dir + 'logvalue_train/' + log_value_folder = log_preprocessor_dir + '/train/logvalue/normal/' model_output_directory = model_dir + 'model2/' log_template = 'model2_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) file_names = os.listdir(log_value_folder) for i in range(len(file_names)): print(i) - file_name = str(i+1) + ".txt" + file_name = str(i+1) train_data_set_name = log_value_folder + file_name validation_data_set_name = train_data_set_name diff --git a/anomalydetection/deeplog/__pycache__/__init__.cpython-36.pyc b/anomalydetection/deeplog/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de760d8632306ae318644edc655711b0f6e441a7 GIT binary patch literal 175 zcmXr!<>lHw%Q=n#2p)q77+?f49Dul(1xTbY1T$zd`mJOr0tq9CUop;BF}{^@~ literal 0 HcmV?d00001 diff --git a/anomalydetection/deeplog/__pycache__/__init__.cpython-37.pyc b/anomalydetection/deeplog/__pycache__/__init__.cpython-37.pyc index 5a7bc22d27f67b871fd8fff759fc8a59c765275f..2bec9dabc02d601c196bc1a09198dffc9dc11c35 100644 GIT binary patch delta 60 zcmX@iIFpgviIEq%@^6COJPPHO41D-7zmQr?NP+I6g16 Md}6mT>t~R106HKOtN;K2 delta 102 zcmbQqc$ksfiI3$JYBN<>Dpx=>dB7ICp)@hOch@2UH@eF rwr5+rU#y=Rg8|sn zkVv?3f>V`?W98_ab4YSX}8pKQ+8rR$vn-kFg>vp`2r7Hi>ecRoE2DDw}3AC>NNyVbo@yV%J8!W-8}q-0z^w z-0rdIkdj$sfk7wqEM|H(vpk2{n9)&%wJ;9q{my2MUH9(S?SG*UtU=S;`cElmqw^kk6ANbeyf z17yM&8AD5$5+X3NKL2=V?|vxlwly+t7!THljxYq6#OzN|3P(Ea%*bfFBV%OYHHT=yhOET)-~S%uo)k+krzRva#th2v&sCokJAt&W3bo?;(+@iSiTx{n|_qMjY>-N zRAc&IJhg&yqA(H4>GpzH<$2(<2-qje?)W`rhp|wZMDVbOq8Eii*%9bmzXx)rGNRv$ z0?36!*sIdxFlODy4Un|Nwl__9JPh`dMiT7wgSd%l%`I9G8#|DL@d5UCJ5L87SPauO zi)PteG8gR=CSO9!0fPwHNhA&3A5kI5z-Ut#Z2$l;EMc|nmO<>^{grS;MgpRd2_Sce z_FpW=Zq2EXj!2j{k*?kuf=QQ}Q4>KY!3M|81fL{<3GSe;s{L%sV8DXWf zEEsKDLc;;`M@M1ad6^&wjM@6248}b2qkhm3K_V{l3QjO^mc_2%B~%ZN`G4eq6Puo6 zBqz3f4=ELbv*|JfL^J9qBH-VahFV!!-9%+c4p_)lspkt2#;bZPU&Lf8!~M7w@C90M zh8k=L1t$x_NyGRtYRhazU0-lQkXMn_Ts1pZ$sX?l{iMqqSi!4OYr3)bBm$-ZXf&bVqRX4x&`Y!{CjeWOyVd0#U(|9cpH5AJQ3&H+ zU+8_QiS3ZFAWlCDbehQD3wWX?j-1RZq~{{gRa)13RBqb-JfahNXKCL>oHNDJa)4qE z?=0RN(ByBS?*L1Z3bF?p%fF*CwgpZL2=0Cbqiu?P-T#uXgr>R+&~|1$wD=Whvdy%e z*je1~V;1!ff%-5DL+_F_fSzO_C!NC<*ow@&A#-TYlPt8eGCwktUw>oH+*o$9kR8|% z%v+F!_Jn|UAoFzpk7cLOE^FDDWVv>Q=Ft*`-ARmrHCeo8$P6SYUmuIh>P3Awr`Ds` zNTuX85|U@;#j%|53t-5%#AAxX1u{}?CY{kb&d*cr1!T%0K~iq83yYA9WoVTXB7B9i zOO&lrcA2tQDSL&o*O1jF`0G@EgR<{a_5);NQCQXbpTfV&Yv@q9?pUVABwG9Vo79!j zqVOG3vU(L{$&1KH;~nUG1+roxcaW;MKj>>;ZXh`~K-nY}qKB+N7D-FaN2t*40Uyl% z%!KZnLsweB-)KW#WR~G_*Jx!(zkv!gpKJx{xcCVn)G`vbLDyWVWeRQDB52E>uRkU0 z{Ir0qW_sDJuthqq+-4BN;I`0==O6F1e)|PIe|x89tBfCot(c15wy!deLngLs7QYEX zehV2i)b2%4bWGOgfoE^}+;ewD*n9}&_9A~1o}jy%mKhO>C{PZ(tH-q*|0%|MF0Slg z7qr@%CcI8F-$tfvOsULWW$rzXz$D}T;|3XuzP3MiKt8s+MR48$$BFaw@(-w z5)xe%n&W@94|Fvr3}^}LD{f9_m}geYWcI8vEC{o0!YP}e8yjVjS;G=i8EF!!f;0tv z#GMU2gc~-ThStyEw$F{s&zFX?>5VOC$xS09()%1bI77E_>VSzn%WBl1l_TSc_4)ho z=&j)~q`4>7?(1^;23j9n7|x@my1OEZt{OXE-ZP$_NqPEV7VG}$V5h&6 zCEZtwbF!>!<(;n&djBFTx~+mX*lVmZx$`wvozwNXo!=kz{A5X%fe~&StnhEh(3!Zx zin0XsbO7aZmIg6xwL3a*wO@gk7 zwkdp{2QFpcQkE6!8Kt%=&nD4!o<_-Y2W=It2U_RkMAaC+h_O?kdojI@#4(CI*(^K8 z=337F=$6CgS#<=vsIA=b`e6S38@DzZ{$@;0L7|h83UEvV&8%)>DdG^|ZV@{=Tcf2RE-bo9MW*e0dAgtn;85Ch+d| zo6cWYx)UIfi;gthy7WpX@Do`6gT}$ktuPLg?SRn~Z-++6D>311?b z8DuKAA6g#92dU3MeuT=#-A0T1?9ee$t_WfT44iIv%KE&8PN6*xeaF;Z z;&i*DzQ;+bebWI^W$l>yF$)+ZT|-9oxsM_Kpm>uYP?K`SSADJO~nYt$I;w) z_hws}>&m=?JznC!!0M`aR5G4ZA5}Qi6LOG}C&=Jx+2vg7lGA0MzGajJ^Bml;)CIt| zRYtvP&S|&30_SW|y9Gsr^mCwFL<@y!)8>kK%7SBWRq&k#ZUqakS+H_v9di!zxK`yz zK8vr^slyKfi*xL$wwOAk4^T%81732{IdAp(pGC{#pGC{Jb;z|$p(qSdV{PN^?S|lf z7<2d~24AMpm%n@T>hdF>hyEr))+58O(-v|%rdp0;PJBz<>M2B2_)jIGa1+0$sHWVs zcS$$U+LwQYVEtsX9?4s})Xrvs(By3yxr?BjxdrA^6bvOZPS% cDp^8CEH3JoAS~Eb%dsn&ToG?6HA6O6~6cV&GXFI9{=7Yc;Xl9q%5i4_t<(2z2Hnfp9`@$>xL z_nb`3ot2A(kXkGj6|ktr5<&u@iWQ5B1#85LRk;g$|n&V4g>VxS;4Fr)kb zf6jN#JLkMSJe+6v?LYIUuf4m-*dM4d_&KP28Y%e&GQk9oStToZg`;i7)|OqdIrZ7G zv*lL2em#e}Z|XcA+R9h*{d%ENF#8m@hAShc&xuF3N|h34A?uBW?9O;^VxLJV+=eB* zeO?)>vtaVGOyq?B1`~d0@7k3UVo2oQV3jFR5Ji;JVpxozoDrjFUduuHP`NA2C_PPA2={;w$HGCF!=ff2)S8 z-@4MQUu*>N{Ul0m1&!twsv=CoS{gMQw?r7W;%5C8Qne*R5!KSg)_s}7W)nyWao6V# zH$62AE&6##LrAY6CEr4pvR&52W4m_B6?ndDeS4$pbY53Z&)R37VXv)q-IS%au!Zvm z%G6PA&)sJ|Z=daA_Ac+b$`x*%D=xfgmgbbZ%V*dmn`RiXdVV@YJ^P$9Wee|PtZTOy zZnB#|)fd@y_7$cqWv^Qxa&Fa7R&!Y+3e%5DN;iYi>J~0D-zq8tA~wH25Go>A#T=!IC&A3Y|_Es@H?Or%6aX?QIcw>*$NwaNQQxk z!SY1gTR}_PQ6trElFF!sq7_G}wqwlmf)=I=?WWtUI4pbGA(rdmdr?C)?^Q9So^Dje ziAkH`{iK?N?d`Bp!?s>FrujM@j0+yZp#PA;t!Ht&WSm~YJ@B*0NB9Dtb4K_Smye?N zkhjEf7s>Dz!VZb9bMTJ$ECu@Sa$!w_FM$m{+u)96My!4CUJzNG)dNx%lHjv5y_Cuz zYAoUAcY}C4Txv8W0Jv0k=ZLa0rFdLYvRE`fW{!&9E(D08Vzm{dUeuG4c;7Zmc9-}BPk9be^0 z65a&Wu+Hxd?E|0K$?fG;j%ZB1pryCEGilV%@&c4mwH60S5+>&tc5+Y=VZ6K~=}tg! z5$LKCEJ>($M$Bln6{H*I7rx#t7YwoGlbEaBmLw2**|yqBo7xlM-KZAo(YP6iYLWuu zAZHCoO`Ec|0brqDJ0U2my>((>S{~9v*&)P>d{#A>%}0$E*pWm#p)PDhLWGU1k*62L z!TnGsy70hjD%t8~!@>SebaU;QU6+sZtzd4MjeQ(1>Vw+1@?e_JC-L_IZut_%4%tUi zK|0N_@3*LcS;Qo_4u*ApMa)d0gmyVp)z85?OXX(4&P_|cpu8S{O^j;KW&K@CSzRA0 z3{0G65C{hf%TZW8&4i0xp{kBX(FcX5hR~l+-8)_{r}FzedF!El{XSHDKxHV!c(JyO85Kjl676H4f*YzrcEg)Yc>qp+wGWY%C8G^9Y0Oo_RBj+(s+fZNfajH2Po5Bv{7n0BynDT z3Ym5d6g~tcE0-`Wc>);&G8{e*<(KD{sn7G7ertdmAmj?ad10?Ja@9vO`|jpwUcZ#@0ERd6DL$We0{$l`>2q?y(+1SZKI36is^j`ErRwAq?4#mfB725Y9yKTBd}22 zNa&wI@f0!&nIh_OBORcx@^{^C5mIO=2WmHK52tp|-e=vB)akh>M}^fbA&nu8BTXPp zrl2)RE2-*RR`*1A%Je{T`#sa$nKUP0V&FWZkoW9wzYd;w-Pt|6b3M(g6QmvX?AJbn z^>f{mBG;Yo{4j-#s*}6;1k7OW4733>$Xnf0>ZCe_KeWL&tXbAQZQ5v|ekS!H)u)k< zkX-j5%gO~#>|QAygXCkRH>}R=a~eUO^`22hbs8U>1&ofX1(>~-z|Lx@0Nj`}9#Lo1 zBbzz-o*FX-(wNA1=dZE$d(a81-fcLFqhOg*ONW?64ajNpc7iP239d;WNj-&i8w2gg zDBI)h^_#5ob3>)Gef|2A?&F;wWIgKfS)69>vDbYS_N?}Wn+*5@MnQ!k`)j0ttQ3Wa!+ry64n6mEUx5&vOspuY3&>(>@0TmNCLi z3r0J=_y{G3^7a+Z@GZ`(X@iJg{3|&#qu=>o%GjPGPpguVvQj2xdrtebkuyg!)J;OW zu$Pms{tJnl{4XTVmGgVfz4t#v;+!^N_WwoVT>0+5l{mNkE~LP1zi#r+|A!LiN&ml) zIL}D8*Y^7o_xpz=?$-}VoF~7z=e1?$r~g#qzWcwDxCdm-lkYr$LXtJF{f?0}PfF9m zhx|y+)Ql?4uI`mYj^xg8KWMQ4w?(GL%)Jbw$rb7SRw&8eT{}2~d}r!# zy|HEnj;I{Lpyv4<>)GWG_`(PLoMwFs3yWrsEgBrVv~&9MHFB-aEnHsB%C&xb&9o25 zMLeKz$<(7CHO zE?sFRiSg4j3!1EIooh6!bqUvOD1<*{ViWQr1~S*$OTz}7W$muRVM0LSZWP`-GR}1_ zkaY~F@&v7TnX)O$C{ADueK|)pW2&!6@{MJgGGnwCGWW|k)_Tk|R;uLF8DlU(gVdfPOYPUu)KkZX!g&ru+ z;#QT+zh15;+ck^WR|Y%79uM0q70+zCQn5iYytP)-{F>(1@$_X~d=N>;t?|5Q4x)Ml>I4x~ z#_YURvPwQV%Vdye|8YKU`!<>AQ%G~X#EW(b`4nb(#-*m&9{yI|u^c$q^B5U{8Sn4~ zYaH|D%zuin*wi|W=#qo}5wqJQ$BEc&9>)J9*3e$G@03%7J4R>XJepw*&O4fuw<%ur zADo9<{j>Avj)U{YJ-phQ#69RX6euWixeJ<<=QBrjo%rLBI7t%T%F6M8>0&=%x_H$D zU=}IjMC`nBW9@3y#JZ%7g>s2jKmXyYR~GLEG74@ZLiNCM@>aEPA}@=LhH>{YcY6$R z7sP7QIJ%7xE!DJ_mGU~RDb$*AoP~80>Ch#N7ful@U5t`ya5snmq&PHzAQRBdGztZ} yjuek^%M4K-KofapgUBd^6w&Dy>6rx!o^GE%XKkQHW#lHjoEFcyD literal 5742 zcmai2-ESOOm9OvqZnvF|?e-*@p2=s=hm&|1Kp}+5kYpF+n2-d5N|`Atear2#tE=0$ zZYNH2`^mEsD>O`YB_80ZEg{e%Ayx>`NDMDL^E}el16pZ^75M{JTFndlJGa_xPa?5Z zx?ks>x*zBK?z!jG+m%XD!}HO(_uma(*0lek#{OfW@)lC^8)QsldZf8osk=JbMr3qN z*VKt-MpoB$ok=~1dS2D_sL(CC#Yw&7mee@qZpE$YTA+<*0&TN4p54> z)6v|yf6$o2a_?y@7nr-IdxYg#;XTc*vmz^@Jj%+ff^wc!*$m1BR%5d$7ug&;g0jK% zJ6fy$6f4)(T6$_-j|W|p_RSs(qSR^+n4g;M{-8}9`;U&wTS!SAS)kD_U4!Ya$qd(G zCPuW<;%XQNYlH5CfOF!1Y9d=}*=gSMV!s=BURw0LZjTKjs+T=)d*DY|k6XTV_3qv4 zx7WPYudQ8o%WL1d`S$hOcU0@&G+x3;D*K-Zwds*cf=y4JzhwHB=8(5>qf`|A0xYq>(3q;TsVK2VV zf}kJuI`@&hJ`Y&f7R&v|+`)u%NC_!DuUopJv<AJRk?4EWH0Q;tPTlSuAq`JJ^dS=r!+&Wm|9Vo zh}7!!gE%elz-JNUoTO&g@26%Mi_}g85Bn(kQ7BR~0-xje!Ol`!4Ej+3oLGc~wDKs7 zS?`etmX6qP=SUx0!Q;eBg6%;Nw=t}~P7`8bQ(R0ou)o>`+CY_dpgVe5uj);G$vmd> zCMZ(@k!IQEKb zx}fc11ZE;xQ2y6P*3R2dab79^^CMdnM3LFdf%1!@B<*n-%CAh7Zwn(ErJ;>Yi4zYs zzk68C+pmzyp~l?WunNIE^rJ!Gi69Z@`7CxYw3fw=;1yJ-+x&lUz)4Kc2$Ii$_9{{a zf|J=aq=YdL9sawPk_2kj?4?qLRZk*L|q)_V~P`2OP3;(7=RLtp|_ zLN^!25D~=kB8~YBWM-e++evUGv>r{*>qWeCNXHb0_$#2c$q>+!M%{PbqPJA*BS#;z*$b_#*WBs=A}dEc;6t+gI3tX3C2jd zaZx&Z7@Mfjets>}c3s)*i%nQ?Y~D|jAi2~W=3ruj=;HZd^D6VZ-UDB>H@qYq247ye zaN$cqzt`UQ@*5W}EcXQ`>jhI##xeEie3Ky@R@L_&6u5Dz`MpuAlxd9QlG=Svr`gFS zP7TpZ9az(_9i%hR8}kwYV*&lNapXmhw>NN7K~Jp!f=-?F9`E|1l}`)VBm|XW821N4 ztuQTZgp37o)+o}xBL8u~leBbbk8UyBA;BQc>T0#AlhL2maY?N#>pPEKOtf2uUh{bC zc=OONzl^@Ab_pjrB&FRyqcX7$RtLJ=`4j0{>|2A6pjEV_)4=&>#+Jb^;cS~sS&r?c z{XS+;{|L5al!NJa2p8ZdROF>~K!Npu@t1kf3&P%X#yMHo(~}>5X3YEqMLD1dRs|?3 z%Hp^r%9|BgAP-@JqT;x!P&C8x;~I^lP=wV22b5WU3giQ{s8i2R6YV8rsYQ@VonQwxBAH-lg)dV-sfAyl>>_0= zl)XXOYm|Ks8DRJ()jvK7|1Q8McgRV+Q$u$a}AtrI!yGCU6>;0T{MKY39G=2_jOnSedI_3 z`qwsr7nx%wv(%Zlani}^J2>gE8mPk+APu2rOLJ0lq{0+vInwInE@&A~$zngH+YHA! zw;p!ryr)h(h+%{~AmjPZuXKL$V|xDMD=jm%{V43jRP;Ce)P58)v4L}G_9Gl`3^3q9 zYCiC}>+FcIy#;+ej{FDk5S^W@v_&YQAhnW>-lJBYe-ksf4zBKC2fWIvCj1tN{2Rzp z6GNu@PO3kCR_-&5`;R;_4+CX?XrSo{wIK1jk~djmeh+CXbGbxX(*SxYw`IS+CQ5ow;bD!W^}#>Ud^Pdt!c2 zhyIMwJkr7wbH|WJu4zwz!tw>18-EXhT=)PU#N zKn(B&&D1Vy+aKS!Uo3CqEzyddZ4yU{$xY2XF@ zpJ16r$5b0F)Q^km=Bzx1e2x{~H^xU;k?yQLb6gikVQbJco|VVN$1!75z98#c&J%-Q*;%K((^}R3 zzdd>>TcvynF7ZST;5)a!UZOXsr$iD`JtQNwv#js1tYYJq_CzO%5eLlzSv2BB$o2SP zKG{%GYREZN&dKBQ#jPAh|NeBPQr1*oEzZkXRhz}g1}r)4=w;a)os=h~v1LOO|0+*n zg)gfWQtM^vS=8i7C6`mw#y6*x+)sU0?)dIhc6UtW4-Uxx^fU554dk4W$H8Bm%=5M^ z(ps3dWnvAJmOMo|k`-u(v~xPs9#DNoo{>3)8zCxGSN=<;7n5(w8MVh5?6#_E)%e&% z7ABtWO-2=y$Zfr@V%Pdv_i$*tj>!b;n{0Y;hMh zUs>?R+Hm0?uid!g`43`pkRo_JFty5;C-;3wK@eh~^I4%B{;7XYktn&OcaT(g-hugh zr%Yjt#AW|4# z_)r|U&pND3;^ij1KQ5H#z;Dn@_^rUi<9@(vXbsPPZ`6FfxzEdkWDp7RFkfp5J_wp> z&(wa(!2Oiw9-g>yE91DamX&uW?YpWyyneOaM#rV)H`X!CEgrPP1U~FO>GW&O+X2GX z=n&z?#n-!mpWuF*5{E}_gmIW`1dN6t=QWHw!&(se{Ul(X2)o2Pa!c2zw-jPqOzjq? zQ@Tzz)3A6jfm=Gb){;;4)2)VzAe5u2e1-#GDYa;p)QICWck|Bm)n1bDCM1vrO8Nbe z`XDaKJR|a!(p=p0I^1W<{Yo7X#E6kNxi6{pFbp0YxL9e<=N)tkNwRt>tUr;<$3!t3mIJdB&~E4gq#~t-v+`i_sC~4>rw2*$$#Qkk-7VEpr<^8w5SlG zB$^i5JcM@wb-BjfTdr{f>FyfJHP^hly6VmxjEo2(;A!=MK*xihpa}}?-XInp7U|Bm z2b{y>QETwj4+0hPDSyxi5eX7)ZO}#F8EmKa#Giqqi1@=TOn@0McQ(fVhe-fCS-8H* z>42ri!w@b{u)X3s>hl%XggjFHR;u5|(l7FNh^u@UI?o)kw0Lluh&lOpWbiW$ib1MQ zRmCLa_hfGsWl=u`YFSaxHmayE>hmgYsiEFPJ+CjpPo;=;PQRd^pwWz)8jC!qqERl8 zC%dR<=fH0vrY)OwL-nE`?=-LK^O(aiYKQVVTA9a-|0xE4h&8vCGH=~O9RVIx!0FJs zE6@I0Tb}$|TfV6x(`Aa>Va~kOJ9lq-g8O02O?=2-r?21m`(LjtKlFL%KR|?k=<~Pm zWtvw3=W-nL@1mK-9J2^F@mo(s;RF25qgv`@C0`{b6?l;OOv_>7`49aNY6XHR+Esu0 zvQtQb|Dj_`ey9?`A>FDU3uPK9e;EM=esW%+O(jia6lRtuFKXtZVVN~MUsmsYzWjgJ C51|17 diff --git a/anomalydetection/deeplog/log_predict.py b/anomalydetection/deeplog/log_predict.py index f9c348a..59c5215 100644 --- a/anomalydetection/deeplog/log_predict.py +++ b/anomalydetection/deeplog/log_predict.py @@ -8,11 +8,12 @@ import torch.nn as nn import os import matplotlib.pyplot as plt +from collections import Counter # use cuda if available otherwise use cpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -pattern2value = [] +# 记录每个 key 对应的 value 的长度 +value_length_of_key = [] # 继承枚举类 class LineNumber(Enum): @@ -20,35 +21,44 @@ class LineNumber(Enum): NUMBERS_LINE = 3 + def generate(name,window_length): - log_keys_sequences = list() + log_keys_sequences=list() + length=0 with open(name, 'r') as f: for line in f.readlines(): line = list(map(lambda n: n, map(int, line.strip().split()))) line = line + [-1] * (window_length + 1 - len(line)) # for i in range(len(line) - window_size): # inputs.add(tuple(line[i:i+window_size])) + # log_keys_sequences[tuple(line)] = log_keys_sequences.get(tuple(line), 0) + 1 log_keys_sequences.append(tuple(line)) - return log_keys_sequences + length+=1 + return log_keys_sequences,length -def value_log_cluster(log_preprocessor_dir): - log_value_folder_cluster = log_preprocessor_dir + 'logvalue_test/' - file_names = os.listdir(log_value_folder_cluster) - pattern2value.append([]) - for i in range(len(file_names)): - pattern2value.append([]) - with open(log_value_folder_cluster + str(i+1) + ".txt", 'r') as in_text: - for line in in_text.readlines(): - line = list(map(lambda n: n, map(float, line.strip().split()))) - pattern2value[i+1].append(line) +def get_value_length(log_preprocessor_dir,log_fttree_out_dir): + global value_length_of_key + value_length_of_key = [10]*(len(os.listdir(log_fttree_out_dir)) + 1) + log_value_folder = log_preprocessor_dir + '/train/logvalue/normal/' + file_names = os.listdir(log_value_folder) + # for i in range(len(file_names)): + # with open(log_value_folder + str(i+1), 'r') as f: + # x = f.readlines() + # if len(x) == 0 or x[0].strip('\n') == '-1': + # value_length_of_key.append(0) + # else: + # line = x[0].strip('\n') + # key_values = line.split(' ') + # value_length_of_key[i+1] = len(key_values[0].split(',')) -def load_model1(model_dir,input_size, hidden_size, num_layers): - num_classes = len(pattern2value) + 1 +def load_model1(model_dir,model_name,input_size, hidden_size, num_layers): + num_classes = len(value_length_of_key) + # num_classes = 28 print("Model1 num_classes: ", num_classes) model1_dir = model_dir + 'model1/' - model_path = model1_dir + 'Adam_batch_size=200;epoch=300.pt' + model_path = model1_dir + model_name model1 = Model1(input_size, hidden_size, num_layers, num_classes).to(device) model1.load_state_dict(torch.load(model_path, map_location='cpu')) model1.eval() @@ -56,16 +66,16 @@ def load_model1(model_dir,input_size, hidden_size, num_layers): return model1 -def load_model2(model_dir,input_size, hidden_size, num_layers): +def load_model2(model_dir,epoch,input_size, hidden_size, num_layers): model2_dir = model_dir+ 'model2/' model2 = [] - for i in range(len(pattern2value)): - if len(pattern2value[i]) == 0: + for i in range(len(value_length_of_key)): + if value_length_of_key[i] == 0: model2.append(None) continue - input_size = len(pattern2value[i][0]) + input_size = value_length_of_key[i] out_size = input_size - model_name = str(i+1) + '_epoch=50.pt' + model_name = str(i+1) + '_epoch=' + str(epoch)+ '.pt' model_path = model2_dir + str(i+1) + '/' + model_name if not os.path.exists(model_path): model2.append(None) @@ -90,18 +100,21 @@ def draw_evaluation(title, indexs, values, xlabel, ylabel): plt.show() -def do_predict(log_preprocessor_dir,model_dir,window_length,input_size, hidden_size, num_layers,num_candidates,mse_threshold): - abnormal_label_file = log_preprocessor_dir + 'HDFS_abnormal_label.txt' +def do_predict(log_preprocessor_dir,log_fttree_out_dir,model_dir,model1_name,model2_num_epochs,window_length,input_size, hidden_size, num_layers,num_candidates,mse_threshold,use_model2): + # abnormal_label_file = log_preprocessor_dir + 'HDFS_abnormal_label.txt' + + get_value_length(log_preprocessor_dir,log_fttree_out_dir) - value_log_cluster(log_preprocessor_dir) - model1 = load_model1(model_dir,input_size, hidden_size, num_layers) - model2 = load_model2(model_dir,input_size, hidden_size, num_layers) + model1 = load_model1(model_dir, model1_name, input_size, hidden_size, num_layers) + + model2 = load_model2(model_dir,model2_num_epochs,10, hidden_size, num_layers) # for Model2's prediction, store which log currently predicts for each log_key. # When model one predicts normal, model2 makes predictions. # At this time, the forward few logs with the same log_key are needed to be predicted # so the pattern_index is used to record the log_key to be predicted. - pattern_index = [0]*len(pattern2value) + #pattern_index = [0]*len(pattern2value) + #pattern_index = [0] * 63 start_time = time.time() criterion = nn.MSELoss() TP = 0 @@ -109,111 +122,176 @@ def do_predict(log_preprocessor_dir,model_dir,window_length,input_size, hidden_s TN = 0 FN = 0 ALL = 0 - abnormal_loader = generate(log_preprocessor_dir+ 'logkey/logkey_test',window_length) - abnormal_label = [] - with open(abnormal_label_file) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] + test_normal_loader, test_normal_length = generate(log_preprocessor_dir+ '/test/logkey/normal',window_length) + test_abnormal_loader, test_abnormal_length=generate(log_preprocessor_dir+'/test/logkey/abnormal',window_length) + + print('predict start') + + #normal test with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for line in abnormal_loader: - i = 0 - # first traverse [0, window_size) - for ii in range(window_length): - if ii < len(line): - pattern_index[line[ii]] += 1 - while i < len(line) - window_length: - lineNum = current_file_line * 10 + i + window_length + 1 - count_num += 1 - seq = line[i:i + window_length] + count = 1 + for line_num,line in enumerate(test_normal_loader): + model1_success=False + for i in range(len(line) - window_length-1): + seq0 = line[i:i + window_length] label = line[i + window_length] - seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1,window_length,input_size).to(device) label = torch.tensor(label).view(-1).to(device) - output = model1(seq) - predicted = torch.argsort(output, 1)[0][-num_candidates:] - print('{} - predict result: {}, true label: {}'.format(count_num, predicted, label)) - now_pattern_index = pattern_index[label] - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - for j in range(window_length + 1): - if i + window_length + j < len(line) and line[i + window_length + j] < len(pattern_index): - pattern_index[line[i + window_length + j]] += 1 - else: + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] + if label not in predicted: + FP += 1 + model1_success=True + break + if(model1_success): + continue + + + #如果模型二预测normal TN+1 否则FP+1 + + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 + + # When model one predicts normal, model2 makes predictions. + # values:all log's value vector belongs to log_key(whose id is pattern_id) + # 是否使用模型二 + if use_model2: + + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(31): + with open(log_preprocessor_dir+'/test/logvalue/normal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + FP+=1 + model2_success=True break - i += window_length + 1 - else: - pattern_index[label] += 1 - i += 1 - ALL += 1 + if(model2_success): + break + + + #abnormal test + with torch.no_grad(): + for line_num,line in enumerate(test_abnormal_loader): + model1_success=False + for i in range(len(line) - window_length): + seq0 = line[i:i + window_length] + label = line[i + window_length] + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1, window_length, input_size).to(device) + + label = torch.tensor(label,).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] if label not in predicted: - if lineNum in abnormal_label: - TN += 1 - else: - FN += 1 - # else: - # if lineNum in abnormal_label: - # FP += 1 - # else: - # TP += 1 - else: - # When model one predicts normal, model2 makes predictions. - # values:all log's value vector belongs to log_key(whose id is pattern_id) - values = pattern2value[label] - vi = now_pattern_index - if vi >= window_length and vi < len(values): - # Model2 testing - seq2 = values[vi - window_length:vi] - label2 = values[vi] - seq2 = torch.tensor(seq2, dtype=torch.float).view(-1, window_length, len(seq2[0])).to(device) - label2 = torch.tensor(label2).view(-1).to(device) - mse = 0 - if label < len(model2) and model2[label] != None: - output = model2[label](seq2) - # Calculate the MSE of the prediction result and the original result. - # If the MSE is within the confidence interval of the Gaussian distribution, the log is a normal log - mse = criterion(output[0], label2.to(device)) - - if mse < mse_threshold: - print(mse, mse_threshold) - if lineNum in abnormal_label: - FP += 1 - else: - TP += 1 - else: - if lineNum in abnormal_label: - TN += 1 - else: - FN += 1 - else: - if lineNum in abnormal_label: - FP += 1 - else: - TP += 1 - current_file_line += 1 - # Compute precision, recall and F1-measure - if TP + FP == 0: - P = 0 - else: - P = 100 * TP / (TP + FP) + TP += 1 + model1_success=True + break + if(model1_success): + continue + + # 是否使用模型二 + if use_model2: + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(31): + with open(log_preprocessor_dir+'/test/logvalue/abnormal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + TP += 1 + model2_success = True + break + if (model2_success): + break - if P + R == 0: - F1 = 0 - else: - F1 = 2 * P * R / (P + R) + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 - Acc = (TP + TN) * 100 / ALL + # Compute precision, recall and F1-measure + FN = test_abnormal_length - TP + TN=test_normal_length-FP + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) - print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + Acc = (TP + TN) * 100 /(TP+TN+FP+FN) + P = 100 * TP / (TP + FP) + R = 100 * TP / (TP + FN) + F1 = 2 * P * R / (P + R) print('Finished Predicting') elapsed_time = time.time() - start_time print('elapsed_time: {}'.format(elapsed_time)) - draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'],[Acc, P, R, F1], 'evaluations', '%') + + + + + diff --git a/anomalydetection/loganomaly/__pycache__/__init__.cpython-36.pyc b/anomalydetection/loganomaly/__pycache__/__init__.cpython-36.pyc index a94fb9be6d5715d56c5725858ab5bcbeb5023745..0f6b81b9067572473c60dd66a47af2c4f77a9d42 100644 GIT binary patch delta 26 hcmdnQxQUU~n3tF9)I?SX&a}*&)Z!T5%AARQDF9^72v7h3 delta 26 hcmdnQxQUU~n3tF9#6(sHPPfdQ)Z!T5N}q{-DF9-u2o3-M diff --git a/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/__init__.cpython-37.pyc index 17b247bb22f2c009cd882030eab5a6488a96c89d..c0f594c244bd083119da13aa9ac3f110861cfa13 100644 GIT binary patch delta 86 zcmX@cxQ&t9iISW#*;F6xlgU$lI=Qb(?hp#<-kpl!f^`c;p8Ig5(5rcB?P_0l_+z`)o^54 zGPf`w7eNsiFbwp*3luGSC~_zg1StBC%(YGtUwZ2$m-hD!*QsitBslZ&<8bDC{JzJ# z?RJCVdiATne7Ziz*x%{p#pQswj;DBvK`_Bn7F07IT7ktY+8#QAV`wXNhhE?r+D>aj zKk$e3pgvpI2%2V(R?sqYJ7~vj(uvvr+@w2Yav+?{67G}-oe!DtMC}n1wb4alE=4YqGH*zbo*AE6}fyrQc)2r-5_Es zk3=lB(?1lEw)>;QKK=gUvS&nb2}H~SE?8iF#s*w)VLkE#TiC+E(T;G1huQs%IV|uX zMoqgna}lR~ceVx19v<4VxQ;Vn}2d;1o&Y~G1fe=i&)vQWK-c8kMNEM*M`Xg>^-EKy;on|p~6aTXTINvzx1 z;V{exVHzFBvd~tRX=m%BTQ{|n7HX)ytt5*hsjWf3=W3^j(}8Xz+2~N2-(?e9%a++c zpYb3@Jq-J4R1{(OC-$$0_c1vez^aB-`q(rsto*kGMnNEAd=2->e5J~n75}4+kA+5?YamKCVWbIk;uA~o zDc#nNHMYh8-2~Dj0NHOWF`EbWIwNgM(#YPH9D6+fH!etxoWSHo8ZOWBu}T+7vtbc zidV6nZqE?dkJB_fb8?T%Ws;JtY2t#gYYz8$tLD3g*Qc%_bQ#G2$uJR}K(k=NK!g^rAik|ZKXB<#2g4!C_?tiKp z!miexTJN(@w#NSbE2>fYxcMql19%(bdg+z^IxC&BzRrZR29VPHCNjx2cJS-l^u?^N zzLvGp2Y>%ItE3Q+V~MTdk5%%&HLxL1r|+w9<2oldF|w*IMcq2BatFYAm5GBL?Lx0h0^vg8fiPply*mwCOgGq zD;r4yoOWfD?Z%Q^E!T+QDN_)%J4o}0Hp#PH?6?<=;$B-fkzww}$Kfbad%As;WFkKb zvEeRWJF^^Wa$a0$KF|)<7TT*8$PKeUq5yeQ^F;G~UC-hpBxGVVh@5?_iK-sG94qU) zaTd!+#b>iy8r_N0SFm^S6AX+ucng`$$Gw2uM)w911I5%5zsS4%M_AF|@&l~xn>+~O z9G*HJ1M#OYZdnt6+qfZ++Dh*fw6IqR_%6|KCsGC_H-OujSc(ADQh;L#=$+c*HUgnt zdcqXJ$IiGz&MlkRrS_PQ=TvRtBV@X&K53M#NpnhRW&R1j@2M7@0FZw-MlByN-XE!U z*)be}#I%HGdAw3CPdW&@IXqoKXL6+CN{Z#ORW2joR?BX=TCP-Fz-z6XpUj)8x;9xb z^J2L+S)8&r*?1A(mWX%o6d|%K8k3cBK|opr;-XqDJNs*={5+f51kUPYw_W4(3tgULnc{mPV;zG&l|by%je%BJX*4jv+S z<|Z2?4`i96Gfi#pZl5fkDQZ;f0GG=r-FHMZ46Bk)RpafG+FS7`@9({R>jR|V8*r;` z{P(l26?=z+K^p7ED9OTMJj~^Bj|Uz$6p1w9B?(R^KfXb`Z2aoM`0|7CCzm!%8BMa? zFwKkN>c)eso39VX@~2q(TwwT)Up}!nN9tt$7S&rU+!$ag_F~cdp|%TEwIGtPCePDA z5R_D7B}KBNJgd9!p*9vbG9~lT@fJw!<|CC1wL@w`=18Ilpm{6g8??S2BkW8}Xt$4G zjj58V2xZo}m&8Z9PQ^Cdm66c?ov448N{x2V{b9#K#UpuzcJWYVhLMs#CpnD8th~^U z33E3oU>EHVqq{MPqRJCa5#56c3q{c=l0NbajxXZgIr$cCvPi@CeUYScix5YbWn5Fp z55WlPGS8JsCcz3STAa=j05+89Zr~VsmDaV%+28Ym7QI4)QHsO0YyJ$*VuR90h<+BG zG*QT8)X;iIrt1yreKVtX(sg57fJSQ92o^MqmyL_%$0W2vnA5JwGV;3NkpBZW#-iPq zGgPzs-H1f~DPG6GET6AgOIFM3@{aA}>2im!AS3x!7w;&#xHpiSI><~HK>1uI_^-EI z8QV9F(`J@c7-?3oMs~T4bwR7j+IMnz_hW-5-K`cKsw6Eo$qZGJC%9=BjzBv|vS~k( zy8u-Y%K)zxFsbL$t)vK}dr<<-5iH(K({Y&2paL!lct(BGqnY;1zXtWuo&1uDR6+bo R!4?!GWqoVST3GZ~{sYKCkGB8- diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequence_train.cpython-37.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequence_train.cpython-37.pyc deleted file mode 100644 index 4f8318b26445d6b774b9fa69de8510172cece2f5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3901 zcmbVO%a0tz8L#TcOwV&?AKr&wtbs%ff!!q!NDL-CO@xdOcqOBbNvAzkvpegao^f@p z?Hzi!ED?&p3YT2bj1(oxA>t4vLW*)u{)Rdx5aDa2aNv^szUtZKp@c)ar>efHuBz{w z@AdnhYc#41&oBP+`=4A~VC?TSIeO|Ko@6O;;gYC}2ELq-%c6P1QU4TPxFY7lRnZF9#C&)X^eM3ro)(MY z8L`#B#VUR>E^)HN0P*6Kg9hd_bO%XXT1qm22{(JS9)dGxD5#K;Cy)gVg*A zYoEM_|IRMAxoKS4@AqSMa8>oRRBgvJwq=?P)C;kWGpS9Lh)aW5NOh0n436-Jjn~;1 zv2i=Y{fH^gT2`bC||gDopJfjtGX(@V&9?j0(W^8o96S6RVXfcFw3?kg5f;|~=LZi3BDIyC-BszGLa6{kCNXg9{aWM$*)f$Hp< zlC~aV{BAOc>Apd_({7krGT6Bx529hLcTMAVFBOB^5iZ@qw3+Vrqd_<7BypC>%Ql`&<@O6 z{pRD}{hEf4KK%JdAHH+#d+VS4`Gb%C^v8ev;hj%D_~o@rgPmv7I626A*%#?Yvt1}C z67tryI34t{K}an-x;IF#;a|T`*l*ya>GVRDntGJhL3=d3mN2XH6T1H8HT_#OP zls$xw@hY$LCJ*o~@TJ+i%1`56LTllBmEX_Rx3ITE2acYj(8?afiew=dEOg#xT`u54 zZv~+%T;U;KcyBY0g+5%uH~uRFA(IY8B0V%xqpPxK&}pW5jtHS>TYb`@T-4<}KNfPo1jSgfa` z$kcXwLdZ1AdLwBXGZ#qW1F152T57yYSGHdSsTZcF z^U>CqW{;^Y@KTRihVTi?19DZO!pEY>`h|L!c5R~j23lr80j4P7UE-sr)z!w4yd8=g zddjo*JjQH@PO}M=w&&!0;tD<`+MBu)Uzd;`oUi=~1y2z2xs$s{XwEff?2Ub_1yHk1 z0>UZAuDix4;pHW(rR#CHQTxBh`0l!{g}MW%UGx za28Zsp|`sgq3=-dleV@DkBked&=Q4^Rh21s2kLgL1X*&XuCx5C_0v(KP}gh+j08)k zabLpH)hOx@#C}3_J&JDb$4T+U>RvsE+e~9-nvP78Bl8gBHdjv`@wXmC-ky2r0$wlM zl}-&adQPEL(5z+NN3zQUfrLM<30NU7AA%O{3PDea=y6?h0*XVRx`y@03UGG+(4PQ? zpzQ{rsG2jZxZ@^LaWgLqUz9}ot~Z_oDAn>hZVc}7af<)~05-3ylUm-G)Td;vh4=WK zl5P+;z|d#oMNt_q-5Kg;K4(`1?0o`ldAyP@Pv!uEEwp(-JUKk5kYYJ+b1R-lT&$R%HQMJWxX~zO*CqQuWHfWXf=6KpP8J+|GsYu zNE1!)ISI|ot*|L{d-GiYTWhjT@<5gr@wCZe{_<%4I8##kgv)J?TF;8O9~BvdT=~MN z{G=QXI=dG}i`$e=*C|)bu;Tps_QkfRD3hA%=Vweg+uiSW6KSf$UK;gff1nQ9JS>5W z=qVdM>M0x=J@_JBz5e4j#+z@9Uw>rXChT6i6D5NzJHP(M`HjcBV+*7A1onT%H%IQq zP><%VDA>5(MLx@RrD&fsZl;w&MaisRs;7ziA$63z6*;YJXQ!FlwFxxru`L(|7{_}_#L3Cq|nweAlJLqlebNUkU82>u`t2) zxO0O7)p)4eaAT(B(3B9H{aC9-62WTB0g`(bk^No zIw%LgIg{;E>I_}-ed>Nd-EqEEXE8NZGLbb}re4BASW$z4w%{KwBNxbI2GzJ#0ZL&V zb(;1y$PJFy!a7ay4~7XOFupw*288Z_SdHSEDAXCKKTYYjEmFfu$BsZ}Q?UxAbZC4l zO<1+|wZ>Jik)$ph5FkUCtRA2R6f9X5`oQVI&R6qatvGGP_zpH_luwxx@KvYEmz}!P z;!QU|YjKY+1HA*Mg*h@6-gRJk6Igy4RKQh_m?1~@;n7ia?*gXxNOy{|nR=O~f2EEp zxY5%a8=Lr1wY_<9ds|8Qs?7Ei{pgQgxOin#NI4`$b$f}-HbJ3ynhiw(GP>~ppO={) zwZHawVNa`?&CzpTyY<3m!}8S-^Ji6gB~VpQidhJCCmL$T~cdyc4gp>ITaT QbHPIaJomr9fvrUU14gJd!vFvP diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-36.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_predict.cpython-36.pyc index 68a42afcbb5feb42ebfc2c398b9f88475623ff8a..4c2ee03fc6ef9e2513f88f36f4efe4a885450075 100644 GIT binary patch delta 1255 zcmYjQ&2Jl35PxsI>+GlF*p5HyeAQpA5ujE*H1yI+gj*9qsYqlIjFY#mV!O7p-biVg zRSsJ?R6;UZAtA&Si5wB?{{RvfxZy)3)DuTIAyf%|Z;jlpHRGAz{NB8o_uhP6dRnet za~yN?`nMl#Wf^$HJ$ID7~Hsb)OW`pZa?h*@C`2{ z&LmxaDkK4U8#2BEb)N;wX@l4n+g}S3#04)gN|cyTda7rV(uEgt&W_DpMhem=c04ZE zy*U5#*X@IMZ#KDCfZtT7#>jKU(V_2(X>xetk52q)5@q^YN$kAt?;piY(fvisW=_!@A*o5y70tbibb!bhPpDW!Zks*~>nCNEwY{;?Ql1v{|z{pWWXrUrd2I&?Pvx2Jr zGF7P(DnXUju-BQ~>~C4I>;oOxRE#?*4s^0(%ces7Um(*@&_*clz1myANo>X`lCa*S zY|P1GRW7!2gOz#oHsxb1kLYr2Eu$TVcY^-qv8^8ewF_6U_t+h_$0$oVoQBd^$yybY+61?Kf6AW{a z-SIR$&3HH-Mb8%Q@te^*`VHqt`EL1C6aK@I8$H#(Dm+S!dq<=8;qYYO-`Myh_~T!E z%|dk7_#mlyoxy!KIyWkoeED8ou5L!Z7|*YoFITk|I|qZIhskO8haKU1-UaYlsh4Qy z!0kuBnQOJyLjP#s?wmZBCdTgZRO`!#XUW`eNcB2Yv~F!CUxP9JEZVoK*W~b-yvB8I zV-Qu0p{nG04t^R)hF^d-`G%syG83lsqI2u2fpOhNmS!sY-7508qJJzm8F#TAeQY=K T8N_rXF=gUJQPpkT(DnZSLpUKL delta 1011 zcmYjQOHb5L6u!6fpzYia3^UAw;n8^jA@LC+kpvQ>(GW3e5EC&Il?-#MFbo~&bP|D6 zXVJJa>PTavuJs2Pp7wsszsZ%(XqvkIeeGT$ zLC809?2+(%9%pOyMQJrcZ<5K_`+bEf^Xm=>6_N{P+tnoGL$`64A;a`HHby7b{W$X-J9 z?zE#J!VuAc*>TWZ3gIl-AYI8Cfg!G>2(S!+aE9uK(j4l=3&AktHlkf>eHQ6OQCJQ$WnjD`Dr1ldD$E=Sp|H_|g5QVIUoa4G z22hp`PHA)Kz$;n(kP5IAdXvE^*`?cLnP5LehN11B%V_=YG$bK~-H|q=4mRxywzFvV zaRy0jUEsA4o5#D|8@kbO%{ZYS`W%@_5+1i4QqG)Ouj-5C{54PYs25hi)iD&M!{H(`E zYTwe9m3^1W1yTfo-2%N9BoMU7p~#_V5wz%~r=EMoIwm>gNf%coFC<#VL%)Xu3 z+1Z)d*_nB-R4Oq1uKf8AKYf3Sv40XW{CSvM0VELy!32+3eemUVZgHpO)?GfJd36so zC(5+4^(*WIWINuxTGx?ryV@G&>UeAe)$UbNFg2;(HzN2D96!0yIQBlOVB>0D{ zR(g#S*?Nte%#F4PBI7l?!Z&WS(``DmgkJ_Q0(c*gJj9^bE=wK7H8kCIUR+Pz?Q7cY zIem7G-G3+bl&d`92=_S?p2$3B+UsTdtd|wpKEK9xv1ga3o_2aU=K>E>_H2Dya%wSMVaph>%b0OR7Gm?s3e7%-28P)AXfpJ@Y z$7CKxZC{y8qGSjx=|rJ|y*&P$$s*+Z=Vur`xOnfY&wlsS?|)05S6}|()t8^#`+oJC zzkc!RFMs;_kDq+=#jo#OZ*N?U{peW|CWq(el1E`2G(_;^o*%bcU=V={nksC^_oDX3 z9*w4ry?G-Ewz@&ALO*JBWFW$(TJ1bD`Ad=CdLaDE6J#NP?JR5raUgvatX_)RO+QL5 zV=`dc_Y2(;#gFyNA|@M5v-_2lE2jA_qG!{Sp60)FwmkBVBtOD@HwxmKW1Q9VhHqOx z42BYx-xMxIVWOJtR%f6Losnfbi{GZskns&aTgw}dHX5(p31TxM17AdNP9n1qAagWO zAW3=&ChK>=C2(6_?6(3_dK$)}{j>qaHq@i~BzbHzc$VzBs~IGQKcoZT61ukoeNBF- z7-^r-BI8+JyOs*Cdh3#|0B$WJ;u>^UbpJc$7QLxP$5)RoRzFH>Ba%F6 z9Mv{i1Wecr%ou`GG!lhNZcrp3blbA|2zG*Qyx@r+nM_B9u`=0*ZQ1fwEo%y4-07-D z67B?M^ie285D$<-+-)@?|5+dtGfMMj6e zRCtBw;Zylt(Dt(f8BSpRc^;Du-bv%5#qw_&6y~DmJY_bOwOtgVg3S7P7Q}fEclf?N&M}oV2*Trbm0+Cbx@eNtb#^ zZGC>vNz3UNr8~05 z+04G=@-AfTQAmrey99Tj_2Y4U3}Z$VK6QE%VpMyY_1)g2n!+PJrYA^xdIHu}^a+va zI|5qh{3yzZ)#s#|)+hA|`0XS-QS8m=Mc@S#o$4-vvhUglykxa-Yx}}E_ex#=?&lGl5?5q+m?yo-7GnO`k)oDAMhMg#S@C%)A zbnD-TV?fKrzNX659F}`Zl>23^0>qZJ z;;^g@%9%|8RLqbbYmOWzkyJh3N)2)nw08jsNkt)B67BuUd#JJwOW96oFYi>5m!*3B zckIF`8KItag4h4UZ|C$!Y4u!n$R>lN8!1ZhXRAtfgKFeI2qMB?3+#PO$L=hx-5hYN ztq;Dp_wd^m-}&Hbvx$X^tKWMFF*jw<3=WEprUkKjm?+oKv6J4|`V0|d3YK7Od3 zvnA6MY}i;|*PYup>&_bB_By`nb@%G^>-EBc^Xp^&gP7`igtoyKfs_kmW%+4lIu~>~!$QSccp(&qDUWSOGTZJ*cCTH=s+cJP0CcI{0zdC6glKM+mliQicTJJ Z(2au2^zIW%K4fN*z@78*#r#M<|9{V+qFw+1 diff --git a/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-36.pyc b/anomalydetection/loganomaly/__pycache__/log_anomaly_sequential_train.cpython-36.pyc index 8a6f41477ea4e8bf540fa102a2fa4e235f1e1f04..acfa2de52410a244210f58f0fe732a559fa55a6e 100644 GIT binary patch delta 828 zcmX9+TTj$L6rNMI-Ch?Iv#Z?1c!7EYmc(dK5hM`_mjD`M1=q5z&}E^^bRb|#B-!8t zk&rlV#5Z1i&3_c+J$wI2 zI|Xo;x4};R;kLE;HnNIPAwy*THU9`VZbq<7=;yV)%^{|;WTaFeg43h|SkS#KM@CVL zlt{)28L}&#!3>bdENc9m)Y$uThOF()kQw5VNis#A5T-MuXhsI2>>B%0m6;W&NZY9> zHBD@jFG{bp7k%z3fpcY-2T__g8(E&(lGq^uB;BUp6in17cYd`Jr(!J&Jr9Bh(QHq(H%$ z=06PoeD%QcOOFO&&7t~QHt9T-EG!_40En3us~K`sympVnNQR(V)!y y723;BriQb9sK>STsJPCY>4$k|!|_Yb0)C1Pl<0`4QSe__N2QCuN%hw?srG-k;>$?RuQ|59H^=5!z-CyE%`;&MyX=G#)xh3Y8qBUkUD}%yW=@Y7Ir0B;3ipJAy?!mY%8CtGhhOOR_Qcg)Vm!gTHdQi0X4(lEf>3Z zH_ak7TMde**a(4l+&h& zZ>{r*Y2rD`n!IBME@%`TcxaW*rbvj??IN$%niU$`u9C`!2)sDY;Ujwx7Vx#b+k3ew z-IL4n5*|pHX{tF)JN@9}qLZ;?$L1f4_`yj{%u3!aL0Xp^YVsa*uytK+XkLO#FAKV; z^1||^N`((gYBgzfO_ZMU5o~uS^-*~+LhS&#RCEy&a$ATDMD5*&;?92~4gATdp` zB9IAD93}Zh$r*Zic}2z`E;odmUr>^nTVyg>o6p0@1}J_DXef{>Vh5?=$t;dfEKAJH bNlXHIL<1!54N}AAo1an&6fu~*l+OtOz@crA delta 465 zcmX>n`&^dSiI5}D7*DeE5Pj+-Z+0h+is_==Xwr+7oW?pK1N^03;DP}Lm;>oGZ{}_cgr?BWU zGOBN$#JZP}ORzY#urxINA0Q)0G^~uH@wv5t~3psqvPSPBwL zGH$VE7N=wu-Qvhi%uWR|ii>1H@+`%PWvNkI#U+U)K-)5tOK!0y=j0csYDyQ$PiEot zVAP!K%c;hDi!&v$Br!fMGbgo3VR8d!C8Oo!Uz}$d^(UX;>XXt3nZywvpP83g5+7gW z1hU;3L>Nsj;8x*w1Ti6kljn0UkpWxGT9sOqUkp(OmNNjUGMpUGvz1YMvMBEXTac+i zKwQiQBsds(I5=2~Kw_F=V5diMl;jsBXXxeS6&V3Vqqrg5{DP9q+#=)2LVO-Z)} z%>_9_J8{122Y$B@6sGHnLCNqa2W8V%f(m!J7gsl#VB;CVc5CC=36njZ>)AX%v4R=y z^THh#%>97zA}`%xyd>=1`9mkD^D?hsWJ3}|gU3r_O+ z;1sV1r};wg1YZnR`BHF(H-aa{8NR&4f~UlSI4zb$LoACGaYCFFr^FfYlz8H>1gWhn zthsU@@6IkaEnT^?*X>5~;HvB>A)B_YY>Hmemlq-xB|_;U5f}Rr7xKP^J=kJ>N4u>( z9%-jF*lRh&3BOsCJj$CWiET5cSc=0A?U9o@V{5|3_JoaG($A#2Otp-c&wmq>%Ipyz8M?iVpzH5*2Ut8sW!>Ypm&s32+8Mj0G0=qUC75ASs zW47tnN2SzFi_0v{nY#1}yE(qf_McP5lpbq|nRTn9a_XBEZdzVu+-)$AQTq~e81{U2 zl@T9WpUtL)w21M0S6L=0#~amI13Q*jYT?ZsyjkTjbJhLpnd}elG_t&dcX;g)`PLv` zjwDMfM#^$pY(Or^Ok>pt$L?7-cdjzjn~=GXTJ$cQ{oGz=qZ!;qWw&H*BbQc>shKHN zR+ZgqT0uQSw_tkJhk7&U)oAX}8P(DnV3xbAEeXVUTpbLAB>YGZ)qTK&HD|TkABdjz zrHFXk=?U#6gSex#8+Q_=oo+PHPN%1|H;~jgF&5jsf%b+%_7m;NsJBg}O$Am8Z?_EdOQ`~5wYQPZL4bS@jxR!3V(Tf4f{6Su>o4G(ml(?yz@F3zPk-_Gryp*;u=?enKKcBQfB5_FKm78OUv6FO zZ@=D);)A4_CSh7q_;eUcZY4Tqw~}oqlf%FFqY3GMv((l2NC(HaO7LZP61l zQX-6_>msf~kmP%)m{qjOR@L(H%vp6)>Q)U@-CDI8)=BeZy&BfiQ^Sm0$KDn(IQkvA ziR5XF2o_kJ1@;H5ZE=g+cl^NN4tL@E+z*(`0uSKvw0Eh`Mcm3wc~DD*SS9bGQcPJX zY?M5U#Zo6_p#w`~Aw0}(4wi-o>MZppHdYlm!_n^LO`x`P zJyNZmu-%b~k|mtiECFnLz?$4ZPx~RRxTC^Qmv%awi(Z&?hC)}SNQ$EaArk<$r`?NJ zHZN%}PE>azYtHK2Mch^-wcTztbJ|Tr+}7m`GQ=j_c&K+)K93{i3+7zF#~cM23t@)X zFf@1W$&S%Pp3%9rIkq0+^#LxZ)e)-4F`rzlMcA8K9DZhf@hp>Q%3+Kh{ ztZh4RcZIKywXtGikBZ;}w+{JBX$hC?Ts98r{(tammSh8OEf=UVN_OcIjMBBO`Z@=* z>a>I;qmbW6h2R${`7)?xS-yfMn0kW^`6Tt+osDoKa9dP+bYUCX3eD(QGkP`~JsT9d zd$H<}OVAFg>>BCz%w6hyyDx7?lD~+NWDXUxPUERh>*rx`!%B9m)0IFIklBm60>~_e zVYkosVxr4ocylj`vl#;t`40H$$`nH_5ywY3A_1C~d<{*8DMI`+D2a>8x5@=$11&NO z`kh26pcnxC8IC^nIpX1{#BJmO`9sj$Spuxm9MPk)LhMPsL!?^@Aj9>N<$8zS82N#T zjvgW^tP-)LipW$=^W5V(p1ihRpHJzccYj*(5g)L{aI9eJt(#7!%GL#z1ET?lVfIX9BFdDGU@m2rK-USgv<<`!s-vJL2=G+s>S zIo=0Rysnngncc>rb%ISCipALsuPicNKuY+LTHak5pGXT6>z( zhM7Mk!#F?8zbpp**3S9i{3ZpFRdRaM*m-Vs^X;Z92`#$#)gGNscJ|usSm@%Q(+j(z z+m{DTE6Cv#-jP{$@H#dPpMHy2um0ry(fa$NYtO8jFxBa8hjBki&aJ+GZtcbPC`%+b z*8kgDA3AFTHJmk~VBu;Tjyc&8ym>}DaNd#(#{gCmK;;FhC`3s@v!rZ9l7Z>kn@B8@ zw|YwU2L~5HYOg;~ovwCC0T4Wruq!0WiNt3G(*=R|Hi1)nEo2#j9EFOIg1EOj;_S0)(>7U3jB zJk3kMRU&V|?MO1~ph7S>9t+CU#TyJ_NT5BlGw>;u3Gu;z&mp3acj1dc=Wd(NilERk zEm9F(FbXB-sy!o3P&D^zE?2%sk~)dFsa^Q09)cz1Bl24`j5IDu0uMP#fBR84S;}}A zVF*K6_G0^1!!BBjcG<33RmVq}v)pOkQnYIrfh)taimasyS9%hZZ^;222PfJEvcvE` zsq{YS&}0GfN7Vg{D#~Studc1F<4@m7>u+yvN+EtM68Q9I-?{Mim31z}fE3p5#3ES- zg`{%YWa-!N{QtjSBx=}v?Dp(ljdIq9Z#;JC!;+HlVRXGm@Qkypa82#2VeuF#>&>S# zMEWM_UhDNT8!Ba8bIo!S8Vbr;I)1$m^uKGYP1mwTd#V#BYlIk+zZ-MZ9)LPHkacX; zek8YHbcv8ZrS8;p={pnYQ4&VCq7D!e3uDbXcge)M%+l!+B6X1@&8)mXa$F`$q3`ETZo2aX+$0{<$^#D}A3hVwWScI_rDt!|r=TK!p bECYxHN<4PM@lVe!_)Q;>{pQCv3HN^iWRRTu diff --git a/anomalydetection/loganomaly/log_anomaly_predict.py b/anomalydetection/loganomaly/log_anomaly_predict.py deleted file mode 100644 index 637bbd8..0000000 --- a/anomalydetection/loganomaly/log_anomaly_predict.py +++ /dev/null @@ -1,131 +0,0 @@ -import torch -import os -import torch.nn as nn -import time -import numpy as np -from anomalydetection.loganomaly.log_anomaly_train import Model -from anomalydetection.loganomaly.log_anomaly_train import train_model - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def generate_test_label(logkey_path, window_length,num_of_classes): - f = open(logkey_path,'r') - keys = f.readline().split() - keys = list(map(int, keys)) - print(keys) - length = len(keys) - input_1 = np.zeros((length -window_length,num_of_classes)) - output_1 = np.zeros(length -window_length,dtype=np.int) - input_2 = np.zeros((length -window_length,num_of_classes)) - output = np.zeros(length -window_length,dtype=np.int) - for i in range(0,length -window_length): - for t in range(0,num_of_classes): - input_1[i][t] = keys[i] - for j in range(i,i+window_length): - input_2[i][keys[j]-1] += 1 - output[i] = keys[i+window_length]-1 - new_input_1 = np.zeros((length -2*window_length+1,window_length,num_of_classes)) - new_input_2 = np.zeros((length - 2 * window_length + 1, window_length, num_of_classes)) - for i in range(0,length -2*window_length+1): - for j in range(i,i+window_length): - new_input_1[i][j - i] = input_1[j] - new_input_2[i][j-i] = input_2[j] - new_output = output[window_length-1:] - return length,new_input_1,new_input_2,new_output - -def load_model(input_size_1,input_size_2, hidden_size, num_layers, num_classes, model_path): - model = Model(input_size_1,input_size_2,hidden_size, num_layers, num_classes).to(device) - model.load_state_dict(torch.load(model_path, map_location='cpu')) - model.eval() - print('model_path: {}'.format(model_path)) - return model - -def filter_small_top_k(predicted, output): - filter = [] - for p in predicted: - if output[0][p] > 0.001: - filter.append(p) - return filter - -def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): - model = load_model(input_size_1,input_size_2 ,hidden_size, num_layers, num_classes, model_path) - start_time = time.time() - TP = 0 - FP = 0 - TN = 0 - FN = 0 - ALL = 0 - length,input_1,input_2,output = generate_test_label(logkey_path, window_length,num_classes) - abnormal_label = [] - with open(anomaly_test_line_path) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] - print('predict start') - with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for i in range(0,length-2*window_length+1): - lineNum = i + 2*window_length - seq = input_1[i] - quan = input_2[i] - label = output[i] - seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size_1).to(device) - quan = torch.tensor(quan, dtype=torch.float).view(-1, window_length, input_size_2).to(device) - test_output = model(seq,quan) - predicted = torch.argsort(test_output , 1)[0][-num_candidates:] - predicted = filter_small_top_k(predicted, test_output) - print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - i += 2*window_length + 1 - else: - i += 1 - ALL += 1 - if label not in predicted: - if lineNum in abnormal_label: - TP += 1 - else: - FP += 1 - else: - if lineNum in abnormal_label: - FN += 1 - else: - TN += 1 - # Compute precision, recall and F1-measure - if TP + FP == 0: - P = 0 - else: - P = 100 * TP / (TP + FP) - - if TP + FN == 0: - R = 0 - else: - R = 100 * TP / (TP + FN) - - if P + R == 0: - F1 = 0 - else: - F1 = 2 * P * R / (P + R) - - Acc = (TP + TN) * 100 / ALL - print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) - print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) - print('Finished Predicting') - elapsed_time = time.time() - start_time - print('elapsed_time: {}'.format(elapsed_time)) - -if __name__=='__main__': - input_size_1 = 61 - input_size_2 = 61 - hidden_size = 30 - num_of_layers = 2 - num_of_classes = 61 - num_epochs = 100 - batch_size = 200 - window_length = 5 - train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' - test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' - train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' - label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' - model_out_path = train_root_path + 'model_out/' - - do_predict(input_size_1,input_size_2, hidden_size, num_of_layers, num_of_classes, window_length, - model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 5, test_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py b/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py deleted file mode 100644 index 6286fce..0000000 --- a/anomalydetection/loganomaly/log_anomaly_quantitive_predict.py +++ /dev/null @@ -1,132 +0,0 @@ -import torch -import os -import torch.nn as nn -import time -import numpy as np -from anomalydetection.loganomaly.log_anomaly_quantitive_train import Model -from anomalydetection.loganomaly.log_anomaly_quantitive_train import train_model - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def generate_test_label(logkey_path, window_length): - f = open(logkey_path,'r') - keys = f.readline().split() - keys = list(map(int, keys)) - print(keys) - length = len(keys) - input = np.zeros((length -window_length,num_of_classes)) - output = np.zeros(length -window_length,dtype=np.int) - for i in range(0,length -window_length): - for j in range(i,i+window_length): - input[i][keys[j]-1] += 1 - output[i] = keys[i+window_length]-1 - new_input = np.zeros((length -2*window_length+1,window_length,num_of_classes)) - for i in range(0,length -2*window_length+1): - for j in range(i,i+window_length): - new_input[i][j-i] = input[j] - new_output = output[window_length-1:] - print(new_input.shape) - print(new_output.shape) - print(new_input[0]) - print(new_output[0]) - return length,new_input,new_output - -def load_quantitive_model(input_size, hidden_size, num_layers, num_classes, model_path): - model2 = Model(input_size, hidden_size, num_layers, num_classes).to(device) - model2.load_state_dict(torch.load(model_path, map_location='cpu')) - model2.eval() - print('model_path: {}'.format(model_path)) - return model2 - -def filter_small_top_k(predicted, output): - filter = [] - for p in predicted: - if output[0][p] > 0.001: - filter.append(p) - return filter - -def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): - quantitive_model = load_quantitive_model(input_size, hidden_size, num_layers, num_classes, model_path) - start_time = time.time() - TP = 0 - FP = 0 - TN = 0 - FN = 0 - ALL = 0 - length,input,output = generate_test_label(logkey_path, window_length) - abnormal_label = [] - with open(anomaly_test_line_path) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] - print('predict start') - with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for i in range(0,length-2*window_length+1): - lineNum = i + 2*window_length - quan = input[i] - label = output[i] - quan = torch.tensor(quan, dtype=torch.float).view(-1, window_length, input_size).to(device) - test_output = quantitive_model(quan) - predicted = torch.argsort(test_output , 1)[0][-num_candidates:] - predicted = filter_small_top_k(predicted, test_output) - print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - i += 2*window_length + 1 - else: - i += 1 - ALL += 1 - if label not in predicted: - if lineNum in abnormal_label: - TP += 1 - else: - FP += 1 - else: - if lineNum in abnormal_label: - FN += 1 - else: - TN += 1 - # Compute precision, recall and F1-measure - if TP + FP == 0: - P = 0 - else: - P = 100 * TP / (TP + FP) - - if TP + FN == 0: - R = 0 - else: - R = 100 * TP / (TP + FN) - - if P + R == 0: - F1 = 0 - else: - F1 = 2 * P * R / (P + R) - - Acc = (TP + TN) * 100 / ALL - print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) - print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) - print('Finished Predicting') - elapsed_time = time.time() - start_time - print('elapsed_time: {}'.format(elapsed_time)) - - -if __name__ == '__main__': - input_size = 61 - hidden_size = 30 - num_of_layers = 2 - num_of_classes = 61 - num_epochs = 100 - batch_size = 200 - window_length = 5 - train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' - test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' - train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' - label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' - model_out_path = train_root_path + 'quantitive_model_out/' - - train_model(window_length, input_size, hidden_size, - num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, - model_out_path, train_logkey_path) - - do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, - model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) - diff --git a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py b/anomalydetection/loganomaly/log_anomaly_quantitive_train.py deleted file mode 100644 index b4b3b7f..0000000 --- a/anomalydetection/loganomaly/log_anomaly_quantitive_train.py +++ /dev/null @@ -1,98 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -from tensorboardX import SummaryWriter -from torch.utils.data import TensorDataset, DataLoader -import numpy as np -import argparse -import os - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -class Model(nn.Module): - def __init__(self, input_size, hidden_size, num_of_layers, out_size): - super(Model, self).__init__() - self.hidden_size = hidden_size - self.num_of_layers = num_of_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True) - self.fc = nn.Linear(hidden_size, out_size) - - def init_hidden(self, size): - h0 = torch.zeros(self.num_of_layers, size, self.hidden_size).to(device) - c0 = torch.zeros(self.num_of_layers, size, self.hidden_size).to(device) - return (h0, c0) - - def forward(self, input): - out, _ = self.lstm(input, self.init_hidden(input.size(0))) - out = self.fc(out[:, -1, :]) - return out - - -def generate_quantitive_label(logkey_path, window_length,num_of_classes): - f = open(logkey_path,'r') - keys = f.readline().split() - keys = list(map(int, keys)) - print(keys) - length = len(keys) - input = np.zeros((length -window_length,num_of_classes)) - output = np.zeros(length -window_length,dtype=np.int) - for i in range(0,length -window_length): - for j in range(i,i+window_length): - input[i][keys[j]-1] += 1 - output[i] = keys[i+window_length]-1 - new_input = np.zeros((length -2*window_length+1,window_length,num_of_classes)) - for i in range(0,length -2*window_length+1): - for j in range(i,i+window_length): - new_input[i][j-i] = input[j] - new_output = output[window_length-1:] - dataset = TensorDataset(torch.tensor(new_input,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) - print(new_input.shape) - print(new_output.shape) - return dataset - -def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): - # log setting - log_directory = root_path + 'quantitive_log_out/' - log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) - - model = Model(input_size, hidden_size, num_of_layers, num_of_classes).to(device) - # create data set - quantitive_data_set = generate_quantitive_label(logkey_path, window_length,num_of_classes) - # create data_loader - data_loader = DataLoader(dataset=quantitive_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) - writer = SummaryWriter(logdir=log_directory + log_template) - - # Loss and optimizer classify job - criterion = nn.CrossEntropyLoss() - optimizer = optim.Adam(model.parameters()) - - # Training - for epoch in range(num_epochs): - train_loss = 0 - for step, (quan, label) in enumerate(data_loader): - quan = quan.clone().detach().view(-1, window_length, input_size).to(device) - output = model(quan) - - loss = criterion(output, label.to(device)) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - train_loss += loss.item() - optimizer.step() - print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: - if not os.path.isdir(model_output_directory): - os.makedirs(model_output_directory) - e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) - torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') - writer.close() - print('Training finished') - - - - - - - - diff --git a/anomalydetection/loganomaly/log_anomaly_sequence_predict.py b/anomalydetection/loganomaly/log_anomaly_sequence_predict.py deleted file mode 100644 index 5542c3a..0000000 --- a/anomalydetection/loganomaly/log_anomaly_sequence_predict.py +++ /dev/null @@ -1,123 +0,0 @@ -import torch -import os -import torch.nn as nn -import time -import numpy as np -from anomalydetection.loganomaly.log_anomaly_sequence_train import Model -from anomalydetection.loganomaly.log_anomaly_sequence_train import train_model - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def generate_test_label(logkey_path, window_length,num_of_classes): - f = open(logkey_path,'r') - keys = f.readline().split() - keys = list(map(int, keys)) - print(keys) - length = len(keys) - input_1 = np.zeros((length -window_length,1)) - output = np.zeros(length -window_length,dtype=np.int) - for i in range(0,length -window_length): - for j in range(i,i+window_length): - input_1[i][0] = keys[j] - output[i] = keys[i+window_length]-1 - new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) - for i in range(0,length -2*window_length+1): - for j in range(i,i+window_length): - new_input_1[i][j - i] = input_1[j] - new_output = output[window_length-1:] - return length,new_input_1,new_output - -def load_model(input_size_1,input_size_2, hidden_size, num_layers, num_classes, model_path): - model = Model(input_size_1,input_size_2,hidden_size, num_layers, num_classes).to(device) - model.load_state_dict(torch.load(model_path, map_location='cpu')) - model.eval() - print('model_path: {}'.format(model_path)) - return model - -def filter_small_top_k(predicted, output): - filter = [] - for p in predicted: - if output[0][p] > 0.001: - filter.append(p) - return filter - -def do_predict(input_size_1,input_size_2, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, num_candidates, logkey_path): - model = load_model(input_size_1,input_size_2 ,hidden_size, num_layers, num_classes, model_path) - start_time = time.time() - TP = 0 - FP = 0 - TN = 0 - FN = 0 - ALL = 0 - length,input_1,output = generate_test_label(logkey_path, window_length,num_classes) - abnormal_label = [] - with open(anomaly_test_line_path) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] - print('predict start') - with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for i in range(0,length-2*window_length+1): - lineNum = i + 2*window_length - seq = input_1[i] - label = output[i] - seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size_1).to(device) - test_output = model(seq) - predicted = torch.argsort(test_output , 1)[0][-num_candidates:] - predicted = filter_small_top_k(predicted, test_output) - print('{} - predict result: {}, true label: {}'.format(lineNum, predicted,label)) - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - i += 2*window_length + 1 - else: - i += 1 - ALL += 1 - if label not in predicted: - if lineNum in abnormal_label: - TP += 1 - else: - FP += 1 - else: - if lineNum in abnormal_label: - FN += 1 - else: - TN += 1 - # Compute precision, recall and F1-measure - if TP + FP == 0: - P = 0 - else: - P = 100 * TP / (TP + FP) - - if TP + FN == 0: - R = 0 - else: - R = 100 * TP / (TP + FN) - - if P + R == 0: - F1 = 0 - else: - F1 = 2 * P * R / (P + R) - - Acc = (TP + TN) * 100 / ALL - print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) - print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) - print('Finished Predicting') - elapsed_time = time.time() - start_time - print('elapsed_time: {}'.format(elapsed_time)) - -if __name__=='__main__': - input_size_1 = 1 - input_size_2 = 61 - hidden_size = 30 - num_of_layers = 2 - num_of_classes = 61 - num_epochs = 100 - batch_size = 200 - window_length = 5 - train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' - test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' - train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' - label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' - model_out_path = train_root_path + 'sequence_model_out/' - - do_predict(input_size_1,input_size_2, hidden_size, num_of_layers, num_of_classes, window_length, - model_out_path + 'Adam_batch_size=200;epoch=100.pt', label_file_name, 3, test_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_sequence_train.py b/anomalydetection/loganomaly/log_anomaly_sequence_train.py deleted file mode 100644 index dab9ed3..0000000 --- a/anomalydetection/loganomaly/log_anomaly_sequence_train.py +++ /dev/null @@ -1,107 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -from tensorboardX import SummaryWriter -from torch.utils.data import TensorDataset, DataLoader -import numpy as np -import argparse -import os -from . import * - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -def generate_label(logkey_path, window_length,num_of_classes): - f = open(logkey_path,'r') - keys = f.readline().split() - keys = list(map(int, keys)) - print(keys) - length = len(keys) - input_1 = np.zeros((length -window_length,1)) - output = np.zeros(length -window_length,dtype=np.int) - for i in range(0,length -window_length): - for j in range(i,i+window_length): - input_1[i][0] = keys[j] - output[i] = keys[i+window_length]-1 - new_input_1 = np.zeros((length -2*window_length+1,window_length,1)) - for i in range(0,length -2*window_length+1): - for j in range(i,i+window_length): - new_input_1[i][j - i] = input_1[j] - new_output = output[window_length-1:] - print(new_input_1.shape) - print(new_output.shape) - dataset = TensorDataset(torch.tensor(new_input_1,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) - return dataset - -class Model(nn.Module): - def __init__(self, input_size_0,input_size_1, hidden_size, num_of_layers, out_size): - super(Model, self).__init__() - self.hidden_size = hidden_size - self.num_of_layers = num_of_layers - self.lstm0 = nn.LSTM(input_size_0, hidden_size, num_of_layers, batch_first=True) - self.fc = nn.Linear(hidden_size, out_size) - - def forward(self, input_0): - h0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) - c0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) - out_0, _ = self.lstm0(input_0, (h0_0, c0_0)) - out = self.fc(out_0[:, -1, :]) - return out - -def train_model(window_length, input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): - # log setting - log_directory = root_path + 'sequence_log_out/' - log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) - - print("Train num_classes: ", num_of_classes) - model = Model(input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes).to(device) - # create data set - data_set = generate_label(logkey_path, window_length,num_of_classes) - # create data_loader - data_loader = DataLoader(dataset=data_set, batch_size=batch_size, shuffle=True, pin_memory=False) - writer = SummaryWriter(logdir=log_directory + log_template) - - # Loss and optimizer classify job - criterion = nn.CrossEntropyLoss() - optimizer = optim.Adam(model.parameters()) - - # Training - for epoch in range(num_epochs): - train_loss = 0 - for step, (seq, label) in enumerate(data_loader): - seq = seq.clone().detach().view(-1, window_length, input_size_0).to(device) - output = model(seq) - - loss = criterion(output, label.to(device)) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - train_loss += loss.item() - optimizer.step() - print('Epoch [{}/{}], training_loss: {:.6f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: - if not os.path.isdir(model_output_directory): - os.makedirs(model_output_directory) - e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) - torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') - writer.close() - print('Training finished') - -if __name__=='__main__': - input_size_0 = 1 - input_size_1 = 61 - hidden_size = 30 - num_of_layers = 2 - num_of_classes = 61 - num_epochs = 100 - batch_size = 200 - window_length = 5 - train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' - test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' - train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' - label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' - model_out_path = train_root_path + 'sequence_model_out/' - train_model(window_length, input_size_0,input_size_1, hidden_size, - num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, - model_out_path, train_logkey_path) \ No newline at end of file diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py index 7c010b7..a35446b 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_predict.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_predict.py @@ -83,7 +83,7 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, predicted = torch.argsort(output, 1)[0][-num_candidates:] predicted = filter_small_top_k(predicted, output) #print(output) - print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) + #print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 i += window_length + 1 skip_count += 1 diff --git a/anomalydetection/loganomaly/log_anomaly_sequential_train.py b/anomalydetection/loganomaly/log_anomaly_sequential_train.py index 3d87fc2..fe7f7d7 100644 --- a/anomalydetection/loganomaly/log_anomaly_sequential_train.py +++ b/anomalydetection/loganomaly/log_anomaly_sequential_train.py @@ -40,7 +40,7 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): # log setting - log_directory = root_path + 'sequence_log_out/' + log_directory = root_path + 'log_out/' log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) print("Train num_classes: ", num_of_classes) diff --git a/anomalydetection/loganomaly/log_anomaly_train.py b/anomalydetection/loganomaly/log_anomaly_train.py deleted file mode 100644 index b515ed5..0000000 --- a/anomalydetection/loganomaly/log_anomaly_train.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -from tensorboardX import SummaryWriter -from torch.utils.data import TensorDataset, DataLoader -import numpy as np -import argparse -import os -from . import * - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -def generate_label(logkey_path, window_length,num_of_classes): - f = open(logkey_path,'r') - keys = f.readline().split() - keys = list(map(int, keys)) - print(keys) - length = len(keys) - input_1 = np.zeros((length -window_length,num_of_classes)) - output_1 = np.zeros(length -window_length,dtype=np.int) - input_2 = np.zeros((length -window_length,num_of_classes)) - output = np.zeros(length -window_length,dtype=np.int) - for i in range(0,length -window_length): - for t in range(0,num_of_classes): - input_1[i][t] = keys[i] - for j in range(i,i+window_length): - input_2[i][keys[j]-1] += 1 - output[i] = keys[i+window_length]-1 - new_input_1 = np.zeros((length -2*window_length+1,window_length,num_of_classes)) - new_input_2 = np.zeros((length - 2 * window_length + 1, window_length, num_of_classes)) - for i in range(0,length -2*window_length+1): - for j in range(i,i+window_length): - new_input_1[i][j - i] = input_1[j] - new_input_2[i][j-i] = input_2[j] - new_output = output[window_length-1:] - print(new_input_1.shape) - print(new_input_2.shape) - print(new_output.shape) - dataset = TensorDataset(torch.tensor(new_input_1,dtype=torch.float), - torch.tensor(new_input_2,dtype=torch.float),torch.tensor(new_output,dtype=torch.long)) - return dataset - -class Model(nn.Module): - def __init__(self, input_size_0,input_size_1, hidden_size, num_of_layers, out_size): - super(Model, self).__init__() - self.hidden_size = hidden_size - self.num_of_layers = num_of_layers - self.lstm0 = nn.LSTM(input_size_0, hidden_size, num_of_layers, batch_first=True) - self.lstm1 = nn.LSTM(input_size_1, hidden_size, num_of_layers, batch_first=True) - self.fc = nn.Linear(2*hidden_size, out_size) - - - def forward(self, input_0,input_1): - h0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) - c0_0 = torch.zeros(self.num_of_layers, input_0.size(0), self.hidden_size).to(device) - out_0, _ = self.lstm0(input_0, (h0_0, c0_0)) - h0_1 = torch.zeros(self.num_of_layers, input_1.size(0), self.hidden_size).to(device) - c0_1 = torch.zeros(self.num_of_layers, input_1.size(0), self.hidden_size).to(device) - out_1, _ = self.lstm1(input_1, (h0_1, c0_1)) - multi_out = torch.cat((out_0[:, -1, :], out_1[:, -1, :]), -1) - out = self.fc(multi_out) - return out - -def train_model(window_length, input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory,logkey_path): - # log setting - log_directory = root_path + 'log_out/' - log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) - - print("Train num_classes: ", num_of_classes) - model = Model(input_size_0,input_size_1, hidden_size, num_of_layers, num_of_classes).to(device) - # create data set - data_set = generate_label(logkey_path, window_length,num_of_classes) - # create data_loader - data_loader = DataLoader(dataset=data_set, batch_size=batch_size, shuffle=True, pin_memory=False) - writer = SummaryWriter(logdir=log_directory + log_template) - - # Loss and optimizer classify job - criterion = nn.CrossEntropyLoss() - optimizer = optim.Adam(model.parameters()) - - # Training - for epoch in range(num_epochs): - train_loss = 0 - for step, (seq, quan, label) in enumerate(data_loader): - seq = seq.clone().detach().view(-1, window_length, input_size_0).to(device) - quan = quan.clone().detach().view(-1, window_length, input_size_1).to(device) - output = model(seq,quan) - - loss = criterion(output, label.to(device)) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - train_loss += loss.item() - optimizer.step() - print('Epoch [{}/{}], training_loss: {:.6f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: - if not os.path.isdir(model_output_directory): - os.makedirs(model_output_directory) - e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) - torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') - writer.close() - print('Training finished') - -if __name__=='__main__': - input_size_0 = 61 - input_size_1 = 61 - hidden_size = 30 - num_of_layers = 2 - num_of_classes = 61 - num_epochs = 100 - batch_size = 200 - window_length = 10 - train_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_train' - test_logkey_path = '../../Data/FTTreeResult-HDFS/deeplog_files/logkey/logkey_test' - train_root_path = '../../Data/FTTreeResult-HDFS/model_train/' - label_file_name = '../../Data/FTTreeResult-HDFS/deeplog_files/HDFS_abnormal_label.txt' - model_out_path = train_root_path + 'model_out/' - train_model(window_length, input_size_0,input_size_1, hidden_size, - num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, - model_out_path, train_logkey_path) \ No newline at end of file diff --git a/anomalydetection/robust/__pycache__/__init__.cpython-36.pyc b/anomalydetection/robust/__pycache__/__init__.cpython-36.pyc index 9ec1bcab631d6f9267c860993ef65bb36cc9f272..96ca0bf2b56b979fa771187f7bc7de2942910feb 100644 GIT binary patch delta 26 hcmZ3-xQ>z4n3tC;Vz4n3tC;Z6d1!r(0%DYH^HjrO!ma1OQuJ2cQ4| diff --git a/anomalydetection/robust/__pycache__/bi_lstm_att_predict.cpython-36.pyc b/anomalydetection/robust/__pycache__/bi_lstm_att_predict.cpython-36.pyc index 7fca6266b86e95905ad8d42b8cf807aaa6eb4d4d..b34a4baf061898e9124ed2557a487656f86f9311 100644 GIT binary patch literal 4306 zcmZ`+&2J<}6|d^A>G`t9_N+g4on;XrXu`&1iA17kNp{Jy;$zpCjip5+C!_XMd)(um z?%C=ZuWh<7OcHU45{_`(W8ec5Km(#`EK$x?YQP(c8YJKvcu#fKL_RyNL zd+fb83P;(>5tgt&V8RjZ2P}7n?vxEZ;Z6BHHpM$rUO2fmtSCEoFy@>+S%1tP!w5fR zkJ!&xZb9+NrslHG@j-g1bx__#$(B(mrns_lHng#~Gi9_Z-r}1bSG#I5jzevag0Xh@ z;xtg&&6JEV%Eob|a9*B##v~o<`QAHk?mRr+i{dat8;5~RqGYf$I)0kUY(E|E#OWYN zg7`R#vYiL%;O?v+B;wG zmG#7-<4ZCW&S%GAf;+c1Ue)}_9JH+FvHZMtJB~8dPeGs!mk{mG*)$qy(Q~7CxDTNk7aq-=hz}By=(7`f7is*b?S5+MRKaH~1WX$zJF3D;W0)ru29yq>h2> z572>~V5!{7ZNUkaAA3Up2M}~~cL=Bs-P|FdevCQJFy~EyHdPVc(4WCGcMTx7OY2&? z747wQBN55+=7U(*b_3Pl?`6?RC`koeK?CsX(Y+ElPUc4BV7+mm_TRkm^P*FgbgSA@ zsrE#881+Lv52T7-rT~Z@p$0fkWq%(w!Mt|DLwv^_%P3LW+e_srP#sU#qGUW#=EJ(a z9|;jA<)D^KM!h&V4rQk6)bGbZmch!kb4|2o`nN6{{7R{bxfm4%anFd6owFJ$W?kiN z?!*4_7Di`OK_{|NjO|}VX8@u&dMPJjxL(*f7krK7_8PbksI7qlFf#66dt5rby=hE5 z`1fc3{L91Lt+xT_HSE)IboJDgLZQ8MqHtyyA&ONfH6LHrf3aNYWpL2Ar_yon;2YSc zRGvG$##^NPcQALRJROWcWR(9iI)WG007n?$OcpEb!YN$EhTuJb$HQvn=+nFc4g#+o zF!_hv64vWDu&{*#*7Jmm!&cV7P&4K${M;8j=O0>Ytnl%sKg9^|{xbK6j&cPbz@>BI zHLR?%HogQ%+Cz`V=s81ARj@O5rJYOe%q!-*l_xJgX3AI9VU3JxT4GnLsOD9`*Dp=` zj0ob(*`lb^1yR`L%T+`B~;qO zGNN6lrmMv5#_{z0jB92rG*6C^?m@LIgO|cNN_P^7jk;dqtVe@xbrfU=rmJg822s+Z zZ8eYNb^4r7LfA%KvWi~&zgGAb-plA^21K@iNe7tp`8tTLfx5<*(JpiOLr9#Vl&GzV zVo>@KIur&>9Bxre9$Gn3n|&Tj;eyW5j&Z^5!o|otx8t3`znbYRK%b8qMh zcff^HBJar^n#pbD<&`O1*SP8(xGIq*P>;C%GL3*PZ&OyFjqzUjI(pGf0PWd-@N_DY zLY{VpS(-@lL+!%s2#KCbDV~`7(>6MYFgC78w?LEd?4)-X_DefAW;4d>3x>;j>UbQ& zyGd$Bz~uAXL3m;yx9>3-wHX!C?Hhp$dmD;#wB1 zWulD7SnpG$qlYf<>@scxd*`txcc*Sq#f5MrOEr*19i@TNM42PfKy0y!d89z|WPT!z zkDa20)CsvB;*W_#pYl%}XnDMJ)~6 zWuBS0P144Tc3#g{gm+-&ZK~UBcxB2!c=fz-HqS52H}W~mW9E|lT;4pJzbN0#7leoX z7SPrZAH)`WBEFCo*7N2H%Nrh1@mlh3^a6e--0(+*Nb9jMLB2$`oXa zdvnAb=GN)0{My;vyje47$ZM;tcmZ-P$h}ZfxIkXpSu6>9`#z#wg>X!4@>M`gSF-)d z-X7SdHU{sE!ci)ZPnxFz@dgqGsf^DJe)DU3{`B@2_?;rMNt;{f7=HH)1epumUK0E% zv-9Ii6lREhAMehH{kllc659)j>x{{x-+si|Gl3VL5gnggzrRHp&&?b6H_LYWbi8fG zC*Qx@?_=T3jo0=dw1u21A}*dyef{Q*Ma3W&MLRTW_*IgTWQQsfJt|4s8%Shb)eux=a&+VR z@(x6_a~OqB<=061MXD&_({4Xb!8AzbRM6j-4`|K{GU6x`o3xeS|3@HYa14^0jO5FZ z=v2K6s${mgbMJ z=`9TVeciZd?@Kt+wR7BTVT0@uDrU7TA2e%zcFl|$7%e0I)j+^6f^^$__4Mb1h+Dj6 z&2bNs=XD#SHspMewmiYEoi%fh^2g|FZyY2d$mDl1F!v%unzqtR`zFKQND|ymBHuUC z64L2Nx5}l7isEd8tWzcz1e9*)@_)aLGR4`r_}|ZD2`0-gQ9{Y3ZbVrxI1D1>H*skG zxfm##RJ24QMHq923+6Nbhv=^x{zW1cpTyx?q(^oG71^Zx)lA#s4SUt`8$LaY{``Ld DIBGw( literal 3240 zcmZuzOK%&=5$+z&@FkL>B_l^7c!3&SwhSVsr~ub_AhY@o;)jMyV-sNoFN z)3hya&xy_<2g6SOMgBy7NMCc(HOR?_BwsZ}Nidc~KdQRAtGcS@tNyB9ujYS!_vhU? z#{R{w{T$4{LCX%%2_|^V_9i3Wy^;Ty~>!$Z$u$+ zggfSYCE#t-ZV-1HCZ1^Xk@4s0v!7%saU3h76qi}pQQwuDYG z#g(I3uYj}NF{4wli|=$iqeR=W^CTlq7)8V&| z_P;yZj^Z%G_&f|`5+z6b{j)cz%#PE+ew-c!Nf4i9QMUg)J$gEw2T9roTZAg?swhqN zWqL5o)c!%_$C>K;fl~fJh9c^!t-+ZoKaPX`fe4<=(-qLzXzeIWLK&!V>v5cRgE)JF z$wcV#;uMC}s0Ij1gNUy`G6zd{@)9mhCZzcT!-z38N? zWBF6{aU5l;oAw72V@OAljW&MDoT=a^e!o*TE^Rb!ItUX}kzpX>C_{D(dAQX-3C}V=3r~k((hW1i4@m)BLV>xm*OaG* zQ66Tr$as<0c!S?9Y;gGn=q{xx{m7IunoZZ=VFEwFQ`*r5!6_}@7kfw?q@bt09#XaE zX_pf9J$RgfR~#eTR7n(jjIz>~BlEF|}#b)L>5+Y0{L^T=q{Wv%aWoBk*-i?DSgO{sU zmYAZQKm6R`ixW$1U{n;uz7<=8!5PJtIcvPd%kaPaC8)0DVR8?Rf?<8{78r|IUWNZF z7ARiiGs4MhZ@2YKk6W;Kl-G5=hnS4{zLPidS%MDb=~;>a_yi6Vsiv;!s%}1Jx+c73 zme)=f|d&EMJ9f0HqJ@d`%QZ z2~f4xs}fX^cCrw4M*yRA=d>w7$BY*ouuRH_y)vxn9I5lwE zx~3YsKIYn8VJGaPBRBPisI+j8Wx(Pxv$y#|uM6)tEMM1k+_Ogfzc{Nb-?TI+%)f-r zS-pwg6V>-luPNrWtJ%2Fn^W_!a#PRgO^j_wEaPhh?-7L>^03NB%>05>tPpGp&IJ);^-hR-YO2{x9#){zUep|_5*p7pPFedt3 zVB0xq9BppzP9(N>CgaNs`lY2uzkS;6V&T!&SBE%e7Z4g{0LUrp&i(d_5VeqwwK8CA zKYY*+gKQ|nOXg@{J4&MLI23dQp(Tp;s2Rq=APa@BqCUyy=l{d6vnUr~nWSfftxebJ z6_tuf(IO->E^V|0#gwPzO%klh+q7ItQvXN>Le3IZRAG{((zvgq@Qu7dQa7zsAdkRT z@@Jqr_hg4weooylsJlOgnWc+&gs;L(`BeUFkuVKQAL3ndL0b$D5)la$h|Dxm zN>nHl|8>~)NzBaAOr;Z`_bGm+HWji0Jd*T+Fr^EwDPV_bOl3fz;F#v5AYb9WsoK=A zPBqTUUE^${z1+dLV+v27KR2_%K|%#7M!gF}C{5#vK`A2FNo}Q@4in|$9%imPlrjWi z??;z5CF*yuR)Buv+0-({bf{3?Oywd4Lt*%?;V+>2Ayjv}rglB#pVzVKRqo${1=$E4 z<1U^KE{a!!ub_x6IZdYwdit+oR^_*`LQ*8>I!m_PHNj~^b`Ae3Z(vsD^O#i~{LwdX zR)ZID-c{cM)v~*5T)u+#PJNO}PMVprt?ye&Vw@zA^wI8hK7UniOsW>DlId$DD#8YK?KLelN7ea*($x}mrJ0)gA`SL7>7@2dq#Ld7pl_53oCB9R<8UX DQ^`ly diff --git a/anomalydetection/robust/__pycache__/bi_lstm_att_train.cpython-36.pyc b/anomalydetection/robust/__pycache__/bi_lstm_att_train.cpython-36.pyc index 22214b0493bcb0a57a8dbe6b3d15daccd19b4a02..54870d8617ed723d1e27985122e17b443cb10d6d 100644 GIT binary patch delta 3162 zcmZ`*O^h426(;9jBaNipmG{rdLzRw<~c+!ip9G{yiu7C~+WjNV%m1#&8oLx5fyfIZY`Pd)Wqqd@!KNb%MIIs-mF zK0cCWJhH4+O=a-@lMxIHhtFSVN}roR7c@|0aXbMwmm&2}eDq3&cgef`A>=uJaOYd{Ds zQw#oiYEVbBXpLqM_2_sBdbkIq?kM`EGDHw>DLYLzMK{>xi_oF0ZXmil9ln$;8?(m` zpom^p^opVZKZwI~atP&KAVe7BX8br5E7^wez2P^ZLy!nO6KRr;w1EaKU1}pb(1A0g z&M4DHdfqZ6mHJ2n|LIqAvp5=9w{lDP5nOEv$zdelKynmF-QZ{zuOm5z|vqj3@BajHb3h(@slw-qhi~!$@J4?U@~LK%m+j|%vvC_us|Z4*{B4=TE!^G-<6$%8=V5~OaWGPu zIWQo7-v-e?{sUiqieCX<8BvCT!3i0TPbszVf4zr> zr+9%7T7nDFfb-v_)6{`{DZ#&#OB_0_&4PRpvuGQ1n?)P}A3US-$3&h*kHN{(W^f7( ziVUteX=Nvh_fAg>Pnrn(kLfLAFvALi+HPNzWhKv-_@*}42W56JCugI|h(uK=Q>>Ex zrMNhZrdbv8U@n>gp)QOZhzoK$s*MQ1cmW?yP}hK)9T7;n8uZPfZ2MCHGiAkSUd}Lg zj==F)6AQ8uEsp4Zg6|XUE1$r%e z;vM$v6wnpI?F^tJnO;6N!~>LbRMnDH8NEPkDYKozQ%gDB;7$lkI)2<3X>d1GZYuEn zWbLLH&7wB@J`Q{`ei>?O^ul;g7uj`t)H+2eAZUeEqq zz7Nk-5bA(`9s-y7mD#1Q@Hv5Jag6USzEk_zqso!%i@F0CL4i4$Zf0iAtUZ$&Aw|e-s q00p^+KS6HKvYa~vGUc2F%XT{9Idp%Dk1sSX`>Z-+o^}?U`Tqj=yWti9 delta 2054 zcmZ8iO^g&p6t3!@nV$dI-C<^jU1azTBLW+Lz##H362qo9XKkjw(J@kq>02EurEK5uGo5ToI~@mR zBgpmhojk=gW>DxCJH>vfQ%ZR&DEBL!3MKpy@vEHpBmU@+2%p)Z!7@YIslH8EmN_>G zbGYG;4b4uC8xV~R)o3E=o!*3-TU36ud3JSo*kbOi*&Le%V~Q$Vm#<(!Et(>b~`c`pKS|PYv5kTtu-ze?$WT52Otl=?vr7dR_jnbt~>IacwM2 zX`?&!{1$~?*pu!;k1-zZP@T*^I;+0SoLNZ>h@~|nC|WKe;KABiF^2TZ2zwCF6|o&* zCjw+IT*2C@>|QvyDzzB_P`8tX!?-{{??LurBtG&4+YL+OJOD8#Efr_$)KRyx+k6cA zn(=hky};_y)0^t@AaLFPEVilGh@WHhgad#viIH$Ox_k^qd|b7h z8l3UR~*uzv0P5d zSW`2aiZuR9<_G)FtfY!j9&+JH?#BFz8+aFZa7aBaY%dq71GQ5(cF;*G z&MU9dqz}}`mCyD$vXs~u*1P1~>hc)NE!VCOWI3s%p=M`|<#jOX%Z;S&SBB8IV2rS; z+FxD2k~Dl%T4@xU$tZR+xPvBiE}UyX)rMprS+0R@9m=jem?M(1f?rR@SiV8v_G+?` zRQ&NFok$xCtV=blSSKuZ*${VR(_imTB+iiDrSWOtCXt4QYDo2Gb^Xo_{tJ*l8;2xu zSP4w7ZxB{)kf8zF-I*gv)oe|Q-T2$YRv{xQ6h~6sQvt;fb$fc24 z;X!?dms%HLiWfM0^aP&vCT+%2h-r*8Cb!syfCVJZBj7(n7ziT(x_S&6Mf_GMMKoAE z1*Ep4f$a6Qb%c3+oga9@>vPFPY={|kbabndqwN*EZQ(@Z^HFdogrT?ex9N z9EdnAe{IFyHLmlq#9L;)E@VeHfF9SL5U0_|1_Ug%v{~u$(|(-xS{~~>@_Z0=7h-{( z7;-U+q?#OS(=+PC*y2%%LKD(Q+iCvj473q!Ef(wohv9AzUHY#Hq}8tr&}w_3+s9wf zym~UW{b<#6AmxhiwrCZacODY21rl)&?6(_hXK`dCx(mC*a4mF&H8u^G@YP~%TV^vJ z 0.001: + filter.append(p) + return filter + + +def generate_robust_seq_label(file_path, sequence_length): + num_of_sessions = 0 + input_data, output_data, mask_data = [], [], [] + train_file = pd.read_csv(file_path) + i = 0 + while i < len(train_file): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + input_data.append(line) + output_data.append(int(train_file["label"][i])) + i += 1 + data_set = TensorDataset(torch.tensor(input_data), torch.tensor(output_data)) + return data_set + + +def get_batch_semantic(seq, pattern_vec_file): with open(pattern_vec_file, 'r') as pattern_file: - i = 0 - for line in pattern_file.readlines(): - pattern, vec = line.split('[:]') - pattern_vector = tuple(map(float, vec.strip().split(' '))) - vec_to_class_type[pattern_vector] = i - i = i + 1 + class_type_to_vec = json.load(pattern_file) + batch_data = [] + for s in seq: + semantic_line = [] + for event in s.numpy().tolist(): + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + batch_data.append(semantic_line) + return batch_data + + +def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length, model_path, test_file_path, batch_size, pattern_vec_json): sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path) @@ -49,44 +86,30 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, FP = 0 TN = 0 FN = 0 - ALL = 0 - abnormal_loader = generate(test_file_path, window_length) - abnormal_label = [] - with open(anomaly_test_line_path) as f: - abnormal_label = [int(x) for x in f.readline().strip().split()] + + # create data set + sequence_data_set = generate_robust_seq_label(test_file_path, sequence_length) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + print('predict start') with torch.no_grad(): - count_num = 0 - current_file_line = 0 - for line in abnormal_loader: - i = 0 - # first traverse [0, window_size) - while i < len(line) - window_length: - lineNum = current_file_line * 10 + i + window_length + 1 - count_num += 1 - seq = line[i:i + window_length] - label = line[i + window_length] - seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) - #label = torch.tensor(label).view(-1).to(device) - output = sequential_model(seq) - predicted = torch.argsort(output, 1)[0][-num_candidates:] - print('{} - predict result: {}, true label: {}'.format(count_num, predicted, vec_to_class_type[tuple(label)])) - if lineNum in abnormal_label: ## 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 - i += window_length + 1 - else: - i += 1 - ALL += 1 - if vec_to_class_type[tuple(label)] not in predicted: - if lineNum in abnormal_label: - TN += 1 - else: - FN += 1 - else: - if lineNum in abnormal_label: - FP += 1 - else: - TP += 1 - current_file_line += 1 + count = 0 + for step, (seq, label) in enumerate(data_loader): + batch_data = get_batch_semantic(seq, pattern_vec_json) + seq = torch.tensor(batch_data) + seq = seq.view(-1, sequence_length, input_size).to(device) + output = sequential_model(seq)[:, 0].cpu().clone().detach().numpy() + predicted = (output > 0.2).astype(int) + label = np.array([y for y in label]) + TP += ((predicted == 1) * (label == 1)).sum() + FP += ((predicted == 1) * (label == 0)).sum() + FN += ((predicted == 0) * (label == 1)).sum() + TN += ((predicted == 0) * (label == 0)).sum() + count += 1 + if count > 100000: + break + ALL = TP + TN + FP + FN # Compute precision, recall and F1-measure if TP + FP == 0: P = 0 diff --git a/anomalydetection/robust/bi_lstm_att_train.py b/anomalydetection/robust/bi_lstm_att_train.py index 0416371..75509f1 100644 --- a/anomalydetection/robust/bi_lstm_att_train.py +++ b/anomalydetection/robust/bi_lstm_att_train.py @@ -1,5 +1,7 @@ # -*- coding: UTF-8 -*- +import json import torch +import pandas as pd import torch.nn as nn import torch.optim as optim import torch.nn.functional as F @@ -17,25 +19,32 @@ def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirect super(Model, self).__init__() self.hidden_size = hidden_size self.num_of_layers = num_of_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional) - self.fc = nn.Linear(hidden_size*2, out_size) - self.batch_size = batch_size + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional, dropout=0.5) if if_bidirectional: self.num_of_directions = 2 else: self.num_of_directions = 1 + self.fc = nn.Linear(hidden_size*self.num_of_directions, out_size) + self.batch_size = batch_size self.att_weight = nn.Parameter(torch.randn(1, 1, self.hidden_size*self.num_of_directions)) # self.out = nn.Linear(in_features=in_features, out_features=out_features) +# att BiLSTM paper actually H is different from the paper in paper H = hf + hb def attention_net(self, H): - # print(lstm_output.size()) = (squence_length, batch_size, hidden_size*layer_size) + # print(H.size()) = [batch, numdirec*hidden, seqlen] M = F.tanh(H) a = F.softmax(torch.matmul(self.att_weight, M), 2) a = torch.transpose(a, 1, 2) return torch.bmm(H, a) + def robust_attention_net(self, H): + # print(H.size()) = [batch, numdirec*hidden, seqlen] + M = torch.matmul(self.att_weight, H) + a = torch.tanh(M) + a = torch.transpose(a, 1, 2) + return torch.bmm(H, a) def init_hidden(self, size): # size self.batch_size same @@ -52,12 +61,12 @@ def forward(self, input): # out shape [batch, seqlen, numdirec*hidden] out = torch.transpose(out, 1, 2) # out shape [batch, numdirec*hidden, seqlen] - att_out = self.attention_net(out) + att_out = self.robust_attention_net(out) out = self.fc(att_out[:, :, 0]) - # print('out[:, -1, :]:') - # print(out) - return out + # out shape[batch, num_of_class = 1] + # add sigmoid + return torch.sigmoid(out) def generate_seq_label(file_path, window_length, pattern_vec_file): @@ -75,8 +84,9 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): for line in file.readlines(): num_of_sessions += 1 line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) - if len(line) < 10: - print(line) + if len(line) < window_length + 1: + # print(line) + continue for i in range(len(line) - window_length): input_data.append(line[i:i + window_length]) # line[i] is a list need to read file form a dic{vec:log_key} to get log key @@ -85,7 +95,31 @@ def generate_seq_label(file_path, window_length, pattern_vec_file): return data_set -def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + num_of_sessions = 0 + input_data, output_data = [], [] + train_file = pd.read_csv(file_path) + for i in range(len(train_file)): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + semantic_line = [] + for event in line: + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event)]) + input_data.append(semantic_line) + output_data.append(int(train_file["label"][i])) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): # log setting log_directory = root_path + 'log_out/' log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) @@ -93,23 +127,23 @@ def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_cl print("Train num_classes: ", num_of_classes) model = Model(input_size, hidden_size, num_of_layers, num_of_classes, True, batch_size).to(device) # create data set - sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + sequence_data_set = generate_robust_seq_label(data_file, sequence_length, pattern_vec_file) # create data_loader data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) writer = SummaryWriter(logdir=log_directory + log_template) # Loss and optimizer classify job - criterion = nn.CrossEntropyLoss() + criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters()) # Training for epoch in range(num_epochs): train_loss = 0 for step, (seq, label) in enumerate(data_loader): - seq = seq.clone().detach().view(-1, window_length, input_size).to(device) + seq = seq.clone().detach().view(-1, sequence_length, input_size).to(device) output = model(seq) - loss = criterion(output, label.to(device)) + loss = criterion(output.squeeze(-1), label.float().to(device)) # Backward and optimize optimizer.zero_grad() @@ -117,7 +151,7 @@ def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_cl train_loss += loss.item() optimizer.step() print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) - if (epoch + 1) % 100 == 0: + if (epoch + 1) % num_epochs == 0: if not os.path.isdir(model_output_directory): os.makedirs(model_output_directory) e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) diff --git a/anomalydetection/self_att_lstm/__init__.py b/anomalydetection/self_att_lstm/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/self_att_lstm/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/self_att_lstm/__pycache__/__init__.cpython-36.pyc b/anomalydetection/self_att_lstm/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66927421169bd3547ca887f2883d4c72d610ebc2 GIT binary patch literal 181 zcmXr!<>g{qH7!n@fq~&M5W@i@kmUfx#VkM~g&~+hlhJP_LlH4|xXIhDnk#W6nl>5edVVqShOP&g&EBsIAt xGe0k;I5j6NKCz@EKBu@OHzqzlGcU6wK3=b&@)n0pZhlH>PO2Tq?qVQj004BZGzkCz literal 0 HcmV?d00001 diff --git a/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_predict.cpython-36.pyc b/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_predict.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e405fa198859bb7c5166c5ce3250dc8e98e632c0 GIT binary patch literal 5806 zcmb_gO>7&-72erh{%A!~6h%?LmKEEPO~+AVr$2#fI5Fa=Kd=SYaqE<^&8#^qY3bFH znqAqlI@?3(_E6NYd+xbFb1#CNdgv+VqUd2x0SXjwPiYT57)9Fe&5{&tw@3>_!n~cI zH^Vn?-n{SIeZ5$8e(~|IZ+&vyF#c?e{F$h~g%bS?f*YI#Mk9T)2Geb`Z8a=ivl}*^ zW{}y=HnJ?$%C)U_zA@$*Z3kt%UFaF&61PH=+dbBBc!p;m7>y#&@jRX-KE@q9C-^uo z;92HHUcz&dPw+CH70%u@>XT1ki?Lp3Ds#ERy+GNmUG6HY)!l88z4VtsDWSZL5^X@l zM$bshm`UuRXFj@=SbG-CJv#JkIQy{{j%^wLlm&i)6UyxZ(P2= z<_BJc=Oxb-p&xGE*uH|h$Ya>36?kFYROaeyitXtgr=p1A zr_Q;+kK$HmyPFzAzmYZe7ypwv1I4H8R6Vb3nyBnf*9+B{@LV4Fp%;lP77#fSVG-$9aE3gtIdEsw7bJ7X$&15_@+q?<(}7lF__Yz zaKtpNMz=_iBSn=2otm=zFdjKNYtxJMnbxynmXzj5%#&CkL1C!OAbkFuCSS0HCPrll zgW3AP%FJZT)=4Igpsz)TMSo-jxj`qJZ=*s7CYdy)#hJ`-^I^7)YY+ElWwwn()5gJY z9MVJdF#~-#Bxb}po@wU?$wg-2&+Pgdu0h$~Xx{d@FVZXP2FgjJj-pY}^}>}YAaB5l zEthWfercOBW3%hVTW6Pll+?!r9hfrXj>>ZHuHW+11X2udMlmjMlWsW@P)D@35X0zK zws#l5lj#chR%JIjV%v@DIpz3acQYh}Y2?UL5j*g`&^k zHB>fnarRaurT3-8!WCh2hP37YP)Iort^ry)i96TR)NQTnK->K1hkyIS<=a=@MEuTR zow}{Y`gzdfD%;tO;Q)BV4`NR!)_vaoN;=YvkY zp>hg^(ttMpBWwe=-a!zJ>05s_8RuqF*jtn1{hNO_Dav9SXw_pk%%qe|(5=U9pU}5m zW2?rivMfkPmd_coz%w&OQrKA|-Ww@$qEo zG26LC#u~+Mbq2DW{@dXxGv@hcz-k z4qs@yj>R+bm^=!dV{{_0ydY=vb4EJg6)*KDNRQpgVb3SzA#ITvRpk=eOL9tUPK9|{ z)wSx*=R=)0w9aqg>tcHry~hsL>*+9$$6?i6Y89Qv-Wjl+Zewlt9xKQE~ z4^5ocI4=+CkIkJe_+p@`?Myk4tZ7?nuw`1;rguKag<910#huSa<4sP@eb6l>-<3pV@&^ZxSc}1>Zo#QBa6nZDH z)-?1c(Kmy39e$&Fm3q-Wi5*VCTUA*j%^r|c@4AY#B3cy;Xo+`*lx>8P4LHz#v7g75k@Q9Uim< zK*jQ_%L6sxMY};v_;G4E7Q5bZ;NJEEYAa3Bz>mZf7J0%+kP-V!YgbbJwe|FQt>3<; z+xzccXtgkKcJ=iQEP4fq=10J*LD$QtmalrClJp~52OVptU)}cHXjgcnofobSTl-UM ze&|PA9;XEf%0Pwp%UNK1SJIzhua!p25M!4aor?RmZMjfH-yS{f%5NHX!Q5lM@sa(|Ah_~H6m2<@= zbYcww;#C^9sjV!q{rV|EXd%8rf?!OXCZQS38PpoJK7nt>UIfk#x~W0FQPOQ-Puf-` zjlpineGY(+R2ftNfE70HdaWj@sY$BEofN2>6alI*P|^y_PpML_->WRlP^Ezm8Xyd+ zoKleks0KNq@X~%)=Cv!zTtm6Gj_10vE?l~#CfwU05sCnO#pRyRw5(A&=%p}IW3A3^ z7&oyGHQCw~!b3yvk2bAxB-eMh8>N(y3?{>i-cG~m2!GQDHSu;-Mn`SRMZZSQ{3?WE zfnrw_y9%pL!>U%RVXa-bstTh~^W1$?&cXfqI@X9DL!`{m2BeqREO<%HteAPg{NUdK z;2*Q{V2{*OVzyZWf3cxihAglOo5#~(%a~OFw{dcLj8!n|n2uG(C~8=_f?1AP!1^VY zHOnKiYL=l>Wi`wzFtG`nhQwkS4azc_Sge3p#6xDT8YK>j7R?fm@|+CmQ?rtA}5IiTr_T~WEyZ~0pO+pYXC~H+$nB@ zc|Ev7`nA1}WDWEKNwp91Oq`d~NW`@vh|lPD3gEN&wY;2x9OD_lu)_iN8ioODfM39; zoWXN0F1AZ@4#dbr-EL2$b1Sp@LGhlNmEPy-=T17w{ z?WDHz^XtaW#weimeel_c4?dNZz7PI*-~+wmkDuGIDbD;~J1%~yJ2u6=+bC0`~hSh*o=a_n1)xyfO{vA7GCQfawIO3$K(o?d~G?0r~K zpGpad*@;wMzrwGEp)$izA9~}=^I!Z{pZVhVY^xay*AMl#g2vQft<{a)P=70=4-W_R zG%SeT2mp(r3Vzgd@47zVHb{9>8gN9W(wHG=(k73%Sv#CQ>){I{-t>0^?>wy*(T5vy Sprbchux9PNlP~7!&-*ub19vq5 literal 0 HcmV?d00001 diff --git a/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_train.cpython-36.pyc b/anomalydetection/self_att_lstm/__pycache__/self_att_lstm_train.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..253937c1317542102a5d45b6350a89c311304cf7 GIT binary patch literal 4953 zcmaJ_TW{RP6`tX3xmT@ZS(0tVRnj2H7LIJ7L5ez#bFu5Bt*s`pYp0A|g4*FqTyx3Q zaAZpg_i0@O2x=tFL(!+a6bR5i(#QOOc?#MmqyNA!>34=J*>T!%!NcLg34MXYp#8dvj3;3L=m%}6HEW*Td!zbhhb_s6?^n(FYfXN}@jG-4KT!2;`3HsFE_ z>wzEG!WIt1aD*#7Jl#s{!26UrET}=Wx^}PSB2N3Y*;MrOansbr7a)o$Ub3k*w5E)+ z(kfYKmo})Q+@V);aBRU#S6IS+=z(7=U3pPBX!)5ZTzaBsokkA*MOHdfnxg?;Q?-U| zh=x8x%t&9S_7tmyo$r11X52Z!AhNe{V+(TO`u5G9t(%*X>TidGL>5Z7HxrS_xUZ5t zi&9+^G9Tx=>XC)sk2pxLtKDLE97{<`YCjBK_X-= z&VZJB81~bsD8ld`?D6Q*`PW*NQ&E6 z^R2fJ+fkN}z!nhS?BZ>bEhJ${_3yuhN=8YxKHise5S7F&7C{*Aa1Xx~?r`7o_@Z^1 zA9M`N)(`i)PZptza7Mm-33u@th+;#oEQ1H9yA1!HS`!O=ue5+PKDEtTjqo?M zfC_sQ z&eR^d3ZVr`{n%HVoNYkr@{L}tQp`PwWnLJt$m5vaU`Jaj*Pe*)CH=VPREjwS?ZDbT zr>*|EXBBK3H>iBzWy~^D?=*+$<}3A{>T#r;i>`DV>)k*D%2X=x5^)%IqMvmewwu@r z@z_|;ChMW63)}GtA_c5Uahm;!;x{X{@HLO+ppk3|2tOwwQ zxRJrd5jPrPILgItO6^t{?(9Zs^~Lx^C5;i2LbPtre6$~@X&62cp~vMKzDq)uiA)p# z3JOs^Z_WFj?-_2NyoE0EH!`-__jj;43B&#(I+LG#{^7(C*2LYrQaVFEWtZ8V^AoT1 zU=KuB*rGP^m8}rJOMmE10Y%}Em9XFS(nV~qUtoym@Vbe+bNUv$1)F@pK4c#uYfc(l zY>QcJ(wwxgp7G1iznZl7{#v$YYkh{b8p8brn>1(@;fWe$N>P8nAValYW6CeXzGh8N z)k;9W&XqmSb$uQ}Z9iXhWp}90bJFq`Kf9Ek@Gl1(8^ys1|Ir4Sf z#S#c*MrGU>d+1D&Bd3(@;hHW2|Lz!h+a5(@?GDmBQph_>CU`5xX`&FXdG^Hq#_-y| zkVd0T5na4SOWV7051=S-FDadWXM0XY^ z9HxRJ3U=3#2&#Xvb{rDaC<{q!N+6cD*3W6a_l!`F8yC`~Q2l%~uEe4>Ja+KXj|+A< zk1ze%_$8oQ%oqQZ;x-R2Lq)3>Bd=sjUJta2%;^-M*h>BEL`Ra0{9a;qKtK!hODkb_1La+Ar}gx4>DlUgoP@{sFy@LNW!74sH_~{)xuS zVJQF~I_#e(&fbHg;IKDwm8EPd;2tn?+0q@lfHYhckfyNU!WmuTqK%ZJZy6Ug z+?k7>f#X|nMi6xe z9r+&3{%ayfDAsn8sS3pg-UMa&8zR3Ya)e@em0k(*JztW`%MBvGCvua>Eg~Nfd7sEf zL`bQMynvQI`Ts?y(I!r@8E)2&Q1c?8hV<_@LFhV#n>Nv)=4J5P)V5W`qQZHHAK(g;Buk4f_QUOjN51q*z z@YF0@nA3mACtc+a>r*zFSB+s4X{0q}bewustu)qCH$Hj|s>Od@riya&EXX<)_Sn1>&}I2O-0kw^-R7 zu8};DrEBIQtS@XF_y8>xG-nUGZ;NOYR_7rq_TD(Cy&jMA{`MOO$8J!mzD9w|91`N< z{MwZpJqJgm!=vYuaVVX)cLxIm#Kt(u!cjcR<$jL`9=S&%w_drl`k#NjcJfU`_EvSA z2}RtG_McQ~2QU7L)?NGflgZgnCU?%PnUexeAYqypSo)Lm>#q(bkNMdHdwr}9=8bgt zyf#3US!~DR(Ra9d&ube=Q@dkH9!2<+XNbH)grc6LY$quU$ORDHy@Xo0_!T zYBwLNWTYM18mM!4GAoQj>UxYFG5J-y{WM49N71Yz9NCrTtNKH%6nVQkPRi_pq^rqNFa8 zJ_-Y@T*SRomHVG2WvKkC&VQX*u9;a)H|X6MwUBqvE2zsnS0(`lOQ=NSl(1kDkU0p$ za=LEv52YgcI~vj^#8d}Kd=6Ul3YEqwMAELAB$%(xmVL-LOI|_Kv{1|7-wEyAH-BLS z^}cx`4eGktYTynQGZF?3V|HV6ImXH`XS}0bQ^shok`oyhb_(1W-L{?$Z_R1~Bhmm9 zm=hwie7<5eaJQ^3@7O+HuosXWmk`x_tBZFO&G=Eg>mc^6qUCe>0ONZrl@WY%E?>{G z3O3E^)yOYto#5DEhxNg3cKCNf<4Ifo6uqRRuC1yT+022Bj|>)dw;H{xlC)T-U8{mV zfi>t<-=bXwbt56^_*4PfLrb@kB8=`u3HAv7=Up>riwgP%@D#O;Do<^d$q6PM_!qEM gIE-ARBvar&Gn^k9Cj8TubnTY4VlABYSNysE0$z>yyZ`_I literal 0 HcmV?d00001 diff --git a/anomalydetection/self_att_lstm/self_att_lstm_predict.py b/anomalydetection/self_att_lstm/self_att_lstm_predict.py new file mode 100644 index 0000000..b62d7ed --- /dev/null +++ b/anomalydetection/self_att_lstm/self_att_lstm_predict.py @@ -0,0 +1,246 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import os +import torch.nn as nn +import time +from anomalydetection.self_att_lstm.self_att_lstm_train import Model +import torch.nn.functional as F + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# len(line) < window_length + +def generate(name, window_length): + log_keys_sequences = list() + with open(name, 'r') as f: + for line in f.readlines(): + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences.append(tuple(line)) + return log_keys_sequences + +def generate_log_deep(name, window_length): + log_keys_sequences = {} + with open(name, 'r') as f: + for line in f.readlines(): + if len(line) < window_length + 1: + continue + ln = list(map(lambda n: n-1, map(int, line.strip().split()))) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences[tuple(ln)] = log_keys_sequences.get(tuple(ln), 0) + 1 + return log_keys_sequences + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, window_size): + + model1 = Model(input_size, hidden_size, num_layers, num_classes, if_bidirectional=False, sequen_len=window_size).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + +def do_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, anomaly_test_line_path, test_file_path, num_candidates, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, window_length) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + abnormal_loader = generate(test_file_path, window_length) + with open(anomaly_test_line_path) as f: + abnormal_label = [int(x) for x in f.readline().strip().split()] + # for testing model using train set + # abnormal_label = [] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in abnormal_loader: + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + lineNum = current_file_line * 200 + i + window_length + 1 + input_abnormal = False + count_num += 1 + seq = line[i:i + window_length] + origin_seq = seq + label = line[i + window_length] + for n in range(len(seq)): + if current_file_line * 200 + i + n + 1 in abnormal_label: + input_abnormal = True + continue + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + output = F.softmax(output, 1) + # print(torch.sort(output, 1)) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + # print(predicted) + # print('Fp {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + '''if lineNum in abnormal_label or in: # 若出现异常日志,则接下来的预测跳过异常日志,保证进行预测的日志均为正常日志 + i += window_length + 1 + else: + i += 1''' + i += 1 + ALL += 1 + if vec_to_class_type[tuple(label)] not in predicted: + if lineNum in abnormal_label or input_abnormal: + TP += 1 + else: + FP += 1 + + else: + if lineNum in abnormal_label or input_abnormal: + print('FN {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + print(torch.sort(output, 1)) + for l in origin_seq: + print(str(vec_to_class_type[tuple(l)]), end='') + print(',', end='') + print(str(vec_to_class_type[tuple(label)])) + FN += 1 + else: + TN += 1 + current_file_line += 1 + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + FAR = FP * 100 / (FP+TN) + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%, FAR: {:.3f}%'.format(Acc, P, R, F1, FAR)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') + + +def do_log_deep_predict(input_size, hidden_size, num_layers, num_classes, window_length, model_path, test_normal_file_path, test_abnormal_file_path, num_candidates, pattern_vec_file): + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, window_length) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + normal_loader = generate_log_deep(test_normal_file_path, window_length) + abnormal_loader = generate_log_deep(test_abnormal_file_path, window_length) + # for testing model using train set + # abnormal_label = [] + print('predict start') + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in normal_loader.keys(): + count_num += 1 + print(count_num) + if count_num > 6000: + break + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + seq = line[i:i + window_length] + label = line[i + window_length] + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + output = F.softmax(output, 1) + # print(torch.sort(output, 1)) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + # print(predicted) + # print('Fp {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + if label in predicted: + TN += normal_loader[line] + else: + FP += normal_loader[line] + i += 1 + with torch.no_grad(): + count_num = 0 + current_file_line = 0 + for line in abnormal_loader.keys(): + count_num += 1 + i = 0 + # first traverse [0, window_size) + while i < len(line) - window_length: + seq = line[i:i + window_length] + label = line[i + window_length] + seq = torch.tensor(seq, dtype=torch.float).view(-1, window_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq) + output = F.softmax(output, 1) + # print(torch.sort(output, 1)) + predicted = torch.argsort(output, 1)[0][-num_candidates:] + predicted = filter_small_top_k(predicted, output) + # print(predicted) + # print('Fp {} - predict result: {}, true label: {}'.format(lineNum, predicted, vec_to_class_type[tuple(label)])) + if label in predicted: + FN += abnormal_loader[line] + else: + TP += abnormal_loader[line] + i += 1 + print(count_num) + + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 /(TP + TN + FN + FP) + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/self_att_lstm/self_att_lstm_train.py b/anomalydetection/self_att_lstm/self_att_lstm_train.py new file mode 100644 index 0000000..b90dfb5 --- /dev/null +++ b/anomalydetection/self_att_lstm/self_att_lstm_train.py @@ -0,0 +1,140 @@ +# -*- coding: UTF-8 -*- +# regularization waiting for heliren sparse +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import os +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader + +# use cuda if available otherwise use cpu +from torch.autograd import Variable + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirectional, sequence_len): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional, dropout=0.5) + if if_bidirectional: + self.num_of_directions = 2 + else: + self.num_of_directions = 1 + self.fc = nn.Linear(hidden_size*self.num_of_directions, out_size) + + self.att_weight = nn.Parameter(torch.randn(1, 1, self.hidden_size*self.num_of_directions)) + self.att_bias = nn.Parameter(torch.randn(1, 1, sequence_len)) + + # self.out = nn.Linear(in_features=in_features, out_features=out_features) + +# l1 regularization will add later + def attention_net(self, H): + # print(H.size()) = [batch, numdirec*hidden, seqlen] + a = F.softmax(torch.matmul(self.att_weight, H) + self.att_bias, 2) + a = torch.transpose(a, 1, 2) + return torch.bmm(H, a) + + def init_hidden(self, size): + # size self.batch_size same + h0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + # h_n: hidden state h of last time step + # c_n: hidden state c of last time step + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + + # out = torch.transpose(out, 0, 1) + # out shape [batch, seqlen, numdirec*hidden] + out = torch.transpose(out, 1, 2) + # out shape [batch, numdirec*hidden, seqlen] + att_out = self.attention_net(out) + # att_out shape[batch, num_direc*hidden_size, 1] + # att_out[:, :, 0] shape[batch, num_direc*hidden_size] + out = self.fc(att_out[:, :, 0]) + # out shape[batch, num_of_class] + return out + + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length + 1: + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(vec_to_class_type[line[i + window_length]]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + +def generate_logdeep_seq_label(file_path, window_length): + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + line = tuple(map(lambda n: n-1, map(int, line.strip().split()))) + if len(line) < window_length + 1: + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(line[i + window_length]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size, hidden_size, num_of_layers, num_of_classes, False, window_length).to(device) + # create data set + sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), weight_decay=0.0001) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, window_length, input_size).to(device) + output = model(seq) + + loss = criterion(output, label.to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') \ No newline at end of file diff --git a/anomalydetection/self_att_lstm_supervised/__init__.py b/anomalydetection/self_att_lstm_supervised/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/anomalydetection/self_att_lstm_supervised/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/anomalydetection/self_att_lstm_supervised/__pycache__/__init__.cpython-36.pyc b/anomalydetection/self_att_lstm_supervised/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66c3c532c0df3162809b72e358dbdb0b4afc30da GIT binary patch literal 192 zcmXr!<>hLfwu4|xXIhDnk#W6nl>5edVVqShOP&g&EBsIAt zGe0k;I5j6NKCz@EKBu@OH@>*EAhoD0vp6*+CO$qhFS8^*Uaz3?7Kcr4eoARhsvXD~ I#X!se0F042d;kCd literal 0 HcmV?d00001 diff --git a/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_predict.cpython-36.pyc b/anomalydetection/self_att_lstm_supervised/__pycache__/self_att_lstm_supervised_predict.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d3c6e27348cadffa6222b8938a94c881529eac2 GIT binary patch literal 4198 zcmZ`+&5s;M6|d^<`Re)D+1;_d>uijXNXUY-#7YpsvK&IpDnj6xvrS??~@r%AY30d61;2yw|J-1r}mI3Z3Y4oI9<;sQqt34X77zp{$ERbQ`ORlj;4 zzk2;R=Y=#lWnYew*63M`B>eJb(~9WL+acc$3m)A zZY%f4LZ&ttOhs({!DKq1gVIw$>7cxal5e5XY{m*tbA!XrywC0x{{CI#54{Ayn_Brf41hSA?4sh6v;rZO%81R zb{da1MSSN9oe+f`Ufq_NRI!$8x6|<;PV+nHluW03AuFBgWd(z6X4pr)f@@}=S;3wJ zDlq()-flqJQKUy4-%n)*m+;o!uzcU?p{(XG{XD##Cb=GrN0U+*`bL!_|IxXcouxLW@a;MgL{H5?=&FM`{T)nErY0^J634_~cJRFY|1-5RS= ztYM)r$tF`>iqYIjgpgS|2(#%ZO5+2ma@(Z-AdT}JS`W`8QC}w+=FUq`wyHXgn-|S; zsg%!|PbYC2IX1FJ9-?C23h(hctfbz==oD}0oIZ-P&>QG%18w$SHH0qL3g2+SmznXG z0i5vd5IBO7!;tV%i5&ihGsE_OKmYe%?{7YM2X4QNeO3e8I1V@|Y;`=<&;(SIq*^M= zCl}>kC}*M^QJzM6Jc*vXhHXmWdB8*7CE?!&Z9oFkL!Q}2al+q1u))1N2b0~x*L(;! zBk((eI^h+SZ5+{KMYRAR4g&*-Eb5>)_Wy1gx?<`x{)GL4t0hwr{`Xi>Ifm7`s6r8q zWrEgP39UtFLO3!1)LUjnh_&I28?07;Fd^U;yKVy%3@>MvSP$Rgq~PZMZ4*MlZxt=m zf?_@O7t1yM|(LvZ*@Sh?(0?|uQbS25E(4v&Uz8r)2!Y$d$o9IauS z?x~rF6Et@3JYu?|=Z0NVMLjQC&pbE6Ic8qbHf@AcXG2lcI$ItQZhbZA^)TZ=uXO@? zCD=K064W68Ry~D#9O09Q3szRN{QD^RG0MGu0w$3fZ%%WBirht9Y|8XJ3ZKB9t$%n7 zPtW=FKstv>e$QwK$+6>D4#g6c*u{R}^Eb9ov8xONS%sCQWw% zDe(6!+LMEb=4|VP;zsi>?jsTyK2KbBj^JRt1x@5RunCY*f`~u@VVh+fMQ#vk6(?D_ zEl<=n`d~Ykqd3#aAR_Fx6}cyYNOj`Jj{1T>Mk&31(X(Q z3d%ZP@UEi03?88WBk&Ih@#%RPCltJBqO?%jC>>-Q zRTCf_{bCN8%bclEmLq~^LD5CVgVYSSLWORJ&;Er(ju-R$|IjtlrO^E7Z;!oV0cEl1 znZ;pU%ws1u^M?(M5G@u>!z|KmE!d}ki!Frju!+2;h0=yBm&_$mdlC#0GT^?fy9gPV z{F$E1s5o|Qn#-d4#4|mrdyJAsgj&amoOy{AOXz|3nCDmESw6>u z8SpHdC9{0W^C$3J&3d9`maxw5Xu>+2h3`|x=D(2Wm&^zLAQ ziCb%LZh`3m@|lD%;Dq$bjq4vsq^9XP!urkEM>5W*N}duA7uS<4$#XW>N$OivIfSca<7itU$7x6)lgp58u$RcE>KnxK z3MyL}q+_5BawQ!jC#rxENp>~P3D`;u^)mbq#7e~n0M_Y9eH-gmUsi-L>UFBtsQM07 zgq(`-$9A;Dop6S!&IP`NsRZn{Tjoe-X^#z^C%B_p@8JXMt)o2LK)Ye*Pcy0$RJ2Xk zqYMhUNo6pe0yBeL%L!zayXp=J^j)gnBB}g$@7=Tff#n}yznd5i2DWuxyBEC1hG$+v zNpQY_ig`V+4hVIx-}1T`5x`ys{W>z89$z|s>j2pH$yx)xUkJFYG{ ziz!NnY;}UWD$dni3>0~wqQ6P&jdNRfnd@4XSue}f$29jo&2>xqR3~Y^Mj9#aIr4}8 z`xpPe)?W1g^}>nBT~{u%c3z|CU)Q#hJc{??1owC<-QRG|B*kL;Hhvw(>*kgSb}Kp4mQGERhS6JiTT6FW`sRL^X;d%DNf zHTLeL`!q%okt~7~<%k4aKtdeg&L6;m1IIccamopCwUFTVs%O`ZA1VTV>AM{}eMi^axYn=f znio59z2DIFX1|HMnY703ew)!8R?-14#@vj}xxtLWEDYZ&ER?o%Vz*#uu{bLnZgT6Pi}qUKh>P4t%}ZVG&>bV| z^k(d>8HGKkIT{6xZK9_o+i?d1Vq}bYY0TW3V@-eQGrhG%YnK}k+`V>V1nKbm+si#m zHTMHKIPiy&$fauUM?4bYKt@>_B&xf%E_mbPzc(p@_au^BkB99 zc@S|PrhXpHLe);EV?P`ENpKX3T$yRA>?_xATvc|G%duKsbe}AhEB8v2hJjG#aG)BM zM0%$BTY(71A&y)*G82OX<%l5VscHmL`uD?VbRd;S_5CQwd!DlMFd3>wluo8n3nRJ^ zR4i+0v6V`|ZZyOTC+i1^*rWliJiI#%(?RGbVG1*<`~Dya^4#}-Ydo62e*V_gqsvhe z=D1&hv}u%%ZjF!bXCgnyCbyDo6hLc7d6eI}l8s(Ht_Nv0Ml;7Qv|?|Oh(5_o+VHhH z??rjYKexk|B8bwx$&pxsn5($*brc5cFc<$#W;4%p*&1`r(`?q!cDZ+aGy^(3`jKH0 z8N7}w{}u{q#7vsnbnJqWamCIC2x4xQCfeP?glV$5rJrhKuyYdzWL*<2ye--&pmBCt z)$(j8$H5_SkD=UgAji{0?9iYuP)i;1r}-qyLuKuc$307&re^l0Vu3hA!*uagYHUL> zVM+$*r=g_w+(luSJJ{`Pwc6Wf#++=BHbp%mRL>ZqKdQk>Db1>i_jx zgw7#tmtE`Cs*O3bP-MBbCGiZV*QTXRnJJft_o6}Av#X8S+IG!@bIKf?dwPRSjhG59 zU&kzq?VV;gy5(wnPmDO(oP(is>CN>~!Dgy0v4X{PPV_f#;p|FFTCuH9&!W@ANC#)f zN>B?<)dq1JKQR|28(Le2R)-bWXcV?=UlRm&eZu;zd!AUM24YL`4JzoQMURRXsMtUO zGl)jxEaE+g;@vRSsUo`&hqeo?j$KPlRawLZjJ4eor~C>yj2cX^FGEw4W?AValYW6mz&xb>Q@ ztQ9~)!Dz)hhWN|f4-YR%@3?gngLk0tn)$o)Wk!O_})zG0mhmX}}3#2-56VnO&zBLXe${8kEAmOp3i13t8l1PHqS^D_=^}#Fe zLK2Mkd2sP9S^}3bJ2MK?Py{mc;S}J>_QPcFLXr)FB)^DCHT*fZ|o2rO_K%T$xu!kdyvs&za_ z4`2~u1IVRni)Llv71cQ&ga^2PwX}#F@-WZgA@XBK&k_8oCb)&XzJ9Rvv>w%Qfd9og zEB~!T(m?(-6b9UB3s(pKP3Ez6)Ej8&p#CiDnkHJ-&F9z-69Gn_P(ud~bZKgM4n})I z3tdnIIJy7{l=f#p?JPj;mNm`UfKcfj{zbw?6&@f$0Q?h?1GfNcha(;Zm_6=p8AW}| zh-=}=;ZK1(ZDJ%maU@2 z%{!*}GvMCf%@N}iP!gnnM=gadn;2mcEi9n^yPHHTjiRaPpcXe|3$j5jl8vOtyO(bo zvMoFDQsJWP@&*Wio**{}pk1^z0f>uG++(Nnzi~w_NaLz?jH?Py8aRZeD3v`^yoq7T zjQCS{65mHV$CWRD^umb^h-`nFBQ*S9KtINl@~$s}L-8^uiJr#+D@XGJ@O%YLpZ*)a z-myoW$c{FwUBoD@1p+xlG9(L(uiC=4KGO(UnlR)CxxJhxC1mOQp%ZsWd9 z7$UkIH0cfePT?Zwt1VwFRBIOa9ERV>BZb9PE6fp~Axvu^MvQ2nty74O7fAXfQ^<@XHR*f2l8d-+DsM%q~=^;J?<%MYf*A)H8K z5~cn)9B1MPV0W0eZM^WetG-( zU2Xix-TWlWvF7{d_g);9kJ#CnwKtKoWi1xEcZZ0>^MjC!AuLiCy|0Kd>dKi2aE*=( z#aZe+M+I53peR5P!w8BbRqeIczjp=8C?}i9Xsm3~2exvY*5;ZIR6T?mbhx3ML6X4{ zBBhW4@?HC06y8T@jl?uW{y`L=>XeB1BLQyl_JhG4ik_8?4ypzj{1iZ#Yel zzuGh&*Tu6m@lR>4Ru#DIXRwubw1KK__4ZUoNxnx~to#jGaKBUa1yczH1YA(+Sy?${ zhYi!inB;j PU8`konk(nLZEy2GGSGA{ literal 0 HcmV?d00001 diff --git a/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py new file mode 100644 index 0000000..7414db8 --- /dev/null +++ b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_predict.py @@ -0,0 +1,131 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import torch +import json +import pandas as pd +import numpy as np +import os +import torch.nn as nn +import time +import random +from torch.utils.data import TensorDataset, DataLoader +from anomalydetection.self_att_lstm_supervised.self_att_lstm_supervised_train import Model + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# len(line) < window_length + +def generate(name, window_length): + log_keys_sequences = list() + with open(name, 'r') as f: + for line in f.readlines(): + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + log_keys_sequences.append(tuple(line)) + return log_keys_sequences + + + +def load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, batch_size, sequence_length): + + model1 = Model(input_size, hidden_size, num_layers, num_classes, if_bidirectional=False, batch_size=0, sequence_len=sequence_length).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def filter_small_top_k(predicted, output): + filter = [] + for p in predicted: + if output[0][p] > 0.001: + filter.append(p) + return filter + + +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + num_of_sessions = 0 + input_data, output_data = [], [] + train_file = pd.read_csv(file_path) + i = 0 + while i < len(train_file): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + semantic_line = [] + for event in line: + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event - 1)]) + input_data.append(semantic_line) + output_data.append(int(train_file["label"][i])) + i += random.randint(6, 8) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length, model_path, test_file_path, batch_size, pattern_vec_json): + + sequential_model = load_sequential_model(input_size, hidden_size, num_layers, num_classes, model_path, batch_size, sequence_length) + + start_time = time.time() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + + # create data set + sequence_data_set = generate_robust_seq_label(test_file_path, sequence_length, pattern_vec_json) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + + print('predict start') + with torch.no_grad(): + count = 0 + for step, (seq, label) in enumerate(data_loader): + # first traverse [0, window_size) + seq = seq.view(-1, sequence_length, input_size).to(device) + #label = torch.tensor(label).view(-1).to(device) + output = sequential_model(seq)[:, 0].clone().detach().numpy() + predicted = (output > 0.2).astype(int) + label = np.array([y for y in label]) + TP += ((predicted == 1) * (label == 1)).sum() + FP += ((predicted == 1) * (label == 0)).sum() + FN += ((predicted == 0) * (label == 1)).sum() + TN += ((predicted == 0) * (label == 0)).sum() + count += 1 + if count > 100000: + break + ALL = TP + TN + FP + FN + # Compute precision, recall and F1-measure + if TP + FP == 0: + P = 0 + else: + P = 100 * TP / (TP + FP) + + if TP + FN == 0: + R = 0 + else: + R = 100 * TP / (TP + FN) + + if P + R == 0: + F1 = 0 + else: + F1 = 2 * P * R / (P + R) + + Acc = (TP + TN) * 100 / ALL + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'], [Acc, P, R, F1], 'evaluations', '%') \ No newline at end of file diff --git a/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py new file mode 100644 index 0000000..219e7a1 --- /dev/null +++ b/anomalydetection/self_att_lstm_supervised/self_att_lstm_supervised_train.py @@ -0,0 +1,154 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +import json +import torch +import pandas as pd +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import os +from tensorboardX import SummaryWriter +from torch.utils.data import TensorDataset, DataLoader + +# use cuda if available otherwise use cpu +from torch.autograd import Variable + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Model(nn.Module): + def __init__(self, input_size, hidden_size, num_of_layers, out_size, if_bidirectional, batch_size, sequence_len): + super(Model, self).__init__() + self.hidden_size = hidden_size + self.num_of_layers = num_of_layers + self.lstm = nn.LSTM(input_size, hidden_size, num_of_layers, batch_first=True, bidirectional=if_bidirectional, dropout=0.5) + if if_bidirectional: + self.num_of_directions = 2 + else: + self.num_of_directions = 1 + self.fc = nn.Linear(hidden_size*self.num_of_directions, out_size) + self.batch_size = batch_size + + self.att_weight = nn.Parameter(torch.randn(1, 1, self.hidden_size*self.num_of_directions)) + self.att_bias = nn.Parameter(torch.randn(1, 1, sequence_len)) + # self.out = nn.Linear(in_features=in_features, out_features=out_features) + + # l1 regularization will add later + def attention_net(self, H): + # print(H.size()) = [batch, numdirec*hidden, seqlen] + a = F.softmax(torch.matmul(self.att_weight, H) + self.att_bias, 2) + a = torch.transpose(a, 1, 2) + return torch.bmm(H, a) + + + def init_hidden(self, size): + # size self.batch_size same + h0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + c0 = torch.zeros(self.num_of_layers*self.num_of_directions, size, self.hidden_size).to(device) + return (h0, c0) + + def forward(self, input): + # h_n: hidden state h of last time step + # c_n: hidden state c of last time step + out, _ = self.lstm(input, self.init_hidden(input.size(0))) + + # out = torch.transpose(out, 0, 1) + # out shape [batch, seqlen, numdirec*hidden] + out = torch.transpose(out, 1, 2) + # out shape [batch, numdirec*hidden, seqlen] + att_out = self.attention_net(out) + + out = self.fc(att_out[:, :, 0]) + # out shape[batch, num_of_class = 1] + # add sigmoid + return torch.sigmoid(out) + + +def generate_seq_label(file_path, window_length, pattern_vec_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + num_of_sessions = 0 + input_data, output_data = [], [] + with open(file_path, 'r') as file: + for line in file.readlines(): + num_of_sessions += 1 + line = tuple(map(lambda n: tuple(map(float, n.strip().split())), [x for x in line.strip().split(',') if len(x) > 0])) + if len(line) < window_length + 1: + # print(line) + continue + for i in range(len(line) - window_length): + input_data.append(line[i:i + window_length]) + # line[i] is a list need to read file form a dic{vec:log_key} to get log key + output_data.append(vec_to_class_type[line[i + window_length]]) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file): + with open(pattern_vec_file, 'r') as pattern_file: + class_type_to_vec = json.load(pattern_file) + num_of_sessions = 0 + input_data, output_data = [], [] + train_file = pd.read_csv(file_path) + for i in range(len(train_file)): + num_of_sessions += 1 + line = [int(id) for id in train_file["Sequence"][i].split(' ')] + line = line[0:sequence_length] + if len(line) < sequence_length: + line.extend(list([0]) * (sequence_length - len(line))) + semantic_line = [] + for event in line: + if event == 0: + semantic_line.append([-1] * 300) + else: + semantic_line.append(class_type_to_vec[str(event - 1)]) + input_data.append(semantic_line) + output_data.append(int(train_file["label"][i])) + data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data)) + return data_set + + +def train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, root_path, model_output_directory, data_file, pattern_vec_file): + # log setting + log_directory = root_path + 'log_out/' + log_template = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + + print("Train num_classes: ", num_of_classes) + model = Model(input_size, hidden_size, num_of_layers, num_of_classes, False, batch_size, sequence_length).to(device) + # create data set + sequence_data_set = generate_robust_seq_label(data_file, sequence_length, pattern_vec_file) + # create data_loader + data_loader = DataLoader(dataset=sequence_data_set, batch_size=batch_size, shuffle=True, pin_memory=False) + writer = SummaryWriter(logdir=log_directory + log_template) + + # Loss and optimizer classify job + criterion = nn.BCELoss() + optimizer = optim.Adam(model.parameters(), weight_decay=0.001) + + # Training + for epoch in range(num_epochs): + train_loss = 0 + for step, (seq, label) in enumerate(data_loader): + seq = seq.clone().detach().view(-1, sequence_length, input_size).to(device) + output = model(seq) + + loss = criterion(output.squeeze(-1), label.float().to(device)) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + train_loss += loss.item() + optimizer.step() + print('Epoch [{}/{}], training_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(data_loader.dataset))) + if (epoch + 1) % num_epochs == 0: + if not os.path.isdir(model_output_directory): + os.makedirs(model_output_directory) + e_log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(epoch+1) + torch.save(model.state_dict(), model_output_directory + '/' + e_log + '.pt') + writer.close() + print('Training finished') \ No newline at end of file diff --git a/deeplog_detection.py b/deeplog_detection.py index 4341b7b..508cd1c 100644 --- a/deeplog_detection.py +++ b/deeplog_detection.py @@ -1,75 +1,62 @@ import os -from logparsing.fttree import fttree -from extractfeature import hdfs_fs_deeplog_preprocessor +import sys +sys.path.append('./') +from logparsing.drain.HDFS_drain import get_hdfs_drain_clusters +from extractfeature.hdfs_deeplog_preprocessor import hdfs_preprocessor from anomalydetection.deeplog.Model1 import log_key_LSTM_train from anomalydetection.deeplog.Model2 import variable_LSTM_train from anomalydetection.deeplog import log_predict -# 原始日志文件 -log_file_dir = './Data/log/hdfs/' -log_file_name = 'HDFS_split' -log_file_abnormal_label = 'HDFS_split_anomaly' -# FT-tree -log_result = './Data/FTTreeResult-HDFS/' -log_fttree_out_dir = log_result+'clusters/' + # log_train,log_test,logkey,logvalue -log_preprocessor_dir = log_result+'deeplog_files/' -# model -model_dir = log_result+'deeplog_model_train/' +log = './Data/log/hdfs/HDFS_40w' +drain_out = './Data/Drain_HDFS/clusters/' +bin_dir = './HDFS_drain3_state.bin' +log_preprocessor_dir = './Data/Drain_HDFS/log_preprocessor' +model_dir = './Data/Drain_HDFS/deeplog_model_train/' + # train parameters window_length = 4 input_size = 1 hidden_size = 20 num_of_layers = 3 -model1_num_epochs = 300 +model1_num_epochs = 100 model1_batch_size = 200 model2_num_epochs = 50 model2_batch_size = 20 learning_rate = 0.01 num_candidates = 3 mse_threshold = 0.1 +# 是否使用模型二 +use_model2 = False -if not os.path.exists(log_result): - os.makedirs(log_result) -if not os.path.exists(log_fttree_out_dir): - os.makedirs(log_fttree_out_dir) -if not os.path.exists(log_preprocessor_dir): - os.makedirs(log_preprocessor_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) -# FT-tree -def pattern_extract(): - fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_dir, 5, 4, 2) - -# 将原日志文件分成训练集和测试集两部分 -def log_split(): - hdfs_fs_deeplog_preprocessor.log_split(log_file_dir,log_file_name,log_file_abnormal_label,log_preprocessor_dir) +def drain(): + get_hdfs_drain_clusters(log,drain_out,bin_dir) -# 生成log_key -def generate_log_key(): - hdfs_fs_deeplog_preprocessor.generate_log_key(log_file_dir,log_file_abnormal_label,log_preprocessor_dir,log_fttree_out_dir) - -# 提取并处理log_value -def generate_log_value(): - hdfs_fs_deeplog_preprocessor.generate_log_value(log_file_dir,log_file_name,log_file_abnormal_label,log_preprocessor_dir,log_fttree_out_dir) +def generate_logkey_and_value(): + hdfs_preprocessor() # 训练 +def train_model(): + train_model1() + if use_model2: + train_model2() + def train_model1(): - log_key_LSTM_train.train_model1(model_dir,log_preprocessor_dir,log_fttree_out_dir,model1_num_epochs,model1_batch_size,window_length,input_size,hidden_size,num_of_layers) + log_key_LSTM_train.train_model1(model_dir,log_preprocessor_dir,drain_out,model1_num_epochs,model1_batch_size,window_length,input_size,hidden_size,num_of_layers) def train_model2(): variable_LSTM_train.train_model2(model_dir,log_preprocessor_dir,model2_num_epochs,model2_batch_size,window_length,num_of_layers,learning_rate,hidden_size) # 测试 def test_model(): - log_predict.do_predict(log_preprocessor_dir,model_dir,window_length,input_size, hidden_size, num_of_layers,num_candidates,mse_threshold) - + model1_name = 'Adam_batch_size=' + str(model1_batch_size) + ';epoch=' + str(model1_num_epochs) + '.pt' + log_predict.do_predict(log_preprocessor_dir,drain_out,model_dir,model1_name,model2_num_epochs,window_length, input_size, hidden_size, num_of_layers, num_candidates, mse_threshold, use_model2) -# pattern_extract() -# log_split() -# generate_log_key() -# generate_log_value() -# train_model1() -# train_model2() -test_model() \ No newline at end of file +#drain() +generate_logkey_and_value() +# train_model() +#test_model() \ No newline at end of file diff --git a/ecoder_anomaly_detection.py b/ecoder_anomaly_detection.py new file mode 100644 index 0000000..78f9038 --- /dev/null +++ b/ecoder_anomaly_detection.py @@ -0,0 +1,71 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- + +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.att_all_you_need import encoder_self_att_train +from anomalydetection.att_all_you_need import encoder_self_att_predict + +# parameters for early prepare +logparser_structed_file = './Data/logparser_result/Drain/HDFS.log_structured.csv' +logparser_event_file = './Data/logparser_result/Drain/HDFS.log_templates.csv' +anomaly_label_file = './Data/log/hdfs/anomaly_label.csv' +sequential_directory = './Data/DrainResult-HDFS/sequential_files/' +train_file_name = 'robust_train_file' +test_file_name = 'robust_test_file' +valid_file_name = 'robust_valid_file' +wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)' +pattern_vec_out_path = './Data/DrainResult-HDFS/pattern_vec' +variable_symbol = '<*> ' + +# my encoder +sequence_length = 50 +input_size = 300 +hidden_size = 256 +num_of_layers = 4 +# 1 using sigmoid, 2 using softmax +num_of_classes = 1 +num_epochs = 100 +batch_size = 1000 +# for robust attention bi +train_root_path = './Data/DrainResult-HDFS/att_all_you_need/' +model_out_path = train_root_path + 'model_out/' +train_file = sequential_directory + train_file_name +pattern_vec_json = pattern_vec_out_path +dropout = 0.5 +num_of_heads = 8 +pf_dim = 512 + + +# predict parameters +# log anomaly sequential model parameters + +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def train_model(): + encoder_self_att_train.train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, train_file, pattern_vec_json, dropout, num_of_heads, pf_dim) + + +def test_model(): + # do something + encoder_self_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, sequence_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + valid_file_name, batch_size, pattern_vec_json, dropout, num_of_heads, pf_dim) + +#pattern_extract() +#extract_feature() +#train_model() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + diff --git a/extractfeature/hdfs_deeplog_preprocessor.py b/extractfeature/hdfs_deeplog_preprocessor.py new file mode 100644 index 0000000..ed89416 --- /dev/null +++ b/extractfeature/hdfs_deeplog_preprocessor.py @@ -0,0 +1,261 @@ +import csv +import os +import random + +class hdfs_deeplog_preprocessor: + # 日志变量设置 + LOG_LINE = 400000 + NUM_OF_LOGKEY = 31 + VECTOR_DIMENSION = 10 + NORMAL_STAGE_TO_STAGE_SIZE = [2000, 1000, 1000] + ABNORMAL_STAGE_TO_STAGE_SIZE = [800, 200, 200] + + # 读入数据部分 + ANOMALY_LABEL = './Data/log/hdfs/anomaly_label.csv' + LOG_FILE = './Data/log/hdfs/HDFS_40w' + MOFIFIED_LOG_FILE = './Data/log/hdfs/modified_HDFS_40w' + WORD_VECTOR_FILE = './Data/log/hdfs/word2vec_HDFS_40w' + LOGKEY_DIR = './Data/Drain_HDFS/clusters/' + is_block_normal = {} + block_to_lines = {} + line_to_logkey = [] + word_to_vector = {} + modified_logs = [] + + # 输出数据部分 + OUTPUT_DIR_PREFIX = './Data/Drain_HDFS/log_preprocessor/' + STAGE_TO_OUTPUT_DIR_INFIX = ['train/','validate/','test/'] + normal_blocks = [] + abnormal_blocks = [] + normal_block_index_to_stage = [] + abnormal_block_index_to_stage = [] + + + + ''' + ----------------------------------------------- + 以下是load_data部分 + ----------------------------------------------- + ''' + + def load_normal_info(self): + NORMAL_WORD = 'Normal' + FIRST_LINE_BLOCK_NAME = 'BlockId' + + with open(self.ANOMALY_LABEL,'r') as f: + lines = csv.reader(f) + for line in lines: + block = line[0] + normal_word = line[1] + if normal_word == NORMAL_WORD: + normal_info = True + else: + normal_info = False + if block != FIRST_LINE_BLOCK_NAME: + self.is_block_normal[block] = normal_info + + def load_line_info(self): + with open(self.LOG_FILE,'r') as f: + for line_index in range(self.LOG_LINE): + line = f.readline() + block = self.get_blockid(line) + if block not in self.block_to_lines.keys(): + self.block_to_lines[block] = [] + self.block_to_lines[block].append(line_index) + # print(self.block_to_lines['blk_-1608999687919862906']) + + def load_logkey_info(self): + self.line_to_logkey = [0 for i in range(self.LOG_LINE)] + for logkey in range(1,self.NUM_OF_LOGKEY+1): + with open(self.LOGKEY_DIR+str(logkey),'r') as f: + print(self.LOGKEY_DIR+str(logkey)) + lines = f.readline().strip().split(' ') + for line in lines: + line_index = int(line) + if line_index>=self.LOG_LINE: + print('cluster文件中某行的行数过大') + print(line) + exit(2) + self.line_to_logkey[line_index] = logkey + + def load_word_vector(self): + with open(self.WORD_VECTOR_FILE, 'r') as r: + for line in r.readlines(): + list_line = line.split(' ') + value = list(map(float, list_line[1:])) + key = list_line[0] + self.word_to_vector[key] = value + + def load_modified_log(self): + with open(self.MOFIFIED_LOG_FILE, 'r') as file: + content_list = file.readlines() + self.modified_logs = [x.strip() for x in content_list] + + def generate_block_list(self): + for block in self.block_to_lines.keys(): + if self.is_block_normal[block]: + self.normal_blocks.append(block) + else: + self.abnormal_blocks.append(block) + + ''' + ----------------------------------------------- + 以下是一些辅助函数 + ----------------------------------------------- + ''' + + def get_blockid(self, line): + words = line.strip().split(' ') + for word in words: + if len(word)>4 and word[:4] == 'blk_': + return word + print('无法找到block_id') + print(line) + exit(1) + + + def get_sentence_vector(self, sentence): + words = sentence.split(' ') + old_vector = [0.0 for i in range(self.VECTOR_DIMENSION)] + for word in words: + # print(word) + if word not in self.word_to_vector.keys(): + another_vector = [0.0 for i in range(self.VECTOR_DIMENSION)] + else: + another_vector = self.word_to_vector[word] + new_vector = [] + for i, j in zip(old_vector, another_vector): + new_vector.append(i + j) + old_vector = new_vector + + word_count = len(words) + for idx, value in enumerate(old_vector): + old_vector[idx] = value / word_count + vector_str = list(map(str, old_vector)) + sentence_vector = ','.join(vector_str) + return sentence_vector + + def get_logkey_and_logvalue_for_session(self, lines): + logkeys = [] + logkey_to_logvalues = [[] for i in range(self.NUM_OF_LOGKEY+1)] + for line in lines: + logkey = self.line_to_logkey[line] + logkeys.append(logkey) + log = self.modified_logs[line] + vector = self.get_sentence_vector(log) + logkey_to_logvalues[logkey].append(vector) + return logkeys,logkey_to_logvalues + ''' + ----------------------------------------------- + 以下是output_logkey_and_logvalue部分 + ----------------------------------------------- + ''' + + def get_block_stage_info(self,total_length,stage_to_length): + if sum(stage_to_length) > total_length: + print('要输出的条目太大,大于数据集中存在的条目。') + print(total_length) + print(stage_to_length) + exit(3) + block_index_list = [i for i in range(total_length)] + random.shuffle(block_index_list) + table = [-1 for i in range(total_length)] + + used_block_count = 0 + for stage in range(len(stage_to_length)): + block_index_start = used_block_count + block_index_end = used_block_count + stage_to_length[stage] + for block_index in block_index_list[block_index_start:block_index_end]: + table[block_index] = stage + used_block_count = block_index_end + return table + + def output(self,stage,output_normal): + if output_normal: + OUTPUT_DIR_SUFFIXES = ['logkey/','logvalue/normal/'] + LOGKEY_FILE = 'normal' + blocks = self.normal_blocks + block_index_to_stage = self.normal_block_index_to_stage + else: + OUTPUT_DIR_SUFFIXES = ['logkey/', 'logvalue/abnormal/'] + LOGKEY_FILE = 'abnormal' + blocks = self.abnormal_blocks + block_index_to_stage = self.abnormal_block_index_to_stage + + LOGKEY_OUTPUT_DIR = self.OUTPUT_DIR_PREFIX + \ + self.STAGE_TO_OUTPUT_DIR_INFIX[stage] + OUTPUT_DIR_SUFFIXES[0] + LOGVALUE_OUTPUT_DIR = self.OUTPUT_DIR_PREFIX + \ + self.STAGE_TO_OUTPUT_DIR_INFIX[stage] + OUTPUT_DIR_SUFFIXES[1] + if not os.path.exists(LOGKEY_OUTPUT_DIR): + os.makedirs(LOGKEY_OUTPUT_DIR) + if not os.path.exists(LOGVALUE_OUTPUT_DIR): + os.makedirs(LOGVALUE_OUTPUT_DIR) + logkey_writelist = [] + logkey_to_logvalue_writelist = [[] for i in range(self.NUM_OF_LOGKEY+1)] + + for block_index,block in enumerate(blocks): + if block_index_to_stage[block_index] == stage: + lines = self.block_to_lines[block] + logkeys, logkey_to_logvalues = \ + self.get_logkey_and_logvalue_for_session(lines) + logkey_line = ' '.join(str(logkey) for logkey in logkeys) + logkey_writelist.append(logkey_line+'\n') + for logkey in range(1,self.NUM_OF_LOGKEY+1): + if len(logkey_to_logvalues[logkey]) == 0: + logvalue_line = '-1' + else: + logvalue_line = ' '.join(logkey_to_logvalues[logkey]) + logkey_to_logvalue_writelist[logkey].append(logvalue_line+'\n') + + with open(LOGKEY_OUTPUT_DIR + LOGKEY_FILE,'w') as f: + f.writelines(logkey_writelist) + for logkey in range(1,self.NUM_OF_LOGKEY+1): + LOGVALUE_FILE = str(logkey) + with open(LOGVALUE_OUTPUT_DIR + LOGVALUE_FILE,'w') as f: + f.writelines(logkey_to_logvalue_writelist[logkey]) + + + ''' + ----------------------------------------------- + 以下是main函数部分 + ----------------------------------------------- + ''' + + + def load_data(self): + self.load_normal_info() + print('正常/异常标签加载成功') + self.load_line_info() + print('数据集block信息加载成功') + self.load_logkey_info() + print('从clusters取出logkey信息成功') + self.load_word_vector() + print('读入word vector信息成功') + self.load_modified_log() + print('读入log信息成功') + self.generate_block_list() + print('将block划分为正常/异常成功') + + def output_logkey_and_logvalue(self): + self.abnormal_block_index_to_stage = self.get_block_stage_info \ + (len(self.abnormal_blocks),self.ABNORMAL_STAGE_TO_STAGE_SIZE) + print('给异常block选择train validate test数据成功') + self.normal_block_index_to_stage = self.get_block_stage_info \ + (len(self.normal_blocks), self.NORMAL_STAGE_TO_STAGE_SIZE) + print('给正常block选择train validate test数据成功') + for stage in range(len(self.STAGE_TO_OUTPUT_DIR_INFIX)): + self.output(stage, output_normal=True) + print('给阶段' + str(stage) + '输出正常logkey和logvalue成功') + self.output(stage, output_normal=False) + print('给阶段' + str(stage) + '输出异常logkey和logvalue成功') + + def __init__(self): + self.load_data() + print('数据加载成功') + print('正常的session数:' + str(len(self.normal_blocks))) + print('异常的session数:' + str(len(self.abnormal_blocks))) + self.output_logkey_and_logvalue() + print('数据生成成功') + +def hdfs_preprocessor(): + hdfs_deeplog_preprocessor() diff --git a/extractfeature/hdfs_fs_deeplog_preprocessor.py b/extractfeature/hdfs_fs_deeplog_preprocessor.py index 0d94ff8..0044e0e 100644 --- a/extractfeature/hdfs_fs_deeplog_preprocessor.py +++ b/extractfeature/hdfs_fs_deeplog_preprocessor.py @@ -71,8 +71,18 @@ def generate_log_key(log_file_dir,log_file_abnormal_label,log_preprocessor_dir,l # 提取并处理log_value def generate_log_value(log_file_dir,log_file_name,log_file_abnormal_label,log_preprocessor_dir,log_fttree_out_dir): - log = log_file_dir+log_file_name + N_CLUSTER = 21 + WORD2VEC_FILE = 'word2vec' + STRING_VECTOR_FILE = 'string_vector' + + log_list = [] + word_vector = {} + + # log = log_file_dir+log_file_name + word2vec = log_file_dir+WORD2VEC_FILE + string_vector = log_file_dir+STRING_VECTOR_FILE in_abnormal = log_file_dir+log_file_abnormal_label + log_value_dir = ['logvalue_train/', 'logvalue_test/'] log_value_train_directory = log_preprocessor_dir+log_value_dir[0] log_value_test_directory = log_preprocessor_dir +log_value_dir[1] @@ -83,53 +93,58 @@ def generate_log_value(log_file_dir,log_file_name,log_file_abnormal_label,log_pr if not os.path.exists(log_value_test_directory): os.makedirs(log_value_test_directory) - log_list = [] - with open(log, 'r') as file: + with open(string_vector, 'r') as file: content_list = file.readlines() log_list = [x.strip() for x in content_list] + with open(word2vec, 'r') as r: + for line in r.readlines(): + list_line = line.split(' ') + value = list(map(float, list_line[1:])) + key = list_line[0] + word_vector[key] = value + abnormal = get_abnormal(in_abnormal) clusters = get_logkey(log_fttree_out_dir)[0] num = [0, 170000, 199999] - for i in range(0, 2): - for j in range(1, 62): + for i in range(len(log_value_dir)): + for j in range(N_CLUSTER): print("process:", i, j) - para1 = [] - para2 = [] - para3 = [] - out_path = log_preprocessor_dir + log_value_dir[i] + str(j) + ".txt" - for t in clusters[j - 1]: + out_path = log_preprocessor_dir + log_value_dir[i] + str(j+1) + ".txt" + write_list = [] + for t in clusters[j]: s = int(t) - if (i != 1 and s not in abnormal and s >= num[i] and s < num[i + 1]) or ( - i == 1 and s >= num[i] and s < num[i + 1]): - templog = [] - for word in log_list[s].split(' '): - templog.append(word) - para1.append(int(templog[0])) - para2.append(int(templog[1])) - para3.append(int(templog[2])) + if (i != 1 and s not in abnormal and num[i] <= s < num[i + 1]) or ( + i == 1 and num[i] <= s < num[i + 1]): + output = calc_sentence_vector(log_list[s],word_vector) + write_list.append(output) elif s >= num[i + 1]: - break; - if len(para1) > 0: - para1 = preprocessing.scale(para1) - if len(para2) > 0: - para2 = preprocessing.scale(para2) - if len(para3) > 0: - para3 = preprocessing.scale(para3) + break with open(out_path, mode='w', encoding='utf-8') as f: - for w in range(0, len(para1)): - print(para1[w], file=f, end='') - print(' ', file=f, end='') - print(para2[w], file=f, end='') - print(' ', file=f, end='') - print(para3[w], file=f, end='') - print(' ', file=f, end='') - print(' ', file=f) - - - - - + f.write('\n'.join(write_list)) + +def calc_sentence_vector(sentence,word_vector): + VECTOR_DIMENSION = 10 + + words = sentence.split(' ') + old_vector = [0.0 for i in range(VECTOR_DIMENSION)] + for word in words: + # print(word) + if word not in word_vector.keys(): + another_vector = [0.0 for i in range(VECTOR_DIMENSION)] + else: + another_vector = word_vector[word] + new_vector = [] + for i,j in zip(old_vector,another_vector): + new_vector.append(i+j) + old_vector = new_vector + + word_count = len(words) + for idx,value in enumerate(old_vector): + old_vector[idx] = value/word_count + vector_str = list(map(str, old_vector)) + output = ','.join(vector_str) + return output diff --git a/extractfeature/hdfs_ft_preprocessor.py b/extractfeature/hdfs_ft_preprocessor.py index 52df723..981614f 100644 --- a/extractfeature/hdfs_ft_preprocessor.py +++ b/extractfeature/hdfs_ft_preprocessor.py @@ -124,7 +124,7 @@ def preprocessor_hdfs_ft(cluster_directory, anomaly_file_path, wordvec_path, out for f in log_cluster[i]: train_file_obj.write(str(f)) train_file_obj.write(' ') - if count % 10 == 0: + if count % 200 == 0: train_file_obj.write('\n') else: train_file_obj.write(', ') @@ -138,8 +138,60 @@ def preprocessor_hdfs_ft(cluster_directory, anomaly_file_path, wordvec_path, out for f in log_cluster[i]: test_file_obj.write(str(f)) test_file_obj.write(' ') - if count % 10 == 0: + if count % 200 == 0: test_file_obj.write('\n') else: test_file_obj.write(', ') - count = count + 1 \ No newline at end of file + count = count + 1 + + +def preprocessor_hdfs_ft_split_abnormal(cluster_directory, anomaly_file_path, wordvec_path, out_dic, train_out_file_name, + test_out_file_name, label_out_file_name, pattern_vec_out_path, degree, num_of_lines): + anomaly_log_lines = set() + with open(anomaly_file_path, 'r') as anomaly_file: + line = anomaly_file.readline() + lines_str = line.split(' ') + anomaly_log_lines.update([int(x) for x in lines_str if len(x) > 0]) + + pattern_vec = pattern_to_vec(cluster_directory, wordvec_path, pattern_vec_out_path) + + log_cluster = {} + file_names = os.listdir(cluster_directory) + for file_name in file_names: + with open(cluster_directory + file_name, 'r') as cluster: + lines = cluster.readlines() + line_numbers = [int(x) for x in lines[1].split(' ') if len(x) > 0] + for number in line_numbers: + if not (number in anomaly_log_lines and number < int(degree * num_of_lines)): + log_cluster[number] = pattern_vec[lines[0].strip()] + + with open(out_dic + train_out_file_name, 'w+') as train_file_obj, open(out_dic + test_out_file_name, + 'w+') as test_file_obj, open( + out_dic + label_out_file_name, 'w+') as label_file_obj: + count = 1 + last_i = 0 + for i in sorted(log_cluster): + if i < int(degree * num_of_lines): + if i - last_i > 1: + train_file_obj.write('\n') + else: + train_file_obj.write(', ') + for f in log_cluster[i]: + train_file_obj.write(str(f)) + train_file_obj.write(' ') + count = count + 1 + else: + if i == int(degree * num_of_lines): + count = 1 + if i in anomaly_log_lines: + label_file_obj.write(str(count)) + label_file_obj.write(' ') + for f in log_cluster[i]: + test_file_obj.write(str(f)) + test_file_obj.write(' ') + if count % 200 == 0: + test_file_obj.write('\n') + else: + test_file_obj.write(', ') + count = count + 1 + last_i = i \ No newline at end of file diff --git a/extractfeature/hdfs_robust_preprocessor.py b/extractfeature/hdfs_robust_preprocessor.py new file mode 100644 index 0000000..e6760a5 --- /dev/null +++ b/extractfeature/hdfs_robust_preprocessor.py @@ -0,0 +1,166 @@ +# -*- coding: UTF-8 -*- +import os +import io +import re +import random +import math +import json +import pandas as pd +import numpy as np +block_id_regex = r'blk_(|-)[0-9]+' +special_patterns = {'dfs.FSNamesystem:': ['dfs', 'FS', 'Name', 'system'], 'dfs.FSDataset:': ['dfs', 'FS', 'dataset']} + + +def get_anomaly_block_id_set(anomaly_label_file): + datafile = open(anomaly_label_file, 'r', encoding='UTF-8') + data = pd.read_csv(datafile) + + data = data[data['Label'].isin(['Anomaly'])] + # 16838 anomaly block right with the log anomaly paper + anomaly_block_set = set(data['BlockId']) + return anomaly_block_set + + +def get_log_template_dic(logparser_event_file): + dic = {} + datafile = open(logparser_event_file, 'r', encoding='UTF-8') + data = pd.read_csv(datafile) + for _, row in data.iterrows(): + dic[row['EventId']] = row['numberID'] + return dic + + +# log parser_file should be structed.csv +def generate_train_and_test_file(logparser_structed_file, logparser_event_file, anomaly_label_file, out_dic, train_out_file_name, validation_out_file_name, test_out_file_name, wordvec_path, pattern_vec_out_path, variable_symbol): + anomaly_block_set = get_anomaly_block_id_set(anomaly_label_file) + log_template_dic = get_log_template_dic(logparser_event_file) + session_dic = {} + logparser_result = pd.read_csv(logparser_structed_file, header=0) + normal_block_ids = set() + abnormal_block_ids = set() + for _, row in logparser_result.iterrows(): + key = row['EventTemplate'] + content = row['Content'] + block_id = re.search(block_id_regex, content).group() + session_dic.setdefault(block_id, []).append(log_template_dic[row['EventId']]) + if block_id in anomaly_block_set: + abnormal_block_ids.add(block_id) + else: + normal_block_ids.add(block_id) + abnormal_block_ids = list(abnormal_block_ids) + normal_block_ids = list(normal_block_ids) + random.shuffle(abnormal_block_ids) + random.shuffle(normal_block_ids) + with open(out_dic + train_out_file_name, 'w+') as train_file_obj, open(out_dic + test_out_file_name, + 'w+') as test_file_obj, open( + out_dic + validation_out_file_name, 'w+') as validation_file_obj: + train_file_obj.write('Sequence,label\n') + test_file_obj.write('Sequence,label\n') + validation_file_obj.write('Sequence,label\n') + for i in range(len(normal_block_ids)): + if i < 6000: + train_file_obj.write(' '.join([str(num_id) for num_id in session_dic[normal_block_ids[i]]])) + train_file_obj.write(', 0\n') + elif i < 6000 + 50000: + validation_file_obj.write(' '.join([str(num_id) for num_id in session_dic[normal_block_ids[i]]])) + validation_file_obj.write(', 0\n') + else: + test_file_obj.write(' '.join([str(num_id) for num_id in session_dic[normal_block_ids[i]]])) + test_file_obj.write(', 0\n') + + for i in range(len(abnormal_block_ids)): + if i < 6000: + train_file_obj.write(' '.join([str(num_id) for num_id in session_dic[abnormal_block_ids[i]]])) + train_file_obj.write(', 1\n') + elif i < 6000 + 1000: + validation_file_obj.write(' '.join([str(num_id) for num_id in session_dic[abnormal_block_ids[i]]])) + validation_file_obj.write(', 1\n') + else: + test_file_obj.write(' '.join([str(num_id) for num_id in session_dic[abnormal_block_ids[i]]])) + test_file_obj.write(', 1\n') + + pattern_to_vec(logparser_event_file, wordvec_path, pattern_vec_out_path, variable_symbol) + + +def load_vectors(fname): + fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') + data = {} + for line in fin: + tokens = line.rstrip().split(' ') + data[tokens[0]] = list(map(float, tokens[1:])) + return data + + +def get_lower_case_name(text): + word_list = [] + if text in special_patterns: + return + for index, char in enumerate(text): + if not char.isupper(): + break + else: + if index == len(text) - 1: + return [text] + lst = [] + for index, char in enumerate(text): + if char.isupper() and index != 0: + word_list.append("".join(lst)) + lst = [] + lst.append(char) + word_list.append("".join(lst)) + return word_list + + +def preprocess_pattern(log_pattern): + special_list = [] + if log_pattern.split(' ')[0] in special_patterns.keys(): + special_list = special_patterns[log_pattern.split(' ')[0]] + log_pattern = log_pattern[len(log_pattern.split(' ')[0]):] + pattern = r'\*|,|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)' + result_list = [x for x in re.split(pattern, log_pattern) if len(x) > 0] + final_list = list(map(get_lower_case_name, result_list)) + final_list.append(special_list) + return [x for x in re.split(pattern, final_list.__str__()) if len(x) > 0] + + +def pattern_to_vec(logparser_event_file, wordvec_path, pattern_vec_out_path, variable_symbol): + data = load_vectors(wordvec_path) + pattern_to_words = {} + pattern_to_vectors = {} + datafile = open(logparser_event_file, 'r', encoding='UTF-8') + df = pd.read_csv(datafile) + pattern_num = len(df) + for _, row in df.iterrows(): + wd_list = preprocess_pattern(row['EventTemplate'].replace(variable_symbol, '').strip()) + pattern_to_words[row['EventTemplate'].replace(variable_symbol, '').strip()] = wd_list + print(pattern_to_words) + IDF = {} + for key in pattern_to_words.keys(): + wd_list = pattern_to_words[key] + pattern_vector = np.array([0.0 for _ in range(300)]) + word_used = 0 + for word in wd_list: + if not word in data.keys(): + print('out of 0.1m words', ' ', word) + else: + word_used = word_used + 1 + weight = wd_list.count(word)/1.0/len(pattern_to_words[key]) + if word in IDF.keys(): + pattern_vector = pattern_vector + weight * IDF[word] * np.array(data[word]) + else: + pattern_occur_num = 0 + for k in pattern_to_words.keys(): + if word in pattern_to_words[k]: + pattern_occur_num = pattern_occur_num + 1 + IDF[word] = math.log10(pattern_num/1.0/pattern_occur_num) + #print('tf', weight, 'idf', IDF[word], word) + #print(data[word]) + pattern_vector = pattern_vector + weight * IDF[word] * np.array(data[word]) + pattern_to_vectors[key] = pattern_vector / word_used + numberid2vec = {} + for _, row in df.iterrows(): + numberid2vec[row['numberID']] = pattern_to_vectors[row['EventTemplate'].replace(variable_symbol, '').strip()].tolist() + json_str = json.dumps(numberid2vec) + with open(pattern_vec_out_path, 'w+') as file_obj: + file_obj.write(json_str) + return pattern_to_vectors \ No newline at end of file diff --git a/extractfeature/k8s/__pycache__/__init__.cpython-37.pyc b/extractfeature/k8s/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a8ef954f9635b7ac4e25d5ef679663622ae9860 GIT binary patch literal 173 zcmZ?b<>g`kf+OD?;z0Cc5CH>>K!yVl7qb9~6oz01O-8?!3`HPe1o11(*(%1jGA%PF zwHQd}q$U>SW#*;F+n{4hlT literal 0 HcmV?d00001 diff --git a/extractfeature/k8s/__pycache__/log_preprocessor.cpython-36.pyc b/extractfeature/k8s/__pycache__/log_preprocessor.cpython-36.pyc index c4f5232844f5ef1bc900b85d7093bf2bd56a4515..f9df3d5de99bc249ad3a972af9ece2d8a8b53f5c 100644 GIT binary patch delta 16 XcmeCO>#}1v=H=zmUo&kZJF`3hCJh8S delta 16 XcmeCO>#}1v=H=x|Ug5lvomn0LCBg(3 diff --git a/extractfeature/k8s/__pycache__/log_preprocessor.cpython-37.pyc b/extractfeature/k8s/__pycache__/log_preprocessor.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a0756f9daa54394623ff33c0a34ac5f8e832261 GIT binary patch literal 7916 zcmcIpU1%IxcCK4h-Cb4v-)c*;#`bD#%eHJ;lD#vtlfjvvwLLo_Ud3yB*G{cH?RMXi z+*1EYw@Tx7Qw?F|yo7O9fn*8$5Lyt1$*?S88A2ei%)=7G639N}ArxT=B!p22OUOeQ z{E~d!iJ<;65a$HW}JtZ&7NqP8*a9@y9)g$t&@&$PW zwWIQ=JcjmT@+J8dIStNN-^k1)1TYXu4wmu>Yn%JJ!ea}3!PWI#hBg_XgTFx@XmQo%#n#n&ATv(dDfEH>j&Ni zcj3CWEjB*5tzm@)nV8VL)AF30dum|sv-14D8rHyMy;lZT96|lDVZ>5t&z8 zn@zlprhIvqL~~c5=pLD0xKa(O3)k8!mm8aY;Hev)zu5@Ruf6IobPvrhc%491Yr(Qt z4K|gxz-)W7Rd2WE8|{^D`8!@MXe)mqs5d=XUtXRMIzhKQzu;Y=7=9rV^KPkvUHqo^*slx+t8VdN#|3w6)D0ULOxaDnS8Z&1{z8qu zclbwN_$D5|jKb4gNWQB};TlqhZW%ZqGjgugTi()U^MR)n_1`s6EX^h&t5RuIn_i_7 zIh9JYEjJrPXDXGA&1z%NlDYoYty@=bELE=k#?sX&v-It6e*Nl=n_T-dP315X7th|k z{My}bK3=XjJRk3dS5>WgYvpe9@gp4F!}j`Jd`Y!cZ9Mkt{#_ixTSNN!)4z+;r>>QC z<*loB&GY@XnqPmcQdsE}9^XV^2~#j%&LAM40HCGO?PFNtt>oc7%T>NP?HzfexuN+-mK$s_jryt6`#N zO1r0RoC&PH9qK#Kw$Kcm(72?9hIA&ho)ND}JvT7RD(sj{%gj@qX698hWUG2fnUnb^ zT9~|o^?J$9I8pasC4m&M-ojjwKED`=&Yp~@JO)2=E;Z_YP-{2WUw@M( zqnVRh_r!Wt`QDyd%wKA>Yt@GTI*4Ka*<56{*S%JBlqIAZK$lyUs+0<1vNtp`d@qP> z5Nziw%M`@@oO?45|M#<_rmXIT6Mb>LkQhGQ$ z4+GX4^&m2^=B(vf_2tTPUHL(js<$cuBsoga&S?4JoRu~2v72dB{h%^fB{FGOQJTtF zD}8kshj8GaqWpmqIY+0ph(d$(XG9XEDRSbNC^2OUT{yzRe^Mm%66hSBBY4z2#*8$D z3?n_PDgOc@(E890$RX*B==>ru0yBgpvQPM=rw55Z54BKdk4oC~r_dn*={I_bAPKEX zc78uJNTwmR--FbefwgL{I`FEiVG43?N)y^(Zd~J*G`IY9nCfSCw0>45u(Cy(2Mw@T z11x9&C}@BkCi}V2W)1k`+ggzC7s4cKS|TWh0-7dxZb5SoSJ|&68%N+RNH46VSS5r~ zMSL@hy&4Dgf0A%DfsxRbNt)^l?oLgi;oEqI(mfO3B?-sCoPVKwmm4pb(V(CQE{G%FF1s#__>e%2D9Ze=DG#L4xtdQu=hY5(t$r7m)+JoKc)R`>-Rq1ze7N#W$aOxso#ZK$}GU8P;Z7AnWNECpBTYk zuIJ^|Ld&!#uR3ylh5$Ur^6zjM>@cFZ3zaU z_vst>4kmipjZa7Geuz~eBXSboq^l42&m#2$vl!}6hO=OX44p-6@_6ho>6Q+R(A7T- zN6_7Ge2N`qwgk!)_LbN+)X8`rX~sPI?$Mmw8uO(_EwKb)?X3EgR(q9h-R>8l9~ru3 zlIdP4{sw7=B^tvgQi^ApfVrjhv0p>|-WTlmz!!$2al`dDK84(qCcuD`Caj3w81(1x zm*~$F^ryi3Qw+zR*P76V34dYfnV5<3ia5*Ge1L7bCL!|O|HfZ8$54L&xi2n6qT1Ex zrX%6J?WP8xht2*j!4B6X5cEu(eBZqr4f|)I=S7-u*0E!v3+YQ&ItpnFI*WDQ2RB8{a${oRus#5qo z56vSM_7aI@*CwyRj=^QLLOq#>{dgTWUE5R@>VocWo`Flg0KytdrLpX4q+1R;J7$u5SLs6x9T|~2ZVDH z^`t22)8I~m8VBw%;Zml!pM!bytpPknMgJ8(Qh-Xi+YD8fy|ALkYaeG6E(EEECW z$mn!BBcMQnGV($npc2Bm!)ufS0H`?VzNx{^ivX}HFkt1x_wV%d&h?P2-8%>i%+T0E zV6tWcAc;OeQ~*C95wejE2G*92RzN1!#u2}2ZOl<`@?!`H=JvxJtYph1>yM70!3hz> z?Dskhfl^D|>{5H0-28OCwZ0kb68_oc5bhGFuD<9*;?Zn6N~|j&c!m{GhAx<)0U;8; z?h#Ok(Z@-Q1>}ZsStJ)%5FxAU#N*Dy;_>Plc!)+J_YDAP79sUp;Ke9n6iITF5T&Vo zARK>%PCh{`O{Z&$VJ~5YX_3Q4WrVtqwi;Mbe-Trwy*6Aw&TS;3 zS|>3ZX>V#x0P|#mv`i0mo?3Wi0pDw_h%i&udf84Ru*i>MulIXa=T2yVx(*a*^R9DY zjQ^Au_qn22ibm3X2aYLp}nwW$yAsPtxJ@YdxiE!Hlg!`=MQ^G zp7tD>0^WKM+E~%;<98Rh6SfZr_OmpoSeaQ`Wx z{sRisRuQNLMVWi5;|`OkmqLViGXIot2-U}i^13~g z;Qs!g)XEa{R7Xlfmt$;AVFUVyLg3;3)-+5)kIPG+!v=AtB}OA^0h*#>vpP2&Ydhfz z(r-s{*CW0p8vIb=kCtO?~}8_O~~N!x(C9n z0&J|HAO8O1jRApC>B@VYs3b6?ZliH_JdO-Us6@RDZX6?WG)R^^wlR!9IF=-JS9ee# zr{M#^AlsoU*z0CBs6CAI*1DU3<*7c761;iWYF-8&=3>-Zrn7VOMG6*M7ZY9p0SmG=kjINgr3RWpR zb%?{Q1v3Zak<&8+qnW`hQ+Nn<5UmzYkFM(S}+5ButQ*fx3O%vw~~@qt>%K7lT|(v-vvSbHY@Rjyckv)IulzeL#I2 zIz9Wkw(&7kpNuemp@9Q*WSV{(2ppN&t3mD23ZszXH-+6<&h;M$mE&Ch@lc2Jr5gPP z@EWK*^Ir>*b>f<-eu#HrO~V^u|HcpZyA1|A#W>&NCIMfNf$NUN?}hkPLBhY@VkPO8 z4zvJNcZ<*SAZp$5=ULSUt+r}H1Pi#@*n3l62|kinA z5pprJ`6xF?qf~tO>zcPlf#vM>W)OdFWHE{Fi&8}4tD+1M!*3aOKpHcNd?pr9q%@R|;!Gt2 zwMeYRih4lV@?OWQZ3Z3^_nftV5mN4x{Ucam=}F*eO6|n|q+^uuq5P%A+Gi4yry3w0 zC57lI2-T(a@iRpe*Hx)>_4$SW=bPMjiK>akzx)z)HtP7@X$hB(kHs%47y0&F(}S^6 z9AG9u;l~RP&2H*p)qmKiKZv75vJI~D2oX~I(am}n*=Jhq1Qjz>w5j-QDqJe=QL#bA z5h^ZHaT mse_threshold: + FP += 1 + model2_success = True + break + if (model2_success): + break + if(FP==1): + print("predict result: abnormal") + else: + print("predict result: normal") + + + + + +generate_log_key_and_value() +log_predict(use_model2) + + + + + + + + diff --git a/java/detect_log/clusters/1 b/java/detect_log/clusters/1 new file mode 100644 index 0000000..f647c99 --- /dev/null +++ b/java/detect_log/clusters/1 @@ -0,0 +1 @@ +0 2 3 \ No newline at end of file diff --git a/java/detect_log/clusters/2 b/java/detect_log/clusters/2 new file mode 100644 index 0000000..7b57bd1 --- /dev/null +++ b/java/detect_log/clusters/2 @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/java/detect_log/clusters/3 b/java/detect_log/clusters/3 new file mode 100644 index 0000000..b4fe77f --- /dev/null +++ b/java/detect_log/clusters/3 @@ -0,0 +1 @@ +4 5 8 \ No newline at end of file diff --git a/java/detect_log/clusters/4 b/java/detect_log/clusters/4 new file mode 100644 index 0000000..cea0e89 --- /dev/null +++ b/java/detect_log/clusters/4 @@ -0,0 +1 @@ +6 7 9 \ No newline at end of file diff --git a/java/detect_log/detect.log b/java/detect_log/detect.log new file mode 100644 index 0000000..2cefae8 --- /dev/null +++ b/java/detect_log/detect.log @@ -0,0 +1,10 @@ +081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010 +081109 203518 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906 +081109 203519 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010 +081109 203519 145 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.14.224:42420 dest: /10.250.14.224:50010 +081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_-1608999687919862906 terminating +081109 203519 145 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-1608999687919862906 terminating +081109 203519 145 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.10.6 +081109 203519 145 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.19.102 +081109 203519 147 INFO dfs.DataNode$PacketResponder: PacketResponder 0 for block blk_-1608999687919862906 terminating +081109 203519 147 INFO dfs.DataNode$PacketResponder: Received block blk_-1608999687919862906 of size 91178 from /10.250.14.224 \ No newline at end of file diff --git a/java/detect_log/logkey.txt b/java/detect_log/logkey.txt new file mode 100644 index 0000000..4b52ee1 --- /dev/null +++ b/java/detect_log/logkey.txt @@ -0,0 +1 @@ +1 2 1 1 3 3 4 4 3 4 diff --git a/java/detect_log/logvalue/1 b/java/detect_log/logvalue/1 new file mode 100644 index 0000000..f60f1ab --- /dev/null +++ b/java/detect_log/logvalue/1 @@ -0,0 +1 @@ +-0.27948891666666664,0.39378741666666667,0.4394363333333333,0.4158543333333334,0.9347174166666669,-0.08590000000000002,-0.5342015,-0.36786,-1.1335827499999998,-0.6025183333333334 -0.34474441666666666,0.46685858333333335,0.5180023333333333,0.5142760000000001,1.0520995,-0.20836200000000002,-0.4979293333333333,-0.19980916666666668,-1.1568605833333332,-0.6416392500000001 -0.5807055833333334,0.34877700000000006,0.5318064166666666,0.6473328333333334,1.0815334166666668,-0.37340983333333333,-0.2918375,-0.3758109166666667,-1.3126602499999998,-0.6551390833333334 diff --git a/java/detect_log/logvalue/10 b/java/detect_log/logvalue/10 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/10 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/11 b/java/detect_log/logvalue/11 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/11 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/12 b/java/detect_log/logvalue/12 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/12 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/13 b/java/detect_log/logvalue/13 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/13 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/14 b/java/detect_log/logvalue/14 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/14 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/15 b/java/detect_log/logvalue/15 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/15 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/16 b/java/detect_log/logvalue/16 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/16 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/17 b/java/detect_log/logvalue/17 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/17 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/18 b/java/detect_log/logvalue/18 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/18 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/19 b/java/detect_log/logvalue/19 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/19 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/2 b/java/detect_log/logvalue/2 new file mode 100644 index 0000000..760d7e6 --- /dev/null +++ b/java/detect_log/logvalue/2 @@ -0,0 +1 @@ +0.7032797777777778,-0.2436938888888889,-0.16089766666666666,-0.024760222222222225,0.4287812222222222,0.505934,0.17868633333333334,0.4231786666666667,0.08776533333333335,-0.18805511111111112 diff --git a/java/detect_log/logvalue/20 b/java/detect_log/logvalue/20 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/20 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/21 b/java/detect_log/logvalue/21 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/21 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/22 b/java/detect_log/logvalue/22 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/22 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/23 b/java/detect_log/logvalue/23 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/23 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/24 b/java/detect_log/logvalue/24 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/24 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/25 b/java/detect_log/logvalue/25 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/25 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/26 b/java/detect_log/logvalue/26 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/26 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/27 b/java/detect_log/logvalue/27 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/27 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/28 b/java/detect_log/logvalue/28 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/28 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/29 b/java/detect_log/logvalue/29 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/29 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/3 b/java/detect_log/logvalue/3 new file mode 100644 index 0000000..79646df --- /dev/null +++ b/java/detect_log/logvalue/3 @@ -0,0 +1 @@ +0.0664019999999999,0.18280827272727274,0.28105936363636363,0.2908666363636364,1.4609825454545453,0.28252763636363637,0.3609693636363636,-0.17393918181818188,-1.6194566363636362,-0.6568330909090908 0.08106327272727269,0.17677836363636365,0.27736190909090913,0.28871454545454545,1.4653256363636364,0.29304009090909094,0.36668563636363627,-0.1619914545454546,-1.6175321818181816,-0.6542397272727273 -0.10534100000000005,0.09833072727272728,0.2820625454545454,0.3601499090909091,1.5713586363636365,0.22264509090909093,0.4250600909090909,-0.34029572727272733,-1.7507773636363633,-0.7001486363636363 diff --git a/java/detect_log/logvalue/30 b/java/detect_log/logvalue/30 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/30 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/31 b/java/detect_log/logvalue/31 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/31 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/4 b/java/detect_log/logvalue/4 new file mode 100644 index 0000000..2f33070 --- /dev/null +++ b/java/detect_log/logvalue/4 @@ -0,0 +1 @@ +-0.7186064615384615,0.4540127692307693,0.4793014615384616,0.5282222307692307,1.2471863846153846,-0.4967562307692308,-0.2032558461538462,-0.3366557692307693,-1.765158,-0.28286999999999995 -0.7186064615384615,0.4540127692307693,0.4793014615384616,0.5282222307692307,1.2471863846153846,-0.4967562307692308,-0.2032558461538462,-0.3366557692307693,-1.765158,-0.28286999999999995 -0.485389923076923,0.6849405384615385,0.4014096153846154,0.28411176923076914,1.465224923076923,-0.21241961538461537,-0.00538892307692312,-0.26430853846153846,-2.068485,-0.2633912307692307 diff --git a/java/detect_log/logvalue/5 b/java/detect_log/logvalue/5 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/5 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/6 b/java/detect_log/logvalue/6 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/6 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/7 b/java/detect_log/logvalue/7 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/7 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/8 b/java/detect_log/logvalue/8 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/8 @@ -0,0 +1 @@ +-1 diff --git a/java/detect_log/logvalue/9 b/java/detect_log/logvalue/9 new file mode 100644 index 0000000..3a2e3f4 --- /dev/null +++ b/java/detect_log/logvalue/9 @@ -0,0 +1 @@ +-1 diff --git a/java/java.iml b/java/java.iml new file mode 100644 index 0000000..c90834f --- /dev/null +++ b/java/java.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/java/out/production/java/deeplog.class b/java/out/production/java/deeplog.class new file mode 100644 index 0000000000000000000000000000000000000000..954dd5a4ddcf2437189affc301dea791ffb93ed7 GIT binary patch literal 1831 zcmZuyU0WMP7(Ek`YzPa5P(ISu8ft4vDbUuBN>Zx@p;E$!1gWu6TsD)iG}+CwyRqfQ zzr%ICfIjxoOE3H{{sr;OCV@rClkCpSdEfJX%$dnw|NQm`fE?;cM9`xlsw0NDz~H{| zvoTvY?Aq-5?!GjAf%qfKw*1EeJ(Jn(1QG(tiu4K1)SX%aNda-c7drYi4CuIo!E-Fd z1J9QYIy)_LjT8cY%bDG9E!!{pt~46?1X2Pwl8djU+43#BmT4aNdybv)OxJ4qnW|Nn z8QXa^lf-3=Xc*NohH-)63%CTdredz!3^v)07F5GfWX`J!t_b8i5f*Y~&u>)@%BEA1 z<$_atVjJ}X&+^KOyIhg7$s%{uWTttLx#jVj#5KI9;ku6ZkvU&$5Q817I&Q;YxpNm} z@FJ8{yT$ozXsMHD5;yRHhMPKW;X{F;pwys{&6e$34ar@oN&j>*G8rc6Eaw%=q>e1^ z2t*~@D{%XQl)|nXu478i%WJqRFxu_#e5+cOuB>cIqas}mGXm)@wrn?Botltu_(1=ILcY4f4LcqjG$m}Iw;n8KVg%25<&M(u34 zohzw=V;c2s!?o1Bvl#XFEKi`f(DBccpkY`%6{C}3 z1)d%Y;%1{lm(>5oNUTX&u+|XP6^cMa(T=wv?#+3vsKE80-GaHEo&Qj`Xp#vLU(^uA zVz@P?H+(_%0)6cpX-8cZJQ*!d?ljo0`U8Rfp!KUpGw6(lUE-&W_B2@CEeup}iqNX) zv|Ljz1(bTHk27lHFo88*o9Yt*Wr8@L8=TGZE%=VzIfi%>z~AtxU_iqrpL*K>MQm}_ zi*K<_5;}`tNThMiOKF8v@9MAUOAQ@iIF*k5j!3Czs#uENExyIowYQj^c|*3 z?=W3TM~^Ujj8Bg7SuQ4W@pSA6`BE$}h9z4CZP79V28hhZ4R=lw3FQf@qA46ZnB>oE%a7h#lJ5!RRNB z*d7>+0*;$7X~*l9Q7Rlo7#Vno*a=BA4OI=-y5D;mrZ@&hPFMl|@H!xbhNY((R-&lU OIZC=c{`NVJ;N`#A2fGgd literal 0 HcmV?d00001 diff --git a/java/src/deeplog.java b/java/src/deeplog.java new file mode 100644 index 0000000..3e496a9 --- /dev/null +++ b/java/src/deeplog.java @@ -0,0 +1,29 @@ +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.Scanner; +import java.io.IOException; + +public class deeplog { + public static void main(String[] args) throws Exception { + String detect_file = "detect.log"; + String use_model2 = "1"; + System.out.println("\nExecuting python script file now."); + try { + String cmds = String.format("python C:\\study\\code\\LogAnalysis\\java\\deeplog_java.py %s %s", + detect_file,use_model2); + Process proc = Runtime.getRuntime().exec(cmds); + BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + proc.waitFor(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +} diff --git a/l_a_d_bi_lstm.py b/l_a_d_bi_lstm.py new file mode 100644 index 0000000..8ba6690 --- /dev/null +++ b/l_a_d_bi_lstm.py @@ -0,0 +1,91 @@ +# -*- coding: UTF-8 -*- +from extractfeature.k8s import log_preprocessor +from extractfeature.k8s import value_extract +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.bi_lstm_only import bi_lstm_train +from anomalydetection.bi_lstm_only import bi_lstm_predict + +# parameters for early prepare +log_file_dir = './Data/log/hdfs/' +log_file_name = 'HDFS_split' +log_fttree_out_directory = './Data/FTTreeResult-HDFS/clusters/' +# anomaly file name used which is also used in ./Data/log/file_split +anomaly_line_file = './Data/log/hdfs/HDFs_split_anomaly' +wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)' +sequential_directory = './Data/FTTreeResult-HDFS/sequential_files/' +train_file_name = 'train_file' +test_file_name = 'test_file' +label_file_name = 'label_file' +pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' +split_degree = 0.2 +# log file line used which is also used in ./Data/log/file_split +log_line_num = 200000 + +# bi lstm only model parameters +window_length = 20 +input_size = 300 +hidden_size = 128 +num_of_layers = 2 +num_of_classes = 26 +num_epochs = 10 +batch_size = 1000 +# for bi lstm only +train_root_path = './Data/FTTreeResult-HDFS/bi_model_train/' +model_out_path = train_root_path + 'bi_model_out/' +data_file = sequential_directory + train_file_name +pattern_vec_file = pattern_vec_out_path + +# predict parameters + +# log anomaly sequential model parameters + +if not os.path.exists(log_fttree_out_directory): + os.makedirs(log_fttree_out_directory) +if not os.path.exists(sequential_directory): + os.makedirs(sequential_directory) +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def pattern_extract(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + +# 同时生成train file 和 test file好点 +def extract_feature(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, train_file_name, test_file_name, label_file_name, pattern_vec_out_path, split_degree, log_line_num) + + +def pattern_extract_test(): + fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2) + + +def extract_feature_test(): + hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, 'train_file') + + +def train_model(): + #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + bi_lstm_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + + +def test_model(): + # do something + #log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + bi_lstm_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 10, pattern_vec_file) + + +#extract_feature() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + diff --git a/log_anomaly_detection.py b/log_anomaly_detection.py index ddc34dd..6f710cf 100644 --- a/log_anomaly_detection.py +++ b/log_anomaly_detection.py @@ -19,18 +19,18 @@ test_file_name = 'test_file' label_file_name = 'label_file' pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' -split_degree = 0.2 +split_degree = 0.8 # log file line used which is also used in ./Data/log/file_split log_line_num = 200000 # log anomaly sequential model parameters some parameter maybe changed to train similar models -window_length = 4 +window_length = 20 input_size = 300 -hidden_size = 30 +hidden_size = 128 num_of_layers = 2 -num_of_classes = 61 -num_epochs = 200 -batch_size = 200 +num_of_classes = 26 +num_epochs = 10 +batch_size = 1000 # for log anomaly train_root_path = './Data/FTTreeResult-HDFS/model_train/' model_out_path = train_root_path + 'model_out/' @@ -41,7 +41,7 @@ pattern_vec_file = pattern_vec_out_path # predict parameters - +num_of_candidates = 10 # log anomaly sequential model parameters if not os.path.exists(log_fttree_out_directory): @@ -69,17 +69,17 @@ def extract_feature_test(): def train_model(): - #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) - bi_lstm_att_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + #bi_lstm_att_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) def test_model(): # do something - log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) - #bi_lstm_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 10, pattern_vec_file) + #bi_lstm_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + label_file_name, sequential_directory + train_file_name, num_of_candidates, pattern_vec_file) -pattern_extract() -extract_feature() +#pattern_extract() +#extract_feature() train_model() test_model() diff --git a/log_deep_data_anomaly.py b/log_deep_data_anomaly.py new file mode 100644 index 0000000..9db9090 --- /dev/null +++ b/log_deep_data_anomaly.py @@ -0,0 +1,69 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- +from extractfeature.k8s import log_preprocessor +from extractfeature.k8s import value_extract +import os +import torch +from torch.utils.data import TensorDataset, DataLoader +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.self_att_lstm import self_att_lstm_train +from anomalydetection.self_att_lstm import self_att_lstm_predict + +sequential_directory = './Data/logdeepdata/' +train_file_name = 'hdfs_train' +test_abnormal_name = 'hdfs_test_abnormal' +test_normal_name = 'hdfs_test_normal' +pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec' + + +# lstm att model parameters +window_length = 10 +input_size = 1 +hidden_size = 128 +num_of_layers = 2 +num_of_classes = 28 +num_epochs = 20 +batch_size = 2000 +# for self att lstm +train_root_path = './Data/Logdeep_Result/self_att_lstm_model_train/' +model_out_path = train_root_path + 'sa_lstm_model_out/' +data_file = sequential_directory + train_file_name +pattern_vec_file = pattern_vec_out_path + +# predict parameters +num_of_candidates = 8 +# log anomaly sequential model parameters + +if not os.path.exists(sequential_directory): + os.makedirs(sequential_directory) +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + + +def train_model(): + #log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + self_att_lstm_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file) + + +def test_model(): + # do something + #log_anomaly_sequential_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=200;epoch=200.pt', sequential_directory + label_file_name, sequential_directory + test_file_name, 3, pattern_vec_file) + self_att_lstm_predict.do_log_deep_predict(input_size, hidden_size, num_of_layers, num_of_classes, window_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', sequential_directory + test_normal_name, sequential_directory + test_abnormal_name, num_of_candidates, pattern_vec_file) + + +#pattern_extract() +#extract_feature_spilt_abnormal() +#train_model() +#get_label_sequentials('./Data/FTTreeResult-HDFS/pattern_sequntials') +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/log_predict.py b/log_predict.py new file mode 100644 index 0000000..8b5268d --- /dev/null +++ b/log_predict.py @@ -0,0 +1,305 @@ +#!/usr/bin/python +# -*- coding:utf-8 -*- +import torch +import time +from enum import Enum +from anomalydetection.deeplog.Model1.log_key_LSTM_train import Model as Model1 +from anomalydetection.deeplog.Model2.variable_LSTM_train import Model as Model2 +import torch.nn as nn +import os +import matplotlib.pyplot as plt +from collections import Counter + +# use cuda if available otherwise use cpu +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# 记录每个 key 对应的 value 的长度 +value_length_of_key = [] + +# 继承枚举类 +class LineNumber(Enum): + PATTERN_LINE = 0 + NUMBERS_LINE = 3 + + + +def generate(name,window_length): + log_keys_sequences=list() + length=0 + with open(name, 'r') as f: + for line in f.readlines(): + line = list(map(lambda n: n, map(int, line.strip().split()))) + line = line + [-1] * (window_length + 1 - len(line)) + # for i in range(len(line) - window_size): + # inputs.add(tuple(line[i:i+window_size])) + # log_keys_sequences[tuple(line)] = log_keys_sequences.get(tuple(line), 0) + 1 + log_keys_sequences.append(tuple(line)) + length+=1 + return log_keys_sequences,length + + +def get_value_length(log_preprocessor_dir,log_fttree_out_dir): + global value_length_of_key + value_length_of_key = [10]*(len(os.listdir(log_fttree_out_dir)) + 1) + log_value_folder = log_preprocessor_dir + 'logvalue_train/' + file_names = os.listdir(log_value_folder) + # for i in range(len(file_names)): + # with open(log_value_folder + str(i+1), 'r') as f: + # x = f.readlines() + # if len(x) == 0 or x[0].strip('\n') == '-1': + # value_length_of_key.append(0) + # else: + # line = x[0].strip('\n') + # key_values = line.split(' ') + # value_length_of_key[i+1] = len(key_values[0].split(',')) + + +def load_model1(model_dir,model_name,input_size, hidden_size, num_layers): + num_classes = len(value_length_of_key) + # num_classes = 28 + print("Model1 num_classes: ", num_classes) + model1_dir = model_dir + 'model1/' + model_path = model1_dir + model_name + model1 = Model1(input_size, hidden_size, num_layers, num_classes).to(device) + model1.load_state_dict(torch.load(model_path, map_location='cpu')) + model1.eval() + print('model_path: {}'.format(model_path)) + return model1 + + +def load_model2(model_dir,epoch,input_size, hidden_size, num_layers): + model2_dir = model_dir+ 'model2/' + model2 = [] + for i in range(len(value_length_of_key)): + if value_length_of_key[i] == 0: + model2.append(None) + continue + input_size = value_length_of_key[i] + out_size = input_size + model_name = str(i+1) + '_epoch=' + str(epoch)+ '.pt' + model_path = model2_dir + str(i+1) + '/' + model_name + if not os.path.exists(model_path): + model2.append(None) + continue + model = Model2(input_size, hidden_size, num_layers, out_size).to(device) + model.load_state_dict(torch.load(model_path, map_location='cpu')) + model.eval() + print('model_path: {}'.format(model_path)) + model2.append(model) + return model2 + + +def draw_evaluation(title, indexs, values, xlabel, ylabel): + fig = plt.figure(figsize=(15,10)) + x = indexs + y = values + plt.bar(x, y, align='center', alpha=0.5, width=0.4) + plt.xticks(x, x) + plt.ylabel(ylabel) + plt.xlabel(xlabel) + plt.title(title) + plt.show() + + +def do_predict(log_preprocessor_dir,log_fttree_out_dir,model_dir,model1_name,model2_num_epochs,window_length,input_size, hidden_size, num_layers,num_candidates,mse_threshold,use_model2): + # abnormal_label_file = log_preprocessor_dir + 'HDFS_abnormal_label.txt' + + get_value_length(log_preprocessor_dir,log_fttree_out_dir) + + model1 = load_model1(model_dir, model1_name, input_size, hidden_size, num_layers) + + model2 = load_model2(model_dir,model2_num_epochs,10, hidden_size, num_layers) + + # for Model2's prediction, store which log currently predicts for each log_key. + # When model one predicts normal, model2 makes predictions. + # At this time, the forward few logs with the same log_key are needed to be predicted + # so the pattern_index is used to record the log_key to be predicted. + #pattern_index = [0]*len(pattern2value) + #pattern_index = [0] * 63 + start_time = time.time() + criterion = nn.MSELoss() + TP = 0 + FP = 0 + TN = 0 + FN = 0 + ALL = 0 + test_normal_loader, test_normal_length = generate(log_preprocessor_dir+ 'logkey/logkey_normal',window_length) + test_abnormal_loader, test_abnormal_length=generate(log_preprocessor_dir+'logkey/logkey_abnormal',window_length) + + + print('predict start') + + #normal test + with torch.no_grad(): + for line_num,line in enumerate(test_normal_loader): + model1_success=False + for i in range(len(line) - window_length-1): + seq0 = line[i:i + window_length] + label = line[i + window_length] + + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1,window_length,input_size).to(device) + label = torch.tensor(label).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] + if label not in predicted: + FP += 1 + model1_success=True + break + if(model1_success): + continue + + + #如果模型二预测normal TN+1 否则FP+1 + + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 + + # When model one predicts normal, model2 makes predictions. + # values:all log's value vector belongs to log_key(whose id is pattern_id) + + # 是否使用模型二 + if use_model2: + + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(26): + with open(log_preprocessor_dir+'/logvalue_normal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + FP+=1 + model2_success=True + break + if(model2_success): + break + + + #abnormal test + with torch.no_grad(): + for line in test_abnormal_loader: + model1_success=False + for i in range(len(line) - window_length): + seq0 = line[i:i + window_length] + label = line[i + window_length] + + seq0 = torch.tensor(seq0, dtype=torch.float).view( + -1, window_length, input_size).to(device) + + label = torch.tensor(label,).view(-1).to(device) + output = model1(seq0) + predicted = torch.argsort(output, + 1)[0][-num_candidates:] + if label not in predicted: + TP += 1 + model1_success=True + break + if(model1_success): + continue + + # 是否使用模型二 + if use_model2: + seq=[] #得到63个normal预测文件下的这个window的seq + for i in range(26): + with open(log_preprocessor_dir+'/logvalue_abnormal/'+str(i+1),'r')as f: + key_values=f.readlines() + key_values=key_values[line_num].strip('\n') + if(key_values=='-1'): + continue + seq.append(key_values.split(' ')) + #将字符串转为数字 + for k1 in range(len(seq)): + for k2 in range(len(seq[k1])): + seq[k1][k2]=seq[k1][k2].strip('\n') + seq[k1][k2]=seq[k1][k2].split(',') + for k3 in range(len(seq[k1][k2])): + if(seq[k1][k2][k3]!=''): + seq[k1][k2][k3]=float(seq[k1][k2][k3]) + + #补全 + for i in range(len(seq)): + if(len(seq[i]) mse_threshold: + TP+=1 + model2_success=True + break + if(model2_success): + break + + #现在有63个预测normal value 文件 对一个line 找对应的 value normal下的行 进行预测 + + + # Compute precision, recall and F1-measure + FN = test_abnormal_length - TP + TN=test_normal_length-FP + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + Acc = (TP + TN) * 100 /(TP+TN+FP+FN) + P = 100 * TP / (TP + FP) + R = 100 * TP / (TP + FN) + F1 = 2 * P * R / (P + R) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + + print('FP: {}, FN: {}, TP: {}, TN: {}'.format(FP, FN, TP, TN)) + # print('Acc: {:.3f}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(Acc, P, R, F1)) + print('Finished Predicting') + elapsed_time = time.time() - start_time + print('elapsed_time: {}'.format(elapsed_time)) + + #draw_evaluation("Evaluations", ['Acc', 'Precision', 'Recall', 'F1-measure'],[Acc, P, R, F1], 'evaluations', '%') + + + + + + + diff --git a/logparsing/converter/__init__.py b/logparsing/converter/__init__.py new file mode 100644 index 0000000..9764abf --- /dev/null +++ b/logparsing/converter/__init__.py @@ -0,0 +1 @@ +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/logparsing/converter/__pycache__/__init__.cpython-36.pyc b/logparsing/converter/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36fcdb4469308b381ae6f8e1ed5c10e6b5726338 GIT binary patch literal 171 zcmXr!<>m6&bSO@ofq~&M5W@i@kmUfx#VkM~g&~+hlhJP_LlH4|xXIhDnk#W6nl>5ecqP`n_qs2HR;IX|x~ lwWuVuC?-BWGcU6wK3=b&@)n0pZhlH>PO2Tq#$q65006v%Froke literal 0 HcmV?d00001 diff --git a/logparsing/converter/__pycache__/eventid2number.cpython-36.pyc b/logparsing/converter/__pycache__/eventid2number.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1f0a5f7afeae6157c9bdd57e38e898cf35a08d3 GIT binary patch literal 496 zcmYjNy-ve05VoBr4G0PY0$!mD18fK(P+5?Gg{_JjnVwsM949)vfl6U2YZu;v7h>?r z#49jyE+BEzeZIf%etfjKx&GR`kJdP2AMBq9vd>8E4goOWnhiM!h+f!G0KaAZ&L?Hq z&6t=g35*oMm!|@^$)<~wS{jfRyX3B7tL6T|7A1UoQdTh2>NNbA4Zo6#Z^|5k AcK`qY literal 0 HcmV?d00001 diff --git a/logparsing/converter/eventid2number.py b/logparsing/converter/eventid2number.py new file mode 100644 index 0000000..ceba5e0 --- /dev/null +++ b/logparsing/converter/eventid2number.py @@ -0,0 +1,8 @@ +import pandas as pd + +def add_numberid(logparser_templates_file): + df = pd.read_csv(logparser_templates_file, header=0) + df['numberID'] = range(1, len(df) + 1) + print(df) + + df.to_csv(logparser_templates_file, columns=df.columns, index=0, header=1) \ No newline at end of file diff --git a/logparsing/converter/logparser2cluster.py b/logparsing/converter/logparser2cluster.py new file mode 100644 index 0000000..48b95ba --- /dev/null +++ b/logparsing/converter/logparser2cluster.py @@ -0,0 +1,25 @@ +# coding:utf-8 +import pandas as pd +import os + +# log parser_file should be structed.csv output should be './Data/FTTreeResult-HDFS/clusters/' +def logparser2cluster(logparser_file, output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + logparser_result = pd.read_csv(logparser_file, header=0) + key_dict = {} + value_dict = {} + for _, row in logparser_result.iterrows(): + key = row['EventTemplate'] + if not key in key_dict: + key_dict[key] = [] + key_dict[key].append(str(row['LineId'])) + key_num = 1 + for key, lines in key_dict.items(): + with open(output_dir + "/" + str(key_num), 'w') as f: + f.write(key + "\n") + f.write(" ".join(lines)) + key_num += 1 + +if __name__ == "__main__": + logparser2cluster("Drain_result/HDFS.log_structured.csv", "clusters") diff --git a/logparsing/drain/.gitignore b/logparsing/drain/.gitignore new file mode 100644 index 0000000..546f7e3 --- /dev/null +++ b/logparsing/drain/.gitignore @@ -0,0 +1,9 @@ +**/__pycache__/* +MANIFEST +dist/* +venv/* +.idea/* +drain3.egg-info/* +snapshot.txt +examples/snapshot.txt +*.bin diff --git a/logparsing/drain/CONTRIBUTING.md b/logparsing/drain/CONTRIBUTING.md new file mode 100644 index 0000000..b54d7be --- /dev/null +++ b/logparsing/drain/CONTRIBUTING.md @@ -0,0 +1,48 @@ +All contributors must agree to the Developer Certificate of Origin Version 1.1. (DCO 1.1) by signing their commits with: + +``` +Signed-off-by: [NAME] <[EMAIL]> +``` + +This can be simply achieved with `git commit -s` when formatting your commit message. + +The full text of the DCO 1.1 is as follows: + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I +have the right to submit it under the open source license +indicated in the file; or + +(b) The contribution is based upon previous work that, to the best +of my knowledge, is covered under an appropriate open source +license and I have the right under that license to submit that +work with modifications, whether created in whole or in part +by me, under the same open source license (unless I am +permitted to submit under a different license), as indicated +in the file; or + +(c) The contribution was provided directly to me by some other +person who certified (a), (b) or (c) and I have not modified +it. + +(d) I understand and agree that this project and the contribution +are public and that a record of the contribution (including all +personal information I submit with it, including my sign-off) is +maintained indefinitely and may be redistributed consistent with +this project or the open source license(s) involved. +``` diff --git a/logparsing/drain/HDFS_drain.py b/logparsing/drain/HDFS_drain.py new file mode 100644 index 0000000..b14e226 --- /dev/null +++ b/logparsing/drain/HDFS_drain.py @@ -0,0 +1,34 @@ +import configparser +import json +import logging +import sys +import os +import shutil + +from logparsing.drain.drain3.template_miner import TemplateMiner +from logparsing.drain.drain3.file_persistence import FilePersistence + + +def get_hdfs_drain_clusters(log,drain_out,bin_dir): + persistence_type = "FILE" + config = configparser.ConfigParser() + config.read('drain3.ini') + logger = logging.getLogger(__name__) + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') + persistence = FilePersistence(bin_dir) + template_miner = TemplateMiner(persistence) + shutil.rmtree(drain_out) + os.makedirs(drain_out,exist_ok=True) + with open(log,'r') as file: + lineNum = 0 + for line in file.readlines(): + print(lineNum) + result = template_miner.add_log_message(line) + cluster_id = json.dumps(result["cluster_id"]) + cluster_id = int(cluster_id[2:-1]) + with open(drain_out+str(cluster_id),'a') as outfile: + outfile.write(str(lineNum) + " ") + lineNum += 1 + # print("Clusters:") + #for cluster in template_miner.drain.clusters: + #print(cluster) diff --git a/logparsing/drain/LICENSE.txt b/logparsing/drain/LICENSE.txt new file mode 100644 index 0000000..d152f60 --- /dev/null +++ b/logparsing/drain/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 International Business Machines + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/logparsing/drain/README.md b/logparsing/drain/README.md new file mode 100644 index 0000000..ec7d78a --- /dev/null +++ b/logparsing/drain/README.md @@ -0,0 +1,169 @@ +# Drain3 +## Introduction + +Drain3 is an online log template miner that can extract templates (clusters) from a stream of log messages +in a timely manner. It employs a parse tree with fixed depth to guide the log group search process, +which effectively avoids constructing a very deep and unbalanced tree. + +Drain3 continuously learns on-the-fly and automatically extracts "log templates" from raw log entries. + +#### Example: + +For the input: + +``` +connected to 10.0.0.1 +connected to 10.0.0.2 +connected to 10.0.0.3 +Hex number 0xDEADBEAF +Hex number 0x10000 +user davidoh logged in +user eranr logged in +``` + +Drain3 extracts the following templates: + +``` +A0001 (size 3): connected to +A0002 (size 2): Hex number +A0003 (size 2): user <*> logged in +``` + +This project is an upgrade of the original [Drain](https://github.com/logpai/logparser/blob/master/logparser/Drain) +project by LogPAI from Python 2.7 to Python 3.6 or later with some bug-fixes and additional features. + +Read more information about Drain from the following paper: + +- Pinjia He, Jieming Zhu, Zibin Zheng, and Michael R. Lyu. [Drain: An Online Log Parsing Approach with Fixed Depth Tree](http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf), Proceedings of the 24th International Conference on Web Services (ICWS), 2017. + +A possible Drain3 use case in this blog post: [Use open source Drain3 log-template mining project to monitor for network outages](https://developer.ibm.com/blogs/how-mining-log-templates-can-help-ai-ops-in-cloud-scale-data-centers). + + +#### New features + +- **Persistence**. Save and load Drain state into an [Apache Kafka](https://kafka.apache.org) topic or a file. +- **Streaming**. Support feeding Drain with messages one-be-one. +- **Masking**. Replace some message parts (e.g numbers, IPs, emails) with wildcards. This improves the accuracy of template mining. +- **Packaging**. As a pip package. + +#### Expected Input and Output + +The input for Drain3 is the unstructured free-text portion log messages. It is recommended to extract +structured headers like timestamp, hostname. severity, etc.. from log messages before passing to Drain3, +in order to improve mining accuracy. + +The output is a dictionary with the following fields: +- `change_type`: indicates either if a new template was identified, an existing template was changed or message added to an existing cluster. +- `cluster_id`: Sequential ID of the cluster that the log belongs to, for example, `A0008` +- `cluster_size`: The size (message count) of the cluster that the log belongs to +- `cluster_count`: Count clusters seen so far +- `template_mined`: the last template of above cluster_id + +Templates may change over time based on input, for example: + +``` +aa aa aa +{"change_type": "cluster_created", "cluster_id": "A0001", "cluster_size": 1, "template_mined": "aa aa aa", "cluster_count": 1} + +aa aa ab +{"change_type": "cluster_template_changed", "cluster_id": "A0001", "cluster_size": 2, "template_mined": "aa aa <*>", "cluster_count": 1} +``` + +**Explanation:** *Drain3 learned that the third token is a parameter* + +## Configuration + +Drain3 is configured using [configparser](https://docs.python.org/3.4/library/configparser.html) using file `drain3.ini` available parameters are: +- `[DEFAULT]/snapshot_poll_timeout_sec` - maximum timeout for restoring snapshot from Kafka (default 60) +- `[DEFAULT]/sim_th` - recognition threshold (default 0.4) +- `[DEFAULT]/masking` - parameters masking - in json format (default "") +- `[DEFAULT]/snapshot_interval_minutes` - interval for new snapshots (default 1) +- `[DEFAULT]/compress_state` - whether to compress the state before saving it. This can be useful when using Kafka persistence. + +## Masking + +This feature allows masking of specific parameters in log message to specific keywords. Use a list of regular expression +dictionaries in the configuration file with the format {'regex_pattern', 'mask_with'} to set custom masking. + +In order to mask an IP address created the file `drain3.ini` : + +``` +[DEFAULT] +masking = [ + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, + ] +``` + +Now, Drain3 recognizes IP addresses in templates, for example with input such as: +``` +IP is 12.12.12.12 +{"change_type": "cluster_created", "cluster_id": "A0013", "cluster_size": 1, "template_mined": "IP is ", "cluster_count": 13} +``` + +Note: template parameters that do not match custom masking are output as <*> + +## Persistence +The persistence feature saves and loads a snapshot of Drain3 state in (compressed) json format. This feature adds restart resiliency +to Drain allowing continuation of activity and knowledge across restarts. + +Drain3 state includes the search tree and all the clusters that were identified up until snapshot time. + +The snapshot also persist number of occurrences per cluster, and the cluster_id. + +An example of a snapshot: +``` +{"clusters": [{"cluster_id": "A0001", "log_template_tokens": `["aa", "aa", "<\*>"]`, "py/object": "drain3_core.LogCluster", "size": 2}, {"cluster_id": "A0002", "log_template_tokens": `["My", "IP", "is", ""]`, "py/object": "drain3_core.LogCluster", "size": 1}]... +``` + +This example snapshot persist two clusters with the templates: + +> `["aa", "aa", "<\*>"]` - occurs twice +> +> `["My", "IP", "is", ""]` - occurs once + +Snapshots are created in the following events: + +- `cluster_created` - in any new template +- `cluster_template_changed` - in any update of a template +- `periodic` - after n minutes from the last snapshot. This is intended to save cluster sizes even if no new template was identified. + +Drain3 currently supports 3 persistence modes: + +- **Kafka** - The snapshot is saved in a dedicated topic used only for snapshots - the last message in this topic +is the last snapshot that will be loaded after restart. +For Kafka persistence, you need to provide: `topic_name` and `server_name`. + +- **File** - The snapshot is saved to a file. + +- **None** - No persistence. + +Drain3 persistence modes can be easily extended to another medium / database by +inheriting the [PersistenceHandler](drain3/persistence_handler.py) class. + + +## Installation + +Drain3 is available from [PyPI](https://pypi.org/project/drain3). To install use `pip`: + +```pip3 install drain3``` + + +## Examples + +Run [examples/drain_stdin_demo.py](examples/drain_stdin_demo.py) from the root folder of the repository by: + +``` +python -m examples.drain_stdin_demo +``` + +Use Drain3 with input from stdin and persist to either Kafka / file / no persistence. + +Enter several log lines using the command line. Press `q` to end execution. + +Change `persistence_type` variable in the example to change persistence mode. + +An example drain3.ini file with masking instructions exists in the `examples` folder. + +## Contributing + +Our project welcomes external contributions. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for further details. diff --git a/logparsing/drain/__init__.py b/logparsing/drain/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/logparsing/drain/drain3/__init__.py b/logparsing/drain/drain3/__init__.py new file mode 100644 index 0000000..3113a50 --- /dev/null +++ b/logparsing/drain/drain3/__init__.py @@ -0,0 +1,2 @@ +from logparsing.drain.drain3.template_miner import TemplateMiner + diff --git a/logparsing/drain/drain3/drain.py b/logparsing/drain/drain3/drain.py new file mode 100644 index 0000000..9b961d4 --- /dev/null +++ b/logparsing/drain/drain3/drain.py @@ -0,0 +1,258 @@ +""" +Description : This file implements the Drain algorithm for log parsing +Author : LogPAI team +Modified by : david.ohana@ibm.com, moshikh@il.ibm.com +License : MIT +""" + +param_str = '<*>' + + +class LogCluster: + def __init__(self, log_template_tokens: list, cluster_id): + self.log_template_tokens = log_template_tokens + self.cluster_id = cluster_id + self.size = 1 + + def get_template(self): + return ' '.join(self.log_template_tokens) + + def __str__(self): + return f"{self.cluster_id} (size {self.size}): {self.get_template()}" + + +class Node: + def __init__(self, key, depth): + self.depth = depth + self.key = key + self.key_to_child_node = {} + self.clusters = [] + + +class Drain: + + def __init__(self, depth=4, sim_th=0.4, max_children=100): + """ + Attributes + ---------- + depth : depth of all leaf nodes + sim_th : similarity threshold + max_children : max number of children of an internal node + """ + self.depth = depth - 2 + self.sim_th = sim_th + self.max_children = max_children + self.root_node = Node("(ROOT)", 0) + self.clusters = [] + + @staticmethod + def has_numbers(s): + return any(char.isdigit() for char in s) + + def tree_search(self, root_node: Node, tokens): + + token_count = len(tokens) + parent_node = root_node.key_to_child_node.get(token_count) + + # no template with same token count yet + if parent_node is None: + return None + + # handle case of empty log string + if token_count == 0: + return parent_node.clusters[0] + + cluster = None + current_depth = 1 + for token in tokens: + at_max_depth = current_depth == self.depth + is_last_token = current_depth == token_count + + if at_max_depth or is_last_token: + break + + key_to_child_node = parent_node.key_to_child_node + if token in key_to_child_node: + parent_node = key_to_child_node[token] + elif param_str in key_to_child_node: + parent_node = key_to_child_node[param_str] + else: + return cluster + current_depth += 1 + + cluster = self.fast_match(parent_node.clusters, tokens) + + return cluster + + def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster): + token_count = len(cluster.log_template_tokens) + if token_count not in root_node.key_to_child_node: + first_layer_node = Node(key=token_count, depth=1) + root_node.key_to_child_node[token_count] = first_layer_node + else: + first_layer_node = root_node.key_to_child_node[token_count] + + parent_node = first_layer_node + + # handle case of empty log string + if len(cluster.log_template_tokens) == 0: + parent_node.clusters.append(cluster) + return + + current_depth = 1 + for token in cluster.log_template_tokens: + + # Add current log cluster to the leaf node + at_max_depth = current_depth == self.depth + is_last_token = current_depth == token_count + if at_max_depth or is_last_token: + parent_node.clusters.append(cluster) + break + + # If token not matched in this layer of existing tree. + if token not in parent_node.key_to_child_node: + if not self.has_numbers(token): + if param_str in parent_node.key_to_child_node: + if len(parent_node.key_to_child_node) < self.max_children: + new_node = Node(key=token, depth=current_depth + 1) + parent_node.key_to_child_node[token] = new_node + parent_node = new_node + else: + parent_node = parent_node.key_to_child_node[param_str] + else: + if len(parent_node.key_to_child_node) + 1 < self.max_children: + new_node = Node(key=token, depth=current_depth + 1) + parent_node.key_to_child_node[token] = new_node + parent_node = new_node + elif len(parent_node.key_to_child_node) + 1 == self.max_children: + new_node = Node(key=param_str, depth=current_depth + 1) + parent_node.key_to_child_node[param_str] = new_node + parent_node = new_node + else: + parent_node = parent_node.key_to_child_node[param_str] + + else: + if param_str not in parent_node.key_to_child_node: + new_node = Node(key=param_str, depth=current_depth + 1) + parent_node.key_to_child_node[param_str] = new_node + parent_node = new_node + else: + parent_node = parent_node.key_to_child_node[param_str] + + # If the token is matched + else: + parent_node = parent_node.key_to_child_node[token] + + current_depth += 1 + + # seq1 is template + @staticmethod + def get_seq_distance(seq1, seq2): + assert len(seq1) == len(seq2) + sim_tokens = 0 + param_count = 0 + + for token1, token2 in zip(seq1, seq2): + if token1 == param_str: + param_count += 1 + continue + if token1 == token2: + sim_tokens += 1 + + ret_val = float(sim_tokens) / len(seq1) + + return ret_val, param_count + + def fast_match(self, cluster_list: list, tokens): + match_cluster = None + + max_sim = -1 + max_param_count = -1 + max_cluster = None + + for cluster in cluster_list: + cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens) + if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count): + max_sim = cur_sim + max_param_count = param_count + max_cluster = cluster + + if max_sim >= self.sim_th: + match_cluster = max_cluster + + return match_cluster + + @staticmethod + def get_template(seq1, seq2): + assert len(seq1) == len(seq2) + ret_val = [] + + i = 0 + for word in seq1: + if word == seq2[i]: + ret_val.append(word) + else: + ret_val.append(param_str) + + i += 1 + + return ret_val + + def print_tree(self): + self.print_node(self.root_node, 0) + + def print_node(self, node, depth): + out_str = '' + for i in range(depth): + out_str += '\t' + + if node.depth == 0: + out_str += 'Root' + elif node.depth == 1: + out_str += '<' + str(node.key) + '>' + else: + out_str += node.key + + print(out_str) + + if node.depth == self.depth: + return 1 + for child in node.key_to_child_node: + self.print_node(node.key_to_child_node[child], depth + 1) + + @staticmethod + def num_to_cluster_id(num): + cluster_id = "A{:04d}".format(num) + return cluster_id + + def add_log_message(self, content: str): + content = content.strip() + content_tokens = content.split() + match_cluster = self.tree_search(self.root_node, content_tokens) + + # Match no existing log cluster + if match_cluster is None: + cluster_num = len(self.clusters) + 1 + cluster_id = self.num_to_cluster_id(cluster_num) + match_cluster = LogCluster(content_tokens, cluster_id) + self.clusters.append(match_cluster) + self.add_seq_to_prefix_tree(self.root_node, match_cluster) + update_type = "cluster_created" + + # Add the new log message to the existing cluster + else: + new_template_tokens = self.get_template(content_tokens, match_cluster.log_template_tokens) + if ' '.join(new_template_tokens) != ' '.join(match_cluster.log_template_tokens): + match_cluster.log_template_tokens = new_template_tokens + update_type = "cluster_template_changed" + else: + update_type = "none" + match_cluster.size += 1 + + return match_cluster, update_type + + def get_total_cluster_size(self): + size = 0 + for c in self.clusters: + size += c.size + return size diff --git a/logparsing/drain/drain3/file_persistence.py b/logparsing/drain/drain3/file_persistence.py new file mode 100644 index 0000000..26faf66 --- /dev/null +++ b/logparsing/drain/drain3/file_persistence.py @@ -0,0 +1,25 @@ +""" +Description : This file implements the persist/restore from file +Author : Moshik Hershcovitch +Author_email: moshikh@il.ibm.com +License : MIT +""" + +import os +import pathlib + +from logparsing.drain.drain3.persistence_handler import PersistenceHandler + + +class FilePersistence(PersistenceHandler): + def __init__(self, file_path): + self.file_path = file_path + + def save_state(self, state): + pathlib.Path(self.file_path).write_bytes(state) + + def load_state(self): + if not os.path.exists(self.file_path): + return None + + return pathlib.Path(self.file_path).read_bytes() diff --git a/logparsing/drain/drain3/kafka_persistence.py b/logparsing/drain/drain3/kafka_persistence.py new file mode 100644 index 0000000..c5a05d4 --- /dev/null +++ b/logparsing/drain/drain3/kafka_persistence.py @@ -0,0 +1,45 @@ +""" +Author : Moshik Hershcovitch +Author : David Ohana, Moshik Hershcovitch, Eran Raichstein +Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com +License : MIT +""" +import configparser + +import kafka + +# logger = logging.getLogger(__name__) +from logparsing.drain.drain3.persistence_handler import PersistenceHandler + +config = configparser.ConfigParser() +config.read('drain3.ini') + + +class KafkaPersistence(PersistenceHandler): + def __init__(self, server_list, topic): + self.server_list = server_list + self.topic = topic + self.producer = kafka.KafkaProducer(bootstrap_servers=server_list) + + def save_state(self, state): + self.producer.send(self.topic, value=state) + + def load_state(self): + consumer = kafka.KafkaConsumer(bootstrap_servers=self.server_list) + partition = kafka.TopicPartition(self.topic, 0) + consumer.assign([partition]) + end_offsets = consumer.end_offsets([partition]) + end_offset = list(end_offsets.values())[0] + if end_offset > 0: + consumer.seek(partition, end_offset - 1) + snapshot_poll_timeout_ms = int(config.get('DEFAULT', 'snapshot_poll_timeout_sec', fallback=60)) * 1000 + records = consumer.poll(snapshot_poll_timeout_ms) + if not records: + raise RuntimeError(f"No message received from Kafka during restore even though end_offset>0") + last_msg = records[partition][0] + state = last_msg.value + else: + state = None + + consumer.close() + return state diff --git a/logparsing/drain/drain3/masking.py b/logparsing/drain/drain3/masking.py new file mode 100644 index 0000000..a57b9bb --- /dev/null +++ b/logparsing/drain/drain3/masking.py @@ -0,0 +1,65 @@ +""" +Description : This file implements the persist/restore from Kafka +Author : Moshik Hershcovitch +Author_email: moshikh@il.ibm.com +License : MIT +""" +import configparser +import json +import logging +import re +from typing import List + +logger = logging.getLogger(__name__) +config = configparser.ConfigParser() +config.read('drain3.ini') + + +class MaskingInstruction: + def __init__(self, regex_pattern: str, mask_with: str): + self.regex_pattern = regex_pattern + self.mask_with = mask_with + self.regex = re.compile(regex_pattern) + self.mask_with_wrapped = "<" + mask_with + ">" + + +class RegexMasker: + def __init__(self, masking_instructions: List[MaskingInstruction]): + self.masking_instructions = masking_instructions + + def mask(self, content: str): + for mi in self.masking_instructions: + content = re.sub(mi.regex, mi.mask_with_wrapped, content) + return content + + +# Some masking examples +# --------------------- +# +# masking_instances = [ +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)', "ID"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})((?=[^A-Za-z0-9])|$)', "IP"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)', "SEQ"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)', "SEQ"), +# +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)', "HEX"), +# MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)', "NUM"), +# MaskingInstruction(r'(?<=executed cmd )(".+?")', "CMD"), +# ] + + +class LogMasker: + def __init__(self): + masking_instances = [] + self.masker = None + m = json.loads(config.get('DEFAULT', 'masking', fallback="[]")) + for i in range(len(m)): + logger.info("Adding custom mask {0} --> {1}".format(str(m[i]['mask_with']), str(m[i]['regex_pattern']))) + masking_instances.append(MaskingInstruction(m[i]['regex_pattern'], m[i]['mask_with'])) + self.masker = RegexMasker(masking_instances) + + def mask(self, content: str): + if self.masker is not None: + return self.masker.mask(content) + else: + return content diff --git a/logparsing/drain/drain3/persistence_handler.py b/logparsing/drain/drain3/persistence_handler.py new file mode 100644 index 0000000..a1e5076 --- /dev/null +++ b/logparsing/drain/drain3/persistence_handler.py @@ -0,0 +1,18 @@ +""" +Description : This file implements an abstract class for implementing a Drain3 persistence handler +Author : David Ohana +Author_email: david.ohana@ibm.com +License : MIT +""" +from abc import ABC, abstractmethod + + +class PersistenceHandler(ABC): + + @abstractmethod + def save_state(self, state): + pass + + @abstractmethod + def load_state(self): + pass diff --git a/logparsing/drain/drain3/template_miner.py b/logparsing/drain/drain3/template_miner.py new file mode 100644 index 0000000..aeb4b79 --- /dev/null +++ b/logparsing/drain/drain3/template_miner.py @@ -0,0 +1,98 @@ +""" +Description : This file implements wrapper of the Drain core algorithm - add persistent and recovery +Author : David Ohana, Moshik Hershcovitch, Eran Raichstein +Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com +License : MIT +""" +import base64 +import configparser +import logging +import time +import zlib + +import jsonpickle + +from logparsing.drain.drain3.drain import Drain +from logparsing.drain.drain3.masking import LogMasker +from logparsing.drain.drain3.persistence_handler import PersistenceHandler + +logger = logging.getLogger(__name__) +config = configparser.ConfigParser() +config.read('drain3.ini') + + +class TemplateMiner: + + def __init__(self, persistence_handler: PersistenceHandler): + logger.info("Starting Drain3 template miner") + self.compress_state = config.get('DEFAULT', 'compress_state', fallback=True) + self.persistence_handler = persistence_handler + self.snapshot_interval_seconds = int(config.get('DEFAULT', 'snapshot_interval_minutes', fallback=5)) * 60 + self.drain = Drain(sim_th=float(config.get('DEFAULT', 'sim_th', fallback=0.4))) + self.masker = LogMasker() + self.last_save_time = time.time() + if persistence_handler is not None: + self.load_state() + + def load_state(self): + logger.info("Checking for saved state") + + state = self.persistence_handler.load_state() + if state is None: + logger.info("Saved state not found") + return + + if self.compress_state: + state = zlib.decompress(base64.b64decode(state)) + + drain: Drain = jsonpickle.loads(state) + + # After loading, the keys of "parser.root_node.key_to_child" are string instead of int, + # so we have to cast them to int + keys = [] + for i in drain.root_node.key_to_child_node.keys(): + keys.append(i) + for key in keys: + drain.root_node.key_to_child_node[int(key)] = drain.root_node.key_to_child_node.pop(key) + + self.drain = drain + logger.info("Restored {0} clusters with {1} messages".format( + len(drain.clusters), drain.get_total_cluster_size())) + + def save_state(self, snapshot_reason): + state = jsonpickle.dumps(self.drain).encode('utf-8') + if self.compress_state: + state = base64.b64encode(zlib.compress(state)) + + logger.info(f"Saving state of {len(self.drain.clusters)} clusters " + f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, " + f"reason: {snapshot_reason}") + self.persistence_handler.save_state(state) + + def get_snapshot_reason(self, change_type): + if change_type != "none": + return change_type + + diff_time_sec = time.time() - self.last_save_time + if diff_time_sec >= self.snapshot_interval_seconds: + return "periodic" + + return None + + def add_log_message(self, log_message: str): + masked_content = self.masker.mask(log_message) + cluster, change_type = self.drain.add_log_message(masked_content) + result = { + "change_type": change_type, + "cluster_id": cluster.cluster_id, + "cluster_size": cluster.size, + "template_mined": cluster.get_template(), + "cluster_count": len(self.drain.clusters) + } + + if self.persistence_handler is not None: + snapshot_reason = self.get_snapshot_reason(change_type) + if snapshot_reason: + self.save_state(snapshot_reason) + self.last_save_time = time.time() + return result diff --git a/logparsing/drain/examples/drain3.ini b/logparsing/drain/examples/drain3.ini new file mode 100644 index 0000000..8cd0ec8 --- /dev/null +++ b/logparsing/drain/examples/drain3.ini @@ -0,0 +1,14 @@ +[DEFAULT] +sim_th = 0.4 +snapshot_interval_minutes = 10 +snapshot_poll_timeout_sec = 60 +masking = [ + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, + {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"} + ] + + diff --git a/logparsing/drain/examples/drain_stdin_demo.py b/logparsing/drain/examples/drain_stdin_demo.py new file mode 100644 index 0000000..00b45de --- /dev/null +++ b/logparsing/drain/examples/drain_stdin_demo.py @@ -0,0 +1,36 @@ +""" +Description : Example of using Drain3 with Kafka persistence +Author : David Ohana, Moshik Hershcovitch, Eran Raichstein +Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com +License : MIT +""" +import configparser +import json +import logging +import sys +sys.path.append('../') + +from logparsing.drain.drain3.template_miner import TemplateMiner +from logparsing.drain.drain3.file_persistence import FilePersistence + +persistence_type = "FILE" + +config = configparser.ConfigParser() +config.read('drain3.ini') + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') +persistence = FilePersistence("drain3_state.bin") +template_miner = TemplateMiner(persistence) +print(f"Drain3 started with '{persistence_type}' persistence, reading from std-in (input 'q' to finish)") +while True: + log_line = input() + if log_line == 'q': + break + result = template_miner.add_log_message(log_line) + result_json = json.dumps(result) + print(result_json) + +print("Clusters:") +for cluster in template_miner.drain.clusters: + print(cluster) diff --git a/logparsing/drain/requirements.txt b/logparsing/drain/requirements.txt new file mode 100644 index 0000000..6fa3443 --- /dev/null +++ b/logparsing/drain/requirements.txt @@ -0,0 +1,5 @@ +jsonpickle==1.3 +kafka==1.3.5 + + + diff --git a/logparsing/drain/setup.cfg b/logparsing/drain/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/logparsing/drain/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/logparsing/drain/setup.py b/logparsing/drain/setup.py new file mode 100644 index 0000000..cfb897e --- /dev/null +++ b/logparsing/drain/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup +from os import path + +this_directory = path.abspath(path.dirname(__file__)) +with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='drain3', + packages= ['drain3'], + version="0.7.2", + license='MIT', + description="persistent log parser", + long_description=long_description, + long_description_content_type="text/markdown", + author="IBM Research Haifa", + author_email="drain3@il.ibm.com", + url="https://github.com/IBM/Drain3", + download_url = 'https://github.com/IBM/Drain3/archive/v_01.tar.gz', + keywords = ['drain', 'log', 'parser', 'IBM'], + install_requires=[ + 'jsonpickle==1.3', + 'kafka==1.3.5' + ], + classifiers=[ + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries", + ], +) diff --git a/logparsing/fttree/__pycache__/__init__.cpython-36.pyc b/logparsing/fttree/__pycache__/__init__.cpython-36.pyc index aac5a61d3dace8d92be6fbf1239c3e15cc68e376..43201daff2aae4fd04596d139da61ee9bb6337c7 100644 GIT binary patch delta 26 hcmZ3%xPp<@n3tDp#za;J&a}*&)Z!T5%AARQQ2<|l2mk;8 delta 26 hcmZ3%xPp<@n3tDp`b1U-PPfdQ)Z!T5N}q{-Q2Eq%@^6COJPPHO41D-7zmQr?NP+I6g16 Md}6l_>t~R105zl&kN^Mx delta 102 zcmbQlxQCJ3iI3$JYBN<>Dpx=>dB7ICp)@hOch@2UH@eF rwr5+rU#y=ReTqC%@ipav{nBQ80) zmQ{+Gsc^&O0_J)~y~!V#<=OSZ8743nicJ<{2~}NJ!w)o86l4fcG1zFJ+`4dv8h((d z4M=6fo9&60f& z&^auHOD21Z=vmZ#zi7AM`q?IbB?mco0JC$Z);)=g$& zvt&O2bQDYBipfrFnT(qzuVIVlbj!?1EspW6^w})J{+Chi7EexodVCsCFg_)-=oT-W zmzS8EdTX*irx)X`$yJ>0JhzyEQny&j^NUh$O+L$c7$`iEYbB%9WNU6$Alb;Rz^c!{ Yz;J8wGHx?Q{mFN^a~M@8d+|5`0Mj0Ki2wiq diff --git a/logparsing/fttree/__pycache__/fttree.cpython-37.pyc b/logparsing/fttree/__pycache__/fttree.cpython-37.pyc index ab64d4f528b2da46ad7c4552f13830f38def4e80..693b5b2c63c70518db10c5cf77b64bad2b0edf39 100644 GIT binary patch delta 681 zcmZuv&ubG=5T4oJW;cmiHb2~m5rtq6Ihcc_=1{9%w1g_+wh|R_n-D0eYu37jx1|z9 z@RahP2k}P*74cAnUOb3@LC+%Iya(lR`fPRd%_EW>}h;Ts+h(x zkf8=0!!!7*)F#YV$*I2-_kqF^J?u%R;PQH8wIIr{%!b4g%*w-4;tHF?E>l#7oikCE zCgj3#wwuLumd*1!>ONWWU-LjNp}`-@j=huoVq}GNqB8ilT$(F0ThI{2*VMP%~)ibrAcIr@M!5+=w!@3tsKHlenYWan=Y`l=!qFuT?lk9E$j(& zW{Z=Sb?7gI<&hbM$YRchMApO;dgvLsct&6G_X-~72z+={;P^M?plM|P-k;$)^++u- z!uX4}|30kkH1gyBjV$1M{#dQpG&U;!Thr6C;OYFcTEZos(yIDfvPtou@w#@LUci0L z86zQ)<&vrA;5^JiU-e({H`?rhLG|QBqZ{1vmK)a>z4^xUV(Ucb+QM?T-M!kGzs=|L wX7VP|OB-PEi#WHo^%cZRTW5Ef5@xt-o~HQ7EaEtSXHHQ3Z7$$p-n1tF0Q_RCVE_OC delta 728 zcmZ`%&ui0g6o20&O=^>-j;(F7u3J?~5mB*KWOIi)oIgN@%CI}sUy!+VmP%|L8GKU* z9f(Y!xs9f(dzfdGC|=`MeL`ZEaas z2mF2?;C1HX-K*W&oH|cJb5AFU#JWg|c-aMVAM;Nniuq5{+b-!)Aq2`&4(v8-xWpS0 z8{ks1cpP}>hl1A-*)*5Kf4*^%doP@RNua(8sc@&@%7U$ofgPfuMUMKYqQer-vL>1+ z*{L*tKi)=zw!Vwo2Qms)MalooZC=hbm{j%0NlRFKt3GWJ7tx zvuQqjMBGtKgUA_j_z;=TfHnS)w#&9+4Pi}WANT}GvhRFPAen(GZ+0Tua?q?>i2Bh1 zS9RLOo)z}R*JuD|2Oaf}N|%E-*D^o@P8GHB<_xO}J?P4+5RKp#E=Djzs92&A$5AWk zGBkuevKao`A7UMz{b|+dw7B>im2Nj{t)A8X;~AUsbdegH_oRYIZO3+`8n}qz4dcUV zo&+f~QS6o8;lhIXoSha=9QH1%M!V5512jtaI<{BH65a%;VxUA@$$$YD;S!8M)nl%) zRdM9pkL!4p2K%yGPo8bPTy|dd%?I_(+3c}c 0])) + if len(line) < window_length: + continue + for i in range(len(line) - window_length): + label_line = [] + for j in range(window_length): + label_line.append(vec_to_class_type[line[i+j]]) + label_line.append(vec_to_class_type[line[i + window_length]]) + input_data.append(label_line) + return input_data + + +def get_label_sequentials(sequential_out_file): + vec_to_class_type = {} + with open(pattern_vec_file, 'r') as pattern_file: + i = 0 + for line in pattern_file.readlines(): + pattern, vec = line.split('[:]') + pattern_vector = tuple(map(float, vec.strip().split(' '))) + vec_to_class_type[pattern_vector] = i + i = i + 1 + with open(sequential_out_file, 'w+') as file: + sequence_data_set = generate_seq_label(data_file, window_length, pattern_vec_file) + for line in sequence_data_set: + for label in line: + file.write(str(label)) + file.write(',') + file.write('\n') + + +#pattern_extract() +#extract_feature_spilt_abnormal() +#train_model() +#get_label_sequentials('./Data/FTTreeResult-HDFS/pattern_sequntials') +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict + +# -*- coding: UTF-8 -*- \ No newline at end of file diff --git a/self_att_supervised_detection.py b/self_att_supervised_detection.py new file mode 100644 index 0000000..df27674 --- /dev/null +++ b/self_att_supervised_detection.py @@ -0,0 +1,62 @@ +# -*- coding: UTF-8 -*- +# -*- coding: UTF-8 -*- + +import os +from logparsing.fttree import fttree +from extractfeature import hdfs_ft_preprocessor +from anomalydetection.loganomaly import log_anomaly_sequential_train +from anomalydetection.loganomaly import log_anomaly_sequential_predict +from anomalydetection.self_att_lstm_supervised import self_att_lstm_supervised_train +from anomalydetection.self_att_lstm_supervised import self_att_lstm_supervised_predict + +# parameters for early prepare + +temp_directory = './Data/logdeepdata/' +train_file_name = 'robust_log_train.csv' +test_file_name = 'robust_log_test.csv' +valid_file_name = 'robust_log_valid.csv' + +# log anomaly sequential model parameters some parameter maybe changed to train similar models +sequence_length = 50 +input_size = 300 +hidden_size = 128 +num_of_layers = 2 +# 1 using sigmoid, 2 using softmax +num_of_classes = 1 +num_epochs = 20 +batch_size = 1000 +# for robust attention bi +train_root_path = './Data/FTTreeResult-HDFS/self_att_supervised_model_train/' +model_out_path = train_root_path + 'model_out/' +train_file = temp_directory + train_file_name +pattern_vec_json = './Data/logdeepdata/event2semantic_vec.json' + + +# predict parameters +# log anomaly sequential model parameters + +if not os.path.exists(train_root_path): + os.makedirs(train_root_path) + + +def train_model(): + self_att_lstm_supervised_train.train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, train_file, pattern_vec_json) + + +def test_model(): + # do something + self_att_lstm_supervised_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, sequence_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', temp_directory + test_file_name, batch_size, pattern_vec_json) + +#pattern_extract() +#extract_feature() +#train_model() +#train_model() +test_model() + +# deep log +# log_preprocessor.execute_process() +# value_extract.get_value() +# value_extract.value_deal() +# value_extract.value_extract() +# train predict +