From ca9e4e1ae498617041ef8f768c2829e03a734b3e Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Wed, 9 Aug 2017 16:18:01 +0100 Subject: [PATCH 01/22] Ignore local configurations --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 58fe940..5c3cdf2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ keras-tf/* *.root *.h5 scaler.sav +localConfig.py From dd6ffde90d3f2212258b7da3ab10786f255d564d Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Wed, 9 Aug 2017 18:00:42 +0100 Subject: [PATCH 02/22] Plotting make-up. Add weights to histogram plot for normalization --- .gitignore | 1 + localConfig.py | 2 +- trainNN.py | 99 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 88 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 5c3cdf2..af27e48 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,6 @@ keras-tf/* *.pcm *.root *.h5 +*.pyc scaler.sav localConfig.py diff --git a/localConfig.py b/localConfig.py index 584cd36..4963e7f 100644 --- a/localConfig.py +++ b/localConfig.py @@ -1,3 +1,3 @@ #!/usr/bin/env python -loc = "./" +loc = "/home/diogo/LIP/DATA/" diff --git a/trainNN.py b/trainNN.py index 4c27800..366e391 100644 --- a/trainNN.py +++ b/trainNN.py @@ -240,6 +240,17 @@ def getSELUClassifier(nIn, nOut, compileArgs): print "Calculating FOM:" dataVal["NN"] = valPredict +dataDev["NN"] = devPredict + +sig_dataValIdx=(dataVal.category==1) +bkg_dataValIdx=(dataVal.category==0) +sig_dataDevIdx=(dataVal.category==1) +bkg_dataDevIdx=(dataVal.category==0) + +sig_dataVal=dataVal[sig_dataValIdx] +bkg_dataVal=dataVal[bkg_dataValIdx] +sig_dataDev=dataDev[sig_dataDevIdx] +bkg_dataDev=dataDev[bkg_dataDevIdx] def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): selectedValIdx = (dataVal.NN>cut) @@ -310,39 +321,101 @@ def FullFOM(sIn, bIn, fValue=0.2): #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) print(history.history.keys()) +fomEvo = [] +fomCut = [] + +bkgEff = [] +sigEff = [] + +luminosity=35866 +sig_Init = sigDataVal.weight.sum() * luminosity * 2; +bkg_Init = bkgDataVal.weight.sum() * luminosity * 2; + +for cut in np.arange(0.0, 0.9999999, 0.001): + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + +max_FOM=0 +"""" +for x in fomCut: + flag=0 + for y in fomCut: + if abs(x-y)<0.1 and abs(fomEvo[fomCut.index(x)]-fomEvo[fomCut.index(y)])>0.1: + flag=1 + if fomEvo[fomCut.index(x)]>max_FOM and flag==0: + max_FOM=fomEvo[fomCut.index(x)] +""" + +print "Maximizing FOM" +for x in fomEvo: + if x>max_FOM: + max_FOM=x + + +print "Maximizacao da FOM:", max_FOM , "com corte em: " , fomCut[fomEvo.index(max_FOM)] +Eff = zip(bkgEff, sigEff) + +print "Plotting" + +plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.5, normed=1, weights=sig_dataDev["weight"]) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.5, normed=1, weights=bkg_dataDev["weight"]) +plt.hist(sig_dataVal["NN"], 50, facecolor='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") +plt.hist(bkg_dataVal["NN"], 50, facecolor='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") +plt.xlabel('NN output') +plt.title("TMVA overtraining check for classifier: NN") +plt.legend(['Signal (Dev sample)', 'Background (Dev sample)', 'Signal (Val sample)', 'Background (Val sample)'], loc='upper right') +plt.show() + +both_dataDev=bkg_dataDev["NN"].append(sig_dataDev["NN"]) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=1, weights=bkg_dataDev["weight"]) +plt.hist(both_dataDev, 50, facecolor="blue", histtype="step") +plt.show() + +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=1, weights=bkg_dataDev["weight"]) +plt.hist(both_dataDev, 50, facecolor="blue", histtype="step") +plt.yscale('log', nonposy='clip') +plt.show() + +plt.subplots_adjust(hspace=0.25) +plt.subplot(221) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.show() +plt.subplot(222) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.show() - -fomEvo = [] -fomCut = [] -for cut in np.arange(0.0, 0.9999999, 0.001): - sig, bkg = getYields(dataVal, cut=cut) - if sig[0] > 0 and bkg[0] > 0: - fom, fomUnc = FullFOM(sig, bkg) - fomEvo.append(fom) - fomCut.append(cut) +plt.subplot(223) plt.plot(fomCut, fomEvo) plt.title("FOM") plt.ylabel("FOM") plt.xlabel("ND") -plt.legend(['test'], loc='upper left') +plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') + + +plt.subplot(224) +plt.semilogy(fomCut, Eff) +plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) +#plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) +plt.title("Efficiency") +plt.ylabel("Eff") +plt.xlabel("ND") +plt.legend(['Background', 'Signal'], loc='upper left') plt.show() -#print fomEvo sys.exit("Done!") From f5320e2ad5415dd5a7db7424b97fd98de886390f Mon Sep 17 00:00:00 2001 From: Diogo de Bastos Date: Wed, 9 Aug 2017 18:07:40 +0100 Subject: [PATCH 03/22] Delete localConfig.py --- localConfig.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 localConfig.py diff --git a/localConfig.py b/localConfig.py deleted file mode 100644 index 4963e7f..0000000 --- a/localConfig.py +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env python - -loc = "/home/diogo/LIP/DATA/" From 7ce635039877a345e4cc0ca11be118f6fc866b69 Mon Sep 17 00:00:00 2001 From: Gustavo Marques Date: Thu, 10 Aug 2017 10:53:26 +0100 Subject: [PATCH 04/22] fix plots --- trainNN.py | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) mode change 100644 => 100755 trainNN.py diff --git a/trainNN.py b/trainNN.py old mode 100644 new mode 100755 index 366e391..7afd8bf --- a/trainNN.py +++ b/trainNN.py @@ -168,7 +168,7 @@ compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 2, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': 1, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam @@ -235,6 +235,7 @@ def getSELUClassifier(nIn, nOut, compileArgs): print "Dev score:", scoreDev print "Val score:", scoreVal print confusion_matrix(YVal, valPredict.round()) +cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) print cohen_kappa_score(YVal, valPredict.round()) @@ -341,15 +342,6 @@ def FullFOM(sIn, bIn, fValue=0.2): sigEff.append(sig[0]/sig_Init) max_FOM=0 -"""" -for x in fomCut: - flag=0 - for y in fomCut: - if abs(x-y)<0.1 and abs(fomEvo[fomCut.index(x)]-fomEvo[fomCut.index(y)])>0.1: - flag=1 - if fomEvo[fomCut.index(x)]>max_FOM and flag==0: - max_FOM=fomEvo[fomCut.index(x)] -""" print "Maximizing FOM" for x in fomEvo: @@ -357,32 +349,32 @@ def FullFOM(sIn, bIn, fValue=0.2): max_FOM=x -print "Maximizacao da FOM:", max_FOM , "com corte em: " , fomCut[fomEvo.index(max_FOM)] +print "FOM maximization: ", max_FOM , "with cut at: " , fomCut[fomEvo.index(max_FOM)] Eff = zip(bkgEff, sigEff) print "Plotting" -plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.5, normed=1, weights=sig_dataDev["weight"]) -plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.5, normed=1, weights=bkg_dataDev["weight"]) -plt.hist(sig_dataVal["NN"], 50, facecolor='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") -plt.hist(bkg_dataVal["NN"], 50, facecolor='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") +plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) +plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") +plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") plt.xlabel('NN output') -plt.title("TMVA overtraining check for classifier: NN") -plt.legend(['Signal (Dev sample)', 'Background (Dev sample)', 'Signal (Val sample)', 'Background (Val sample)'], loc='upper right') +plt.title("Cohen's kapa: {0}".format(cohen_kappa), fontsize=10) +plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13) +plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)'], loc='upper right') plt.show() -both_dataDev=bkg_dataDev["NN"].append(sig_dataDev["NN"]) -plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=1, weights=bkg_dataDev["weight"]) -plt.hist(both_dataDev, 50, facecolor="blue", histtype="step") -plt.show() - -plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=1, weights=bkg_dataDev["weight"]) -plt.hist(both_dataDev, 50, facecolor="blue", histtype="step") -plt.yscale('log', nonposy='clip') +both_dataDev=bkg_dataDev.append(sig_dataDev) +plt.xlabel('NN output') +plt.title("Number of Events") +#plt.yscale('log', nonposy='clip') +plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) +plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) plt.show() plt.subplots_adjust(hspace=0.25) -plt.subplot(221) +plt.subplot(121) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') @@ -390,15 +382,17 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.subplot(222) +plt.subplot(122) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') +plt.show() -plt.subplot(223) +plt.subplots_adjust(hspace=0.25) +plt.subplot(121) plt.plot(fomCut, fomEvo) plt.title("FOM") plt.ylabel("FOM") @@ -406,7 +400,7 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') -plt.subplot(224) +plt.subplot(122) plt.semilogy(fomCut, Eff) plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) #plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) From 9d6b5667a28f85de2c5d1fb33eb8a9831357fc67 Mon Sep 17 00:00:00 2001 From: Gustavo Marques Date: Thu, 10 Aug 2017 15:37:29 +0100 Subject: [PATCH 05/22] mais estetica dos graficos --- .gitignore | 2 ++ trainNN.py | 29 +++++++++++++++++------------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index af27e48..3ca082c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ keras-tf/* *.pcm *.root *.h5 +*.png +*.json *.pyc scaler.sav localConfig.py diff --git a/trainNN.py b/trainNN.py index 7afd8bf..f98689b 100755 --- a/trainNN.py +++ b/trainNN.py @@ -168,7 +168,7 @@ compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 1, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': 10, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam @@ -210,10 +210,10 @@ def getSELUClassifier(nIn, nOut, compileArgs): name = "myNN" model.save(name+".h5") -#model_json = model.to_json() -#with open(name + ".json", "w") as json_file: -# json_file.write(model_json) -#model.save_weights(name + ".h5") +model_json = model.to_json() +with open(name + ".json", "w") as json_file: + json_file.write(model_json) +model.save_weights(name + ".h5") ## To load: #from keras.models import model_from_json @@ -359,9 +359,10 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") plt.xlabel('NN output') -plt.title("Cohen's kapa: {0}".format(cohen_kappa), fontsize=10) +plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13) plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)'], loc='upper right') +#plt.savefig('hist.png', bbox_inches='tight') plt.show() both_dataDev=bkg_dataDev.append(sig_dataDev) @@ -371,10 +372,11 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) +#plt.savefig('pred.png', bbox_inches='tight') plt.show() -plt.subplots_adjust(hspace=0.25) -plt.subplot(121) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') @@ -382,17 +384,19 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.subplot(122) +plt.subplot(212) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') +plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') +#plt.savefig('NN.png', papertype="a4") plt.show() -plt.subplots_adjust(hspace=0.25) -plt.subplot(121) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) plt.plot(fomCut, fomEvo) plt.title("FOM") plt.ylabel("FOM") @@ -400,7 +404,7 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') -plt.subplot(122) +plt.subplot(212) plt.semilogy(fomCut, Eff) plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) #plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) @@ -408,6 +412,7 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.ylabel("Eff") plt.xlabel("ND") plt.legend(['Background', 'Signal'], loc='upper left') +#plt.savefig('FOM.png', bbox_inches='tight') plt.show() From 937e6cb695f0cb3f660b26822f885c0f49fa67f0 Mon Sep 17 00:00:00 2001 From: Gustavo Marques Date: Thu, 10 Aug 2017 18:24:43 +0100 Subject: [PATCH 06/22] Set2 added + KS test --- trainNN.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/trainNN.py b/trainNN.py index f98689b..f500efd 100755 --- a/trainNN.py +++ b/trainNN.py @@ -11,9 +11,13 @@ from keras.layers import Dense, Dropout, AlphaDropout from keras.optimizers import Adam from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt import localConfig as cfg + + treeName = "bdttree" baseSigName = "T2DegStop_300_270" bkgDatasets = [ @@ -38,7 +42,7 @@ ] -myFeatures = ["Jet1Pt", "Met", "Njet", "LepPt", "LepEta", "LepChg", "HT", "NbLoose"] +myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT", "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV"] inputBranches = list(myFeatures) inputBranches.append("XS") inputBranches.append("weight") @@ -103,6 +107,11 @@ bkgDataDev["sampleWeight"] = 1 bkgDataVal["sampleWeight"] = 1 + + + + + # Calculate event weights # The input files already have a branch called weight, which contains the per-event weights # These precomputed weights have all scale factors applied. If we desire to not use the scale factors @@ -167,8 +176,9 @@ joblib.dump(scaler, scalerfile) + compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 10, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam @@ -208,7 +218,7 @@ def getSELUClassifier(nIn, nOut, compileArgs): history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) print("Training took ", time.time()-start, " seconds") -name = "myNN" +name = "myNN2" model.save(name+".h5") model_json = model.to_json() with open(name + ".json", "w") as json_file: @@ -317,7 +327,7 @@ def FullFOM(sIn, bIn, fValue=0.2): ######################################################### # Let's repeat the above, but monitor the evolution of the loss function -import matplotlib.pyplot as plt + #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) print(history.history.keys()) @@ -352,29 +362,38 @@ def FullFOM(sIn, bIn, fValue=0.2): print "FOM maximization: ", max_FOM , "with cut at: " , fomCut[fomEvo.index(max_FOM)] Eff = zip(bkgEff, sigEff) +print "Kolmogorov-Smirnov test" +km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) +print km_value + print "Plotting" +plt.figure(figsize=(7,6)) plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") plt.xlabel('NN output') -plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) -plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13) -plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)'], loc='upper right') -#plt.savefig('hist.png', bbox_inches='tight') +#plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) +plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') +plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) +plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') +plt.savefig('hist2.png', bbox_inches='tight') plt.show() + both_dataDev=bkg_dataDev.append(sig_dataDev) +plt.figure(figsize=(7,6)) plt.xlabel('NN output') plt.title("Number of Events") #plt.yscale('log', nonposy='clip') plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) -#plt.savefig('pred.png', bbox_inches='tight') +plt.savefig('pred2.png', bbox_inches='tight') plt.show() +plt.figure(figsize=(7,6)) plt.subplots_adjust(hspace=0.5) plt.subplot(211) plt.plot(history.history['acc']) @@ -392,9 +411,10 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -#plt.savefig('NN.png', papertype="a4") +plt.savefig('NN2.png', papertype="a4") plt.show() +plt.figure(figsize=(7,6)) plt.subplots_adjust(hspace=0.5) plt.subplot(211) plt.plot(fomCut, fomEvo) @@ -412,7 +432,7 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.ylabel("Eff") plt.xlabel("ND") plt.legend(['Background', 'Signal'], loc='upper left') -#plt.savefig('FOM.png', bbox_inches='tight') +plt.savefig('FOM2.png', bbox_inches='tight') plt.show() From 55a9290c7eb6bcf7f54e9210157ea11169e34c16 Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Fri, 11 Aug 2017 17:17:40 +0100 Subject: [PATCH 07/22] Added gridSearch --- gridSearch.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 gridSearch.py diff --git a/gridSearch.py b/gridSearch.py new file mode 100644 index 0000000..0fa70e6 --- /dev/null +++ b/gridSearch.py @@ -0,0 +1,42 @@ +from sklearn.model_selection import GridSearchCV +from keras.wrappers.scikit_learn import KerasClassifier + +import trainNN + +trainFeatures = trainNN.trainFeatures +compileArgs = trainNN.compileArgs +XDev = trainNN.XDev +YDev = trainNN.YDev + +# Fix seed for reproducibility +seed = 42 +numpy.random.seed(seed) + +# Tune the Number of Neurons in the Hidden Layer +def myClassifier(nIn=len(trainFeatures), nOut=1, compileArgs=compileArgs, layers=1, neurons =1): + model = Sequential() + model.add(Dense(nIn, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + for i in range(0,layers): + model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + + +model = KerasClassifier(build_fn=myClassifier, epochs = 2, batch_size = 20, verbose = 1) + +neurons = [1, 5, 10, 15, 20, 25] +layers = [1,2,3,4,5] +param_grid = dict(neurons=neurons, layers=layers) +grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs=-1) #n_jobs = -1 -> Total number of CPU/GPU cores +print("Starting the training") +start = time.time() +grid_result = grid.fit(XDev,YDev) +print("Training took ", time.time()-start, " seconds") + +print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) +means = grid_result.cv_results_['mean_test_score'] +stds = grid_result.cv_results_['std_test_score'] +params = grid_result.cv_results_['params'] +for mean, stdev, param in (zipmeans, stds, params): + print("%f (%f) with: %r" % (mean, stdev, param)) From a75a412c4916d26063f3ed7a48c613a61c9a8deb Mon Sep 17 00:00:00 2001 From: Gustavo Marques Date: Tue, 22 Aug 2017 15:30:47 +0100 Subject: [PATCH 08/22] alteration in commonFunctions - different Dev and Val signal samples. --- commonFunctions.py | 162 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 commonFunctions.py diff --git a/commonFunctions.py b/commonFunctions.py new file mode 100644 index 0000000..49f44fe --- /dev/null +++ b/commonFunctions.py @@ -0,0 +1,162 @@ +import root_numpy +import pandas + +signalMap = { + "DM30" : ["T2DegStop_250_220", + "T2DegStop_275_245", + "T2DegStop_300_270", + "T2DegStop_325_295", + "T2DegStop_350_320", + "T2DegStop_375_345", + "T2DegStop_400_370", + "T2DegStop_425_395", + "T2DegStop_450_420", + "T2DegStop_475_445", + "T2DegStop_500_470", + "T2DegStop_525_495", + "T2DegStop_550_520", + "T2DegStop_575_545", + "T2DegStop_600_570", + "T2DegStop_625_595", + "T2DegStop_650_620", + "T2DegStop_675_645", + "T2DegStop_700_670", + "T2DegStop_725_695", + "T2DegStop_750_720", + "T2DegStop_775_745", + "T2DegStop_800_770"], + "300_270" : ["T2DegStop_300_270"], + "550_520" : ["T2DegStop_550_520"] + } +bkgDatasets = [ + "Wjets_70to100", + "Wjets_100to200", + "Wjets_200to400", + "Wjets_400to600", + "Wjets_600to800", + "Wjets_800to1200", + "Wjets_1200to2500", + "Wjets_2500toInf", + "TTJets_DiLepton", + "TTJets_SingleLeptonFromTbar", + "TTJets_SingleLeptonFromT", + "ZJetsToNuNu_HT100to200", + "ZJetsToNuNu_HT200to400", + "ZJetsToNuNu_HT400to600", + "ZJetsToNuNu_HT600to800", + "ZJetsToNuNu_HT800to1200", + "ZJetsToNuNu_HT1200to2500", + "ZJetsToNuNu_HT2500toInf" + ] + + +def StopDataLoader(path, features, test="550_520", selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): + if signal not in signalMap: + raise KeyError("Unknown training signal requested ("+signal+")") + if test not in signalMap: + raise KeyError("Unknown test signal requested ("+test+")") + if fraction >= 1.0: + fraction = 1.0 + if fraction < 0.0: + raise ValueError("An invalid fraction was chosen") + if "XS" not in features: + features.append("XS") + if "weight" not in features: + features.append("weight") + + sigDev = None + sigVal = None + + + for sigName_test in signalMap[test]: + tmp = root_numpy.root2array( + path + "test/" + sigName_test + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if sigVal is None: + sigVal = pandas.DataFrame(tmp) + else: + sigVal = sigVal.append(pandas.DataFrame(tmp), ignore_index=True) + + + for sigName in signalMap[signal]: + tmp = root_numpy.root2array( + path + "train/" + sigName + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if sigDev is None: + sigDev = pandas.DataFrame(tmp) + else: + sigDev = sigDev.append(pandas.DataFrame(tmp), ignore_index=True) + + + + bkgDev = None + bkgVal = None + for bkgName in bkgDatasets: + tmp = root_numpy.root2array( + path + "/train/" + bkgName + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if bkgDev is None: + bkgDev = pandas.DataFrame(tmp) + else: + bkgDev = bkgDev.append(pandas.DataFrame(tmp), ignore_index=True) + + tmp = root_numpy.root2array( + path + "/test/" + bkgName + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if bkgVal is None: + bkgVal = pandas.DataFrame(tmp) + else: + bkgVal = bkgVal.append(pandas.DataFrame(tmp), ignore_index=True) + + sigDev["category"] = 1 + sigVal["category"] = 1 + bkgDev["category"] = 0 + bkgVal["category"] = 0 + sigDev["sampleWeight"] = 1 + sigVal["sampleWeight"] = 1 + bkgDev["sampleWeight"] = 1 + bkgVal["sampleWeight"] = 1 + + if fraction < 1.0: + sigDev.weight = sigDev.weight/fraction + sigVal.weight = sigVal.weight/fraction + bkgDev.weight = bkgDev.weight/fraction + bkgVal.weight = bkgVal.weight/fraction + + sigDev.sampleWeight = sigDev.weight/sigDev.XS + sigVal.sampleWeight = sigVal.weight/sigVal.XS + bkgDev.sampleWeight = bkgDev.weight + bkgVal.sampleWeight = bkgVal.weight + + sigDev.sampleWeight = sigDev.sampleWeight/sigDev.sampleWeight.sum() + sigVal.sampleWeight = sigVal.sampleWeight/sigVal.sampleWeight.sum() + bkgDev.sampleWeight = bkgDev.sampleWeight/bkgDev.sampleWeight.sum() + bkgVal.sampleWeight = bkgVal.sampleWeight/bkgVal.sampleWeight.sum() + + dev = sigDev.copy() + dev = dev.append(bkgDev.copy(), ignore_index=True) + val = sigVal.copy() + val = val.append(bkgVal.copy(), ignore_index=True) + + return dev, val + \ No newline at end of file From 6b785374bee3cde01997a54c1233e029e6adae47 Mon Sep 17 00:00:00 2001 From: g-marques Date: Tue, 22 Aug 2017 15:42:07 +0100 Subject: [PATCH 09/22] update --- trainNN.py | 393 +++++++++++++++++++++++++++++------------------------ 1 file changed, 213 insertions(+), 180 deletions(-) diff --git a/trainNN.py b/trainNN.py index f500efd..b84eac1 100755 --- a/trainNN.py +++ b/trainNN.py @@ -10,16 +10,14 @@ from keras.models import Sequential from keras.layers import Dense, Dropout, AlphaDropout from keras.optimizers import Adam +from keras.callbacks import EarlyStopping from sklearn.metrics import confusion_matrix, cohen_kappa_score from scipy.stats import ks_2samp import matplotlib.pyplot as plt - import localConfig as cfg - - treeName = "bdttree" -baseSigName = "T2DegStop_300_270" +baseSigName = "T2DegStop_550_520" bkgDatasets = [ "Wjets_70to100", "Wjets_100to200", @@ -42,7 +40,7 @@ ] -myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT", "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV"] +myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT"] #, "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV" inputBranches = list(myFeatures) inputBranches.append("XS") inputBranches.append("weight") @@ -107,11 +105,6 @@ bkgDataDev["sampleWeight"] = 1 bkgDataVal["sampleWeight"] = 1 - - - - - # Calculate event weights # The input files already have a branch called weight, which contains the per-event weights # These precomputed weights have all scale factors applied. If we desire to not use the scale factors @@ -183,145 +176,167 @@ myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam -def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - -def getSELUClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - -print type(XVal) -print XVal.shape -print XVal.dtype - -print("Starting the training") -start = time.time() -model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) -#model = getSELUClassifier(len(trainFeatures), 1, compileArgs) -history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) -print("Training took ", time.time()-start, " seconds") - -name = "myNN2" -model.save(name+".h5") -model_json = model.to_json() -with open(name + ".json", "w") as json_file: - json_file.write(model_json) -model.save_weights(name + ".h5") - -## To load: -#from keras.models import model_from_json -#with open('model.json', 'r') as json_file: -# loaded_model_json = json_file.read() -#loaded_model = model_from_json(loaded_model_json) -#loaded_model.load_weights("model.h5") - -print("Getting predictions") -devPredict = model.predict(XDev) -valPredict = model.predict(XVal) - -print("Getting scores") - -scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) -scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) -print "" - -print "Dev score:", scoreDev -print "Val score:", scoreVal -print confusion_matrix(YVal, valPredict.round()) -cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) -print cohen_kappa_score(YVal, valPredict.round()) - - -print "Calculating FOM:" -dataVal["NN"] = valPredict -dataDev["NN"] = devPredict - -sig_dataValIdx=(dataVal.category==1) -bkg_dataValIdx=(dataVal.category==0) -sig_dataDevIdx=(dataVal.category==1) -bkg_dataDevIdx=(dataVal.category==0) - -sig_dataVal=dataVal[sig_dataValIdx] -bkg_dataVal=dataVal[bkg_dataValIdx] -sig_dataDev=dataDev[sig_dataDevIdx] -bkg_dataDev=dataDev[bkg_dataDevIdx] - -def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): - selectedValIdx = (dataVal.NN>cut) - selectedVal = dataVal[selectedValIdx] - - selectedSigIdx = (selectedVal.category == 1) - selectedBkgIdx = (selectedVal.category == 0) - selectedSig = selectedVal[selectedSigIdx] - selectedBkg = selectedVal[selectedBkgIdx] - - sigYield = selectedSig.weight.sum() - sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) - bkgYield = selectedBkg.weight.sum() - bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) - - sigYield = sigYield * luminosity * splitFactor # The factor 2 comes from the splitting - sigYieldUnc = sigYieldUnc * luminosity * splitFactor - bkgYield = bkgYield * luminosity * splitFactor - bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor - - return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - -tmpSig, tmpBkg = getYields(dataVal) -sigYield, sigYieldUnc = tmpSig -bkgYield, bkgYieldUnc = tmpBkg - -print "Signal@Presel:", sigDataVal.weight.sum() * 35866 * 2 -print "Background@Presel:", bkgDataVal.weight.sum() * 35866 * 2 -print "Signal:", sigYield, "+-", sigYieldUnc -print "Background:", bkgYield, "+-", bkgYieldUnc - +## DEFINITIONS def FOM1(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / (b**0.5) - fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 - return (fom, fomErr) + s, sErr = sIn + b, bErr = bIn + fom = s / (b**0.5) + fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 + return (fom, fomErr) def FOM2(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / ((s+b)**0.5) - fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 - return (fom, fomErr) + s, sErr = sIn + b, bErr = bIn + fom = s / ((s+b)**0.5) + fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 + return (fom, fomErr) def FullFOM(sIn, bIn, fValue=0.2): - from math import log - s, sErr = sIn - b, bErr = bIn - fomErr = 0.0 # Add the computation of the uncertainty later - fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) - fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) - fom = (fomA - fomB)**0.5 - return (fom, fomErr) - -print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) -print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) -print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - -import sys + from math import log + s, sErr = sIn + b, bErr = bIn + fomErr = 0.0 # Add the computation of the uncertainty later + fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) + fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) + fom = (fomA - fomB)**0.5 + return (fom, fomErr) + +number_of_neurons_all=[] +max_FOM_all=[] +FOM_cut_all=[] +cohen_kappa_score_all=[] +KS_test_all=[] + + +for x in number_of_neurons_all: + print x, " ", max_FOM_all[number_of_neurons_all.index(x)], " ", FOM_cut_all[number_of_neurons_all.index(x)], " ", cohen_kappa_score_all[number_of_neurons_all.index(x)], " ", KS_test_all[number_of_neurons_all.index(x)] + + +for x in range(7,8): + print "======> NUMBER OF NEURONS", x + number_of_neurons_all.append(x) + def getDefinedClassifier(nIn, nOut, compileArgs): + model = Sequential() + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + #model.add(Dropout(0.2)) + #model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) + #model.add(Dropout(0.2)) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + + """ + def getSELUClassifier(nIn, nOut, compileArgs): + model = Sequential() + model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='selu')) + model.add(AlphaDropout(0.2)) + model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) + model.add(AlphaDropout(0.2)) + model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) + model.add(AlphaDropout(0.2)) + model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) + model.add(AlphaDropout(0.2)) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + """ + #print type(XVal) + #print XVal.shape + #print XVal.dtype + + call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') + print("Starting the training") + start = time.time() + model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) + #model = getSELUClassifier(len(trainFeatures), 1, compileArgs) + history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, callbacks=[call], **trainParams) + print("Training took ", time.time()-start, " seconds") + + name = "myNN_4_"+str(x) + model.save(name+".h5") + model_json = model.to_json() + with open(name + ".json", "w") as json_file: + json_file.write(model_json) + model.save_weights(name + ".h5") + + ## To load: + #from keras.models import model_from_json + #with open('model.json', 'r') as json_file: + # loaded_model_json = json_file.read() + #loaded_model = model_from_json(loaded_model_json) + #loaded_model.load_weights("model.h5") + + print("Getting predictions") + devPredict = model.predict(XDev) + valPredict = model.predict(XVal) + + print("Getting scores") + + scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) + scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) + print "" + + print "Dev score:", scoreDev + print "Val score:", scoreVal + print confusion_matrix(YVal, valPredict.round()) + cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + cohen_kappa_score_all.append(cohen_kappa) + print cohen_kappa_score(YVal, valPredict.round()) + + + print "Calculating FOM:" + dataVal["NN"] = valPredict + dataDev["NN"] = devPredict + + sig_dataValIdx=(dataVal.category==1) + bkg_dataValIdx=(dataVal.category==0) + sig_dataDevIdx=(dataVal.category==1) + bkg_dataDevIdx=(dataVal.category==0) + + sig_dataVal=dataVal[sig_dataValIdx] + bkg_dataVal=dataVal[bkg_dataValIdx] + sig_dataDev=dataDev[sig_dataDevIdx] + bkg_dataDev=dataDev[bkg_dataDevIdx] + + def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): + selectedValIdx = (dataVal.NN>cut) + selectedVal = dataVal[selectedValIdx] + + selectedSigIdx = (selectedVal.category == 1) + selectedBkgIdx = (selectedVal.category == 0) + selectedSig = selectedVal[selectedSigIdx] + selectedBkg = selectedVal[selectedBkgIdx] + + + print len(selectedSig) + print len(selectedBkg) + + sigYield = selectedSig.weight.sum() + sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) + bkgYield = selectedBkg.weight.sum() + bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) + + sigYield = sigYield * luminosity * splitFactor # The factor 2 comes from the splitting + sigYieldUnc = sigYieldUnc * luminosity * splitFactor + bkgYield = bkgYield * luminosity * splitFactor + bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor + + return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + + tmpSig, tmpBkg = getYields(dataVal) + sigYield, sigYieldUnc = tmpSig + bkgYield, bkgYieldUnc = tmpBkg + + #print "Signal@Presel:", sigDataVal.weight.sum() * 35866 * 2 + #print "Background@Presel:", bkgDataVal.weight.sum() * 35866 * 2 + #print "Signal:", sigYield, "+-", sigYieldUnc + #print "Background:", bkgYield, "+-", bkgYieldUnc + + #print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + #print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + #print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + + import sys #sys.exit("Done!") ######################################################### @@ -330,42 +345,50 @@ def FullFOM(sIn, bIn, fValue=0.2): #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) -print(history.history.keys()) - -fomEvo = [] -fomCut = [] - -bkgEff = [] -sigEff = [] - -luminosity=35866 -sig_Init = sigDataVal.weight.sum() * luminosity * 2; -bkg_Init = bkgDataVal.weight.sum() * luminosity * 2; - -for cut in np.arange(0.0, 0.9999999, 0.001): - sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) - if sig[0] > 0 and bkg[0] > 0: - fom, fomUnc = FullFOM(sig, bkg) - fomEvo.append(fom) - fomCut.append(cut) - bkgEff.append(bkg[0]/bkg_Init) - sigEff.append(sig[0]/sig_Init) - -max_FOM=0 - -print "Maximizing FOM" -for x in fomEvo: - if x>max_FOM: - max_FOM=x - - -print "FOM maximization: ", max_FOM , "with cut at: " , fomCut[fomEvo.index(max_FOM)] -Eff = zip(bkgEff, sigEff) - -print "Kolmogorov-Smirnov test" -km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) -print km_value - + print(history.history.keys()) + + fomEvo = [] + fomCut = [] + + bkgEff = [] + sigEff = [] + + luminosity=35866 + sig_Init = sigDataVal.weight.sum() * luminosity * 2; + bkg_Init = bkgDataVal.weight.sum() * luminosity * 2; + + for cut in np.arange(0.0, 0.9999999, 0.001): + print cut + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + print fom + print "" + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + + max_FOM=0 + + print "Maximizing FOM" + for x in fomEvo: + if x>max_FOM: + max_FOM=x + + max_FOM_all.append(max_FOM) + FOM_cut_all.append(fomCut[fomEvo.index(max_FOM)]) + + print "FOM maximization: ", max_FOM , "with cut at: " , fomCut[fomEvo.index(max_FOM)] + Eff = zip(bkgEff, sigEff) + + print "Kolmogorov-Smirnov test" + km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + print km_value + KS_test_all.append(km_value) + + +""" print "Plotting" plt.figure(figsize=(7,6)) @@ -435,6 +458,16 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.savefig('FOM2.png', bbox_inches='tight') plt.show() +""" +for x in number_of_neurons_all: + print "4" + print x + print cohen_kappa_score_all[number_of_neurons_all.index(x)] + print max_FOM_all[number_of_neurons_all.index(x)] + print FOM_cut_all[number_of_neurons_all.index(x)] + print KS_test_all[number_of_neurons_all.index(x)][0] + print KS_test_all[number_of_neurons_all.index(x)][1] + sys.exit("Done!") From ec56f28183bdcfd612a6272326e4ab5eddf0e67b Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Tue, 22 Aug 2017 15:47:33 +0100 Subject: [PATCH 10/22] DM = 30 data --- commonFunctions.py | 69 ++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/commonFunctions.py b/commonFunctions.py index ce06396..6dc6060 100644 --- a/commonFunctions.py +++ b/commonFunctions.py @@ -2,30 +2,53 @@ import pandas signalMap = { - "DM30" : [], - "300_270" : ["T2DegStop_300_270"], - } -bkgDatasets = [ - "Wjets_70to100", - "Wjets_100to200", - "Wjets_200to400", - "Wjets_400to600", - "Wjets_600to800", - "Wjets_800to1200", - "Wjets_1200to2500", - "Wjets_2500toInf", - "TTJets_DiLepton", - "TTJets_SingleLeptonFromTbar", - "TTJets_SingleLeptonFromT", - "ZJetsToNuNu_HT100to200", - "ZJetsToNuNu_HT200to400", - "ZJetsToNuNu_HT400to600", - "ZJetsToNuNu_HT600to800", - "ZJetsToNuNu_HT800to1200", - "ZJetsToNuNu_HT1200to2500", - "ZJetsToNuNu_HT2500toInf" - ] + "DM30" : [ + "T2DegStop_250_220", + "T2DegStop_275_245", + "T2DegStop_300_270", + "T2DegStop_325_295", + "T2DegStop_350_320", + "T2DegStop_375_345", + "T2DegStop_400_370", + "T2DegStop_425_395", + "T2DegStop_450_420", + "T2DegStop_475_445", + "T2DegStop_500_470", + "T2DegStop_525_495", + "T2DegStop_550_520", + "T2DegStop_575_545", + "T2DegStop_600_570", + "T2DegStop_625_595", + "T2DegStop_650_620", + "T2DegStop_675_645", + "T2DegStop_700_670", + "T2DegStop_725_695", + "T2DegStop_750_720", + "T2DegStop_775_745", + "T2DegStop_800_770"], + "300_270" : ["T2DegStop_300_270"], + "550_520" : ["T2DegStop_550_520"] +} +bkgDatasets = [ + "Wjets_70to100", + "Wjets_100to200", + "Wjets_200to400", + "Wjets_400to600", + "Wjets_600to800", + "Wjets_800to1200", + "Wjets_1200to2500", + "Wjets_2500toInf", + "TTJets_DiLepton", + "TTJets_SingleLeptonFromTbar", + "TTJets_SingleLeptonFromT", + "ZJetsToNuNu_HT100to200", + "ZJetsToNuNu_HT200to400", + "ZJetsToNuNu_HT400to600", + "ZJetsToNuNu_HT600to800", + "ZJetsToNuNu_HT800to1200", + "ZJetsToNuNu_HT1200to2500", + "ZJetsToNuNu_HT2500toInf"] def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): if signal not in signalMap: From bcec950756add8943c96b55ce17f02091b76af0e Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Tue, 22 Aug 2017 15:57:15 +0100 Subject: [PATCH 11/22] fixed trainNN print --- trainNN.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/trainNN.py b/trainNN.py index 861fe13..49015d4 100644 --- a/trainNN.py +++ b/trainNN.py @@ -159,6 +159,9 @@ def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): sigYield, sigYieldUnc = tmpSig bkgYield, bkgYieldUnc = tmpBkg +sigDataVal = dataVal[dataVal.category==1] +bkgDataVal = dataVal[dataVal.category==0] + print "Signal@Presel:", sigDataVal.weight.sum() * 35866 * 2 print "Background@Presel:", bkgDataVal.weight.sum() * 35866 * 2 print "Signal:", sigYield, "+-", sigYieldUnc From 24924d6ef3190798276b44ab514811bed4a35307 Mon Sep 17 00:00:00 2001 From: g-marques Date: Tue, 22 Aug 2017 16:01:01 +0100 Subject: [PATCH 12/22] commonFunctions now with different train and test signal data sets --- commonFunctions.py | 162 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 commonFunctions.py diff --git a/commonFunctions.py b/commonFunctions.py new file mode 100644 index 0000000..49f44fe --- /dev/null +++ b/commonFunctions.py @@ -0,0 +1,162 @@ +import root_numpy +import pandas + +signalMap = { + "DM30" : ["T2DegStop_250_220", + "T2DegStop_275_245", + "T2DegStop_300_270", + "T2DegStop_325_295", + "T2DegStop_350_320", + "T2DegStop_375_345", + "T2DegStop_400_370", + "T2DegStop_425_395", + "T2DegStop_450_420", + "T2DegStop_475_445", + "T2DegStop_500_470", + "T2DegStop_525_495", + "T2DegStop_550_520", + "T2DegStop_575_545", + "T2DegStop_600_570", + "T2DegStop_625_595", + "T2DegStop_650_620", + "T2DegStop_675_645", + "T2DegStop_700_670", + "T2DegStop_725_695", + "T2DegStop_750_720", + "T2DegStop_775_745", + "T2DegStop_800_770"], + "300_270" : ["T2DegStop_300_270"], + "550_520" : ["T2DegStop_550_520"] + } +bkgDatasets = [ + "Wjets_70to100", + "Wjets_100to200", + "Wjets_200to400", + "Wjets_400to600", + "Wjets_600to800", + "Wjets_800to1200", + "Wjets_1200to2500", + "Wjets_2500toInf", + "TTJets_DiLepton", + "TTJets_SingleLeptonFromTbar", + "TTJets_SingleLeptonFromT", + "ZJetsToNuNu_HT100to200", + "ZJetsToNuNu_HT200to400", + "ZJetsToNuNu_HT400to600", + "ZJetsToNuNu_HT600to800", + "ZJetsToNuNu_HT800to1200", + "ZJetsToNuNu_HT1200to2500", + "ZJetsToNuNu_HT2500toInf" + ] + + +def StopDataLoader(path, features, test="550_520", selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): + if signal not in signalMap: + raise KeyError("Unknown training signal requested ("+signal+")") + if test not in signalMap: + raise KeyError("Unknown test signal requested ("+test+")") + if fraction >= 1.0: + fraction = 1.0 + if fraction < 0.0: + raise ValueError("An invalid fraction was chosen") + if "XS" not in features: + features.append("XS") + if "weight" not in features: + features.append("weight") + + sigDev = None + sigVal = None + + + for sigName_test in signalMap[test]: + tmp = root_numpy.root2array( + path + "test/" + sigName_test + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if sigVal is None: + sigVal = pandas.DataFrame(tmp) + else: + sigVal = sigVal.append(pandas.DataFrame(tmp), ignore_index=True) + + + for sigName in signalMap[signal]: + tmp = root_numpy.root2array( + path + "train/" + sigName + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if sigDev is None: + sigDev = pandas.DataFrame(tmp) + else: + sigDev = sigDev.append(pandas.DataFrame(tmp), ignore_index=True) + + + + bkgDev = None + bkgVal = None + for bkgName in bkgDatasets: + tmp = root_numpy.root2array( + path + "/train/" + bkgName + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if bkgDev is None: + bkgDev = pandas.DataFrame(tmp) + else: + bkgDev = bkgDev.append(pandas.DataFrame(tmp), ignore_index=True) + + tmp = root_numpy.root2array( + path + "/test/" + bkgName + suffix + ".root", + treename=treename, + selection=selection, + branches=features + ) + if fraction < 1.0: + tmp = tmp[:int(len(tmp)*fraction)] + if bkgVal is None: + bkgVal = pandas.DataFrame(tmp) + else: + bkgVal = bkgVal.append(pandas.DataFrame(tmp), ignore_index=True) + + sigDev["category"] = 1 + sigVal["category"] = 1 + bkgDev["category"] = 0 + bkgVal["category"] = 0 + sigDev["sampleWeight"] = 1 + sigVal["sampleWeight"] = 1 + bkgDev["sampleWeight"] = 1 + bkgVal["sampleWeight"] = 1 + + if fraction < 1.0: + sigDev.weight = sigDev.weight/fraction + sigVal.weight = sigVal.weight/fraction + bkgDev.weight = bkgDev.weight/fraction + bkgVal.weight = bkgVal.weight/fraction + + sigDev.sampleWeight = sigDev.weight/sigDev.XS + sigVal.sampleWeight = sigVal.weight/sigVal.XS + bkgDev.sampleWeight = bkgDev.weight + bkgVal.sampleWeight = bkgVal.weight + + sigDev.sampleWeight = sigDev.sampleWeight/sigDev.sampleWeight.sum() + sigVal.sampleWeight = sigVal.sampleWeight/sigVal.sampleWeight.sum() + bkgDev.sampleWeight = bkgDev.sampleWeight/bkgDev.sampleWeight.sum() + bkgVal.sampleWeight = bkgVal.sampleWeight/bkgVal.sampleWeight.sum() + + dev = sigDev.copy() + dev = dev.append(bkgDev.copy(), ignore_index=True) + val = sigVal.copy() + val = val.append(bkgVal.copy(), ignore_index=True) + + return dev, val + \ No newline at end of file From 36ab26138eb3e4b3ce86f399b3f3c0c09a911876 Mon Sep 17 00:00:00 2001 From: g-marques Date: Tue, 22 Aug 2017 17:34:35 +0100 Subject: [PATCH 13/22] train_loop - grid search on #layers and #neurons --- trainNN_loop.py | 364 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 trainNN_loop.py diff --git a/trainNN_loop.py b/trainNN_loop.py new file mode 100644 index 0000000..7883558 --- /dev/null +++ b/trainNN_loop.py @@ -0,0 +1,364 @@ +import root_numpy +import numpy as np +import pandas +import keras +import time +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.optimizers import Adam +from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt +import localConfig as cfg +from commonFunctions import StopDataLoader +import sys +from math import log + +myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT", "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV", "mt"] +inputBranches = list(myFeatures) +inputBranches.append("XS") +inputBranches.append("weight") +preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" +suffix = "_skimmed" +luminosity = 35866 +number_of_events_print = 0 +test_point = "550_520" +train_DM = "DM30" + +print "Loading datasets..." +dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test="550_520") # + +if number_of_events_print == 1: + np_dataDev, np_dataVal = StopDataLoader(cfg.loc, inputBranches, suffix=suffix, signal=train_DM, test=test_point) # + print " ==> BEFORE PRE-SELECTION:" + print " Train Signal Events:", len(np_dataDev[np_dataDev.category==1]) + print " Train Background Events:",len(np_dataDev[np_dataDev.category==0]) + print " Test Signal Events:", len(np_dataVal[np_dataVal.category==1]) + print " Test Background Events:", len(np_dataVal[np_dataVal.category==0]) + print "" + print " ==> AFTER PRE-SELECTION:" + print " Train Signal Events:", len(dataDev[dataDev.category==1]) + print " Train Background Events:",len(dataDev[dataDev.category==0]) + print " Test Signal Events:", len(dataVal[dataVal.category==1]) + print " Test Background Events:", len(dataVal[dataVal.category==0]) + +data = dataDev.copy() +data = data.append(dataVal.copy(), ignore_index=True) + +print 'Finding features of interest' +trainFeatures = [var for var in data.columns if var in myFeatures] +otherFeatures = [var for var in data.columns if var not in trainFeatures] + +print "Preparing the data for the NN" +XDev = dataDev.ix[:,0:len(trainFeatures)] # PORQUe X E Y??? EQUIVALENTE A SIG E BACK? +XVal = dataVal.ix[:,0:len(trainFeatures)] +YDev = np.ravel(dataDev.category) +YVal = np.ravel(dataVal.category) +weightDev = np.ravel(dataDev.sampleWeight) +weightVal = np.ravel(dataVal.sampleWeight) + +print("Fitting the scaler and scaling the input variables") +scaler = StandardScaler().fit(XDev) +XDev = scaler.transform(XDev) +XVal = scaler.transform(XVal) + +scalerfile = 'scaler_'+train_DM+'.sav' +joblib.dump(scaler, scalerfile) + +compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} +trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} +learning_rate = 0.001/5.0 +myAdam = Adam(lr=learning_rate) +compileArgs['optimizer'] = myAdam + +## DEFINITIONS +def FOM1(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / (b**0.5) + fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 + return (fom, fomErr) + +def FOM2(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / ((s+b)**0.5) + fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 + return (fom, fomErr) + +def FullFOM(sIn, bIn, fValue=0.2): + s, sErr = sIn + b, bErr = bIn + fomErr = 0.0 # Add the computation of the uncertainty later + fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) + fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) + fom = (fomA - fomB)**0.5 + return (fom, fomErr) + +print "Opening file" +f = open('DATA_loop_test_f_'+test_point+'.txt', 'w') + +for y in range(1, 2): + for x in range(7,8): + + if y==1: + def getDefinedClassifier(nIn, nOut, compileArgs): + model = Sequential() + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + if y==2: + def getDefinedClassifier(nIn, nOut, compileArgs): + model = Sequential() + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + if y==3: + def getDefinedClassifier(nIn, nOut, compileArgs): + model = Sequential() + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + if y==4: + def getDefinedClassifier(nIn, nOut, compileArgs): + model = Sequential() + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + #model.add(Dropout(0.2)) + #model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) + #model.add(Dropout(0.2)) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + + print("Starting the training") + call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') + model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) + history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) + + name = "myNN_"+str(y)+"_"+str(x)+"_"+train_DM + model.save(name+".h5") + model_json = model.to_json() + with open(name + ".json", "w") as json_file: + json_file.write(model_json) + model.save_weights(name + ".h5") + + # To load: + #from keras.models import model_from_json + #with open("myNN_"+str(y)+"_"+str(x)+"_"+test_point+'.json', 'r') as json_file: + # loaded_model_json = json_file.read() + #model = model_from_json(loaded_model_json) + #model.load_weights("myNN_"+str(y)+"_"+str(x)+"_"+test_point+".h5") + #model.compile(loss = 'binary_crossentropy', optimizer = 'adam') + + print("Getting predictions") + devPredict = model.predict(XDev) + #print devPredict + valPredict = model.predict(XVal) + #print valPredict + + + print("Getting scores") + + scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) + scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) + print "" + #print "Dev score:", scoreDev + #print "Val score:", scoreVal + #print confusion_matrix(YVal, valPredict.round()) + cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + + + print "Calculating FOM:" + dataDev["NN"] = devPredict + dataVal["NN"] = valPredict + + #dataVal[dataVal.category==1].weight = dataVal[dataVal.category==1].weight*dataVal[dataVal.category==1].XS + sig_dataDev=dataDev[dataDev.category==1] + bkg_dataDev=dataDev[dataDev.category==0] + sig_dataVal=dataVal[dataVal.category==1] + bkg_dataVal=dataVal[dataVal.category==0] + + + def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): + #defines the selected test data + selectedVal = dataVal[dataVal.NN>cut] + unselectedVal = dataVal[ dataVal.NN<=cut ] + + #separates the true positives from false negatives + selectedSig = selectedVal[selectedVal.category == 1] + selectedBkg = selectedVal[selectedVal.category == 0] + #unselectedSig = unselectedVal[unselectedVal.category == 1] + #unselectedBkg = unselectedVal[unselectedVal.category == 0] + + #print cut + #print "TP", len(selectedSig) + #print "FP", len(selectedBkg) + #print "FN", len(unselectedSig) + #print "TN", len(unselectedBkg) + #print "" + + sigYield = selectedSig.weight.sum() + sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) + bkgYield = selectedBkg.weight.sum() + bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) + + sigYield = sigYield * luminosity * splitFactor #The factor 2 comes from the splitting + sigYieldUnc = sigYieldUnc * luminosity * splitFactor + bkgYield = bkgYield * luminosity * splitFactor + bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor + + return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + + tmpSig, tmpBkg = getYields(dataVal) + sigYield, sigYieldUnc = tmpSig + bkgYield, bkgYieldUnc = tmpBkg + + + #print "Signal@Presel:", dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 + #print "Background@Presel:", dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + #print "Signal:", sigYield, "+-", sigYieldUnc + #print "Background:", bkgYield, "+-", bkgYieldUnc + + #print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + #print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + #print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + + #sys.exit("Done!") + + ######################################################### + + # Let's repeat the above, but monitor the evolution of the loss function + + + #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) + #print(history.history.keys()) + + fomEvo = [] + fomCut = [] + + bkgEff = [] + sigEff = [] + + sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 + bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + + for cut in np.arange(0.0, 0.9999999, 0.001): + #print cut + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + #print fom + #print "" + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + + max_FOM=0 + + print "Maximizing FOM" + for k in fomEvo: + if k>max_FOM: + max_FOM=k + + Eff = zip(bkgEff, sigEff) + + km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + + f.write(str(y)+"\n") + print "Layers:", y + f.write(str(x)+"\n") + print "Neurons:", x + f.write(str(cohen_kappa)+"\n") + print "Cohen Kappa score:", cohen_kappa + f.write(str(max_FOM)+"\n") + print "Maximized FOM:", max_FOM + f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") + print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] + f.write(str(km_value[0])+"\n") + print "KS test statistic:", km_value[0] + f.write(str(km_value[1])+"\n") + print "KS test p-value:", km_value[1] + + f.close() + + selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] + selectedSig = selectedVal[selectedVal.category == 1] + selectedBkg = selectedVal[selectedVal.category == 0] + sigYield = selectedSig.weight.sum() + bkgYield = selectedBkg.weight.sum() + sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting + bkgYield = bkgYield * luminosity * 2 + + + print fomCut[fomEvo.index(max_FOM)] + print "Number of selected Signal Events:", len(selectedSig) + print "Number of selected Background Events:", len(selectedBkg) + print "Sig Yield", sigYield + print "Bkg Yield", bkgYield + + print "Plotting" + + plt.figure(figsize=(7,6)) + plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) + plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) + plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") + plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") + plt.xlabel('NN output') + #plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) + plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') + plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) + plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') + plt.savefig('hist_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') + plt.show() + + + both_dataDev=bkg_dataDev.append(sig_dataDev) + plt.figure(figsize=(7,6)) + plt.xlabel('NN output') + plt.title("Number of Events") + #plt.yscale('log', nonposy='clip') + plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) + plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) + plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) + plt.savefig('pred_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') + plt.show() + + + plt.figure(figsize=(7,6)) + plt.subplots_adjust(hspace=0.5) + plt.subplot(211) + plt.plot(fomCut, fomEvo) + plt.title("FOM") + plt.ylabel("FOM") + plt.xlabel("ND") + plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') + + + plt.subplot(212) + plt.semilogy(fomCut, Eff) + plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) + #plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) + plt.title("Efficiency") + plt.ylabel("Eff") + plt.xlabel("ND") + plt.legend(['Background', 'Signal'], loc='upper left') + plt.savefig('FOM_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') + plt.show() + + + + +sys.exit("Done!") + From 06eeddbaa96885560710e343e3ee4af338992f55 Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Tue, 22 Aug 2017 18:12:16 +0100 Subject: [PATCH 14/22] Set2 fix --- commonFunctions.py | 137 ++++++++++++++++++++++----------------------- trainNN.py | 47 ++++++++-------- 2 files changed, 92 insertions(+), 92 deletions(-) diff --git a/commonFunctions.py b/commonFunctions.py index 6dc6060..25c80a5 100644 --- a/commonFunctions.py +++ b/commonFunctions.py @@ -2,57 +2,59 @@ import pandas signalMap = { - "DM30" : [ - "T2DegStop_250_220", - "T2DegStop_275_245", - "T2DegStop_300_270", - "T2DegStop_325_295", - "T2DegStop_350_320", - "T2DegStop_375_345", - "T2DegStop_400_370", - "T2DegStop_425_395", - "T2DegStop_450_420", - "T2DegStop_475_445", - "T2DegStop_500_470", - "T2DegStop_525_495", - "T2DegStop_550_520", - "T2DegStop_575_545", - "T2DegStop_600_570", - "T2DegStop_625_595", - "T2DegStop_650_620", - "T2DegStop_675_645", - "T2DegStop_700_670", - "T2DegStop_725_695", - "T2DegStop_750_720", - "T2DegStop_775_745", - "T2DegStop_800_770"], - "300_270" : ["T2DegStop_300_270"], - "550_520" : ["T2DegStop_550_520"] -} - + "DM30" : ["T2DegStop_250_220", + "T2DegStop_275_245", + "T2DegStop_300_270", + "T2DegStop_325_295", + "T2DegStop_350_320", + "T2DegStop_375_345", + "T2DegStop_400_370", + "T2DegStop_425_395", + "T2DegStop_450_420", + "T2DegStop_475_445", + "T2DegStop_500_470", + "T2DegStop_525_495", + "T2DegStop_550_520", + "T2DegStop_575_545", + "T2DegStop_600_570", + "T2DegStop_625_595", + "T2DegStop_650_620", + "T2DegStop_675_645", + "T2DegStop_700_670", + "T2DegStop_725_695", + "T2DegStop_750_720", + "T2DegStop_775_745", + "T2DegStop_800_770"], + "300_270" : ["T2DegStop_300_270"], + "550_520" : ["T2DegStop_550_520"] + } bkgDatasets = [ - "Wjets_70to100", - "Wjets_100to200", - "Wjets_200to400", - "Wjets_400to600", - "Wjets_600to800", - "Wjets_800to1200", - "Wjets_1200to2500", - "Wjets_2500toInf", - "TTJets_DiLepton", - "TTJets_SingleLeptonFromTbar", - "TTJets_SingleLeptonFromT", - "ZJetsToNuNu_HT100to200", - "ZJetsToNuNu_HT200to400", - "ZJetsToNuNu_HT400to600", - "ZJetsToNuNu_HT600to800", - "ZJetsToNuNu_HT800to1200", - "ZJetsToNuNu_HT1200to2500", - "ZJetsToNuNu_HT2500toInf"] - -def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): + "Wjets_70to100", + "Wjets_100to200", + "Wjets_200to400", + "Wjets_400to600", + "Wjets_600to800", + "Wjets_800to1200", + "Wjets_1200to2500", + "Wjets_2500toInf", + "TTJets_DiLepton", + "TTJets_SingleLeptonFromTbar", + "TTJets_SingleLeptonFromT", + "ZJetsToNuNu_HT100to200", + "ZJetsToNuNu_HT200to400", + "ZJetsToNuNu_HT400to600", + "ZJetsToNuNu_HT600to800", + "ZJetsToNuNu_HT800to1200", + "ZJetsToNuNu_HT1200to2500", + "ZJetsToNuNu_HT2500toInf" + ] + + +def StopDataLoader(path, features, test="550_520", selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): if signal not in signalMap: - raise KeyError("Unknown signal requested ("+signal+")") + raise KeyError("Unknown training signal requested ("+signal+")") + if test not in signalMap: + raise KeyError("Unknown test signal requested ("+test+")") if fraction >= 1.0: fraction = 1.0 if fraction < 0.0: @@ -62,43 +64,40 @@ def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", if "weight" not in features: features.append("weight") - - sigDev = None sigVal = None - for sigName in signalMap[signal]: - stopM = int(sigName[10:13]) + + + for sigName_test in signalMap[test]: tmp = root_numpy.root2array( - path + "/train/" + sigName + suffix + ".root", + path + "test/" + sigName_test + suffix + ".root", treename=treename, selection=selection, branches=features ) if fraction < 1.0: tmp = tmp[:int(len(tmp)*fraction)] - if sigDev is None: - sigDev = pandas.DataFrame(tmp) - sigDev["stopM"] = stopM + if sigVal is None: + sigVal = pandas.DataFrame(tmp) else: - tmp2 = pandas.DataFrame(tmp) - tmp2["stopM"] = stopM - sigDev = sigDev.append(tmp2, ignore_index=True) + sigVal = sigVal.append(pandas.DataFrame(tmp), ignore_index=True) + + for sigName in signalMap[signal]: tmp = root_numpy.root2array( - path + "/test/" + sigName + suffix + ".root", + path + "train/" + sigName + suffix + ".root", treename=treename, selection=selection, branches=features ) if fraction < 1.0: tmp = tmp[:int(len(tmp)*fraction)] - if sigVal is None: - sigVal = pandas.DataFrame(tmp) - sigVal["stopM"] = stopM + if sigDev is None: + sigDev = pandas.DataFrame(tmp) else: - tmp2 = pandas.DataFrame(tmp) - tmp2["stopM"] = stopM - sigVal = sigVal.append(tmp2, ignore_index=True) + sigDev = sigDev.append(pandas.DataFrame(tmp), ignore_index=True) + + bkgDev = None bkgVal = None @@ -138,9 +137,6 @@ def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", bkgDev["sampleWeight"] = 1 bkgVal["sampleWeight"] = 1 - bkgDev["stopM"] = -1 - bkgVal["stopM"] = -1 - if fraction < 1.0: sigDev.weight = sigDev.weight/fraction sigVal.weight = sigVal.weight/fraction @@ -163,3 +159,4 @@ def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", val = val.append(bkgVal.copy(), ignore_index=True) return dev, val + diff --git a/trainNN.py b/trainNN.py index 49015d4..2637d99 100644 --- a/trainNN.py +++ b/trainNN.py @@ -15,7 +15,7 @@ import localConfig as cfg from commonFunctions import StopDataLoader -myFeatures = ["Jet1Pt", "Met", "Njet", "LepPt", "LepEta", "LepChg", "HT", "NbLoose"] +myFeatures = ["Jet1Pt", "Met", "mt", "LepPt", "LepEta", "LepChg", "HT", "NbLoose","Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV"] inputBranches = list(myFeatures) inputBranches.append("XS") inputBranches.append("weight") @@ -24,7 +24,7 @@ luminosity = 35866 print "Loading datasets..." -dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal="300_270") +dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal="DM30", test="550_520") #print dataDev.describe() #print dataVal.describe() data = dataDev.copy() @@ -59,18 +59,17 @@ scalerfile = 'scaler.sav' joblib.dump(scaler, scalerfile) - compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 2, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam def getDefinedClassifier(nIn, nOut, compileArgs): model = Sequential() - model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(7, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) #model.add(Dropout(0.2)) - model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) + #model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) #model.add(Dropout(0.2)) model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) model.compile(**compileArgs) @@ -94,26 +93,33 @@ def getSELUClassifier(nIn, nOut, compileArgs): print XVal.shape print XVal.dtype + + print("Starting the training") start = time.time() +call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) #model = getSELUClassifier(len(trainFeatures), 1, compileArgs) -history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) +history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) print("Training took ", time.time()-start, " seconds") -name = "myNN" +# To save: +name = "myNN_DM30" model.save(name+".h5") -#model_json = model.to_json() -#with open(name + ".json", "w") as json_file: -# json_file.write(model_json) -#model.save_weights(name + ".h5") - -## To load: -#from keras.models import model_from_json -#with open('model.json', 'r') as json_file: -# loaded_model_json = json_file.read() -#loaded_model = model_from_json(loaded_model_json) -#loaded_model.load_weights("model.h5") +model_json = model.to_json() +with open(name + ".json", "w") as json_file: + json_file.write(model_json) +model.save_weights(name + ".h5") + +# To load: +''' +from keras.models import model_from_json +with open(name + '.json', 'r') as json_file: + loaded_model_json = json_file.read() +model = model_from_json(loaded_model_json) +model.load_weights(name+".h5") +model.compile(loss = 'binary_crossentropy', optimizer = 'adam') +''' print("Getting predictions") devPredict = model.predict(XDev) @@ -259,6 +265,3 @@ def FullFOM(sIn, bIn, fValue=0.2): print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) cvscores.append(scores[1] * 100) print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))) - - - From 859fa6569ee1f402d9c12508cba7a57c2f5c4039 Mon Sep 17 00:00:00 2001 From: Diogo de Bastos Date: Tue, 22 Aug 2017 18:15:18 +0100 Subject: [PATCH 15/22] Maximize FOM --- trainNN.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/trainNN.py b/trainNN.py index 2637d99..f0dac77 100644 --- a/trainNN.py +++ b/trainNN.py @@ -230,12 +230,38 @@ def FullFOM(sIn, bIn, fValue=0.2): fomEvo = [] fomCut = [] + +bkgEff = [] +sigEff = [] + +sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 +bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + for cut in np.arange(0.0, 0.9999999, 0.001): sig, bkg = getYields(dataVal, cut=cut) if sig[0] > 0 and bkg[0] > 0: fom, fomUnc = FullFOM(sig, bkg) fomEvo.append(fom) fomCut.append(cut) + +max_FOM=0 + +print "Maximizing FOM" +for k in fomEvo: + if k>max_FOM: + max_FOM=k + +Eff = zip(bkgEff, sigEff) + +km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + +print "Layers:", y +print "Neurons:", x +print "Cohen Kappa score:", cohen_kappa +print "Maximized FOM:", max_FOM +print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] +print "KS test statistic:", km_value[0] +print "KS test p-value:", km_value[1] plt.plot(fomCut, fomEvo) plt.title("FOM") From 90f8e9752426f787b95d0df96339ef1810459e58 Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Wed, 23 Aug 2017 11:51:20 +0100 Subject: [PATCH 16/22] Separated DATA preparation from training and fixed gridsearch --- gridSearch.py | 22 +++++++++---- prepareDATA.py | 55 +++++++++++++++++++++++++++++++++ trainNN.py | 84 ++++++++++---------------------------------------- 3 files changed, 88 insertions(+), 73 deletions(-) create mode 100644 prepareDATA.py diff --git a/gridSearch.py b/gridSearch.py index 0fa70e6..4ff3777 100644 --- a/gridSearch.py +++ b/gridSearch.py @@ -1,12 +1,20 @@ +import numpy from sklearn.model_selection import GridSearchCV from keras.wrappers.scikit_learn import KerasClassifier +import time +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.wrappers.scikit_learn import KerasClassifier +from keras.constraints import maxnorm -import trainNN +from prepareDATA import * -trainFeatures = trainNN.trainFeatures -compileArgs = trainNN.compileArgs +''' XDev = trainNN.XDev YDev = trainNN.YDev +''' +compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} +trainParams = {'epochs': 2, 'batch_size': 10, 'verbose': 1} # Fix seed for reproducibility seed = 42 @@ -20,13 +28,15 @@ def myClassifier(nIn=len(trainFeatures), nOut=1, compileArgs=compileArgs, layers model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) model.compile(**compileArgs) + print("Training with %i layers and %i neurons" % (layers, neurons)) return model model = KerasClassifier(build_fn=myClassifier, epochs = 2, batch_size = 20, verbose = 1) -neurons = [1, 5, 10, 15, 20, 25] -layers = [1,2,3,4,5] +neurons = [10, 12, 14] +layers = [1,2] +#layers = [1,2,3,4,5] param_grid = dict(neurons=neurons, layers=layers) grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs=-1) #n_jobs = -1 -> Total number of CPU/GPU cores print("Starting the training") @@ -38,5 +48,5 @@ def myClassifier(nIn=len(trainFeatures), nOut=1, compileArgs=compileArgs, layers means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] -for mean, stdev, param in (zipmeans, stds, params): +for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) diff --git a/prepareDATA.py b/prepareDATA.py new file mode 100644 index 0000000..46cb30e --- /dev/null +++ b/prepareDATA.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +import root_numpy +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.externals import joblib + +import localConfig as cfg +from commonFunctions import StopDataLoader + +myFeatures = ["Jet1Pt", "Met", "mt", "LepPt", "LepEta", "LepChg", "HT", "NbLoose","Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV"] +inputBranches = list(myFeatures) +inputBranches.append("XS") +inputBranches.append("weight") +preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" +suffix = "_skimmed" +luminosity = 35866 + +print "Loading datasets..." +dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal="DM30", test="550_520") +#print dataDev.describe() +#print dataVal.describe() +data = dataDev.copy() +data = data.append(dataVal.copy(), ignore_index=True) +print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' +print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' +print ' Signal:', len(dataDev[dataDev.category == 1]), '(', dataDev[dataDev.category == 1].weight.sum()*luminosity, 'weighted)' +print ' Background:', len(dataDev[dataDev.category == 0]), '(', dataDev[dataDev.category == 0].weight.sum()*luminosity, 'weighted)' +print ' Validation (test):', len(dataVal), '(', dataVal.weight.sum()*luminosity, 'weighted)' +print ' Signal:', len(dataVal[dataVal.category == 1]), '(', dataVal[dataVal.category == 1].weight.sum()*luminosity, 'weighted)' +print ' Background:', len(dataVal[dataVal.category == 0]), '(', dataVal[dataVal.category == 0].weight.sum()*luminosity, 'weighted)' + +print 'Finding features of interest' +trainFeatures = [var for var in data.columns if var in myFeatures] +otherFeatures = [var for var in data.columns if var not in trainFeatures] + +###################################### + +print "Preparing the data for the NN" +XDev = dataDev.ix[:,0:len(trainFeatures)] +XVal = dataVal.ix[:,0:len(trainFeatures)] +YDev = np.ravel(dataDev.category) +YVal = np.ravel(dataVal.category) +weightDev = np.ravel(dataDev.sampleWeight) +weightVal = np.ravel(dataVal.sampleWeight) + +print "Fitting the scaler and scaling the input variables" +scaler = StandardScaler().fit(XDev) +XDev = scaler.transform(XDev) +XVal = scaler.transform(XVal) + +scalerfile = 'scaler.sav' +joblib.dump(scaler, scalerfile) + +print "DATA is ready!" diff --git a/trainNN.py b/trainNN.py index f0dac77..837cae7 100644 --- a/trainNN.py +++ b/trainNN.py @@ -1,66 +1,16 @@ -#!/usr/bin/env python - -import root_numpy -import numpy as np -import pandas -import keras +from keras.optimizers import Adam import time -from sklearn.externals import joblib -from sklearn.preprocessing import StandardScaler +import keras +import pandas from keras.models import Sequential from keras.layers import Dense, Dropout, AlphaDropout -from keras.optimizers import Adam from sklearn.metrics import confusion_matrix, cohen_kappa_score +#from scipy.stats import ks_2samp -import localConfig as cfg -from commonFunctions import StopDataLoader - -myFeatures = ["Jet1Pt", "Met", "mt", "LepPt", "LepEta", "LepChg", "HT", "NbLoose","Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV"] -inputBranches = list(myFeatures) -inputBranches.append("XS") -inputBranches.append("weight") -preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" -suffix = "_skimmed" -luminosity = 35866 - -print "Loading datasets..." -dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal="DM30", test="550_520") -#print dataDev.describe() -#print dataVal.describe() -data = dataDev.copy() -data = data.append(dataVal.copy(), ignore_index=True) -print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' -print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' -print ' Signal:', len(dataDev[dataDev.category == 1]), '(', dataDev[dataDev.category == 1].weight.sum()*luminosity, 'weighted)' -print ' Background:', len(dataDev[dataDev.category == 0]), '(', dataDev[dataDev.category == 0].weight.sum()*luminosity, 'weighted)' -print ' Validation (test):', len(dataVal), '(', dataVal.weight.sum()*luminosity, 'weighted)' -print ' Signal:', len(dataVal[dataVal.category == 1]), '(', dataVal[dataVal.category == 1].weight.sum()*luminosity, 'weighted)' -print ' Background:', len(dataVal[dataVal.category == 0]), '(', dataVal[dataVal.category == 0].weight.sum()*luminosity, 'weighted)' - -print 'Finding features of interest' -trainFeatures = [var for var in data.columns if var in myFeatures] -otherFeatures = [var for var in data.columns if var not in trainFeatures] - -###################################### - -print "Preparing the data for the NN" -XDev = dataDev.ix[:,0:len(trainFeatures)] -XVal = dataVal.ix[:,0:len(trainFeatures)] -YDev = np.ravel(dataDev.category) -YVal = np.ravel(dataVal.category) -weightDev = np.ravel(dataDev.sampleWeight) -weightVal = np.ravel(dataVal.sampleWeight) - -print("Fitting the scaler and scaling the input variables") -scaler = StandardScaler().fit(XDev) -XDev = scaler.transform(XDev) -XVal = scaler.transform(XVal) - -scalerfile = 'scaler.sav' -joblib.dump(scaler, scalerfile) +from prepareDATA import * compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': 2, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam @@ -206,6 +156,10 @@ def FullFOM(sIn, bIn, fValue=0.2): ######################################################### +fomEvo = [] +fomCut = [] + +bkgEff = [] # Let's repeat the above, but monitor the evolution of the loss function import matplotlib.pyplot as plt @@ -228,10 +182,6 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.legend(['train', 'test'], loc='upper left') plt.show() -fomEvo = [] -fomCut = [] - -bkgEff = [] sigEff = [] sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 @@ -253,15 +203,15 @@ def FullFOM(sIn, bIn, fValue=0.2): Eff = zip(bkgEff, sigEff) -km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) - -print "Layers:", y -print "Neurons:", x -print "Cohen Kappa score:", cohen_kappa +#km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + +#print "Layers:", y +#print "Neurons:", x +#print "Cohen Kappa score:", cohen_kappa print "Maximized FOM:", max_FOM print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] -print "KS test statistic:", km_value[0] -print "KS test p-value:", km_value[1] +#print "KS test statistic:", km_value[0] +#print "KS test p-value:", km_value[1] plt.plot(fomCut, fomEvo) plt.title("FOM") From c925984c03dd9d603f25fc222df4af02eeda65f7 Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Wed, 23 Aug 2017 14:24:16 +0100 Subject: [PATCH 17/22] optimize trainNN --- .gitignore | 2 + gridSearch.py | 13 +++---- prepareDATA.py | 26 +++++++------ trainNN.py | 102 ++++++++++++++++++++++++++----------------------- 4 files changed, 76 insertions(+), 67 deletions(-) diff --git a/.gitignore b/.gitignore index 3ca082c..f2ae9b4 100644 --- a/.gitignore +++ b/.gitignore @@ -7,5 +7,7 @@ keras-tf/* *.png *.json *.pyc +*.txt +*.sav scaler.sav localConfig.py diff --git a/gridSearch.py b/gridSearch.py index 4ff3777..bd0541b 100644 --- a/gridSearch.py +++ b/gridSearch.py @@ -1,3 +1,5 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' import numpy from sklearn.model_selection import GridSearchCV from keras.wrappers.scikit_learn import KerasClassifier @@ -9,12 +11,7 @@ from prepareDATA import * -''' -XDev = trainNN.XDev -YDev = trainNN.YDev -''' compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 2, 'batch_size': 10, 'verbose': 1} # Fix seed for reproducibility seed = 42 @@ -32,10 +29,10 @@ def myClassifier(nIn=len(trainFeatures), nOut=1, compileArgs=compileArgs, layers return model -model = KerasClassifier(build_fn=myClassifier, epochs = 2, batch_size = 20, verbose = 1) +model = KerasClassifier(build_fn=myClassifier, epochs = 10, batch_size = 20, verbose = 1) -neurons = [10, 12, 14] -layers = [1,2] +neurons = [10,12,14,16] +layers = [1,2,3] #layers = [1,2,3,4,5] param_grid = dict(neurons=neurons, layers=layers) grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs=-1) #n_jobs = -1 -> Total number of CPU/GPU cores diff --git a/prepareDATA.py b/prepareDATA.py index 46cb30e..b6fdc51 100644 --- a/prepareDATA.py +++ b/prepareDATA.py @@ -15,27 +15,31 @@ preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" suffix = "_skimmed" luminosity = 35866 +number_of_events_print = 0 +test_point = "550_520" +train_DM = "DM30" print "Loading datasets..." -dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal="DM30", test="550_520") +dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test=test_point) #print dataDev.describe() #print dataVal.describe() + +if number_of_events_print == 1: + print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' + print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' + print ' Signal:', len(dataDev[dataDev.category == 1]), '(', dataDev[dataDev.category == 1].weight.sum()*luminosity, 'weighted)' + print ' Background:', len(dataDev[dataDev.category == 0]), '(', dataDev[dataDev.category == 0].weight.sum()*luminosity, 'weighted)' + print ' Validation (test):', len(dataVal), '(', dataVal.weight.sum()*luminosity, 'weighted)' + print ' Signal:', len(dataVal[dataVal.category == 1]), '(', dataVal[dataVal.category == 1].weight.sum()*luminosity, 'weighted)' + print ' Background:', len(dataVal[dataVal.category == 0]), '(', dataVal[dataVal.category == 0].weight.sum()*luminosity, 'weighted)' + data = dataDev.copy() data = data.append(dataVal.copy(), ignore_index=True) -print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' -print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' -print ' Signal:', len(dataDev[dataDev.category == 1]), '(', dataDev[dataDev.category == 1].weight.sum()*luminosity, 'weighted)' -print ' Background:', len(dataDev[dataDev.category == 0]), '(', dataDev[dataDev.category == 0].weight.sum()*luminosity, 'weighted)' -print ' Validation (test):', len(dataVal), '(', dataVal.weight.sum()*luminosity, 'weighted)' -print ' Signal:', len(dataVal[dataVal.category == 1]), '(', dataVal[dataVal.category == 1].weight.sum()*luminosity, 'weighted)' -print ' Background:', len(dataVal[dataVal.category == 0]), '(', dataVal[dataVal.category == 0].weight.sum()*luminosity, 'weighted)' print 'Finding features of interest' trainFeatures = [var for var in data.columns if var in myFeatures] otherFeatures = [var for var in data.columns if var not in trainFeatures] -###################################### - print "Preparing the data for the NN" XDev = dataDev.ix[:,0:len(trainFeatures)] XVal = dataVal.ix[:,0:len(trainFeatures)] @@ -49,7 +53,7 @@ XDev = scaler.transform(XDev) XVal = scaler.transform(XVal) -scalerfile = 'scaler.sav' +scalerfile = 'scaler_'+train_DM+'.sav' joblib.dump(scaler, scalerfile) print "DATA is ready!" diff --git a/trainNN.py b/trainNN.py index 837cae7..94f9638 100644 --- a/trainNN.py +++ b/trainNN.py @@ -1,3 +1,5 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' from keras.optimizers import Adam import time import keras @@ -7,34 +9,50 @@ from sklearn.metrics import confusion_matrix, cohen_kappa_score #from scipy.stats import ks_2samp -from prepareDATA import * +from prepareDATA import * + +n_neurons = 10 +n_layers = 2 +n_epochs = 10 compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 2, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': n_epochs, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam -def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(7, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - #model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model +def FOM1(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / (b**0.5) + fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 + return (fom, fomErr) + +def FOM2(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / ((s+b)**0.5) + fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 + return (fom, fomErr) + +def FullFOM(sIn, bIn, fValue=0.2): + from math import log + s, sErr = sIn + b, bErr = bIn + fomErr = 0.0 # Add the computation of the uncertainty later + fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) + fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) + fom = (fomA - fomB)**0.5 + return (fom, fomErr) + +print "Opening file" +f = open("DATA_"+test_point+'.txt','w') -def getSELUClassifier(nIn, nOut, compileArgs): +def getDefinedClassifier(nIn, nOut, compileArgs, neurons, layers): model = Sequential() - model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) + model.add(Dense(nIn, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + for i in range(0,layers): + model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) model.compile(**compileArgs) return model @@ -43,12 +61,10 @@ def getSELUClassifier(nIn, nOut, compileArgs): print XVal.shape print XVal.dtype - - print("Starting the training") start = time.time() call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') -model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) +model = getDefinedClassifier(len( trainFeatures), 1, compileArgs, n_neurons, n_layers) #model = getSELUClassifier(len(trainFeatures), 1, compileArgs) history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) print("Training took ", time.time()-start, " seconds") @@ -123,30 +139,6 @@ def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): print "Signal:", sigYield, "+-", sigYieldUnc print "Background:", bkgYield, "+-", bkgYieldUnc -def FOM1(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / (b**0.5) - fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 - return (fom, fomErr) - -def FOM2(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / ((s+b)**0.5) - fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 - return (fom, fomErr) - -def FullFOM(sIn, bIn, fValue=0.2): - from math import log - s, sErr = sIn - b, bErr = bIn - fomErr = 0.0 # Add the computation of the uncertainty later - fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) - fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) - fom = (fomA - fomB)**0.5 - return (fom, fomErr) - print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) @@ -193,7 +185,7 @@ def FullFOM(sIn, bIn, fValue=0.2): fom, fomUnc = FullFOM(sig, bkg) fomEvo.append(fom) fomCut.append(cut) - + max_FOM=0 print "Maximizing FOM" @@ -220,6 +212,20 @@ def FullFOM(sIn, bIn, fValue=0.2): plt.legend(['test'], loc='upper left') plt.show() + +selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] +selectedSig = selectedVal[selectedVal.category == 1] +selectedBkg = selectedVal[selectedVal.category == 0] +sigYield = selectedSig.weight.sum() +bkgYield = selectedBkg.weight.sum() +sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting +bkgYield = bkgYield * luminosity * 2 + +print fomCut[fomEvo.index(max_FOM)] +print "Number of selected Signal Events:", len(selectedSig) +print 'Number of selected Background Events:', len(selectedBkg) +print "Sig Yield", sigYield +print "Bkg Yield", bkgYield #print fomEvo sys.exit("Done!") From e74c1ebe2964462219d68c7a321ecfebea6d7de0 Mon Sep 17 00:00:00 2001 From: g-marques Date: Wed, 23 Aug 2017 14:24:17 +0100 Subject: [PATCH 18/22] NN testing software --- testNN.py | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 testNN.py diff --git a/testNN.py b/testNN.py new file mode 100644 index 0000000..6d0fe69 --- /dev/null +++ b/testNN.py @@ -0,0 +1,261 @@ +import root_numpy +import numpy as np +import pandas +import keras +import time +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.optimizers import Adam +from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt +import localConfig as cfg +from commonFunctions import StopDataLoader +from commonFunctions import StopNNGraphics +import sys +from math import log +from keras.models import model_from_json + +myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT", "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV", "mt"] +inputBranches = list(myFeatures) +inputBranches.append("XS") +inputBranches.append("weight") +preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" +suffix = "_skimmed" +luminosity = 35866 +number_of_events_print = 0 +test_point = "550_520" +train_DM = "DM30" +model_name = "myNN_test_2_15_DM30" + +print "Loading datasets..." +dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test=test_point) + +if number_of_events_print == 1: + np_dataDev, np_dataVal = StopDataLoader(cfg.loc, inputBranches, suffix=suffix, signal=train_DM, test=test_point) # + print " ==> BEFORE PRE-SELECTION:" + print " Train Signal Events:", len(np_dataDev[np_dataDev.category==1]) + print " Train Background Events:",len(np_dataDev[np_dataDev.category==0]) + print " Test Signal Events:", len(np_dataVal[np_dataVal.category==1]) + print " Test Background Events:", len(np_dataVal[np_dataVal.category==0]) + print "" + print " ==> AFTER PRE-SELECTION:" + print " Train Signal Events:", len(dataDev[dataDev.category==1]) + print " Train Background Events:",len(dataDev[dataDev.category==0]) + print " Test Signal Events:", len(dataVal[dataVal.category==1]) + print " Test Background Events:", len(dataVal[dataVal.category==0]) + +data = dataDev.copy() +data = data.append(dataVal.copy(), ignore_index=True) + +print 'Finding features of interest' +trainFeatures = [var for var in data.columns if var in myFeatures] +otherFeatures = [var for var in data.columns if var not in trainFeatures] + +print "Preparing the data for the NN" +XDev = dataDev.ix[:,0:len(trainFeatures)] # PORQUe X E Y??? EQUIVALENTE A SIG E BACK? +XVal = dataVal.ix[:,0:len(trainFeatures)] +YDev = np.ravel(dataDev.category) +YVal = np.ravel(dataVal.category) +weightDev = np.ravel(dataDev.sampleWeight) +weightVal = np.ravel(dataVal.sampleWeight) + +print("Fitting the scaler and scaling the input variables") +scaler = StandardScaler().fit(XDev) +XDev = scaler.transform(XDev) +XVal = scaler.transform(XVal) + +scalerfile = 'scaler_'+train_DM+'.sav' +joblib.dump(scaler, scalerfile) + +## DEFINITIONS +def FOM1(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / (b**0.5) + fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 + return (fom, fomErr) + +def FOM2(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / ((s+b)**0.5) + fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 + return (fom, fomErr) + +def FullFOM(sIn, bIn, fValue=0.2): + s, sErr = sIn + b, bErr = bIn + fomErr = 0.0 # Add the computation of the uncertainty later + fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) + fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) + fom = (fomA - fomB)**0.5 + return (fom, fomErr) + +print "Loading Model ..." +with open(model_name+'.json', 'r') as json_file: + loaded_model_json = json_file.read() +model = model_from_json(loaded_model_json) +model.load_weights(model_name+".h5") +model.compile(loss = 'binary_crossentropy', optimizer = 'adam') + + +print("Getting predictions") +devPredict = model.predict(XDev) +valPredict = model.predict(XVal) + + +print("Getting scores") + +scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) +scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) +print "" +cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + + +print "Calculating parameters" +dataDev["NN"] = devPredict +dataVal["NN"] = valPredict + + +sig_dataDev=dataDev[dataDev.category==1] +bkg_dataDev=dataDev[dataDev.category==0] +sig_dataVal=dataVal[dataVal.category==1] +bkg_dataVal=dataVal[dataVal.category==0] + + +def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): + #defines the selected test data + selectedVal = dataVal[dataVal.NN>cut] + + #separates the true positives from false negatives + selectedSig = selectedVal[selectedVal.category == 1] + selectedBkg = selectedVal[selectedVal.category == 0] + + sigYield = selectedSig.weight.sum() + sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) + bkgYield = selectedBkg.weight.sum() + bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) + + sigYield = sigYield * luminosity * splitFactor #The factor 2 comes from the splitting + sigYieldUnc = sigYieldUnc * luminosity * splitFactor + bkgYield = bkgYield * luminosity * splitFactor + bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor + + return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) + +tmpSig, tmpBkg = getYields(dataVal) +sigYield, sigYieldUnc = tmpSig +bkgYield, bkgYieldUnc = tmpBkg + + +fomEvo = [] +fomCut = [] + +bkgEff = [] +sigEff = [] + +sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 +bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + +for cut in np.arange(0.0, 0.9999, 0.001): + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + +max_FOM=0 + +for k in fomEvo: + if k>max_FOM: + max_FOM=k + +Eff = zip(bkgEff, sigEff) + +km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + +#f.write(str(y)+"\n") +#print "Layers:", y +#f.write(str(x)+"\n") +#print "Neurons:", x +#f.write(str(cohen_kappa)+"\n") +print "Cohen Kappa score:", cohen_kappa +#f.write(str(max_FOM)+"\n") +print "Maximized FOM:", max_FOM +#f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") +print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] +#f.write(str(km_value[0])+"\n") +print "KS test statistic:", km_value[0] +#f.write(str(km_value[1])+"\n") +print "KS test p-value:", km_value[1] + + #f.close() + +selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] +selectedSig = selectedVal[selectedVal.category == 1] +selectedBkg = selectedVal[selectedVal.category == 0] +sigYield = selectedSig.weight.sum() +bkgYield = selectedBkg.weight.sum() +sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting +bkgYield = bkgYield * luminosity * 2 + + +print "Selected events left after cut @", fomCut[fomEvo.index(max_FOM)] +print " Number of selected Signal Events:", len(selectedSig) +print " Number of selected Background Events:", len(selectedBkg) +print " Sig Yield", sigYield +print " Bkg Yield", bkgYield + +print "Plotting" + +plt.figure(figsize=(7,6)) +plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) +plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") +plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") +plt.xlabel('NN output') +plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') +plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) +plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') +plt.savefig('hist_'+model_name+'.png', bbox_inches='tight') +#plt.show() + + +both_dataDev=bkg_dataDev.append(sig_dataDev) +plt.figure(figsize=(7,6)) +plt.xlabel('NN output') +plt.title("Number of Events") +#plt.yscale('log', nonposy='clip') +plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) +plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) +plt.savefig('pred_'+model_name+'.png', bbox_inches='tight') +#plt.show() + +plt.figure(figsize=(7,6)) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) +plt.plot(fomCut, fomEvo) +plt.title("FOM") +plt.ylabel("FOM") +plt.xlabel("ND") +plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') + + +plt.subplot(212) +plt.semilogy(fomCut, Eff) +plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) +#plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) +plt.title("Efficiency") +plt.ylabel("Eff") +plt.xlabel("ND") +plt.legend(['Background', 'Signal'], loc='upper left') +plt.savefig('FOM_'+model_name+'.png', bbox_inches='tight') +#plt.show() + +sys.exit("Done!") \ No newline at end of file From e27516f2ef7517d74ea0556dd3f255c5ede7441c Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Wed, 23 Aug 2017 14:52:45 +0100 Subject: [PATCH 19/22] Added testNN --- commonFunctions.py | 44 ++++++++++++ prepareDATA.py | 16 ++++- testNN.py | 166 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 testNN.py diff --git a/commonFunctions.py b/commonFunctions.py index 25c80a5..1570565 100644 --- a/commonFunctions.py +++ b/commonFunctions.py @@ -1,5 +1,7 @@ import root_numpy import pandas +import numpy as np +from math import log signalMap = { "DM30" : ["T2DegStop_250_220", @@ -160,3 +162,45 @@ def StopDataLoader(path, features, test="550_520", selection="", treename="bdttr return dev, val +def FOM1(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / (b**0.5) + fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 + return (fom, fomErr) + +def FOM2(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / ((s+b)**0.5) + fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 + return (fom, fomErr) + +def FullFOM(sIn, bIn, fValue=0.2): + s, sErr = sIn + b, bErr = bIn + fomErr = 0.0 # Add the computation of the uncertainty later + fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) + fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) + fom = (fomA - fomB)**0.5 + return (fom, fomErr) + +def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): + #defines the selected test data + selectedVal = dataVal[dataVal.NN>cut] + + #separates the true positives from false negatives + selectedSig = selectedVal[selectedVal.category == 1] + selectedBkg = selectedVal[selectedVal.category == 0] + + sigYield = selectedSig.weight.sum() + sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) + bkgYield = selectedBkg.weight.sum() + bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) + + sigYield = sigYield * luminosity * splitFactor #The factor 2 comes from the splitting + sigYieldUnc = sigYieldUnc * luminosity * splitFactor + bkgYield = bkgYield * luminosity * splitFactor + bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor + + return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) diff --git a/prepareDATA.py b/prepareDATA.py index b6fdc51..a09b6ed 100644 --- a/prepareDATA.py +++ b/prepareDATA.py @@ -23,7 +23,21 @@ dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test=test_point) #print dataDev.describe() #print dataVal.describe() - +''' +if number_of_events_print == 1: + np_dataDev, np_dataVal = StopDataLoader(cfg.loc, inputBranches, suffix=suffix, signal=train_DM, test=test_point) # + print " ==> BEFORE PRE-SELECTION:" + print " Train Signal Events:", len(np_dataDev[np_dataDev.category==1]) + print " Train Background Events:",len(np_dataDev[np_dataDev.category==0]) + print " Test Signal Events:", len(np_dataVal[np_dataVal.category==1]) + print " Test Background Events:", len(np_dataVal[np_dataVal.category==0]) + print "" + print " ==> AFTER PRE-SELECTION:" + print " Train Signal Events:", len(dataDev[dataDev.category==1]) + print " Train Background Events:",len(dataDev[dataDev.category==0]) + print " Test Signal Events:", len(dataVal[dataVal.category==1]) + print " Test Background Events:", len(dataVal[dataVal.category==0]) +''' if number_of_events_print == 1: print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' diff --git a/testNN.py b/testNN.py new file mode 100644 index 0000000..762b97e --- /dev/null +++ b/testNN.py @@ -0,0 +1,166 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' +import root_numpy +import numpy as np +import pandas +import keras +import time +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.optimizers import Adam +from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt +import localConfig as cfg +from commonFunctions import StopDataLoader, FOM1, FOM2, FullFOM, getYields +import sys + +from keras.models import model_from_json + +from prepareDATA import * + +model_name = "myNN_DM30" + +print "Loading Model ..." +with open(model_name+'.json', 'r') as json_file: + loaded_model_json = json_file.read() +model = model_from_json(loaded_model_json) +model.load_weights(model_name+".h5") +model.compile(loss = 'binary_crossentropy', optimizer = 'adam') + +print("Getting predictions") +devPredict = model.predict(XDev) +valPredict = model.predict(XVal) + + +print("Getting scores") + +scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) +scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) +print "" +cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + +print "Calculating parameters" +dataDev["NN"] = devPredict +dataVal["NN"] = valPredict + +sig_dataDev=dataDev[dataDev.category==1] +bkg_dataDev=dataDev[dataDev.category==0] +sig_dataVal=dataVal[dataVal.category==1] +bkg_dataVal=dataVal[dataVal.category==0] + +tmpSig, tmpBkg = getYields(dataVal) +sigYield, sigYieldUnc = tmpSig +bkgYield, bkgYieldUnc = tmpBkg + + +fomEvo = [] +fomCut = [] + +bkgEff = [] +sigEff = [] + +sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 +bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + +for cut in np.arange(0.0, 0.9999, 0.001): + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + +max_FOM=0 + +for k in fomEvo: + if k>max_FOM: + max_FOM=k + +Eff = zip(bkgEff, sigEff) + +km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + +#f.write(str(y)+"\n") +#print "Layers:", y +#f.write(str(x)+"\n") +#print "Neurons:", x +#f.write(str(cohen_kappa)+"\n") +print "Cohen Kappa score:", cohen_kappa +#f.write(str(max_FOM)+"\n") +print "Maximized FOM:", max_FOM +#f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") +print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] +#f.write(str(km_value[0])+"\n") +print "KS test statistic:", km_value[0] +#f.write(str(km_value[1])+"\n") +print "KS test p-value:", km_value[1] + + #f.close() + +selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] +selectedSig = selectedVal[selectedVal.category == 1] +selectedBkg = selectedVal[selectedVal.category == 0] +sigYield = selectedSig.weight.sum() +bkgYield = selectedBkg.weight.sum() +sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting +bkgYield = bkgYield * luminosity * 2 + + +print "Selected events left after cut @", fomCut[fomEvo.index(max_FOM)] +print " Number of selected Signal Events:", len(selectedSig) +print " Number of selected Background Events:", len(selectedBkg) +print " Sig Yield", sigYield +print " Bkg Yield", bkgYield + +print "Plotting" + +plt.figure(figsize=(7,6)) +plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) +plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") +plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") +plt.xlabel('NN output') +plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') +plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) +plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') +plt.savefig('hist_'+model_name+'.png', bbox_inches='tight') +plt.show() + + +both_dataDev=bkg_dataDev.append(sig_dataDev) +plt.figure(figsize=(7,6)) +plt.xlabel('NN output') +plt.title("Number of Events") +#plt.yscale('log', nonposy='clip') +plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) +plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) +plt.savefig('pred_'+model_name+'.png', bbox_inches='tight') +plt.show() + +plt.figure(figsize=(7,6)) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) +plt.plot(fomCut, fomEvo) +plt.title("FOM") +plt.ylabel("FOM") +plt.xlabel("ND") +plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') + + +plt.subplot(212) +plt.semilogy(fomCut, Eff) +plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) +#plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) +plt.title("Efficiency") +plt.ylabel("Eff") +plt.xlabel("ND") +plt.legend(['Background', 'Signal'], loc='upper left') +plt.savefig('FOM_'+model_name+'.png', bbox_inches='tight') +plt.show() + +sys.exit("Done!") From 28a7e6171023e5a6abbd82ad4ca86ae9a42d4563 Mon Sep 17 00:00:00 2001 From: diogodebastos Date: Wed, 23 Aug 2017 15:23:09 +0100 Subject: [PATCH 20/22] trainNN only trains one NN and saves its parameters --- trainNN.py | 184 +++++++++-------------------------------------------- 1 file changed, 31 insertions(+), 153 deletions(-) diff --git a/trainNN.py b/trainNN.py index 94f9638..4b5abc3 100644 --- a/trainNN.py +++ b/trainNN.py @@ -7,13 +7,16 @@ from keras.models import Sequential from keras.layers import Dense, Dropout, AlphaDropout from sklearn.metrics import confusion_matrix, cohen_kappa_score +import matplotlib.pyplot as plt +from commonFunctions import getYields, FullFOM #from scipy.stats import ks_2samp from prepareDATA import * n_neurons = 10 n_layers = 2 -n_epochs = 10 +n_epochs = 2 +name = "myNN_N"+str(n_neurons)+"_L"+str(n_layers)+"_E"+str(n_epochs)+"_Dev"+train_DM+"_Val"+test_point compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} trainParams = {'epochs': n_epochs, 'batch_size': 20, 'verbose': 1} @@ -21,33 +24,6 @@ myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam -def FOM1(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / (b**0.5) - fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 - return (fom, fomErr) - -def FOM2(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / ((s+b)**0.5) - fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 - return (fom, fomErr) - -def FullFOM(sIn, bIn, fValue=0.2): - from math import log - s, sErr = sIn - b, bErr = bIn - fomErr = 0.0 # Add the computation of the uncertainty later - fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) - fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) - fom = (fomA - fomB)**0.5 - return (fom, fomErr) - -print "Opening file" -f = open("DATA_"+test_point+'.txt','w') - def getDefinedClassifier(nIn, nOut, compileArgs, neurons, layers): model = Sequential() model.add(Dense(nIn, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) @@ -57,36 +33,20 @@ def getDefinedClassifier(nIn, nOut, compileArgs, neurons, layers): model.compile(**compileArgs) return model -print type(XVal) -print XVal.shape -print XVal.dtype - print("Starting the training") start = time.time() call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') model = getDefinedClassifier(len( trainFeatures), 1, compileArgs, n_neurons, n_layers) -#model = getSELUClassifier(len(trainFeatures), 1, compileArgs) history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) print("Training took ", time.time()-start, " seconds") # To save: -name = "myNN_DM30" model.save(name+".h5") model_json = model.to_json() with open(name + ".json", "w") as json_file: json_file.write(model_json) model.save_weights(name + ".h5") -# To load: -''' -from keras.models import model_from_json -with open(name + '.json', 'r') as json_file: - loaded_model_json = json_file.read() -model = model_from_json(loaded_model_json) -model.load_weights(name+".h5") -model.compile(loss = 'binary_crossentropy', optimizer = 'adam') -''' - print("Getting predictions") devPredict = model.predict(XDev) valPredict = model.predict(XVal) @@ -95,38 +55,11 @@ def getDefinedClassifier(nIn, nOut, compileArgs, neurons, layers): scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) -print "" - -print "Dev score:", scoreDev -print "Val score:", scoreVal -print confusion_matrix(YVal, valPredict.round()) -print cohen_kappa_score(YVal, valPredict.round()) print "Calculating FOM:" dataVal["NN"] = valPredict -def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): - selectedValIdx = (dataVal.NN>cut) - selectedVal = dataVal[selectedValIdx] - - selectedSigIdx = (selectedVal.category == 1) - selectedBkgIdx = (selectedVal.category == 0) - selectedSig = selectedVal[selectedSigIdx] - selectedBkg = selectedVal[selectedBkgIdx] - - sigYield = selectedSig.weight.sum() - sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) - bkgYield = selectedBkg.weight.sum() - bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) - - sigYield = sigYield * luminosity * splitFactor # The factor 2 comes from the splitting - sigYieldUnc = sigYieldUnc * luminosity * splitFactor - bkgYield = bkgYield * luminosity * splitFactor - bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor - - return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - tmpSig, tmpBkg = getYields(dataVal) sigYield, sigYieldUnc = tmpSig bkgYield, bkgYieldUnc = tmpBkg @@ -134,116 +67,61 @@ def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): sigDataVal = dataVal[dataVal.category==1] bkgDataVal = dataVal[dataVal.category==0] +fomEvo = [] +fomCut = [] + +for cut in np.arange(0.0, 0.9999999, 0.001): + sig, bkg = getYields(dataVal, cut=cut) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + +max_FOM=0 + +print "Maximizing FOM" +for k in fomEvo: + if k>max_FOM: + max_FOM=k + print "Signal@Presel:", sigDataVal.weight.sum() * 35866 * 2 print "Background@Presel:", bkgDataVal.weight.sum() * 35866 * 2 print "Signal:", sigYield, "+-", sigYieldUnc print "Background:", bkgYield, "+-", bkgYieldUnc -print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) -print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) -print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) +print "Maximized FOM:", max_FOM +print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] import sys #sys.exit("Done!") ######################################################### -fomEvo = [] -fomCut = [] -bkgEff = [] # Let's repeat the above, but monitor the evolution of the loss function -import matplotlib.pyplot as plt + #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) -print(history.history.keys()) +plt.figure(figsize=(7,6)) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.show() +plt.subplot(212) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') +plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.show() - -sigEff = [] - -sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 -bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 - -for cut in np.arange(0.0, 0.9999999, 0.001): - sig, bkg = getYields(dataVal, cut=cut) - if sig[0] > 0 and bkg[0] > 0: - fom, fomUnc = FullFOM(sig, bkg) - fomEvo.append(fom) - fomCut.append(cut) - -max_FOM=0 - -print "Maximizing FOM" -for k in fomEvo: - if k>max_FOM: - max_FOM=k - -Eff = zip(bkgEff, sigEff) - -#km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) - -#print "Layers:", y -#print "Neurons:", x -#print "Cohen Kappa score:", cohen_kappa -print "Maximized FOM:", max_FOM -print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] -#print "KS test statistic:", km_value[0] -#print "KS test p-value:", km_value[1] - -plt.plot(fomCut, fomEvo) -plt.title("FOM") -plt.ylabel("FOM") -plt.xlabel("ND") -plt.legend(['test'], loc='upper left') -plt.show() - - -selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] -selectedSig = selectedVal[selectedVal.category == 1] -selectedBkg = selectedVal[selectedVal.category == 0] -sigYield = selectedSig.weight.sum() -bkgYield = selectedBkg.weight.sum() -sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting -bkgYield = bkgYield * luminosity * 2 - -print fomCut[fomEvo.index(max_FOM)] -print "Number of selected Signal Events:", len(selectedSig) -print 'Number of selected Background Events:', len(selectedBkg) -print "Sig Yield", sigYield -print "Bkg Yield", bkgYield -#print fomEvo +plt.savefig(name+'.png') +#plt.savefig('NN2_'+str(y)+''+str(x)+''+test_point+"_"+str(max_FOM)+'.png') sys.exit("Done!") - -######################################################### - -print "Attempting KFold" - -from sklearn.model_selection import StratifiedKFold - -seed = 42 -np.random.seed(seed) -kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) -cvscores = [] -for train, test in kfold.split(XDev, YDev): - model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) - model.fit(XDev[train], YDev[train], validation_data=(XDev[test],YDev[test],weightDev[test]), sample_weight=weightDev[train], **trainParams) - scores = model.evaluate(XDev[test], YDev[test], sample_weight=weightDev[test], verbose=1) - print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) - cvscores.append(scores[1] * 100) -print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))) From fd0dfae224db0503c3ed50baf1949d36e9f2a784 Mon Sep 17 00:00:00 2001 From: g-marques <30872659+g-marques@users.noreply.github.com> Date: Wed, 23 Aug 2017 17:40:18 +0100 Subject: [PATCH 21/22] Delete trainNN_loop.py --- trainNN_loop.py | 364 ------------------------------------------------ 1 file changed, 364 deletions(-) delete mode 100644 trainNN_loop.py diff --git a/trainNN_loop.py b/trainNN_loop.py deleted file mode 100644 index 7883558..0000000 --- a/trainNN_loop.py +++ /dev/null @@ -1,364 +0,0 @@ -import root_numpy -import numpy as np -import pandas -import keras -import time -from sklearn.externals import joblib -from sklearn.preprocessing import StandardScaler -from keras.models import Sequential -from keras.layers import Dense, Dropout, AlphaDropout -from keras.optimizers import Adam -from sklearn.metrics import confusion_matrix, cohen_kappa_score -from scipy.stats import ks_2samp -import matplotlib.pyplot as plt -import localConfig as cfg -from commonFunctions import StopDataLoader -import sys -from math import log - -myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT", "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV", "mt"] -inputBranches = list(myFeatures) -inputBranches.append("XS") -inputBranches.append("weight") -preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" -suffix = "_skimmed" -luminosity = 35866 -number_of_events_print = 0 -test_point = "550_520" -train_DM = "DM30" - -print "Loading datasets..." -dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test="550_520") # - -if number_of_events_print == 1: - np_dataDev, np_dataVal = StopDataLoader(cfg.loc, inputBranches, suffix=suffix, signal=train_DM, test=test_point) # - print " ==> BEFORE PRE-SELECTION:" - print " Train Signal Events:", len(np_dataDev[np_dataDev.category==1]) - print " Train Background Events:",len(np_dataDev[np_dataDev.category==0]) - print " Test Signal Events:", len(np_dataVal[np_dataVal.category==1]) - print " Test Background Events:", len(np_dataVal[np_dataVal.category==0]) - print "" - print " ==> AFTER PRE-SELECTION:" - print " Train Signal Events:", len(dataDev[dataDev.category==1]) - print " Train Background Events:",len(dataDev[dataDev.category==0]) - print " Test Signal Events:", len(dataVal[dataVal.category==1]) - print " Test Background Events:", len(dataVal[dataVal.category==0]) - -data = dataDev.copy() -data = data.append(dataVal.copy(), ignore_index=True) - -print 'Finding features of interest' -trainFeatures = [var for var in data.columns if var in myFeatures] -otherFeatures = [var for var in data.columns if var not in trainFeatures] - -print "Preparing the data for the NN" -XDev = dataDev.ix[:,0:len(trainFeatures)] # PORQUe X E Y??? EQUIVALENTE A SIG E BACK? -XVal = dataVal.ix[:,0:len(trainFeatures)] -YDev = np.ravel(dataDev.category) -YVal = np.ravel(dataVal.category) -weightDev = np.ravel(dataDev.sampleWeight) -weightVal = np.ravel(dataVal.sampleWeight) - -print("Fitting the scaler and scaling the input variables") -scaler = StandardScaler().fit(XDev) -XDev = scaler.transform(XDev) -XVal = scaler.transform(XVal) - -scalerfile = 'scaler_'+train_DM+'.sav' -joblib.dump(scaler, scalerfile) - -compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} -learning_rate = 0.001/5.0 -myAdam = Adam(lr=learning_rate) -compileArgs['optimizer'] = myAdam - -## DEFINITIONS -def FOM1(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / (b**0.5) - fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 - return (fom, fomErr) - -def FOM2(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / ((s+b)**0.5) - fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 - return (fom, fomErr) - -def FullFOM(sIn, bIn, fValue=0.2): - s, sErr = sIn - b, bErr = bIn - fomErr = 0.0 # Add the computation of the uncertainty later - fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) - fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) - fom = (fomA - fomB)**0.5 - return (fom, fomErr) - -print "Opening file" -f = open('DATA_loop_test_f_'+test_point+'.txt', 'w') - -for y in range(1, 2): - for x in range(7,8): - - if y==1: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - if y==2: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - if y==3: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - if y==4: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - #model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - - print("Starting the training") - call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') - model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) - history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) - - name = "myNN_"+str(y)+"_"+str(x)+"_"+train_DM - model.save(name+".h5") - model_json = model.to_json() - with open(name + ".json", "w") as json_file: - json_file.write(model_json) - model.save_weights(name + ".h5") - - # To load: - #from keras.models import model_from_json - #with open("myNN_"+str(y)+"_"+str(x)+"_"+test_point+'.json', 'r') as json_file: - # loaded_model_json = json_file.read() - #model = model_from_json(loaded_model_json) - #model.load_weights("myNN_"+str(y)+"_"+str(x)+"_"+test_point+".h5") - #model.compile(loss = 'binary_crossentropy', optimizer = 'adam') - - print("Getting predictions") - devPredict = model.predict(XDev) - #print devPredict - valPredict = model.predict(XVal) - #print valPredict - - - print("Getting scores") - - scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) - scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) - print "" - #print "Dev score:", scoreDev - #print "Val score:", scoreVal - #print confusion_matrix(YVal, valPredict.round()) - cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) - - - print "Calculating FOM:" - dataDev["NN"] = devPredict - dataVal["NN"] = valPredict - - #dataVal[dataVal.category==1].weight = dataVal[dataVal.category==1].weight*dataVal[dataVal.category==1].XS - sig_dataDev=dataDev[dataDev.category==1] - bkg_dataDev=dataDev[dataDev.category==0] - sig_dataVal=dataVal[dataVal.category==1] - bkg_dataVal=dataVal[dataVal.category==0] - - - def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): - #defines the selected test data - selectedVal = dataVal[dataVal.NN>cut] - unselectedVal = dataVal[ dataVal.NN<=cut ] - - #separates the true positives from false negatives - selectedSig = selectedVal[selectedVal.category == 1] - selectedBkg = selectedVal[selectedVal.category == 0] - #unselectedSig = unselectedVal[unselectedVal.category == 1] - #unselectedBkg = unselectedVal[unselectedVal.category == 0] - - #print cut - #print "TP", len(selectedSig) - #print "FP", len(selectedBkg) - #print "FN", len(unselectedSig) - #print "TN", len(unselectedBkg) - #print "" - - sigYield = selectedSig.weight.sum() - sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) - bkgYield = selectedBkg.weight.sum() - bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) - - sigYield = sigYield * luminosity * splitFactor #The factor 2 comes from the splitting - sigYieldUnc = sigYieldUnc * luminosity * splitFactor - bkgYield = bkgYield * luminosity * splitFactor - bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor - - return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - - tmpSig, tmpBkg = getYields(dataVal) - sigYield, sigYieldUnc = tmpSig - bkgYield, bkgYieldUnc = tmpBkg - - - #print "Signal@Presel:", dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 - #print "Background@Presel:", dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 - #print "Signal:", sigYield, "+-", sigYieldUnc - #print "Background:", bkgYield, "+-", bkgYieldUnc - - #print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - #print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - #print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - - #sys.exit("Done!") - - ######################################################### - - # Let's repeat the above, but monitor the evolution of the loss function - - - #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) - #print(history.history.keys()) - - fomEvo = [] - fomCut = [] - - bkgEff = [] - sigEff = [] - - sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 - bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 - - for cut in np.arange(0.0, 0.9999999, 0.001): - #print cut - sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) - if sig[0] > 0 and bkg[0] > 0: - fom, fomUnc = FullFOM(sig, bkg) - fomEvo.append(fom) - fomCut.append(cut) - #print fom - #print "" - bkgEff.append(bkg[0]/bkg_Init) - sigEff.append(sig[0]/sig_Init) - - max_FOM=0 - - print "Maximizing FOM" - for k in fomEvo: - if k>max_FOM: - max_FOM=k - - Eff = zip(bkgEff, sigEff) - - km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) - - f.write(str(y)+"\n") - print "Layers:", y - f.write(str(x)+"\n") - print "Neurons:", x - f.write(str(cohen_kappa)+"\n") - print "Cohen Kappa score:", cohen_kappa - f.write(str(max_FOM)+"\n") - print "Maximized FOM:", max_FOM - f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") - print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] - f.write(str(km_value[0])+"\n") - print "KS test statistic:", km_value[0] - f.write(str(km_value[1])+"\n") - print "KS test p-value:", km_value[1] - - f.close() - - selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] - selectedSig = selectedVal[selectedVal.category == 1] - selectedBkg = selectedVal[selectedVal.category == 0] - sigYield = selectedSig.weight.sum() - bkgYield = selectedBkg.weight.sum() - sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting - bkgYield = bkgYield * luminosity * 2 - - - print fomCut[fomEvo.index(max_FOM)] - print "Number of selected Signal Events:", len(selectedSig) - print "Number of selected Background Events:", len(selectedBkg) - print "Sig Yield", sigYield - print "Bkg Yield", bkgYield - - print "Plotting" - - plt.figure(figsize=(7,6)) - plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) - plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) - plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") - plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") - plt.xlabel('NN output') - #plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) - plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') - plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) - plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') - plt.savefig('hist_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') - plt.show() - - - both_dataDev=bkg_dataDev.append(sig_dataDev) - plt.figure(figsize=(7,6)) - plt.xlabel('NN output') - plt.title("Number of Events") - #plt.yscale('log', nonposy='clip') - plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) - plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) - plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) - plt.savefig('pred_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') - plt.show() - - - plt.figure(figsize=(7,6)) - plt.subplots_adjust(hspace=0.5) - plt.subplot(211) - plt.plot(fomCut, fomEvo) - plt.title("FOM") - plt.ylabel("FOM") - plt.xlabel("ND") - plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') - - - plt.subplot(212) - plt.semilogy(fomCut, Eff) - plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) - #plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) - plt.title("Efficiency") - plt.ylabel("Eff") - plt.xlabel("ND") - plt.legend(['Background', 'Signal'], loc='upper left') - plt.savefig('FOM_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') - plt.show() - - - - -sys.exit("Done!") - From f1f2aa61a08f7f8ed4dbb5de6f5c9de54edc9876 Mon Sep 17 00:00:00 2001 From: g-marques Date: Wed, 23 Aug 2017 17:41:33 +0100 Subject: [PATCH 22/22] manualGridSearch uploaded --- commonFunctions.py | 1 - graphicsNN.py | 2 +- manualGridSearch.py | 137 +++++++++++++++++ testNN.py | 2 +- trainNN.py | 4 +- trainNN_loop.py | 364 -------------------------------------------- 6 files changed, 141 insertions(+), 369 deletions(-) create mode 100644 manualGridSearch.py delete mode 100644 trainNN_loop.py diff --git a/commonFunctions.py b/commonFunctions.py index 1570565..b2c572f 100755 --- a/commonFunctions.py +++ b/commonFunctions.py @@ -159,7 +159,6 @@ def StopDataLoader(path, features, test="550_520", selection="", treename="bdttr dev = dev.append(bkgDev.copy(), ignore_index=True) val = sigVal.copy() val = val.append(bkgVal.copy(), ignore_index=True) - return dev, val def FOM1(sIn, bIn): diff --git a/graphicsNN.py b/graphicsNN.py index 9e00d05..ee8d7cd 100644 --- a/graphicsNN.py +++ b/graphicsNN.py @@ -3,7 +3,7 @@ import numpy as np from mpl_toolkits.mplot3d import Axes3D -f = open('trainingDATA.txt', 'r') +f = open('DATA_loop_test_trash_550_520.txt', 'r') layers = [] neurons = [] diff --git a/manualGridSearch.py b/manualGridSearch.py new file mode 100644 index 0000000..6f055d2 --- /dev/null +++ b/manualGridSearch.py @@ -0,0 +1,137 @@ +import root_numpy +import numpy as np +import pandas +import keras +import time +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.optimizers import Adam +from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt +import localConfig as cfg +from commonFunctions import StopDataLoader, FullFOM, getYields + +import sys +from math import log + +from prepareDATA import * + +compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} +trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} +learning_rate = 0.001/5.0 +myAdam = Adam(lr=learning_rate) +compileArgs['optimizer'] = myAdam + +print "Opening file" +f = open('trainNN_outputs_run1_'+test_point+'.txt', 'w') + +def getDefinedClassifier(nIn, nOut, compileArgs, neurons=12, layers=1): + model = Sequential() + model.add(Dense(neurons, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + for x in range(0, layers): + model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + +for y in range(1,3): # LAYERS + for x in range(10, 20): # NEURONS + print " ==> #LAYERS:", y, " #NEURONS:", x, " <==" + + print("Starting the training") + model = getDefinedClassifier(len(trainFeatures), 1, compileArgs, x, y) + history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) + name = "myNN_run1_L"+str(y)+"_N"+str(x)+"_"+train_DM + model.save(name+".h5") + model_json = model.to_json() + with open(name + ".json", "w") as json_file: + json_file.write(model_json) + model.save_weights(name + ".h5") + + print("Getting predictions") + devPredict = model.predict(XDev) + valPredict = model.predict(XVal) + + print("Getting scores") + + scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) + scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) + cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + + print "Calculating FOM:" + dataDev["NN"] = devPredict + dataVal["NN"] = valPredict + + sig_dataDev=dataDev[dataDev.category==1] + bkg_dataDev=dataDev[dataDev.category==0] + sig_dataVal=dataVal[dataVal.category==1] + bkg_dataVal=dataVal[dataVal.category==0] + + + tmpSig, tmpBkg = getYields(dataVal) + sigYield, sigYieldUnc = tmpSig + bkgYield, bkgYieldUnc = tmpBkg + + fomEvo = [] + fomCut = [] + + bkgEff = [] + sigEff = [] + + sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 + bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + + for cut in np.arange(0.0, 0.9999999, 0.001): + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + + max_FOM=0 + + print "Maximizing FOM" + for k in fomEvo: + if k>max_FOM: + max_FOM=k + + Eff = zip(bkgEff, sigEff) + + km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + + f.write(str(y)+"\n") + f.write(str(x)+"\n") + f.write(str(cohen_kappa)+"\n") + f.write(str(max_FOM)+"\n") + f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") + f.write(str(km_value[0])+"\n") + f.write(str(km_value[1])+"\n") + + print "Plotting" + plt.figure(figsize=(7,6)) + plt.subplots_adjust(hspace=0.5) + plt.subplot(211) + plt.plot(history.history['acc']) + plt.plot(history.history['val_acc']) + plt.title('model accuracy') + plt.ylabel('accuracy') + plt.xlabel('epoch') + plt.legend(['train', 'test'], loc='upper left') + + plt.subplot(212) + plt.plot(history.history['loss']) + plt.plot(history.history['val_loss']) + plt.title('model loss') + plt.ylabel('loss') + plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) + plt.xlabel('epoch') + plt.legend(['train', 'test'], loc='upper left') + plt.savefig(name+"_FOM:"+str(max_FOM)+'.png') + #plt.show() + +sys.exit("Done!") diff --git a/testNN.py b/testNN.py index 762b97e..5de2316 100755 --- a/testNN.py +++ b/testNN.py @@ -21,7 +21,7 @@ from prepareDATA import * -model_name = "myNN_DM30" +model_name = "myNN_N13_L2_E25_DevDM30_Val550_520" print "Loading Model ..." with open(model_name+'.json', 'r') as json_file: diff --git a/trainNN.py b/trainNN.py index 4b5abc3..5d41e8f 100755 --- a/trainNN.py +++ b/trainNN.py @@ -13,9 +13,9 @@ from prepareDATA import * -n_neurons = 10 +n_neurons = 13 n_layers = 2 -n_epochs = 2 +n_epochs = 25 name = "myNN_N"+str(n_neurons)+"_L"+str(n_layers)+"_E"+str(n_epochs)+"_Dev"+train_DM+"_Val"+test_point compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} diff --git a/trainNN_loop.py b/trainNN_loop.py deleted file mode 100644 index 7883558..0000000 --- a/trainNN_loop.py +++ /dev/null @@ -1,364 +0,0 @@ -import root_numpy -import numpy as np -import pandas -import keras -import time -from sklearn.externals import joblib -from sklearn.preprocessing import StandardScaler -from keras.models import Sequential -from keras.layers import Dense, Dropout, AlphaDropout -from keras.optimizers import Adam -from sklearn.metrics import confusion_matrix, cohen_kappa_score -from scipy.stats import ks_2samp -import matplotlib.pyplot as plt -import localConfig as cfg -from commonFunctions import StopDataLoader -import sys -from math import log - -myFeatures = ["LepPt", "LepEta", "LepChg", "Met", "Jet1Pt", "HT", "NbLoose", "Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV", "mt"] -inputBranches = list(myFeatures) -inputBranches.append("XS") -inputBranches.append("weight") -preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" -suffix = "_skimmed" -luminosity = 35866 -number_of_events_print = 0 -test_point = "550_520" -train_DM = "DM30" - -print "Loading datasets..." -dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test="550_520") # - -if number_of_events_print == 1: - np_dataDev, np_dataVal = StopDataLoader(cfg.loc, inputBranches, suffix=suffix, signal=train_DM, test=test_point) # - print " ==> BEFORE PRE-SELECTION:" - print " Train Signal Events:", len(np_dataDev[np_dataDev.category==1]) - print " Train Background Events:",len(np_dataDev[np_dataDev.category==0]) - print " Test Signal Events:", len(np_dataVal[np_dataVal.category==1]) - print " Test Background Events:", len(np_dataVal[np_dataVal.category==0]) - print "" - print " ==> AFTER PRE-SELECTION:" - print " Train Signal Events:", len(dataDev[dataDev.category==1]) - print " Train Background Events:",len(dataDev[dataDev.category==0]) - print " Test Signal Events:", len(dataVal[dataVal.category==1]) - print " Test Background Events:", len(dataVal[dataVal.category==0]) - -data = dataDev.copy() -data = data.append(dataVal.copy(), ignore_index=True) - -print 'Finding features of interest' -trainFeatures = [var for var in data.columns if var in myFeatures] -otherFeatures = [var for var in data.columns if var not in trainFeatures] - -print "Preparing the data for the NN" -XDev = dataDev.ix[:,0:len(trainFeatures)] # PORQUe X E Y??? EQUIVALENTE A SIG E BACK? -XVal = dataVal.ix[:,0:len(trainFeatures)] -YDev = np.ravel(dataDev.category) -YVal = np.ravel(dataVal.category) -weightDev = np.ravel(dataDev.sampleWeight) -weightVal = np.ravel(dataVal.sampleWeight) - -print("Fitting the scaler and scaling the input variables") -scaler = StandardScaler().fit(XDev) -XDev = scaler.transform(XDev) -XVal = scaler.transform(XVal) - -scalerfile = 'scaler_'+train_DM+'.sav' -joblib.dump(scaler, scalerfile) - -compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} -learning_rate = 0.001/5.0 -myAdam = Adam(lr=learning_rate) -compileArgs['optimizer'] = myAdam - -## DEFINITIONS -def FOM1(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / (b**0.5) - fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 - return (fom, fomErr) - -def FOM2(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / ((s+b)**0.5) - fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 - return (fom, fomErr) - -def FullFOM(sIn, bIn, fValue=0.2): - s, sErr = sIn - b, bErr = bIn - fomErr = 0.0 # Add the computation of the uncertainty later - fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) - fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) - fom = (fomA - fomB)**0.5 - return (fom, fomErr) - -print "Opening file" -f = open('DATA_loop_test_f_'+test_point+'.txt', 'w') - -for y in range(1, 2): - for x in range(7,8): - - if y==1: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - if y==2: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - if y==3: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - if y==4: - def getDefinedClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - model.add(Dense(x, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - #model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - - print("Starting the training") - call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') - model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) - history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) - - name = "myNN_"+str(y)+"_"+str(x)+"_"+train_DM - model.save(name+".h5") - model_json = model.to_json() - with open(name + ".json", "w") as json_file: - json_file.write(model_json) - model.save_weights(name + ".h5") - - # To load: - #from keras.models import model_from_json - #with open("myNN_"+str(y)+"_"+str(x)+"_"+test_point+'.json', 'r') as json_file: - # loaded_model_json = json_file.read() - #model = model_from_json(loaded_model_json) - #model.load_weights("myNN_"+str(y)+"_"+str(x)+"_"+test_point+".h5") - #model.compile(loss = 'binary_crossentropy', optimizer = 'adam') - - print("Getting predictions") - devPredict = model.predict(XDev) - #print devPredict - valPredict = model.predict(XVal) - #print valPredict - - - print("Getting scores") - - scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) - scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) - print "" - #print "Dev score:", scoreDev - #print "Val score:", scoreVal - #print confusion_matrix(YVal, valPredict.round()) - cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) - - - print "Calculating FOM:" - dataDev["NN"] = devPredict - dataVal["NN"] = valPredict - - #dataVal[dataVal.category==1].weight = dataVal[dataVal.category==1].weight*dataVal[dataVal.category==1].XS - sig_dataDev=dataDev[dataDev.category==1] - bkg_dataDev=dataDev[dataDev.category==0] - sig_dataVal=dataVal[dataVal.category==1] - bkg_dataVal=dataVal[dataVal.category==0] - - - def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): - #defines the selected test data - selectedVal = dataVal[dataVal.NN>cut] - unselectedVal = dataVal[ dataVal.NN<=cut ] - - #separates the true positives from false negatives - selectedSig = selectedVal[selectedVal.category == 1] - selectedBkg = selectedVal[selectedVal.category == 0] - #unselectedSig = unselectedVal[unselectedVal.category == 1] - #unselectedBkg = unselectedVal[unselectedVal.category == 0] - - #print cut - #print "TP", len(selectedSig) - #print "FP", len(selectedBkg) - #print "FN", len(unselectedSig) - #print "TN", len(unselectedBkg) - #print "" - - sigYield = selectedSig.weight.sum() - sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) - bkgYield = selectedBkg.weight.sum() - bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) - - sigYield = sigYield * luminosity * splitFactor #The factor 2 comes from the splitting - sigYieldUnc = sigYieldUnc * luminosity * splitFactor - bkgYield = bkgYield * luminosity * splitFactor - bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor - - return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - - tmpSig, tmpBkg = getYields(dataVal) - sigYield, sigYieldUnc = tmpSig - bkgYield, bkgYieldUnc = tmpBkg - - - #print "Signal@Presel:", dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 - #print "Background@Presel:", dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 - #print "Signal:", sigYield, "+-", sigYieldUnc - #print "Background:", bkgYield, "+-", bkgYieldUnc - - #print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - #print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - #print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) - - #sys.exit("Done!") - - ######################################################### - - # Let's repeat the above, but monitor the evolution of the loss function - - - #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) - #print(history.history.keys()) - - fomEvo = [] - fomCut = [] - - bkgEff = [] - sigEff = [] - - sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 - bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 - - for cut in np.arange(0.0, 0.9999999, 0.001): - #print cut - sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) - if sig[0] > 0 and bkg[0] > 0: - fom, fomUnc = FullFOM(sig, bkg) - fomEvo.append(fom) - fomCut.append(cut) - #print fom - #print "" - bkgEff.append(bkg[0]/bkg_Init) - sigEff.append(sig[0]/sig_Init) - - max_FOM=0 - - print "Maximizing FOM" - for k in fomEvo: - if k>max_FOM: - max_FOM=k - - Eff = zip(bkgEff, sigEff) - - km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) - - f.write(str(y)+"\n") - print "Layers:", y - f.write(str(x)+"\n") - print "Neurons:", x - f.write(str(cohen_kappa)+"\n") - print "Cohen Kappa score:", cohen_kappa - f.write(str(max_FOM)+"\n") - print "Maximized FOM:", max_FOM - f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") - print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] - f.write(str(km_value[0])+"\n") - print "KS test statistic:", km_value[0] - f.write(str(km_value[1])+"\n") - print "KS test p-value:", km_value[1] - - f.close() - - selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] - selectedSig = selectedVal[selectedVal.category == 1] - selectedBkg = selectedVal[selectedVal.category == 0] - sigYield = selectedSig.weight.sum() - bkgYield = selectedBkg.weight.sum() - sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting - bkgYield = bkgYield * luminosity * 2 - - - print fomCut[fomEvo.index(max_FOM)] - print "Number of selected Signal Events:", len(selectedSig) - print "Number of selected Background Events:", len(selectedBkg) - print "Sig Yield", sigYield - print "Bkg Yield", bkgYield - - print "Plotting" - - plt.figure(figsize=(7,6)) - plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) - plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) - plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") - plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") - plt.xlabel('NN output') - #plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) - plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') - plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) - plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') - plt.savefig('hist_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') - plt.show() - - - both_dataDev=bkg_dataDev.append(sig_dataDev) - plt.figure(figsize=(7,6)) - plt.xlabel('NN output') - plt.title("Number of Events") - #plt.yscale('log', nonposy='clip') - plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) - plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) - plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) - plt.savefig('pred_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') - plt.show() - - - plt.figure(figsize=(7,6)) - plt.subplots_adjust(hspace=0.5) - plt.subplot(211) - plt.plot(fomCut, fomEvo) - plt.title("FOM") - plt.ylabel("FOM") - plt.xlabel("ND") - plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') - - - plt.subplot(212) - plt.semilogy(fomCut, Eff) - plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) - #plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) - plt.title("Efficiency") - plt.ylabel("Eff") - plt.xlabel("ND") - plt.legend(['Background', 'Signal'], loc='upper left') - plt.savefig('FOM_'+str(y)+'_'+str(x)+'_'+test_point+'.png', bbox_inches='tight') - plt.show() - - - - -sys.exit("Done!") -