diff --git a/.gitignore b/.gitignore index 58fe940..f2ae9b4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,10 @@ keras-tf/* *.pcm *.root *.h5 +*.png +*.json +*.pyc +*.txt +*.sav scaler.sav +localConfig.py diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/Stop4Body.sublime-project b/Stop4Body.sublime-project old mode 100644 new mode 100755 diff --git a/applyNN.py b/applyNN.py old mode 100644 new mode 100755 diff --git a/commonFunctions.py b/commonFunctions.py old mode 100644 new mode 100755 index ce06396..b2c572f --- a/commonFunctions.py +++ b/commonFunctions.py @@ -1,9 +1,34 @@ import root_numpy import pandas +import numpy as np +from math import log signalMap = { - "DM30" : [], + "DM30" : ["T2DegStop_250_220", + "T2DegStop_275_245", + "T2DegStop_300_270", + "T2DegStop_325_295", + "T2DegStop_350_320", + "T2DegStop_375_345", + "T2DegStop_400_370", + "T2DegStop_425_395", + "T2DegStop_450_420", + "T2DegStop_475_445", + "T2DegStop_500_470", + "T2DegStop_525_495", + "T2DegStop_550_520", + "T2DegStop_575_545", + "T2DegStop_600_570", + "T2DegStop_625_595", + "T2DegStop_650_620", + "T2DegStop_675_645", + "T2DegStop_700_670", + "T2DegStop_725_695", + "T2DegStop_750_720", + "T2DegStop_775_745", + "T2DegStop_800_770"], "300_270" : ["T2DegStop_300_270"], + "550_520" : ["T2DegStop_550_520"] } bkgDatasets = [ "Wjets_70to100", @@ -27,9 +52,11 @@ ] -def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): +def StopDataLoader(path, features, test="550_520", selection="", treename="bdttree", suffix="", signal="DM30", fraction=1.0): if signal not in signalMap: - raise KeyError("Unknown signal requested ("+signal+")") + raise KeyError("Unknown training signal requested ("+signal+")") + if test not in signalMap: + raise KeyError("Unknown test signal requested ("+test+")") if fraction >= 1.0: fraction = 1.0 if fraction < 0.0: @@ -39,43 +66,40 @@ def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", if "weight" not in features: features.append("weight") - - sigDev = None sigVal = None - for sigName in signalMap[signal]: - stopM = int(sigName[10:13]) + + + for sigName_test in signalMap[test]: tmp = root_numpy.root2array( - path + "/train/" + sigName + suffix + ".root", + path + "test/" + sigName_test + suffix + ".root", treename=treename, selection=selection, branches=features ) if fraction < 1.0: tmp = tmp[:int(len(tmp)*fraction)] - if sigDev is None: - sigDev = pandas.DataFrame(tmp) - sigDev["stopM"] = stopM + if sigVal is None: + sigVal = pandas.DataFrame(tmp) else: - tmp2 = pandas.DataFrame(tmp) - tmp2["stopM"] = stopM - sigDev = sigDev.append(tmp2, ignore_index=True) + sigVal = sigVal.append(pandas.DataFrame(tmp), ignore_index=True) + + for sigName in signalMap[signal]: tmp = root_numpy.root2array( - path + "/test/" + sigName + suffix + ".root", + path + "train/" + sigName + suffix + ".root", treename=treename, selection=selection, branches=features ) if fraction < 1.0: tmp = tmp[:int(len(tmp)*fraction)] - if sigVal is None: - sigVal = pandas.DataFrame(tmp) - sigVal["stopM"] = stopM + if sigDev is None: + sigDev = pandas.DataFrame(tmp) else: - tmp2 = pandas.DataFrame(tmp) - tmp2["stopM"] = stopM - sigVal = sigVal.append(tmp2, ignore_index=True) + sigDev = sigDev.append(pandas.DataFrame(tmp), ignore_index=True) + + bkgDev = None bkgVal = None @@ -115,9 +139,6 @@ def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", bkgDev["sampleWeight"] = 1 bkgVal["sampleWeight"] = 1 - bkgDev["stopM"] = -1 - bkgVal["stopM"] = -1 - if fraction < 1.0: sigDev.weight = sigDev.weight/fraction sigVal.weight = sigVal.weight/fraction @@ -138,5 +159,47 @@ def StopDataLoader(path, features, selection="", treename="bdttree", suffix="", dev = dev.append(bkgDev.copy(), ignore_index=True) val = sigVal.copy() val = val.append(bkgVal.copy(), ignore_index=True) - return dev, val + +def FOM1(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / (b**0.5) + fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 + return (fom, fomErr) + +def FOM2(sIn, bIn): + s, sErr = sIn + b, bErr = bIn + fom = s / ((s+b)**0.5) + fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 + return (fom, fomErr) + +def FullFOM(sIn, bIn, fValue=0.2): + s, sErr = sIn + b, bErr = bIn + fomErr = 0.0 # Add the computation of the uncertainty later + fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) + fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) + fom = (fomA - fomB)**0.5 + return (fom, fomErr) + +def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): + #defines the selected test data + selectedVal = dataVal[dataVal.NN>cut] + + #separates the true positives from false negatives + selectedSig = selectedVal[selectedVal.category == 1] + selectedBkg = selectedVal[selectedVal.category == 0] + + sigYield = selectedSig.weight.sum() + sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) + bkgYield = selectedBkg.weight.sum() + bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) + + sigYield = sigYield * luminosity * splitFactor #The factor 2 comes from the splitting + sigYieldUnc = sigYieldUnc * luminosity * splitFactor + bkgYield = bkgYield * luminosity * splitFactor + bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor + + return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) diff --git a/graphicsNN.py b/graphicsNN.py new file mode 100644 index 0000000..ee8d7cd --- /dev/null +++ b/graphicsNN.py @@ -0,0 +1,62 @@ +#from matplotlib.colors import LogNorm +import matplotlib.pyplot as plt +import numpy as np +from mpl_toolkits.mplot3d import Axes3D + +f = open('DATA_loop_test_trash_550_520.txt', 'r') + +layers = [] +neurons = [] +cohen_kappa =[] +FOM_max = [] +FOM_cut = [] +KS_test_stat = [] +KS_test_pval = [] +layers_legend = [] +line_index=0 + +for line in f: + if line_index%7==0: + layers.append(float(line,)) + if line_index%7==1: + neurons.append(float(line,)) + if line_index%7==2: + cohen_kappa.append(float(line,)) + if line_index%7==3: + FOM_max.append(float(line,)) + if line_index%7==4: + FOM_cut.append(float(line,)) + if line_index%7==5: + KS_test_stat.append(float(line,)) + if line_index%7==6: + KS_test_pval.append(float(line,)) + line_index=line_index+1 + + +for l in list(set(layers)): + layers_legend.append(str(l)+" layers") + + +nol = len(list(set(layers))) +non = len(list(set(neurons))) + + + +plt.figure(figsize=(7,6)) +plt.xlabel('Number of neurons per layer') +plt.ylabel('F.O.M.') +#plt.title("Cohen's kappa: {0}".format(cohen_kappa), fontsize=10) +plt.suptitle("FOM for several configurations of Neural Nets", fontsize=13, fontweight='bold') +#plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) +for x in range(0, nol): + plt.plot(neurons[int(x*non):int((x+1)*non)], FOM_max[int(x*non):int((x+1)*non)]) +#plt.plot(neurons[0:18], FOM_max[0:18], "b") +#plt.plot(neurons[18:36], FOM_max[18:36], "r") +#plt.plot(neurons[36:54], FOM_max[36:54], "g") +#plt.plot(neurons[54:72], FOM_max[54:72], "c") +plt.legend(layers_legend, loc='best') +plt.show() + +plt.hist2d(neurons, layers, bins=[non,nol], weights=FOM_max) +plt.colorbar() +plt.show() \ No newline at end of file diff --git a/gridSearch.py b/gridSearch.py new file mode 100755 index 0000000..bd0541b --- /dev/null +++ b/gridSearch.py @@ -0,0 +1,49 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' +import numpy +from sklearn.model_selection import GridSearchCV +from keras.wrappers.scikit_learn import KerasClassifier +import time +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.wrappers.scikit_learn import KerasClassifier +from keras.constraints import maxnorm + +from prepareDATA import * + +compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} + +# Fix seed for reproducibility +seed = 42 +numpy.random.seed(seed) + +# Tune the Number of Neurons in the Hidden Layer +def myClassifier(nIn=len(trainFeatures), nOut=1, compileArgs=compileArgs, layers=1, neurons =1): + model = Sequential() + model.add(Dense(nIn, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + for i in range(0,layers): + model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + print("Training with %i layers and %i neurons" % (layers, neurons)) + return model + + +model = KerasClassifier(build_fn=myClassifier, epochs = 10, batch_size = 20, verbose = 1) + +neurons = [10,12,14,16] +layers = [1,2,3] +#layers = [1,2,3,4,5] +param_grid = dict(neurons=neurons, layers=layers) +grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs=-1) #n_jobs = -1 -> Total number of CPU/GPU cores +print("Starting the training") +start = time.time() +grid_result = grid.fit(XDev,YDev) +print("Training took ", time.time()-start, " seconds") + +print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) +means = grid_result.cv_results_['mean_test_score'] +stds = grid_result.cv_results_['std_test_score'] +params = grid_result.cv_results_['params'] +for mean, stdev, param in zip(means, stds, params): + print("%f (%f) with: %r" % (mean, stdev, param)) diff --git a/localConfig.py b/localConfig.py deleted file mode 100644 index 584cd36..0000000 --- a/localConfig.py +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env python - -loc = "./" diff --git a/manualGridSearch.py b/manualGridSearch.py new file mode 100644 index 0000000..6f055d2 --- /dev/null +++ b/manualGridSearch.py @@ -0,0 +1,137 @@ +import root_numpy +import numpy as np +import pandas +import keras +import time +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.optimizers import Adam +from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt +import localConfig as cfg +from commonFunctions import StopDataLoader, FullFOM, getYields + +import sys +from math import log + +from prepareDATA import * + +compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} +trainParams = {'epochs': 25, 'batch_size': 20, 'verbose': 1} +learning_rate = 0.001/5.0 +myAdam = Adam(lr=learning_rate) +compileArgs['optimizer'] = myAdam + +print "Opening file" +f = open('trainNN_outputs_run1_'+test_point+'.txt', 'w') + +def getDefinedClassifier(nIn, nOut, compileArgs, neurons=12, layers=1): + model = Sequential() + model.add(Dense(neurons, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + for x in range(0, layers): + model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) + model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) + model.compile(**compileArgs) + return model + +for y in range(1,3): # LAYERS + for x in range(10, 20): # NEURONS + print " ==> #LAYERS:", y, " #NEURONS:", x, " <==" + + print("Starting the training") + model = getDefinedClassifier(len(trainFeatures), 1, compileArgs, x, y) + history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) + name = "myNN_run1_L"+str(y)+"_N"+str(x)+"_"+train_DM + model.save(name+".h5") + model_json = model.to_json() + with open(name + ".json", "w") as json_file: + json_file.write(model_json) + model.save_weights(name + ".h5") + + print("Getting predictions") + devPredict = model.predict(XDev) + valPredict = model.predict(XVal) + + print("Getting scores") + + scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) + scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) + cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + + print "Calculating FOM:" + dataDev["NN"] = devPredict + dataVal["NN"] = valPredict + + sig_dataDev=dataDev[dataDev.category==1] + bkg_dataDev=dataDev[dataDev.category==0] + sig_dataVal=dataVal[dataVal.category==1] + bkg_dataVal=dataVal[dataVal.category==0] + + + tmpSig, tmpBkg = getYields(dataVal) + sigYield, sigYieldUnc = tmpSig + bkgYield, bkgYieldUnc = tmpBkg + + fomEvo = [] + fomCut = [] + + bkgEff = [] + sigEff = [] + + sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 + bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + + for cut in np.arange(0.0, 0.9999999, 0.001): + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + + max_FOM=0 + + print "Maximizing FOM" + for k in fomEvo: + if k>max_FOM: + max_FOM=k + + Eff = zip(bkgEff, sigEff) + + km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + + f.write(str(y)+"\n") + f.write(str(x)+"\n") + f.write(str(cohen_kappa)+"\n") + f.write(str(max_FOM)+"\n") + f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") + f.write(str(km_value[0])+"\n") + f.write(str(km_value[1])+"\n") + + print "Plotting" + plt.figure(figsize=(7,6)) + plt.subplots_adjust(hspace=0.5) + plt.subplot(211) + plt.plot(history.history['acc']) + plt.plot(history.history['val_acc']) + plt.title('model accuracy') + plt.ylabel('accuracy') + plt.xlabel('epoch') + plt.legend(['train', 'test'], loc='upper left') + + plt.subplot(212) + plt.plot(history.history['loss']) + plt.plot(history.history['val_loss']) + plt.title('model loss') + plt.ylabel('loss') + plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) + plt.xlabel('epoch') + plt.legend(['train', 'test'], loc='upper left') + plt.savefig(name+"_FOM:"+str(max_FOM)+'.png') + #plt.show() + +sys.exit("Done!") diff --git a/prepareDATA.py b/prepareDATA.py new file mode 100755 index 0000000..a09b6ed --- /dev/null +++ b/prepareDATA.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +import root_numpy +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.externals import joblib + +import localConfig as cfg +from commonFunctions import StopDataLoader + +myFeatures = ["Jet1Pt", "Met", "mt", "LepPt", "LepEta", "LepChg", "HT", "NbLoose","Njet", "JetHBpt", "DrJetHBLep", "JetHBCSV"] +inputBranches = list(myFeatures) +inputBranches.append("XS") +inputBranches.append("weight") +preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" +suffix = "_skimmed" +luminosity = 35866 +number_of_events_print = 0 +test_point = "550_520" +train_DM = "DM30" + +print "Loading datasets..." +dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal=train_DM, test=test_point) +#print dataDev.describe() +#print dataVal.describe() +''' +if number_of_events_print == 1: + np_dataDev, np_dataVal = StopDataLoader(cfg.loc, inputBranches, suffix=suffix, signal=train_DM, test=test_point) # + print " ==> BEFORE PRE-SELECTION:" + print " Train Signal Events:", len(np_dataDev[np_dataDev.category==1]) + print " Train Background Events:",len(np_dataDev[np_dataDev.category==0]) + print " Test Signal Events:", len(np_dataVal[np_dataVal.category==1]) + print " Test Background Events:", len(np_dataVal[np_dataVal.category==0]) + print "" + print " ==> AFTER PRE-SELECTION:" + print " Train Signal Events:", len(dataDev[dataDev.category==1]) + print " Train Background Events:",len(dataDev[dataDev.category==0]) + print " Test Signal Events:", len(dataVal[dataVal.category==1]) + print " Test Background Events:", len(dataVal[dataVal.category==0]) +''' +if number_of_events_print == 1: + print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' + print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' + print ' Signal:', len(dataDev[dataDev.category == 1]), '(', dataDev[dataDev.category == 1].weight.sum()*luminosity, 'weighted)' + print ' Background:', len(dataDev[dataDev.category == 0]), '(', dataDev[dataDev.category == 0].weight.sum()*luminosity, 'weighted)' + print ' Validation (test):', len(dataVal), '(', dataVal.weight.sum()*luminosity, 'weighted)' + print ' Signal:', len(dataVal[dataVal.category == 1]), '(', dataVal[dataVal.category == 1].weight.sum()*luminosity, 'weighted)' + print ' Background:', len(dataVal[dataVal.category == 0]), '(', dataVal[dataVal.category == 0].weight.sum()*luminosity, 'weighted)' + +data = dataDev.copy() +data = data.append(dataVal.copy(), ignore_index=True) + +print 'Finding features of interest' +trainFeatures = [var for var in data.columns if var in myFeatures] +otherFeatures = [var for var in data.columns if var not in trainFeatures] + +print "Preparing the data for the NN" +XDev = dataDev.ix[:,0:len(trainFeatures)] +XVal = dataVal.ix[:,0:len(trainFeatures)] +YDev = np.ravel(dataDev.category) +YVal = np.ravel(dataVal.category) +weightDev = np.ravel(dataDev.sampleWeight) +weightVal = np.ravel(dataVal.sampleWeight) + +print "Fitting the scaler and scaling the input variables" +scaler = StandardScaler().fit(XDev) +XDev = scaler.transform(XDev) +XVal = scaler.transform(XVal) + +scalerfile = 'scaler_'+train_DM+'.sav' +joblib.dump(scaler, scalerfile) + +print "DATA is ready!" diff --git a/skimmer.cc b/skimmer.cc old mode 100644 new mode 100755 diff --git a/testNN.py b/testNN.py new file mode 100755 index 0000000..5de2316 --- /dev/null +++ b/testNN.py @@ -0,0 +1,166 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' +import root_numpy +import numpy as np +import pandas +import keras +import time +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from keras.models import Sequential +from keras.layers import Dense, Dropout, AlphaDropout +from keras.optimizers import Adam +from sklearn.metrics import confusion_matrix, cohen_kappa_score +from scipy.stats import ks_2samp +import matplotlib.pyplot as plt +import localConfig as cfg +from commonFunctions import StopDataLoader, FOM1, FOM2, FullFOM, getYields +import sys + +from keras.models import model_from_json + +from prepareDATA import * + +model_name = "myNN_N13_L2_E25_DevDM30_Val550_520" + +print "Loading Model ..." +with open(model_name+'.json', 'r') as json_file: + loaded_model_json = json_file.read() +model = model_from_json(loaded_model_json) +model.load_weights(model_name+".h5") +model.compile(loss = 'binary_crossentropy', optimizer = 'adam') + +print("Getting predictions") +devPredict = model.predict(XDev) +valPredict = model.predict(XVal) + + +print("Getting scores") + +scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) +scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) +print "" +cohen_kappa=cohen_kappa_score(YVal, valPredict.round()) + +print "Calculating parameters" +dataDev["NN"] = devPredict +dataVal["NN"] = valPredict + +sig_dataDev=dataDev[dataDev.category==1] +bkg_dataDev=dataDev[dataDev.category==0] +sig_dataVal=dataVal[dataVal.category==1] +bkg_dataVal=dataVal[dataVal.category==0] + +tmpSig, tmpBkg = getYields(dataVal) +sigYield, sigYieldUnc = tmpSig +bkgYield, bkgYieldUnc = tmpBkg + + +fomEvo = [] +fomCut = [] + +bkgEff = [] +sigEff = [] + +sig_Init = dataVal[dataVal.category == 1].weight.sum() * 35866 * 2 +bkg_Init = dataVal[dataVal.category == 0].weight.sum() * 35866 * 2 + +for cut in np.arange(0.0, 0.9999, 0.001): + sig, bkg = getYields(dataVal, cut=cut, luminosity=luminosity) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) + bkgEff.append(bkg[0]/bkg_Init) + sigEff.append(sig[0]/sig_Init) + +max_FOM=0 + +for k in fomEvo: + if k>max_FOM: + max_FOM=k + +Eff = zip(bkgEff, sigEff) + +km_value=ks_2samp((sig_dataDev["NN"].append(bkg_dataDev["NN"])),(sig_dataVal["NN"].append(bkg_dataVal["NN"]))) + +#f.write(str(y)+"\n") +#print "Layers:", y +#f.write(str(x)+"\n") +#print "Neurons:", x +#f.write(str(cohen_kappa)+"\n") +print "Cohen Kappa score:", cohen_kappa +#f.write(str(max_FOM)+"\n") +print "Maximized FOM:", max_FOM +#f.write(str(fomCut[fomEvo.index(max_FOM)])+"\n") +print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] +#f.write(str(km_value[0])+"\n") +print "KS test statistic:", km_value[0] +#f.write(str(km_value[1])+"\n") +print "KS test p-value:", km_value[1] + + #f.close() + +selectedVal = dataVal[dataVal.NN>fomCut[fomEvo.index(max_FOM)]] +selectedSig = selectedVal[selectedVal.category == 1] +selectedBkg = selectedVal[selectedVal.category == 0] +sigYield = selectedSig.weight.sum() +bkgYield = selectedBkg.weight.sum() +sigYield = sigYield * luminosity * 2 #The factor 2 comes from the splitting +bkgYield = bkgYield * luminosity * 2 + + +print "Selected events left after cut @", fomCut[fomEvo.index(max_FOM)] +print " Number of selected Signal Events:", len(selectedSig) +print " Number of selected Background Events:", len(selectedBkg) +print " Sig Yield", sigYield +print " Bkg Yield", bkgYield + +print "Plotting" + +plt.figure(figsize=(7,6)) +plt.hist(sig_dataDev["NN"], 50, facecolor='blue', alpha=0.7, normed=1, weights=sig_dataDev["weight"]) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', alpha=0.7, normed=1, weights=bkg_dataDev["weight"]) +plt.hist(sig_dataVal["NN"], 50, color='blue', alpha=1, normed=1, weights=sig_dataVal["weight"], histtype="step") +plt.hist(bkg_dataVal["NN"], 50, color='red', alpha=1, normed=1, weights=bkg_dataVal["weight"], histtype="step") +plt.xlabel('NN output') +plt.suptitle("MVA overtraining check for classifier: NN", fontsize=13, fontweight='bold') +plt.title("Cohen's kappa: {0}\nKolmogorov Smirnov test: {1}".format(cohen_kappa, km_value[1]), fontsize=10) +plt.legend(['Signal (Test sample)', 'Background (Test sample)', 'Signal (Train sample)', 'Background (Train sample)\nasdfgh'], loc='upper right') +plt.savefig('hist_'+model_name+'.png', bbox_inches='tight') +plt.show() + + +both_dataDev=bkg_dataDev.append(sig_dataDev) +plt.figure(figsize=(7,6)) +plt.xlabel('NN output') +plt.title("Number of Events") +#plt.yscale('log', nonposy='clip') +plt.legend(['Background + Signal (test sample)', 'Background (test sample)'], loc="upper left" ) +plt.hist(bkg_dataDev["NN"], 50, facecolor='red', weights=bkg_dataDev["weight"]) +plt.hist(both_dataDev["NN"], 50, color="blue", histtype="step", weights=both_dataDev["weight"]) +plt.savefig('pred_'+model_name+'.png', bbox_inches='tight') +plt.show() + +plt.figure(figsize=(7,6)) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) +plt.plot(fomCut, fomEvo) +plt.title("FOM") +plt.ylabel("FOM") +plt.xlabel("ND") +plt.legend(["Max. FOM: {0}".format(max_FOM)], loc='upper left') + + +plt.subplot(212) +plt.semilogy(fomCut, Eff) +plt.axvspan(fomCut[fomEvo.index(max_FOM)], 1, facecolor='#2ca02c', alpha=0.3) +#plt.axvline(x=fomCut[fomEvo.index(max_FOM)], ymin=0, ymax=1) +plt.title("Efficiency") +plt.ylabel("Eff") +plt.xlabel("ND") +plt.legend(['Background', 'Signal'], loc='upper left') +plt.savefig('FOM_'+model_name+'.png', bbox_inches='tight') +plt.show() + +sys.exit("Done!") diff --git a/trainNN.py b/trainNN.py old mode 100644 new mode 100755 index 861fe13..5d41e8f --- a/trainNN.py +++ b/trainNN.py @@ -1,119 +1,51 @@ -#!/usr/bin/env python - -import root_numpy -import numpy as np -import pandas -import keras +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' +from keras.optimizers import Adam import time -from sklearn.externals import joblib -from sklearn.preprocessing import StandardScaler +import keras +import pandas from keras.models import Sequential from keras.layers import Dense, Dropout, AlphaDropout -from keras.optimizers import Adam from sklearn.metrics import confusion_matrix, cohen_kappa_score +import matplotlib.pyplot as plt +from commonFunctions import getYields, FullFOM +#from scipy.stats import ks_2samp -import localConfig as cfg -from commonFunctions import StopDataLoader - -myFeatures = ["Jet1Pt", "Met", "Njet", "LepPt", "LepEta", "LepChg", "HT", "NbLoose"] -inputBranches = list(myFeatures) -inputBranches.append("XS") -inputBranches.append("weight") -preselection = "(DPhiJet1Jet2 < 2.5 || Jet2Pt < 60) && (Met > 280) && (HT > 200) && (isTight == 1) && (Jet1Pt > 110)" -suffix = "_skimmed" -luminosity = 35866 - -print "Loading datasets..." -dataDev, dataVal = StopDataLoader(cfg.loc, inputBranches, selection=preselection, suffix=suffix, signal="300_270") -#print dataDev.describe() -#print dataVal.describe() -data = dataDev.copy() -data = data.append(dataVal.copy(), ignore_index=True) -print 'Datasets contain a total of', len(data), '(', data.weight.sum()*luminosity, 'weighted) events:' -print ' Development (train):', len(dataDev), '(', dataDev.weight.sum()*luminosity, 'weighted)' -print ' Signal:', len(dataDev[dataDev.category == 1]), '(', dataDev[dataDev.category == 1].weight.sum()*luminosity, 'weighted)' -print ' Background:', len(dataDev[dataDev.category == 0]), '(', dataDev[dataDev.category == 0].weight.sum()*luminosity, 'weighted)' -print ' Validation (test):', len(dataVal), '(', dataVal.weight.sum()*luminosity, 'weighted)' -print ' Signal:', len(dataVal[dataVal.category == 1]), '(', dataVal[dataVal.category == 1].weight.sum()*luminosity, 'weighted)' -print ' Background:', len(dataVal[dataVal.category == 0]), '(', dataVal[dataVal.category == 0].weight.sum()*luminosity, 'weighted)' - -print 'Finding features of interest' -trainFeatures = [var for var in data.columns if var in myFeatures] -otherFeatures = [var for var in data.columns if var not in trainFeatures] - -###################################### - -print "Preparing the data for the NN" -XDev = dataDev.ix[:,0:len(trainFeatures)] -XVal = dataVal.ix[:,0:len(trainFeatures)] -YDev = np.ravel(dataDev.category) -YVal = np.ravel(dataVal.category) -weightDev = np.ravel(dataDev.sampleWeight) -weightVal = np.ravel(dataVal.sampleWeight) - -print("Fitting the scaler and scaling the input variables") -scaler = StandardScaler().fit(XDev) -XDev = scaler.transform(XDev) -XVal = scaler.transform(XVal) - -scalerfile = 'scaler.sav' -joblib.dump(scaler, scalerfile) +from prepareDATA import * +n_neurons = 13 +n_layers = 2 +n_epochs = 25 +name = "myNN_N"+str(n_neurons)+"_L"+str(n_layers)+"_E"+str(n_epochs)+"_Dev"+train_DM+"_Val"+test_point compileArgs = {'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': ["accuracy"]} -trainParams = {'epochs': 2, 'batch_size': 20, 'verbose': 1} +trainParams = {'epochs': n_epochs, 'batch_size': 20, 'verbose': 1} learning_rate = 0.001/5.0 myAdam = Adam(lr=learning_rate) compileArgs['optimizer'] = myAdam -def getDefinedClassifier(nIn, nOut, compileArgs): +def getDefinedClassifier(nIn, nOut, compileArgs, neurons, layers): model = Sequential() - model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) - model.add(Dense(10, kernel_initializer='he_normal', activation='relu')) - #model.add(Dropout(0.2)) + model.add(Dense(nIn, input_dim=nIn, kernel_initializer='he_normal', activation='relu')) + for i in range(0,layers): + model.add(Dense(neurons, kernel_initializer='he_normal', activation='relu')) model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) model.compile(**compileArgs) return model -def getSELUClassifier(nIn, nOut, compileArgs): - model = Sequential() - model.add(Dense(16, input_dim=nIn, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(32, kernel_initializer='he_normal', activation='selu')) - model.add(AlphaDropout(0.2)) - model.add(Dense(nOut, activation="sigmoid", kernel_initializer='glorot_normal')) - model.compile(**compileArgs) - return model - -print type(XVal) -print XVal.shape -print XVal.dtype - print("Starting the training") start = time.time() -model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) -#model = getSELUClassifier(len(trainFeatures), 1, compileArgs) -history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) +call = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-7, patience=5, verbose=1, mode='auto') +model = getDefinedClassifier(len( trainFeatures), 1, compileArgs, n_neurons, n_layers) +history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev,callbacks=[call], **trainParams) print("Training took ", time.time()-start, " seconds") -name = "myNN" +# To save: model.save(name+".h5") -#model_json = model.to_json() -#with open(name + ".json", "w") as json_file: -# json_file.write(model_json) -#model.save_weights(name + ".h5") - -## To load: -#from keras.models import model_from_json -#with open('model.json', 'r') as json_file: -# loaded_model_json = json_file.read() -#loaded_model = model_from_json(loaded_model_json) -#loaded_model.load_weights("model.h5") +model_json = model.to_json() +with open(name + ".json", "w") as json_file: + json_file.write(model_json) +model.save_weights(name + ".h5") print("Getting predictions") devPredict = model.predict(XDev) @@ -123,139 +55,73 @@ def getSELUClassifier(nIn, nOut, compileArgs): scoreDev = model.evaluate(XDev, YDev, sample_weight=weightDev, verbose = 1) scoreVal = model.evaluate(XVal, YVal, sample_weight=weightVal, verbose = 1) -print "" - -print "Dev score:", scoreDev -print "Val score:", scoreVal -print confusion_matrix(YVal, valPredict.round()) -print cohen_kappa_score(YVal, valPredict.round()) print "Calculating FOM:" dataVal["NN"] = valPredict -def getYields(dataVal, cut=0.5, luminosity=35866, splitFactor=2): - selectedValIdx = (dataVal.NN>cut) - selectedVal = dataVal[selectedValIdx] +tmpSig, tmpBkg = getYields(dataVal) +sigYield, sigYieldUnc = tmpSig +bkgYield, bkgYieldUnc = tmpBkg - selectedSigIdx = (selectedVal.category == 1) - selectedBkgIdx = (selectedVal.category == 0) - selectedSig = selectedVal[selectedSigIdx] - selectedBkg = selectedVal[selectedBkgIdx] +sigDataVal = dataVal[dataVal.category==1] +bkgDataVal = dataVal[dataVal.category==0] - sigYield = selectedSig.weight.sum() - sigYieldUnc = np.sqrt(np.sum(np.square(selectedSig.weight))) - bkgYield = selectedBkg.weight.sum() - bkgYieldUnc = np.sqrt(np.sum(np.square(selectedBkg.weight))) +fomEvo = [] +fomCut = [] - sigYield = sigYield * luminosity * splitFactor # The factor 2 comes from the splitting - sigYieldUnc = sigYieldUnc * luminosity * splitFactor - bkgYield = bkgYield * luminosity * splitFactor - bkgYieldUnc = bkgYieldUnc * luminosity * splitFactor +for cut in np.arange(0.0, 0.9999999, 0.001): + sig, bkg = getYields(dataVal, cut=cut) + if sig[0] > 0 and bkg[0] > 0: + fom, fomUnc = FullFOM(sig, bkg) + fomEvo.append(fom) + fomCut.append(cut) - return ((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) +max_FOM=0 -tmpSig, tmpBkg = getYields(dataVal) -sigYield, sigYieldUnc = tmpSig -bkgYield, bkgYieldUnc = tmpBkg +print "Maximizing FOM" +for k in fomEvo: + if k>max_FOM: + max_FOM=k print "Signal@Presel:", sigDataVal.weight.sum() * 35866 * 2 print "Background@Presel:", bkgDataVal.weight.sum() * 35866 * 2 print "Signal:", sigYield, "+-", sigYieldUnc print "Background:", bkgYield, "+-", bkgYieldUnc -def FOM1(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / (b**0.5) - fomErr = ((sErr / (b**0.5))**2+(bErr*s / (2*(b)**(1.5)) )**2)**0.5 - return (fom, fomErr) - -def FOM2(sIn, bIn): - s, sErr = sIn - b, bErr = bIn - fom = s / ((s+b)**0.5) - fomErr = ((sErr*(2*b + s)/(2*(b + s)**1.5))**2 + (bErr * s / (2*(b + s)**1.5))**2)**0.5 - return (fom, fomErr) - -def FullFOM(sIn, bIn, fValue=0.2): - from math import log - s, sErr = sIn - b, bErr = bIn - fomErr = 0.0 # Add the computation of the uncertainty later - fomA = 2*(s+b)*log(((s+b)*(b + (fValue*b)**2))/(b**2 + (s + b) * (fValue*b)**2)) - fomB = log(1 + (s*b*b*fValue*fValue)/(b*(b+(fValue*b)**2)))/(fValue**2) - fom = (fomA - fomB)**0.5 - return (fom, fomErr) - -print "Basic FOM (s/SQRT(b)):", FOM1((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) -print "Basic FOM (s/SQRT(s+b)):", FOM2((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) -print "Full FOM:", FullFOM((sigYield, sigYieldUnc), (bkgYield, bkgYieldUnc)) +print "Maximized FOM:", max_FOM +print "FOM Cut:", fomCut[fomEvo.index(max_FOM)] import sys #sys.exit("Done!") ######################################################### + # Let's repeat the above, but monitor the evolution of the loss function -import matplotlib.pyplot as plt + #history = model.fit(XDev, YDev, validation_data=(XVal,YVal,weightVal), sample_weight=weightDev, **trainParams) -print(history.history.keys()) +plt.figure(figsize=(7,6)) +plt.subplots_adjust(hspace=0.5) +plt.subplot(211) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.show() +plt.subplot(212) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') +plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') -plt.show() - -fomEvo = [] -fomCut = [] -for cut in np.arange(0.0, 0.9999999, 0.001): - sig, bkg = getYields(dataVal, cut=cut) - if sig[0] > 0 and bkg[0] > 0: - fom, fomUnc = FullFOM(sig, bkg) - fomEvo.append(fom) - fomCut.append(cut) - -plt.plot(fomCut, fomEvo) -plt.title("FOM") -plt.ylabel("FOM") -plt.xlabel("ND") -plt.legend(['test'], loc='upper left') -plt.show() - -#print fomEvo +plt.savefig(name+'.png') +#plt.savefig('NN2_'+str(y)+''+str(x)+''+test_point+"_"+str(max_FOM)+'.png') sys.exit("Done!") - -######################################################### - -print "Attempting KFold" - -from sklearn.model_selection import StratifiedKFold - -seed = 42 -np.random.seed(seed) -kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) -cvscores = [] -for train, test in kfold.split(XDev, YDev): - model = getDefinedClassifier(len(trainFeatures), 1, compileArgs) - model.fit(XDev[train], YDev[train], validation_data=(XDev[test],YDev[test],weightDev[test]), sample_weight=weightDev[train], **trainParams) - scores = model.evaluate(XDev[test], YDev[test], sample_weight=weightDev[test], verbose=1) - print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) - cvscores.append(scores[1] * 100) -print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))) - - -