diff --git a/DFconverter.py b/DFconverter.py deleted file mode 100644 index 5cb2f23..0000000 --- a/DFconverter.py +++ /dev/null @@ -1,22 +0,0 @@ -from hipe4ml.tree_handler import TreeHandler -import tomli -import sys - - -def convertDF(input_file): - """ - Opens input file in toml format, retrives signal, background and deploy data - like TreeHandler objects - - Parameters - ------------------------------------------------ - df: str - input toml file - """ - with open(str(input_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - signal = TreeHandler(inp_dict["signal"]["path"], inp_dict["signal"]["tree"]) - background = TreeHandler(inp_dict["background"]["path"], inp_dict["background"]["tree"]) - deploy = TreeHandler(inp_dict["deploy"]["path"], inp_dict["deploy"]["tree"]) - - return signal, background, deploy diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7995e55 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 CandidatesClassifier + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MLconfig_XGboostParameters.py b/MLconfig_XGboostParameters.py deleted file mode 100644 index fcf6798..0000000 --- a/MLconfig_XGboostParameters.py +++ /dev/null @@ -1,58 +0,0 @@ -import numpy as np -import pandas as pd -import xgboost as xgb -import matplotlib.pyplot as plt - -from bayes_opt import BayesianOptimization -import gc - - -class XGBoostParams: - - def __init__(self, params, dtrain): - self.params = params - self.dtrain = dtrain - - def bo_tune_xgb(self, max_depth, gamma, alpha, n_estimators ,learning_rate): - - params1 = {'max_depth': int(max_depth), - 'gamma': gamma, - 'alpha':alpha, - 'n_estimators': n_estimators, - 'learning_rate':learning_rate, - 'subsample': 0.8, - 'eta': 0.3, - 'eval_metric': 'auc', 'nthread' : 7} - - cv_result = xgb.cv(params1, self.dtrain, num_boost_round=10, nfold=5) - return cv_result['test-auc-mean'].iloc[-1] - - def get_best_params(self): - """ - Performs Bayesian Optimization and looks for the best parameters - - Parameters: - None - """ - #Invoking the Bayesian Optimizer with the specified parameters to tune - xgb_bo = BayesianOptimization(self.bo_tune_xgb, {'max_depth': (4, 10), - 'gamma': (0, 1), - 'alpha': (2,20), - 'learning_rate':(0,1), - 'n_estimators':(100,500) - }) - #performing Bayesian optimization for 5 iterations with 8 steps of random exploration - # with an #acquisition function of expected improvement - xgb_bo.maximize(n_iter=1, init_points=1) - - max_param = xgb_bo.max['params'] - param1= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], - 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), - 'objective': 'reg:logistic'} - gc.collect() - - - #To train the algorithm using the parameters selected by bayesian optimization - #Fit/train on training data - bst = xgb.train(param1, self.dtrain) - return bst diff --git a/MLconfig_variables.py b/cand_class/MLconfig_variables.py similarity index 79% rename from MLconfig_variables.py rename to cand_class/MLconfig_variables.py index bc013b8..605c183 100644 --- a/MLconfig_variables.py +++ b/cand_class/MLconfig_variables.py @@ -6,6 +6,15 @@ from scipy.stats import binned_statistic as b_s import matplotlib as mpl +from hipe4ml import plot_utils + +def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): + res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) + res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') + res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') + + + def calculate_correlation(df, vars_to_corr, target_var) : """ Calculates correlations with target variable variable and standart errors @@ -56,18 +65,18 @@ def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, background covariance standart error of the mean output_path: path that contains output plot - """ - fig, ax = plt.subplots(figsize=(20,10)) - plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') - plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') + fig, ax = plt.subplots(figsize=(10,6)) + plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='--o') + plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='--o') ax.grid(zorder=0) - ax.set_xticklabels(vars_to_draw, fontsize=25, rotation =70) - ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) - plt.legend(('signal','background'), fontsize = 25) - plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 25) - plt.ylabel('Correlation coefficient', fontsize = 25) + ax.set_xticklabels(vars_to_draw, fontsize=15, rotation =70) + # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) + ax.yaxis.set_tick_params(labelsize=15) + plt.legend(('signal','background'), fontsize = 15) + plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 18) + plt.ylabel('Correlation coefficient', fontsize = 15) fig.tight_layout() fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') @@ -78,33 +87,24 @@ def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): This function takes the entries of the variables and distributes them in 25 bins. The function then plots the bin centers of the first variable on the x-axis and the mean values of the bins of the second variable on the y-axis, along with its bin stds. - Parameters ------------------------------------------------ df: pandas.DataFrame input DataFrame - variable_xaxis: str variable to be plotted on x axis (invariant mass) - x_unit: str x axis variable units - variable_yaxis: str variable to be plotted on y axis - sgn: int(0 or 1) signal definition(0 background, 1 signal) - pdf_key: matplotlib.backends.backend_pdf.PdfPages output pdf file - peak: int invariant mass peak position - edge_left: int left edge of x axis variable - edge_right: int left edge of y axis variable """ @@ -119,7 +119,7 @@ def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): for var in df.columns: if var != variable_xaxis: - fig, axs = plt.subplots(figsize=(20, 15)) + fig, axs = plt.subplots(figsize=(10, 6)) bin_means, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='mean', bins=25) bin_std, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='std', bins=25) @@ -134,19 +134,21 @@ def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): bin_std = np.delete(bin_std , nan_ind) - plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', marker='.',mfc='red', ms=10) - + plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', linewidth = 2, marker='.',mfc='red', ms=15) + plt.locator_params(axis='y', nbins=5) + plt.locator_params(axis='x', nbins=5) - plt.title('Mean of ' +var+ ' plotted versus bin centers of '+variable_xaxis+ \ - '('+keyword+')', fontsize=25) - plt.xlabel('Mass', fontsize=25) - plt.ylabel("Mean of each bin with the SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=25) + plt.title('Mean of ' +var+ ' vs bin centers of '+variable_xaxis+ \ + '('+keyword+')', fontsize=19) + plt.xlabel('Mass', fontsize=17) + plt.ylabel(" SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=17) - plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') - + plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-', linewidth = 3) + axs.xaxis.set_tick_params(labelsize=16) + axs.yaxis.set_tick_params(labelsize=16) fig.tight_layout() plt.savefig(pdf_key,format='pdf') @@ -160,13 +162,10 @@ def plot2D_all(df, sample, sgn, pdf_key): ------------------------------------------------ df: pandas.DataFrame input dataframe - sample: str title of the sample - sgn: int(0 or 1) signal definition(0 background, 1 signal) - pdf_key: matplotlib.backends.backend_pdf.PdfPages output pdf file """ @@ -206,48 +205,49 @@ def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): ------------------------------------------------ df: pandas.DataFrame input dataframe - sample: str title of the sample - - mass_var: str name of the invariant mass variable - mass_range: list mass range to be plotted - sgn: int(0 or 1) signal definition(0 background, 1 signal) - peak: int invariant mass value - pdf_key: matplotlib.backends.backend_pdf.PdfPages output pdf file """ for var in df.columns: if var != mass_var: - fig, axs = plt.subplots(figsize=(15, 10)) + fig, axs = plt.subplots(figsize=(6, 4)) cax = plt.hist2d(df[mass_var],df[var],range=[mass_range, [df[var].min(), df[var].max()]], bins=100, norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) if sgn==1: - plt.title('Signal candidates ' + sample, fontsize = 25) + plt.title('Signal candidates ' + sample, fontsize = 15) if sgn==0: - plt.title('Background candidates ' + sample, fontsize = 25) + plt.title('Background candidates ' + sample, fontsize = 15) - plt.xlabel(mass_var, fontsize=25) - plt.ylabel(var, fontsize=25) + plt.xlabel(mass_var, fontsize=16) + plt.ylabel(var, fontsize=16) - plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-') + plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-', linewidth = 4) mpl.pyplot.colorbar() + + axs.xaxis.set_tick_params(labelsize=11) + axs.yaxis.set_tick_params(labelsize=11) + + plt.locator_params(axis='y', nbins=5) + plt.locator_params(axis='x', nbins=5) + + plt.legend(shadow=True,title =str(len(df))+ " samples") fig.tight_layout() diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py new file mode 100644 index 0000000..146deb8 --- /dev/null +++ b/cand_class/apply_model.py @@ -0,0 +1,428 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import xgboost as xgb +from hipe4ml.plot_utils import plot_roc_train_test +from cand_class.helper import * +from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score + +from dataclasses import dataclass + +from matplotlib.font_manager import FontProperties + +from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, + AutoMinorLocator) + +import gc +import matplotlib as mpl +from cand_class.helper import * +from hipe4ml import plot_utils + +mpl.rc('figure', max_open_warning = 0) + + +@dataclass +class ApplyXGB: + """ + A class used to apply XGBoost predictions on the data + + ... + + Attributes + ---------- + x_train : pd.core.frame.DataFrame + train dataframe + x_test : pd.core.frame.DataFrame + test dataframe + y_pred_train : np.ndarray + array with XGBoost predictions for train dataset + y_pred_test : np.ndarray + array with XGBoost predictions for test dataset + output_path : str + output path for plots + + Methods + ------- + get_predictions() + Returns train and test dataframes with predictions + features_importance(bst) + + """ + + x_train : pd.core.frame.DataFrame + x_test : pd.core.frame.DataFrame + + y_pred_train : np.ndarray + y_pred_test : np.ndarray + + y_train : np.ndarray + y_test : np.ndarray + + output_path : str + + __train_res = pd.DataFrame() + __test_res = pd.DataFrame() + + __best_train_thr : int = 0 + __best_test_thr : int = 0 + + + def get_predictions(self): + """ + Makes XGB predictions + + Returns + -------- + + Train and test dataframes with predictions + """ + self.__train_res = self.x_train.copy() + self.__train_res['xgb_preds'] = self.y_pred_train + + self.__test_res = self.x_test.copy() + self.__test_res['xgb_preds'] = self.y_pred_test + + return self.__train_res, self.__test_res + + + def apply_prob_cut(self, ams, train_thr, test_thr): + """ + Applies BDT cut on XGB probabilities and returns 'xgb_preds1' ==1 if + sample's prediction > BDT threshold, otherwise 'xgb_preds1' ==0 + If ams==1 BDT cut is computed with respect to AMS metrics optimization + If ams==0 and train_thr!=0 and test_thr!=0, one should specify + thresholds for test and train datasets manually + """ + + if ams==1: + self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, + self.y_test, self.y_pred_test, self.output_path) + + if ams==0 and train_thr!=0 and test_thr!=0: + self.__best_train_thr = train_thr + self.__best_test_thr = test_thr + + train_pred = ((self.y_pred_train > self.__best_train_thr)*1) + test_pred = ((self.y_pred_test > self.__best_test_thr)*1) + + + self.__train_res['xgb_preds1'] = train_pred + + self.__test_res['xgb_preds1'] = test_pred + + return self.__train_res, self.__test_res + + + def print_roc(self): + plot_utils.plot_roc_train_test(self.y_test, self.__test_res['xgb_preds1'], + self.y_train, self.__train_res['xgb_preds1'], None, ['background', 'signal']) + plt.savefig(str(self.output_path)+'/roc_curve.png') + + + + + def features_importance(self, bst): + """ + Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to + the number of observations known to be in group i and predicted to be in + group j. Thus in binary classification, the count of true positives is C00, + false negatives C01,false positives is C10, and true neagtives is C11. + + Confusion matrix is applied to previously unseen by model data, so we can + estimate model's performance + + Parameters + ---------- + bst: xgboost.sklearn.XGBClassifier + model's XGB classifier + + Returns + ------- + + Saves plot with XGB features imporance + + """ + ax = xgb.plot_importance(bst) + plt.rcParams['figure.figsize'] = [6, 3] + plt.rcParams['font.size'] = 15 + ax.xaxis.set_tick_params(labelsize=13) + ax.yaxis.set_tick_params(labelsize=13) + ax.figure.tight_layout() + ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") + + + def CM_plot_train_test(self, issignal): + """ + Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to + the number of observations known to be in group i and predicted to be in + group j. Thus in binary classification, the count of true positives is C00, + false negatives C01,false positives is C10, and true neagtives is C11. + + Confusion matrix is applied to previously unseen by model data, so we can + estimate model's performance + + Parameters + ---------- + issignal: str + signal label + + Returns + ------- + + Saves plot with confusion matrix + + """ + + cnf_matrix_train = confusion_matrix(self.__train_res[issignal], self.__train_res['xgb_preds1'], labels=[1,0]) + np.set_printoptions(precision=2) + fig_train, axs_train = plt.subplots(figsize=(8, 6)) + axs_train.yaxis.set_label_coords(-0.04,.5) + axs_train.xaxis.set_label_coords(0.5,-.005) + + axs_train.xaxis.set_tick_params(labelsize=15) + axs_train.yaxis.set_tick_params(labelsize=15) + + plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], + title=' Train Dataset Confusion Matrix for cut > '+"%.4f"%self.__best_train_thr) + plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') + + + cnf_matrix_test = confusion_matrix(self.__test_res[issignal], self.__test_res['xgb_preds1'], labels=[1,0]) + np.set_printoptions(precision=2) + fig_test, axs_test = plt.subplots(figsize=(8, 6)) + axs_test.yaxis.set_label_coords(-0.04,.5) + axs_test.xaxis.set_label_coords(0.5,-.005) + + axs_test.xaxis.set_tick_params(labelsize=15) + axs_test.yaxis.set_tick_params(labelsize=15) + + plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], + title=' Test Dataset Confusion Matrix for cut > '+"%.4f"%self.__best_test_thr) + plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') + + + def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name, pt_rap): + """ + Plots distribution in pT-rapidity phase space + + Parameters + ---------- + df: pd.DataFrame + dataframe with XGBoost predictions + sign_label: str + dataframe column that srecifies if the sample is signal or not + pred_label: str + dataframe column with XGBoost prediction + x_range: list + rapidity range + y_range: list + pT range + data_name: str + name of the dataset (for example, train or test) + pt_rap: list + list with pt and rapidity labels(for example ['pt', 'rapid']) + + Returns + ------- + + Saves istribution in pT-rapidity phase space + + """ + fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) + + df_orig = df[df[sign_label]==1] + + df_cut = df[df[pred_label]==1] + + diff_vars = pt_rap + + difference = pd.concat([df_orig[diff_vars], df_cut[diff_vars]]).drop_duplicates(keep=False) + + s_label = 'Signal ' + + axs[0].set_aspect(aspect = 'auto') + axs[1].set_aspect(aspect = 'auto') + axs[2].set_aspect(aspect = 'auto') + + rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) + diff = df_orig.shape[0] - df_cut.shape[0] + axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) + axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) + axs[2].legend(shadow=True, title ='ML cut saves \n'+ str(saved) +'% of '+ s_label, fontsize =14) + + + + counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig[pt_rap[1]], df_orig[pt_rap[0]] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) + axs[0].set_xlabel('rapidity', fontsize=15) + axs[0].set_ylabel('pT, GeV', fontsize=15) + + + mpl.pyplot.colorbar(im0, ax = axs[0]) + + + + axs[0].xaxis.set_major_locator(MultipleLocator(1)) + axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[0].xaxis.set_tick_params(which='both', width=2) + + + fig.tight_layout() + + + counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut[pt_rap[1]], df_cut[pt_rap[0]] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) + axs[1].set_xlabel('rapidity', fontsize=15) + axs[1].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[1]) + + axs[1].xaxis.set_major_locator(MultipleLocator(1)) + axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[1].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + + counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference[pt_rap[1]], difference[pt_rap[0]] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[2].set_title(s_label + ' difference ', fontsize = 16) + axs[2].set_xlabel('rapidity', fontsize=15) + axs[2].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[2]) + + + axs[2].xaxis.set_major_locator(MultipleLocator(1)) + axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[2].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') + + + + def hist_variables(self, mass_var, df, sign_label, pred_label, sample, pdf_key): + """ + Applied quality cuts and created distributions for all the features in pdf + file + Parameters + ---------- + df_s: dataframe + signal + df_b: dataframe + background + feature: str + name of the feature to be plotted + pdf_key: PdfPages object + name of pdf document with distributions + """ + + dfs_orig = df[df[sign_label]==1] + dfb_orig = df[df[sign_label]==0] + + dfs_cut = df[(df[sign_label]==1) & (df[pred_label]==1)] + dfb_cut = df[(df[sign_label]==0) & (df[pred_label]==1)] + + diff_vars = dfs_orig.columns.drop([sign_label, pred_label]) + + difference_s = pd.concat([dfs_orig[diff_vars], dfs_cut[diff_vars]]).drop_duplicates(keep=False) + + + + for feature in diff_vars: + fig, ax = plt.subplots(3, figsize=(15, 10)) + + + fontP = FontProperties() + fontP.set_size('xx-large') + + ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + + + '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + + '\nquality cuts ', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[0].xaxis.set_tick_params(labelsize=15) + ax[0].yaxis.set_tick_params(labelsize=15) + + ax[0].set_title(str(feature) + ' MC '+ sample + ' before ML cut', fontsize = 25) + ax[0].set_xlabel(feature, fontsize = 25) + + if feature!=mass_var: + ax[0].set_yscale('log') + + fig.tight_layout() + + + if len(dfb_cut) !=0: + s_b_cut = round(len(dfs_cut)/len(dfb_cut), 3) + title1 = 'S/B='+ str(s_b_cut) + else: + title1 = 'S = '+str(len(dfs_cut)) + ' all bgr was cut' + + + + ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[1].legend(shadow=True,title = title1 + + '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + + '\nquality cuts + ML cut', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[1].xaxis.set_tick_params(labelsize=15) + ax[1].yaxis.set_tick_params(labelsize=15) + + ax[1].set_title(feature + ' MC '+ sample+ ' after ML cut', fontsize = 25) + ax[1].set_xlabel(feature, fontsize = 25) + + if feature!='mass': + ax[1].set_yscale('log') + + fig.tight_layout() + + + + + ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[2].xaxis.set_tick_params(labelsize=15) + ax[2].yaxis.set_tick_params(labelsize=15) + + ax[2].set_title(feature + ' MC '+ sample +' signal difference', fontsize = 25) + ax[2].set_xlabel(feature, fontsize = 25) + + if feature!=mass_var: + ax[2].set_yscale('log') + + fig.tight_layout() + + plt.savefig(pdf_key,format='pdf') + + + pdf_key.close() diff --git a/cand_class/config_reader.py b/cand_class/config_reader.py new file mode 100644 index 0000000..24b47eb --- /dev/null +++ b/cand_class/config_reader.py @@ -0,0 +1,53 @@ +from hipe4ml.tree_handler import TreeHandler +import tomli +import sys + + +def convertDF(input_file, mass_var): + """ + Opens input file in toml format, retrives signal, background and deploy data + like TreeHandler objects + + Parameters + ------------------------------------------------ + df: str + input toml file + """ + with open(str(input_file), encoding="utf-8") as inp_file: + inp_info = tomli.load(inp_file) + + signal = TreeHandler(inp_info["signal"]["path"], inp_info["signal"]["tree"]) + background = TreeHandler(inp_info["background"]["path"], inp_info["background"]["tree"]) + + + + + + bgr_left_edge = inp_info["peak_range"]["bgr_left_edge"] + bgr_right_edge = inp_info["peak_range"]["bgr_right_edge"] + + peak_left_edge = inp_info["peak_range"]["sgn_left_edge"] + peak_right_edge = inp_info["peak_range"]["sgn_right_edge"] + + + selection = str(bgr_left_edge)+'< '+mass_var+' <'+str(peak_left_edge)+' or '+str(peak_right_edge)+\ + '< '+mass_var+' <'+str(bgr_right_edge) + + signalH = signal.get_subset(size = inp_info["number_of_events"]["number_of_signal_events"]) + bkgH = background.get_subset(selection, size=inp_info["number_of_events"]["number_of_background_events"]) + + return signalH, bkgH + + +def read_log_vars(inp_file): + with open(str(inp_file), encoding="utf-8") as inp_file: + inp_dict = tomli.load(inp_file) + + return inp_dict['non_log_scale']['variables'], inp_dict['log_scale']['variables'] + + + +def read_train_vars(inp_file): + with open(str(inp_file), encoding="utf-8") as inp_file: + inp_dict = tomli.load(inp_file) + return inp_dict['train_vars'] diff --git a/helper.py b/cand_class/helper.py similarity index 76% rename from helper.py rename to cand_class/helper.py index a3dfc99..9b4aa7c 100644 --- a/helper.py +++ b/cand_class/helper.py @@ -2,16 +2,14 @@ import pandas as pd import matplotlib.pyplot as plt -from read_configs import * - import xgboost as xgb from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score from numpy import sqrt, log, argmax import itertools +import treelite - -def transform_df_to_log(df, vars, inp_file): +def transform_df_to_log(df, vars, non_log_x, log_x): """ Transforms DataFrame to DataFrame with features in log scale Parameters @@ -24,17 +22,12 @@ def transform_df_to_log(df, vars, inp_file): config TOML file with list of features that should and shouldn't be transformed to log scale """ - df_new = pd.DataFrame() - - non_log_x, log_x = read_log_vars(inp_file) - - + df_new = df.copy() for var in vars: if var in log_x: - df_new['log('+ var+')'] = np.log(df[var]) - if var in non_log_x: - df_new[var] = df[var] + df_new[var] = df_new[var].apply(np.log) + df_new = df_new.rename(columns={var: 'log('+ var+')'}) return df_new @@ -87,24 +80,14 @@ def AMS(y_true, y_predict, y_true1, y_predict1, output_path): S0_best_threshold1 = (thresholds[xi1]) - fig, ax = plt.subplots(figsize=(12, 8), dpi = 100) - plt.plot(fpr, tpr, linewidth=3 ,linestyle=':',color='darkorange',label='ROC curve train (area = %0.6f)' % roc_auc) - plt.plot(fpr1, tpr1, color='green',label='ROC curve test (area = %0.6f)' % roc_auc1) - plt.plot([0, 1], [0, 1], color='navy', linestyle='--', label='Random guess') - #plt.scatter(fpr[xi], tpr[xi], marker='o', color='black', label= 'Best Threshold train set = '+"%.4f" % S0_best_threshold +'\n AMS = '+ "%.2f" % S0[xi]) - plt.scatter(fpr1[xi1], tpr1[xi1], marker='o', s=80, color='blue', label= 'Best Threshold test set = '+"%.4f" % S0_best_threshold1 +'\n AMS = '+ "%.2f" % S01[xi1]) - plt.xlabel('False Positive Rate', fontsize = 18) - plt.ylabel('True Positive Rate', fontsize = 18) - plt.legend(loc="lower right", fontsize = 18) - plt.title('Receiver operating characteristic', fontsize = 18) - ax.tick_params(axis='both', which='major', labelsize=18) - plt.xlim([-0.01, 1.0]) - plt.ylim([0, 1.02]) - #axs.axis([-0.01, 1, 0.9, 1]) - fig.tight_layout() - fig.savefig(str(output_path)+'/hists.png') - plt.show() - return S0_best_threshold, S0_best_threshold1 + roc_curve_data = dict() + roc_curve_data["fpr_train"] = fpr + roc_curve_data["tpr_train"] = tpr + + roc_curve_data["fpr_test"] = fpr1 + roc_curve_data["tpr_test"] = tpr1 + + return S0_best_threshold, S0_best_threshold1, roc_curve_data def plot_confusion_matrix(cm, classes, @@ -119,8 +102,11 @@ def plot_confusion_matrix(cm, classes, print(cm) + # plt.subplots(figsize=(12, 9)) + # plt.figure(figsize = (6, 5)) plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) + + plt.title(title, fontsize = 20) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) @@ -131,7 +117,7 @@ def plot_confusion_matrix(cm, classes, for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") + color="white" if cm[i, j] > thresh else "black", size = 15) plt.tight_layout() plt.ylabel('True label',fontsize = 15) @@ -186,7 +172,27 @@ def difference_df(df_orig, df_cut, cut): def diff_SB_cut(df, target_label): - dfs_cut = df[(df['xgb_preds1']==1) & (df[target_label]==1)] + dfs_cut = df[(df['xgb_preds1']==1)] dfb_cut = df[(df['xgb_preds1']==1) & (df[target_label]==0)] return dfs_cut, dfb_cut + + + +def save_model_lib(bst_model, output_path): + bst = bst_model.get_booster() + + #create an object out of your model, bst in our case + model = treelite.Model.from_xgboost(bst) + #use GCC compiler + toolchain = 'gcc' + #parallel_comp can be changed upto as many processors as one have + model.export_lib(toolchain=toolchain, libpath=output_path+'/xgb_model.so', + params={'parallel_comp': 4}, verbose=True) + + + # Operating system of the target machine + platform = 'unix' + model.export_srcpkg(platform=platform, toolchain=toolchain, + pkgpath=output_path+'/XGBmodel.zip', libname='xgb_model.so', + verbose=True) diff --git a/cand_class/hipe_conf_params.py b/cand_class/hipe_conf_params.py new file mode 100644 index 0000000..3884f03 --- /dev/null +++ b/cand_class/hipe_conf_params.py @@ -0,0 +1,64 @@ +from hipe4ml.model_handler import ModelHandler +from dataclasses import dataclass +import xgboost as xgb + +import matplotlib.pyplot as plt +from hipe4ml import plot_utils + +import xgboost as xgb + + +@dataclass +class XGBmodel(): + features_for_train: list + hyper_pars_ranges: dict + train_test_data: list + output_path : str + __model_hdl: ModelHandler = (None, None, None) + metrics: str = 'roc_auc' + nfold: int = 3 + init_points: int = 1 + n_iter: int = 2 + n_jobs: int = -1 + + + + + def modelBO(self): + model_clf = xgb.XGBClassifier() + self.__model_hdl = ModelHandler(model_clf, self.features_for_train) + self.__model_hdl.optimize_params_bayes(self.train_test_data, self.hyper_pars_ranges, + self.metrics, self.nfold, self.init_points, self.n_iter, self.n_jobs) + + + + def train_test_pred(self): + self.__model_hdl.train_test_model(self.train_test_data) + + y_pred_train = self.__model_hdl.predict(self.train_test_data[0], False) + y_pred_test = self.__model_hdl.predict(self.train_test_data[2], False) + + + return y_pred_train, y_pred_test + + + def save_predictions(self, filename): + print(self.__model_hdl.get_original_model()) + self.__model_hdl.dump_original_model(self.output_path+'/'+filename, xgb_format=False) + + + def load_model(self, filename): + self.__model_hdl.load_model_handler(filename) + + + def get_mode_booster(self): + return self.__model_hdl.model + + + def plot_dists(self): + + leg_labels = ['background', 'signal'] + ml_out_fig = plot_utils.plot_output_train_test(self.__model_hdl, self.train_test_data, 100, + False, leg_labels, True, density=True) + + plt.savefig(str(self.output_path)+'/thresholds.png') diff --git a/cand_class/hists_root.py b/cand_class/hists_root.py new file mode 100644 index 0000000..9542182 --- /dev/null +++ b/cand_class/hists_root.py @@ -0,0 +1,291 @@ +import ROOT + +from array import array + +from cand_class.helper import * + +from dataclasses import dataclass + +@dataclass +class HistBuilder: + + output_path : str + root_output_name = 'hists.root' + + def __post_init__(self): + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE") + __hist_out.cd() + + ROOT.gDirectory.mkdir('Signal') + ROOT.gDirectory.mkdir('Background') + + __hist_out.cd() + __hist_out.cd('Signal') + + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + + __hist_out.cd() + __hist_out.cd('Background') + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + + __hist_out.cd() + ROOT.gDirectory.cd('Background/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Background/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.Close() + + + def roc_curve_root(self): + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + __hist_out.cd() + + fpr = roc_curve_data['fpr_train'] + tpr = roc_curve_data['tpr_train'] + + + fpr1 = roc_curve_data['fpr_test'] + tpr1 = roc_curve_data['tpr_test'] + + + fpr_d_tr = array('d', fpr.tolist()) + tpr_d_tr = array('d', tpr.tolist()) + + fpr_d_ts = array('d', fpr1.tolist()) + tpr_d_ts = array('d', tpr1.tolist()) + + train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) + test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) + + train_roc.SetLineColor(ROOT.kRed + 2) + test_roc.SetLineColor(ROOT.kBlue + 2) + + + train_roc.SetLineWidth(3) + test_roc.SetLineWidth(3) + + + train_roc.SetLineStyle(9) + test_roc.SetLineStyle(9) + + + train_roc.SetTitle("Receiver operating characteristic train") + test_roc.SetTitle("Receiver operating characteristic test") + + train_roc.GetXaxis().SetTitle('FPR'); + train_roc.GetYaxis().SetTitle('TPR'); + + test_roc.GetXaxis().SetTitle('FPR'); + test_roc.GetYaxis().SetTitle('TPR'); + + train_roc.Write("Train_roc") + test_roc.Write("Test_roc") + + __hist_out.Close() + + def pt_rap_root(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): + + if sign ==0: + s_label = 'Background' + m = 5 + + if sign==1: + s_label = 'Signal' + m = 1 + + rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) + diff = df_orig.shape[0] - df_cut.shape[0] + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') + + + rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) + pT_orig = array('d', df_orig['pT'].values.tolist()) + pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_orig)): + pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_before_cut.Draw('COLZ') + + + + + + rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) + pT_cut = array('d', df_cut['pT'].values.tolist()) + + pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_cut)): + pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_cut.Draw('COLZ') + + + + + + rapidity_diff = array('d', difference['rapidity'].values.tolist()) + pT_diff = array('d', difference['pT'].values.tolist()) + + pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_diff)): + pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_diff.GetXaxis().SetTitle('rapidity'); + pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); + pT_rap_diff.Draw('COLZ') + + + pT_rap_before_cut.Write() + pT_rap_cut.Write() + pT_rap_diff.Write() + + __hist_out.Close() + + + def hist_variables_root(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut,difference_s, sample): + """ + Applied quality cuts and created distributions for all the features in pdf + file + Parameters + ---------- + df_s: dataframe + signal + df_b: dataframe + background + feature: str + name of the feature to be plotted + pdf_key: PdfPages object + name of pdf document with distributions + """ + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + + + for feature in dfs_orig.columns: + + dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) + dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) + + + dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) + dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) + + + dfs_diff_feat = array('d', difference_s[feature].values.tolist()) + + + dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, + min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) + + for i in range(len(dfs_orig_feat)): + dfs_orig_root.Fill(dfs_orig_feat[i]) + + dfs_orig_root.GetXaxis().SetTitle(feature); + + dfs_orig_root.Draw() + + dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, + min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) + + for i in range(len(dfb_orig_feat)): + dfb_orig_root.Fill(dfb_orig_feat[i]) + + dfb_orig_root.GetXaxis().SetTitle(feature); + dfb_orig_root.Draw() + + + dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, + min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) + + for i in range(len(dfs_cut_feat)): + dfs_cut_root.Fill(dfs_cut_feat[i]) + + dfs_cut_root.GetXaxis().SetTitle(feature); + dfs_cut_root.Draw() + + + dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, + min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) + + for i in range(len(dfb_cut_feat)): + dfb_cut_root.Fill(dfb_cut_feat[i]) + + dfb_cut_root.GetXaxis().SetTitle(feature); + dfb_cut_root.Draw() + + + dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, + min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) + + for i in range(len(dfs_diff_feat)): + dfs_diff_root.Fill(dfs_diff_feat[i]) + + dfs_diff_root.GetXaxis().SetTitle(feature); + dfs_diff_root.Draw() + + + __hist_out.cd() + + __hist_out.cd('Signal'+'/'+sample+'/'+'hists') + dfs_orig_root.Write() + dfs_cut_root.Write() + dfs_diff_root.Write() + + __hist_out.cd() + + __hist_out.cd('Background'+'/'+sample+'/'+'hists') + + dfb_orig_root.Write() + dfb_cut_root.Write() + + __hist_out.Close() diff --git a/read_configs.py b/read_configs.py deleted file mode 100644 index 96de294..0000000 --- a/read_configs.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np -import pandas as pd -import tomli - - -def read_log_vars(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - - return inp_dict['non_log_variables'], inp_dict['log_variables'] - - -def read_peak(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - return inp_dict - - -def read_train_vars(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - return inp_dict['train_vars'] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..91f39c0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +uproot +pandas +numpy +scikit-learn +xgboost +bayesian-optimization +hipe4ml +treelite +treelite_runtime diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..93a33d4 --- /dev/null +++ b/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup, find_packages + + +setup( + name="CandidatesClassifier", + version="0.1.0", + author="olha lavoryk", + license="MIT", + url="https://github.com/conformist89/CandidatesClassifier.git", + packages=find_packages() +) diff --git a/train_test_xgboost.py b/train_test_xgboost.py deleted file mode 100644 index 04d3032..0000000 --- a/train_test_xgboost.py +++ /dev/null @@ -1,368 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import xgboost as xgb -from hipe4ml.plot_utils import plot_roc_train_test -from helper import * -from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score - - -from matplotlib.font_manager import FontProperties - -from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, - AutoMinorLocator) - -import gc -import matplotlib as mpl - - - -mpl.rc('figure', max_open_warning = 0) - - -class TrainTestXGBoost: - """ - Class used for train and test model on parameters found by Bayesian Optimization - Parameters - ------------------------------------------------- - bst: - model to be used - - dtrain: xgboost Matrix - train dataset - - y_train: - train target label - - dtest: xgboost Matrix - test dataset - - y_test: - test target label - - bst_train: - dataframe with predictions - - - """ - - - def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): - - self.bst = bst - self.dtrain = dtrain - self.y_train = y_train - self.dtest = dtest - self.y_test = y_test - self.output_path = output_path - - self.__bst_train = None - self.__bst_test = None - - self.__train_best_thr = None - self.__test_best_thr = None - - self.__train_pred = None - self.__test_pred = None - - def apply_predictions(self): - - self.__bst_train= pd.DataFrame(data=self.bst.predict(self.dtrain, output_margin=False), columns=["xgb_preds"]) - self.__bst_train['issignal']=self.y_train - - - self.__bst_test= pd.DataFrame(data=self.bst.predict(self.dtest, output_margin=False), columns=["xgb_preds"]) - self.__bst_test['issignal']=self.y_test - - return self.__bst_train, self.__bst_test - - - def get_threshold(self, train_y, test_y): - self.__train_best_thr, self.__test_best_thr = AMS(train_y, self.__bst_train['xgb_preds'], test_y, self.__bst_test['xgb_preds'], self.output_path) - return self.__train_best_thr, self.__test_best_thr - - - - def apply_threshold(self): - cut_train = self.__train_best_thr - self.__train_pred = ((self.__bst_train['xgb_preds']>cut_train)*1) - - cut_test = self.__test_best_thr - self.__test_pred = ((self.__bst_test['xgb_preds']>cut_test)*1) - - return self.__train_pred, self.__test_pred - - - def get_result(self, x_train, x_test): - - train_with_preds = x_train.copy() - train_with_preds['xgb_preds1'] = self.__train_pred.values - - - test_with_preds = x_test.copy() - test_with_preds['xgb_preds1'] = self.__test_pred.values - - return train_with_preds, test_with_preds - - - - def features_importance(self): - ax = xgb.plot_importance(self.bst) - plt.rcParams['figure.figsize'] = [6, 3] - ax.figure.tight_layout() - ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") - - - - def CM_plot_train_test(self): - """ - Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to - the number of observations known to be in group i and predicted to be in - group j. Thus in binary classification, the count of true positives is C00, - false negatives C01,false positives is C10, and true neagtives is C11. - - Confusion matrix is applied to previously unseen by model data, so we can - estimate model's performance - - Parameters - ---------- - test_best: numpy.float32 - best threshold - - x_train: dataframe - we want to get confusion matrix on training datasets - """ - #lets take the best threshold and look at the confusion matrix - - cnf_matrix_train = confusion_matrix(self.__bst_train['issignal'], self.__train_pred, labels=[1,0]) - np.set_printoptions(precision=2) - fig_train, axs_train = plt.subplots(figsize=(10, 8)) - axs_train.yaxis.set_label_coords(-0.04,.5) - axs_train.xaxis.set_label_coords(0.5,-.005) - plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], - title=' Train Dataset Confusion Matrix for XGB for cut > '+str(self.__train_best_thr)) - plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') - - - cnf_matrix_test = confusion_matrix(self.__bst_test['issignal'], self.__test_pred, labels=[1,0]) - np.set_printoptions(precision=2) - fig_test, axs_test = plt.subplots(figsize=(10, 8)) - axs_test.yaxis.set_label_coords(-0.04,.5) - axs_test.xaxis.set_label_coords(0.5,-.005) - plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], - title=' Test Dataset Confusion Matrix for XGB for cut > '+str(self.__test_best_thr)) - plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') - - - def preds_prob(self, preds, true, dataset): - if dataset =='train': - label1 = 'XGB Predictions on the training data set' - else: - label1 = 'XGB Predictions on the test data set' - fig, ax = plt.subplots(figsize=(12, 8)) - bins1=100 - plt.hist(self.__bst_test[preds], bins=bins1,facecolor='green',alpha = 0.3, label=label1) - - TP = self.__bst_test[(self.__bst_test[true]==1)] - TN = self.__bst_test[(self.__bst_test[true]==0)] - #TP[preds].plot.hist(ax=ax, bins=bins1,facecolor='blue', histtype='stepfilled',alpha = 0.3, label='True Positives/signal in predictions') - hist, bins = np.histogram(TP[preds], bins=bins1) - err = np.sqrt(hist) - center = (bins[:-1] + bins[1:]) / 2 - - - hist1, bins1 = np.histogram(TN[preds], bins=bins1) - err1 = np.sqrt(hist1) - plt.errorbar(center, hist1, yerr=err1, fmt='o', - c='Red', label='Background in predictions') - - plt.errorbar(center, hist, yerr=err, fmt='o', - c='blue', label='Signal in predictions') - - ax.set_yscale('log') - plt.xlabel('Probability',fontsize=18) - plt.ylabel('Counts', fontsize=18) - plt.legend(fontsize=18) - ax.set_xticks(np.arange(0,1.1,0.1)) - ax.tick_params(axis='both', which='major', labelsize=18) - ax.tick_params(axis='both', which='minor', labelsize=16) - plt.show() - fig.tight_layout() - fig.savefig(str(self.output_path)+'/test_best_pred.png') - - - def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): - fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) - - - if sign ==0: - s_label = 'Background' - m = 5 - - if sign==1: - s_label = 'Signal' - m = 1 - - axs[0].set_aspect(aspect = 'auto') - axs[1].set_aspect(aspect = 'auto') - axs[2].set_aspect(aspect = 'auto') - - rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) - diff = df_orig.shape[0] - df_cut.shape[0] - axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) - axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) - axs[2].legend(shadow=True, title ='ML cut rejects \n'+ str(rej) +'% of '+ s_label + - '\n ' + str(diff)+ ' samples were rejected ', - fontsize =14) - - counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) - axs[0].set_xlabel('rapidity', fontsize=15) - axs[0].set_ylabel('pT, GeV', fontsize=15) - - - mpl.pyplot.colorbar(im0, ax = axs[0]) - - - - axs[0].xaxis.set_major_locator(MultipleLocator(1)) - axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[0].xaxis.set_tick_params(which='both', width=2) - - - fig.tight_layout() - - - counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) - axs[1].set_xlabel('rapidity', fontsize=15) - axs[1].set_ylabel('pT, GeV', fontsize=15) - - mpl.pyplot.colorbar(im1, ax = axs[1]) - - axs[1].xaxis.set_major_locator(MultipleLocator(1)) - axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[1].xaxis.set_tick_params(which='both', width=2) - - fig.tight_layout() - - - counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[2].set_title(s_label + ' difference ', fontsize = 16) - axs[2].set_xlabel('rapidity', fontsize=15) - axs[2].set_ylabel('pT, GeV', fontsize=15) - - mpl.pyplot.colorbar(im1, ax = axs[2]) - - - axs[2].xaxis.set_major_locator(MultipleLocator(1)) - axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[2].xaxis.set_tick_params(which='both', width=2) - - fig.tight_layout() - - fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') - - - - def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): - """ - Applied quality cuts and created distributions for all the features in pdf - file - Parameters - ---------- - df_s: dataframe - signal - df_b: dataframe - background - feature: str - name of the feature to be plotted - pdf_key: PdfPages object - name of pdf document with distributions - """ - - for feature in dfs_orig.columns: - fig, ax = plt.subplots(3, figsize=(20, 10)) - - - fontP = FontProperties() - fontP.set_size('xx-large') - - ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + - - '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + - '\nquality cuts ', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[0].xaxis.set_tick_params(labelsize=15) - ax[0].yaxis.set_tick_params(labelsize=15) - - ax[0].set_title(str(feature) + ' MC '+ sample, fontsize = 25) - ax[0].set_xlabel(feature, fontsize = 25) - - if feature!=mass_var: - ax[0].set_yscale('log') - - fig.tight_layout() - - - ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + - '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + - '\nquality cuts + ML cut', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - - ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[1].xaxis.set_tick_params(labelsize=15) - ax[1].yaxis.set_tick_params(labelsize=15) - - ax[1].set_title(feature + ' MC '+ sample, fontsize = 25) - ax[1].set_xlabel(feature, fontsize = 25) - - if feature!='mass': - ax[1].set_yscale('log') - - fig.tight_layout() - - - - - ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - - ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[2].xaxis.set_tick_params(labelsize=15) - ax[2].yaxis.set_tick_params(labelsize=15) - - ax[2].set_title(feature + ' MC '+ sample, fontsize = 25) - ax[2].set_xlabel(feature, fontsize = 25) - - if feature!=mass_var: - ax[2].set_yscale('log') - - fig.tight_layout() - - plt.savefig(pdf_key,format='pdf') - pdf_key.close()