diff --git a/doc/makedoc b/doc/makedoc index 1fbcf3d8..6c413e12 100755 --- a/doc/makedoc +++ b/doc/makedoc @@ -121,7 +121,7 @@ class MakeLatex(): return False, [] # Splitting the lines - msg = out.split('\n') + msg = str(out).split('\n') # Removing irrelevant component msg2 = [] diff --git a/madanalysis/misc/database.py b/madanalysis/misc/database.py new file mode 100644 index 00000000..c9498638 --- /dev/null +++ b/madanalysis/misc/database.py @@ -0,0 +1,153 @@ +################################################################################ +# +# Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks +# The MadAnalysis development team, email: +# +# This file is part of MadAnalysis 5. +# Official website: +# +# MadAnalysis 5 is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MadAnalysis 5 is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MadAnalysis 5. If not, see +# +################################################################################ + +from typing import Text +from math import sqrt + +""" +TODO: Instead of using regiondata: dict we can change our datastructure to be more object oriented + which will give us more/easy control over the data. These functions are written to replicate `regiondata` + in run_recast.py file. In order not to change too much functionality `dict` properties have been implemented + this will allow the complete transition to be spread over time. +""" + +class Region: + """ + Data structure for an analysis region + + :param analysis: name of the analysis + :param regionID: name of the region + :param nobs: number of observed events + :param nb: number of expected events + :param deltanb: total uncertainty on the expected events + :param lumi: luminosity of the analysis + """ + + def __init__( + self, + regionID: Text, + nobs: int, + nb: float, + deltanb: float, + lumi: float = 0.0, + analysis: Text = "__unknown_analysis__", + ) -> None: + self.analysis = analysis + self.region_id = regionID + + self.nobs = nobs + self.nb = nb + self.deltanb = deltanb + + self.final_nevt = -1 + self.initial_nevt = -1 + self.xsec = -1 + self.lumi = lumi + + self.s95exp = -1 + self.s95obs = -1 + self.cls = -1 + self.best = 0 + + # define getitem and setitem attributes for backwards compatibility. + # This will allow us not to change current development too much. + + def __getitem__(self, item): + input_key = item + if item == "Nf": + input_key == "final_nevt" + elif item == "N0": + input_key == "initial_nevt" + return vars(self)[input_key] + + def __setitem__(self, key, value): + input_key = key + if key == "Nf": + input_key == "final_nevt" + elif key == "N0": + input_key == "initial_nevt" + elif key not in vars(self).keys(): + return + + setattr(input_key, value) + + @property + def eff(self) -> float: + """ + Get efficiency of the region + :return: float + """ + return self.final_nevt / self.initial_nevt + + @property + def stat(self) -> float: + """ + Statistical uncertainty of the final number of events + """ + return sqrt(self.eff * (1 - self.eff) / abs(self.initial_nevt * self.lumi)) + + @property + def luminosity(self) -> float: + return self.lumi + + @luminosity.setter + def luminosity(self, value: float): + self.lumi = value + + @property + def nsignal(self): + return self.xsec * self.lumi * 1000.0 * self.eff + + @property + def n95(self) -> float: + """ + number of expected events for excluded SR at 95% CL + """ + return self.s95exp * self.lumi * 1000.0 * self.eff + + @property + def rSR(self) -> float: + """ + Ratio between number of signal events and number of expected events for excluded SR at 95% CL + """ + return self.nsignal / self.n95 + + +class RegionData: + def __init__(self): + self.regiondata = {} + + def keys(self): + return self.regiondata.keys() + + def items(self): + return self.regiondata.items() + + def __getitem__(self, item): + return self.regiondata.get(item, {}) + + def __setitem__(self, key, value): + if isinstance(value, dict): + self.regiondata[key] = Region( + regionID=key, nobs=value["nobs"], nb=value["nb"], deltanb=value["deltanb"] + ) diff --git a/madanalysis/misc/pyhf_interface.py b/madanalysis/misc/pyhf_interface.py new file mode 100644 index 00000000..ff2defb3 --- /dev/null +++ b/madanalysis/misc/pyhf_interface.py @@ -0,0 +1,461 @@ +################################################################################ +# +# Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks +# The MadAnalysis development team, email: +# +# This file is part of MadAnalysis 5. +# Official website: +# +# MadAnalysis 5 is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MadAnalysis 5 is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MadAnalysis 5. If not, see +# +################################################################################ + +import logging, sys, scipy +from typing import Sequence, Dict, Union, Optional, Tuple +from numpy import warnings, isnan +import numpy as np +from madanalysis.misc.histfactory_reader import HF_Background, HF_Signal + +try: + import pyhf +except ImportError as err: + logging.getLogger("MA5").error("Can't import pyhf.") + sys.exit() + +from pyhf.optimize import mixins + +pyhf.pdf.log.setLevel(logging.CRITICAL) +pyhf.workspace.log.setLevel(logging.CRITICAL) +mixins.log.setLevel(logging.CRITICAL) +pyhf.set_backend("numpy", precision="64b") + + +class PyhfInterface: + """ + pyhf interface for MadAnalysis 5 + :param signal: json patch signal + :param background: json dict for background + """ + + def __init__( + self, + signal: Union[HF_Signal, float], + background: Union[HF_Background, float], + nb: Optional[float] = None, + delta_nb: Optional[float] = None, + ): + self.model = None + self.data = None + + self.background = background + self.signal = signal + self.nb = nb + self.delta_nb = delta_nb + + @staticmethod + def _initialize_workspace( + signal: Union[Sequence, float], + background: Union[Dict, float], + nb: Optional[float] = None, + delta_nb: Optional[float] = None, + expected: Optional[bool] = False, + ): + """ + Initialize pyhf workspace + + Parameters + ---------- + signal: Union[Sequence, float] + number of signal events or json patch + background: Union[Dict, float] + number of observed events or json dictionary + nb: Optional[float] + number of expected background events (MC) + delta_nb: Optional[float] + uncertainty on expected background events + expected: bool + if true prepare apriori expected workspace, default False + + Returns + ------- + Tuple: + Workspace(can be none in simple case), model, data + """ + workspace, model, data = None, None, None + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + if isinstance(signal, float) and isinstance(background, (float, int)): + if expected: + background = nb + # Create model from uncorrelated region + model = pyhf.simplemodels.uncorrelated_background( + [max(signal, 0.0)], [nb], [delta_nb] + ) + data = [background] + model.config.auxdata + + elif isinstance(signal, HF_Signal) and isinstance(background, HF_Background): + HF = background() + if expected: + HF = background.get_expected() + workspace = pyhf.Workspace(HF) + model = workspace.model( + patches=[signal], + modifier_settings={ + "normsys": {"interpcode": "code4"}, + "histosys": {"interpcode": "code4p"}, + }, + ) + data = workspace.data(model) + elif isinstance(signal, list) and isinstance(background, dict): + workspace = pyhf.Workspace(background) + model = workspace.model( + patches=[signal], + modifier_settings={ + "normsys": {"interpcode": "code4"}, + "histosys": {"interpcode": "code4p"}, + }, + ) + data = workspace.data(model) + except (pyhf.exceptions.InvalidSpecification, KeyError) as err: + logging.getLogger("MA5").error("Invalid JSON file!! " + str(err)) + except Exception as err: + logging.getLogger("MA5").debug("Unknown error, check PyhfInterface " + str(err)) + + return workspace, model, data + + def computeCLs( + self, mu: float = 1.0, expected: bool = False, iteration_threshold: int = 3, **kwargs + ) -> Union[float, Dict]: + """ + Compute 1-CLs values + + Parameters + ---------- + mu: float + POI signal strength + CLs_exp: bool + if true return expected CLs value + CLs_obs: bool + if true return observed CLs value + iteration_threshold: int + maximum number of trials + + Returns + ------- + float or dict: + CLs values {"CLs_obs": xx, "CLs_exp": [xx] * 5} or single CLs value + """ + _, self.model, self.data = self._initialize_workspace( + self.signal, self.background, self.nb, self.delta_nb, expected + ) + + if self.model is None or self.data is None: + if "CLs_exp" in kwargs.keys() or "CLs_obs" in kwargs.keys(): + return -1 + else: + return {"CLs_obs": -1, "CLs_exp": [-1] * 5} + + def get_CLs(model, data, **keywordargs): + try: + CLs_obs, CLs_exp = pyhf.infer.hypotest( + mu, + data, + model, + test_stat=keywordargs.get("stats", "qtilde"), + par_bounds=keywordargs.get("bounds", model.config.suggested_bounds()), + return_expected_set=True, + ) + + except (AssertionError, pyhf.exceptions.FailedMinimization, ValueError) as err: + logging.getLogger("MA5").debug(str(err)) + # dont use false here 1.-CLs = 0 can be interpreted as false + return "update bounds" + + # if isnan(float(CLs_obs)) or any([isnan(float(x)) for x in CLs_exp]): + # return "update mu" + CLs_obs = float(CLs_obs[0]) if isinstance(CLs_obs, (list, tuple)) else float(CLs_obs) + + return { + "CLs_obs": 1.0 - CLs_obs, + "CLs_exp": list(map(lambda x: float(1.0 - x), CLs_exp)), + } + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + # pyhf can raise an error if the poi_test bounds are too stringent + # they need to be updated dynamically. + arguments = dict(bounds=self.model.config.suggested_bounds(), stats="qtilde") + it = 0 + while True: + CLs = get_CLs(self.model, self.data, **arguments) + if CLs == "update bounds": + arguments["bounds"][self.model.config.poi_index] = ( + arguments["bounds"][self.model.config.poi_index][0], + 2 * arguments["bounds"][self.model.config.poi_index][1], + ) + logging.getLogger("MA5").debug( + "Hypothesis test inference integration bounds has been increased to " + + str(arguments["bounds"][self.model.config.poi_index]) + ) + it += 1 + elif isinstance(CLs, dict): + if isnan(CLs["CLs_obs"]) or any([isnan(x) for x in CLs["CLs_exp"]]): + arguments["stats"] = "q" + arguments["bounds"][self.model.config.poi_index] = ( + arguments["bounds"][self.model.config.poi_index][0] - 5, + arguments["bounds"][self.model.config.poi_index][1], + ) + logging.getLogger("MA5").debug( + "Hypothesis test inference integration bounds has been increased to " + + str(arguments["bounds"][self.model.config.poi_index]) + ) + else: + break + else: + it += 1 + # hard limit on iteration required if it exceeds this value it means + # Nsig >>>>> Nobs + if it >= iteration_threshold: + if "CLs_exp" in kwargs.keys() or "CLs_obs" in kwargs.keys(): + return 1 + return {"CLs_obs": 1.0, "CLs_exp": [1.0] * 5} + + if kwargs.get("CLs_exp", False): + return CLs["CLs_exp"][2] + elif kwargs.get("CLs_obs", False): + return CLs["CLs_obs"] + + return CLs + + def likelihood( + self, + mu: Optional[float] = 1.0, + nll: Optional[bool] = False, + expected: Optional[bool] = False, + mu_lim: Optional[Tuple[float, float]] = (-20., 40.), + iteration_threshold: Optional[int] = 3, + ) -> float: + """ + Returns the value of the likelihood. + Inspired by the `pyhf.infer.mle` module but for non-log likelihood + + Parameters + ---------- + mu: float + POI: signal strength + nll: bool + if true, return nll, not llhd + expected: bool + if true, compute expected likelihood, else observed. + mu_lim: Optional[Sequence[float,float]] + threshold on mu (POI). default [-20, 40] + iteration_threshold: int + maximum number of trials to find maximum log-likelihood + """ + _, self.model, self.data = self._initialize_workspace( + self.signal, self.background, self.nb, self.delta_nb, expected + ) + + if self.model is None or self.data is None: + return -1 + # set a threshold for mu + if not isinstance(mu, float): + mu = 1.0 + if mu < mu_lim[0]: + mu = mu_lim[0] + elif mu > mu_lim[1]: + mu = mu_lim[1] + + def computellhd(model, data, bounds): + try: + _, twice_nllh = pyhf.infer.mle.fixed_poi_fit( + mu, + self.data, + self.model, + return_fitted_val=True, + maxiter=200, + par_bounds=bounds, + ) + except (AssertionError, pyhf.exceptions.FailedMinimization, ValueError) as err: + logging.getLogger("MA5").debug(str(err)) + return "update bounds" + + return twice_nllh + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Values in x were outside bounds during a minimize step, clipping to bounds", + ) + + bounds = self.model.config.suggested_bounds() + it = 0 + while True: + twice_nllh = computellhd(self.model, self.data, bounds) + if twice_nllh == "update bounds": + bounds[self.model.config.poi_index] = ( + bounds[self.model.config.poi_index][0] - 5.0, + 2.0 * bounds[self.model.config.poi_index][1], + ) + it += 1 + else: + break + if it >= iteration_threshold: + logging.getLogger("MA5").debug("pyhf mle.fit failed") + return float("nan") + + return np.exp(-twice_nllh / 2.0) if not nll else twice_nllh / 2.0 + + def sigma_mu(self, expected: bool = False): + """ + get an estimate for the standard deviation of mu around mu_hat this is only used for + initialisation of the optimizer, so can be skipped in the first version, + + adapted from smodels.tools.pyhfInterface + + expected: + get muhat for the expected likelihood, not observed. + """ + workspace, self.model, self.data = self._initialize_workspace( + self.signal, self.background, self.nb, self.delta_nb, expected + ) + + if workspace is not None: + obss, bgs, bgVars, nsig = {}, {}, {}, {} + channels = workspace.channels + for chdata in workspace["channels"]: + if not chdata["name"] in channels: + continue + bg, var = 0.0, 0.0 + for sample in chdata["samples"]: + if sample["name"] == "Bkg": + tbg = sample["data"][0] + bg += tbg + hi = sample["modifiers"][0]["data"]["hi_data"][0] + lo = sample["modifiers"][0]["data"]["lo_data"][0] + delta = max((hi - bg, bg - lo)) + var += delta**2 + if sample["name"] == "bsm": + ns = sample["data"][0] + nsig[chdata["name"]] = ns + bgs[chdata["name"]] = bg + bgVars[chdata["name"]] = var + for chdata in workspace["observations"]: + if not chdata["name"] in channels: + continue + obss[chdata["name"]] = chdata["data"][0] + vars = [] + for c in channels: + # poissonian error + if nsig[c] == 0.0: + nsig[c] = 1e-5 + poiss = (obss[c] - bgs[c]) / nsig[c] + gauss = bgVars[c] / nsig[c] ** 2 + vars.append(poiss + gauss) + var_mu = np.sum(vars) + n = len(obss) + print(f"sigma_mu = {float(np.sqrt(var_mu / (n**2)))}") + return float(np.sqrt(var_mu / (n**2))) + + else: + res = ( + float(np.sqrt(self.delta_nb**2 + self.nb) / self.signal) + if self.signal > 0.0 + else 0.0 + ) + return res + + def muhat( + self, + expected: bool = False, + allowNegativeSignals: bool = True, + iteration_threshold: int = 3, + ): + """ + get the value of mu for which the likelihood is maximal. this is only used for + initialisation of the optimizer, so can be skipped in the first version of the code. + + Parameters + ---------- + expected: bool + get muhat for the expected likelihood, not observed. + allowNegativeSignals: bool + if true, allow negative values for mu + """ + workspace, self.model, self.data = self._initialize_workspace( + self.signal, self.background, self.nb, self.delta_nb, expected + ) + + def computeMu(model, data, **keywordargs): + try: + muhat, maxNllh = pyhf.infer.mle.fit( + data, + model, + return_fitted_val=True, + par_bounds=keywordargs.get("bounds", model.config.suggested_bounds()), + ) + except (AssertionError, pyhf.exceptions.FailedMinimization, ValueError) as err: + logging.getLogger("MA5").debug(str(err)) + # dont use false here 1.-CLs = 0 can be interpreted as false + return "update bounds", None + + return muhat, maxNllh + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Values in x were outside bounds during a minimize step, clipping to bounds", + ) + bounds = self.model.config.suggested_bounds() + if allowNegativeSignals: + bounds[self.model.config.poi_index] = (-5.0, 10.0) + arguments = dict(bounds=bounds) + it = 0 + while True: + muhat, maxNllh = computeMu(self.model, self.data, **arguments) + if muhat == "update bounds": + arguments["bounds"][self.model.config.poi_index] = ( + 2.0 * arguments["bounds"][self.model.config.poi_index][0], + 2.0 * arguments["bounds"][self.model.config.poi_index][1], + ) + it += 1 + else: + break + if it >= iteration_threshold: + logging.getLogger("MA5").debug("pyhf mle.fit failed") + return float("nan") + + return muhat[self.model.config.poi_index] + + def computeUpperLimitOnMu(self, expected: bool = False): + """ + Compute upper limit on POI (signal strength) + Parameters + ---------- + expected:bool + """ + computer = lambda mu: self.computeCLs(mu=mu, expected=expected, CLs_obs=True) - 0.95 + + low, hig = 1.0, 1.0 + while computer(low) > 0.95: + low *= 0.1 + if low < 1e-10: + break + while computer(hig) < 0.95: + hig *= 10.0 + if hig > 1e10: + break + + return scipy.optimize.brentq(computer, low, hig, xtol=low / 100.0) diff --git a/madanalysis/misc/run_recast.py b/madanalysis/misc/run_recast.py index 4ebc21d6..31a42f89 100644 --- a/madanalysis/misc/run_recast.py +++ b/madanalysis/misc/run_recast.py @@ -35,9 +35,10 @@ from string_tools import StringTools from six.moves import map, range, input import copy, logging, math, os, shutil, time, sys, json +from typing import Text, Dict -class RunRecast(): +class RunRecast: def __init__(self, main, dirname): self.dirname = dirname @@ -74,13 +75,17 @@ def init(self): def SetCLsCalculator(self): + def pyhf_wrapper(nobs, nb, deltanb, nsignal, ntoys, CLs_obs = True): + from madanalysis.misc.pyhf_interface import PyhfInterface + interface = PyhfInterface(nsignal, nobs, nb, deltanb) + return interface.computeCLs(CLs_obs = CLs_obs) if self.main.session_info.has_pyhf and self.main.recasting.CLs_calculator_backend == "pyhf": - self.cls_calculator = pyhf_wrapper_py3 + self.cls_calculator = pyhf_wrapper elif not self.main.session_info.has_pyhf: self.main.recasting.CLs_calculator_backend = "native" if self.main.session_info.has_pyhf and self.main.recasting.expectation_assumption == "aposteriori": - self.cls_calculator = pyhf_wrapper_py3 + self.cls_calculator = pyhf_wrapper self.main.recasting.CLs_calculator_backend = "pyhf" self.is_apriori = False elif not self.main.session_info.has_pyhf and self.main.recasting.expectation_assumption == "aposteriori": @@ -130,15 +135,17 @@ def edit_recasting_card(self): if self.forced or self.main.script: return self.logger.info("Would you like to edit the recasting Card ? (Y/N)") - allowed_answers=['n','no','y','yes'] - answer="" - while answer not in allowed_answers: - answer=input("Answer: ") - answer=answer.lower() - if answer=="no" or answer=="n": + allowed_answers = ['n', 'no', 'y', 'yes'] + answer = "" + while answer not in allowed_answers: + answer = input("Answer: ") + answer = answer.lower() + if answer == "no" or answer == "n": return else: - err = os.system(self.main.session_info.editor+" "+self.dirname+"/Input/recasting_card.dat") + err = os.system( + self.main.session_info.editor + " " + self.dirname + "/Input/recasting_card.dat" + ) return @@ -316,10 +323,8 @@ def run_SimplifiedFastSim(self,dataset,card,analysislist): self.main.script = script_mode old_fastsim = self.main.fastsim.package self.main.fastsim.package="fastjet" + output_name = "SFS_events.lhe" + self.main.archi_info.has_zlib * ".gz" if self.main.recasting.store_events: - output_name = "SFS_events.lhe" - if self.main.archi_info.has_zlib: - output_name += ".gz" self.logger.debug(" Setting the output LHE file :"+output_name) # Initializing the JobWriter @@ -332,7 +337,7 @@ def run_SimplifiedFastSim(self,dataset,card,analysislist): self.logger.info(" Copying 'SampleAnalyzer' source files...") if not jobber.CopyLHEAnalysis(): return False - if not jobber.CreateBldDir(analysisName="SFSRun",outputName="SFSRun.saf"): + if not jobber.CreateBldDir(analysisName = "SFSRun", outputName = "SFSRun.saf"): return False if not jobber.WriteSelectionHeader(self.main): return False @@ -347,14 +352,20 @@ def run_SimplifiedFastSim(self,dataset,card,analysislist): if not jobber.WriteMakefiles(): return False # Copying the analysis files - analysisList = open(self.dirname+'_SFSRun/Build/SampleAnalyzer/User/Analyzer/analysisList.h','w') + analysisList = open( + self.dirname + '_SFSRun/Build/SampleAnalyzer/User/Analyzer/analysisList.h', 'w' + ) for ana in analysislist: - analysisList.write('#include "SampleAnalyzer/User/Analyzer/'+ana+'.h"\n') + analysisList.write('#include "SampleAnalyzer/User/Analyzer/' + ana + '.h"\n') analysisList.write('#include "SampleAnalyzer/Process/Analyzer/AnalyzerManager.h"\n') analysisList.write('#include "SampleAnalyzer/Commons/Service/LogStream.h"\n\n') - analysisList.write('// -----------------------------------------------------------------------------\n') + analysisList.write( + '// -----------------------------------------------------------------------------\n' + ) analysisList.write('// BuildUserTable\n') - analysisList.write('// -----------------------------------------------------------------------------\n') + analysisList.write( + '// -----------------------------------------------------------------------------\n' + ) analysisList.write('void BuildUserTable(MA5::AnalyzerManager& manager)\n') analysisList.write('{\n') analysisList.write(' using namespace MA5;\n') @@ -395,28 +406,40 @@ def run_SimplifiedFastSim(self,dataset,card,analysislist): if self.main.recasting.store_events: newfile.write(' //Getting pointer to the writer\n') newfile.write(' WriterBase* writer1 = \n') - newfile.write(' manager.InitializeWriter("lhe","'+output_name+'");\n') + newfile.write(' manager.InitializeWriter("lhe","' + output_name + '");\n') newfile.write(' if (writer1==0) return 1;\n\n') - elif '// Post initialization (creates the new output directory structure)' in line and self.TACO_output!='': - newfile.write(' std::ofstream out;\n out.open(\"../Output/' + self.TACO_output+'\");\n') - newfile.write('\n manager.HeadSR(out);\n out << std::endl;\n'); + elif 'if(!manager.PostInitialize()) return 1;' in line: + ignore = False + newfile.write(line) + if self.TACO_output != "": + if self.TACO_output != '': + newfile.write(" // Initialize TACO output\n") + newfile.write( + ' std::ofstream TACO_file;\n TACO_file.open(\"../Output/' + self.TACO_output + '\");\n' + ) + newfile.write(' manager.HeadSR(TACO_file);\n TACO_file << std::endl;\n') elif '//Getting pointer to the clusterer' in line: ignore=False newfile.write(line) elif '!analyzer1' in line and not ignore: - ignore=True + ignore = True if self.main.recasting.store_events: - newfile.write(' writer1->WriteEvent(myEvent,mySample);\n') + newfile.write(' writer1->WriteEvent(myEvent, mySample);\n') for analysis in analysislist: - newfile.write(' if (!analyzer_'+analysis+'->Execute(mySample,myEvent)) continue;\n') - if self.TACO_output!='': - newfile.write('\n manager.DumpSR(out);\n'); + newfile.write( + ' if (!analyzer_' + analysis + '->Execute(mySample, myEvent)) continue;\n' + ) + if self.TACO_output != '': + newfile.write(" // Fill TACO file\n") + newfile.write(' manager.DumpSR(TACO_file);\n') elif ' }' in line: newfile.write(line) - ignore=False - elif 'manager.Finalize(mySamples,myEvent);' in line and self.TACO_output!='': - newfile.write(line); - newfile.write(' out.close();\n'); + ignore = False + elif 'manager.Finalize(mySamples,myEvent);' in line and self.TACO_output != '': + newfile.write(line) + if self.TACO_output != '': + newfile.write(" // Close TACO file\n") + newfile.write(' TACO_file.close();\n') elif not ignore: newfile.write(line) mainfile.close() @@ -639,24 +662,30 @@ def update_pad_main(self,analysislist): newfile.write(' AnalyzerBase* analyzer_'+analysis+'=\n') newfile.write(' manager.InitializeAnalyzer(\"'+analysis+'\",\"'+analysis+'.saf\",'+\ 'prm'+analysis+');\n') - newfile.write( ' if (analyzer_'+analysis+'==0) return 1;\n\n') - elif '// Post initialization (creates the new output directory structure)' in line: - ignore=False + newfile.write(' if (analyzer_'+analysis+'==0) return 1;\n\n') + elif 'if(!manager.PostInitialize()) return 1;' in line: + ignore = False newfile.write(line) - if self.TACO_output!='': - newfile.write(' std::ofstream out;\n out.open(\"../Output/' + self.TACO_output+'\");\n') - newfile.write(' manager.HeadSR(out);\n out << std::endl;\n'); + if self.TACO_output != '': + newfile.write(" // Initialize TACO output\n") + newfile.write(' std::ofstream TACO_file;\n TACO_file.open(\"../Output/' + self.TACO_output+'\");\n') + newfile.write(' manager.HeadSR(TACO_file);\n TACO_file << std::endl;\n') elif '!analyzer_' in line and not ignore: ignore=True for analysis in analysislist: - newfile.write(' if (!analyzer_'+analysis+'->Execute(mySample,myEvent)) continue;\n') + newfile.write( + ' if (!analyzer_' + analysis + '->Execute(mySample, myEvent)) continue;\n' + ) elif '!analyzer1' in line: - if self.TACO_output!='': - newfile.write('\nmanager.DumpSR(out);\n'); + if self.TACO_output != '': + newfile.write(" // Fill TACO file\n") + newfile.write(' manager.DumpSR(TACO_file);\n') ignore=False elif 'manager.Finalize(mySamples,myEvent);' in line and self.TACO_output!='': - newfile.write(line); - newfile.write(' out.close();\n'); + newfile.write(line) + if self.TACO_output != '': + newfile.write(" // Close TACO file\n") + newfile.write(' TACO_file.close();\n'); elif not ignore: newfile.write(line) @@ -741,7 +770,8 @@ def save_output(self, eventfile, setname, analyses, card): for analysis in analyses: shutil.move(self.dirname+'_RecastRun/Output/SAF/PADevents/'+analysis+'_0',self.dirname+'/Output/SAF/'+setname+'/'+analysis) if self.TACO_output!='': - filename = '.'.join(self.TACO_output.split('.')[:-1]) + '_' + card.replace('tcl','') + self.TACO_output.split('.')[-1] + filename = '.'.join(self.TACO_output.split('.')[:-1]) + '_' + card.replace('tcl', '') + \ + self.TACO_output.split('.')[-1] shutil.move(self.dirname+'_RecastRun/Output/'+self.TACO_output,self.dirname+'/Output/SAF/'+setname+'/'+filename) return True @@ -827,17 +857,17 @@ def compute_cls(self, analyses, dataset): return False ## Performing the CLS calculation - regiondata=self.extract_sig_cls(regiondata,regions,lumi,"exp") + regiondata=self.extract_sig_cls(regiondata, regions, lumi, "exp") if self.cov_config != {}: - regiondata=self.extract_sig_lhcls(regiondata,lumi,"exp") + regiondata=self.extract_sig_lhcls(regiondata, lumi, "exp") # CLs calculation for pyhf regiondata = self.pyhf_sig95Wrapper(lumi, regiondata, "exp") if extrapolated_lumi=='default': if self.cov_config != {}: - regiondata=self.extract_sig_lhcls(regiondata,lumi,"obs") - regiondata = self.extract_sig_cls(regiondata,regions,lumi,"obs") - regiondata = self.pyhf_sig95Wrapper(lumi,regiondata,'obs') + regiondata=self.extract_sig_lhcls(regiondata, lumi, "obs") + regiondata = self.extract_sig_cls(regiondata, regions, lumi, "obs") + regiondata = self.pyhf_sig95Wrapper(lumi, regiondata, 'obs') else: for reg in regions: regiondata[reg]["nobs"]=regiondata[reg]["nb"] @@ -1375,90 +1405,93 @@ def read_cutflows(self, path, regions, regiondata): def extract_cls(self,regiondata,regions,xsection,lumi): self.logger.debug('Compute CLs...') ## computing fi a region belongs to the best expected ones, and derive the CLs in all cases - bestreg=[] + bestreg = [] rMax = -1 for reg in regions: nsignal = xsection * lumi * 1000. * regiondata[reg]["Nf"] / regiondata[reg]["N0"] - nb = regiondata[reg]["nb"] - nobs = regiondata[reg]["nobs"] + nb = regiondata[reg]["nb"] + nobs = regiondata[reg]["nobs"] deltanb = regiondata[reg]["deltanb"] - if nsignal<=0: - rSR = -1 + if nsignal <= 0: + rSR = -1 myCLs = 0 else: - n95 = float(regiondata[reg]["s95exp"]) * lumi * 1000. * regiondata[reg]["Nf"] / regiondata[reg]["N0"] - rSR = nsignal/n95 - myCLs = self.cls_calculator(nobs, nb, deltanb, nsignal, self.ntoys, CLs_obs = True) + n95 = float(regiondata[reg]["s95exp"]) * lumi * 1000. * regiondata[reg]["Nf"] / \ + regiondata[reg]["N0"] + rSR = nsignal / n95 + myCLs = self.cls_calculator(nobs, nb, deltanb, nsignal, self.ntoys, CLs_obs = True) regiondata[reg]["rSR"] = rSR regiondata[reg]["CLs"] = myCLs if rSR > rMax: - regiondata[reg]["best"]=1 + regiondata[reg]["best"] = 1 for mybr in bestreg: - regiondata[mybr]["best"]=0 + regiondata[mybr]["best"] = 0 bestreg = [reg] rMax = rSR else: - regiondata[reg]["best"]=0 + regiondata[reg]["best"] = 0 if self.cov_config != {}: minsig95, bestreg = 1e99, [] for cov_subset in self.cov_config.keys(): cov_regions = self.cov_config[cov_subset]["cov_regions"] - covariance = self.cov_config[cov_subset]["covariance" ] + covariance = self.cov_config[cov_subset]["covariance"] if all(s <= 0. for s in [regiondata[reg]["Nf"] for reg in cov_regions]): - regiondata["cov_subset"][cov_subset]["CLs"]= 0. + regiondata["cov_subset"][cov_subset]["CLs"] = 0. continue - CLs = self.slhCLs(regiondata,cov_regions,xsection,lumi,covariance, ntoys = self.ntoys) s95 = float(regiondata["cov_subset"][cov_subset]["s95exp"]) - regiondata["cov_subset"][cov_subset]["CLs"] = CLs + regiondata["cov_subset"][cov_subset]["CLs"] = self.slhCLs( + regiondata, cov_regions, xsection, lumi, covariance + ) if 0. < s95 < minsig95: regiondata['cov_subset'][cov_subset]["best"] = 1 for mybr in bestreg: - regiondata['cov_subset'][mybr]["best"]=0 + regiondata['cov_subset'][mybr]["best"] = 0 bestreg = [cov_subset] minsig95 = s95 else: - regiondata['cov_subset'][cov_subset]["best"]=0 + regiondata['cov_subset'][cov_subset]["best"] = 0 #initialize pyhf for cls calculation - iterator = [] if self.pyhf_config=={} else copy.deepcopy(list(self.pyhf_config.items())) + iterator = [] if self.pyhf_config == {} else copy.deepcopy(list(self.pyhf_config.items())) minsig95, bestreg = 1e99, [] for n, (likelihood_profile, config) in enumerate(iterator): - self.logger.debug(' * Running CLs for '+likelihood_profile) + self.logger.debug(' * Running CLs for ' + likelihood_profile) # safety check, just in case - if regiondata.get('pyhf',{}).get(likelihood_profile, False) == False: + if regiondata.get('pyhf', {}).get(likelihood_profile, False) == False: continue background = HF_Background(config) - self.logger.debug('current pyhf Configuration = '+str(config)) - signal = HF_Signal(config,regiondata,xsection=xsection) + self.logger.debug('current pyhf Configuration = ' + str(config)) + signal = HF_Signal(config, regiondata, xsection = xsection) is_not_extrapolated = signal.lumi == lumi - CLs = -1 + CLs = -1 if signal.isAlive(): sig_HF = signal(lumi) bkg_HF = background(lumi) if self.main.developer_mode: setattr(self, "hf_sig_test", sig_HF) setattr(self, "hf_bkg_test", bkg_HF) - CLs = pyhf_wrapper(bkg_HF, sig_HF) + from madanalysis.misc.pyhf_interface import PyhfInterface + interface = PyhfInterface(sig_HF, bkg_HF) + CLs = interface.computeCLs() # Take observed if default lumi used, use expected if extrapolated CLs_out = CLs['CLs_obs'] if is_not_extrapolated else CLs['CLs_exp'][2] regiondata['pyhf'][likelihood_profile]['full_CLs_output'] = CLs if CLs_out >= 0.: - regiondata['pyhf'][likelihood_profile]['CLs'] = CLs_out + regiondata['pyhf'][likelihood_profile]['CLs'] = CLs_out s95 = float(regiondata['pyhf'][likelihood_profile]['s95exp']) if 0. < s95 < minsig95: regiondata['pyhf'][likelihood_profile]["best"] = 1 for mybr in bestreg: - regiondata['pyhf'][mybr]["best"]=0 + regiondata['pyhf'][mybr]["best"] = 0 bestreg = [likelihood_profile] minsig95 = s95 else: - regiondata['pyhf'][likelihood_profile]["best"]=0 + regiondata['pyhf'][likelihood_profile]["best"] = 0 return regiondata - @staticmethod - def slhCLs(regiondata,cov_regions,xsection,lumi,covariance,expected=False, ntoys = 10000): + def slhCLs(self, regiondata, cov_regions, xsection, lumi, covariance, expected = False): """ (slh for simplified likelihood) Compute a global CLs combining the different region yields by using a simplified likelihood method (see CMS-NOTE-2017-001 for more information). It relies on the @@ -1467,17 +1500,23 @@ def slhCLs(regiondata,cov_regions,xsection,lumi,covariance,expected=False, ntoys observed, backgrounds, nsignal = [], [], [] # Collect the input data necessary for the simplified_likelyhood.py method for reg in cov_regions: - nsignal.append(xsection*lumi*1000.*regiondata[reg]["Nf"]/regiondata[reg]["N0"]) + nsignal.append(xsection * lumi * 1000. * regiondata[reg]["Nf"] / regiondata[reg]["N0"]) backgrounds.append(regiondata[reg]["nb"]) observed.append(regiondata[reg]["nobs"]) # data - from madanalysis.misc.simplified_likelihood import Data - LHdata = Data(observed, backgrounds, covariance, None, nsignal) - from madanalysis.misc.simplified_likelihood import CLsComputer - computer = CLsComputer(ntoys = ntoys, cl = .95) + from madanalysis.misc.simplified_likelihood import Data, UpperLimitComputer + LHdata = Data( + observed = observed, + backgrounds = backgrounds, + covariance = covariance, + nsignal = nsignal, + deltas_rel = 0.0, + lumi = lumi, + ) + computer = UpperLimitComputer(ntoys = self.ntoys, cl = .95) # calculation and output try: - return computer.computeCLs(LHdata, expected=expected) + return computer.computeCLs(LHdata, expected = expected) except Exception as err: logging.getLogger('MA5').debug("slhCLs : " + str(err)) return 0.0 @@ -1529,7 +1568,7 @@ def sig95(xsection): return regiondata # Calculating the upper limits on sigma with simplified likelihood - def extract_sig_lhcls(self,regiondata,lumi,tag): + def extract_sig_lhcls(self, regiondata: Dict, lumi: float, tag: Text) -> dict: """ Compute gloabal upper limit on cross section. @@ -1548,24 +1587,31 @@ def extract_sig_lhcls(self,regiondata,lumi,tag): def get_s95(regs, matrix): def sig95(xsection): - return self.slhCLs(regiondata,regs,xsection,lumi,matrix,(tag=="exp"), ntoys = self.ntoys)-0.95 + return self.slhCLs( + regiondata, regs, xsection, lumi, matrix, (tag == "exp"), ntoys = self.ntoys + ) - 0.95 + return sig95 for cov_subset in self.cov_config.keys(): cov_regions = self.cov_config[cov_subset]["cov_regions"] - covariance = self.cov_config[cov_subset]["covariance" ] + covariance = self.cov_config[cov_subset]["covariance"] if cov_subset not in regiondata["cov_subset"].keys(): regiondata["cov_subset"][cov_subset] = {} if all(s <= 0. for s in [regiondata[reg]["Nf"] for reg in cov_regions]): - regiondata["cov_subset"][cov_subset]["s95"+tag]= "-1" + regiondata["cov_subset"][cov_subset][f"s95{tag}"] = "-1" continue low, hig = 1., 1. - while self.slhCLs(regiondata,cov_regions,low,lumi,covariance,(tag=="exp"), ntoys = self.ntoys)>0.95: + while self.slhCLs( + regiondata, cov_regions, low, lumi, covariance, (tag == "exp"), + ) > 0.95: self.logger.debug('lower bound = ' + str(low)) low *= 0.1 if low < 1e-10: break - while self.slhCLs(regiondata,cov_regions,hig,lumi,covariance,(tag=="exp"), ntoys = self.ntoys)<0.95: + while self.slhCLs( + regiondata, cov_regions, hig, lumi, covariance, (tag == "exp"), + ) < 0.95: self.logger.debug('upper bound = ' + str(hig)) hig *= 10. if hig > 1e10: break @@ -1573,16 +1619,15 @@ def sig95(xsection): try: import scipy sig95 = get_s95(cov_regions, covariance) - s95 = scipy.optimize.brentq(sig95,low,hig,xtol=low/100.) + s95 = scipy.optimize.brentq(sig95, low, hig, xtol=low / 100.) except ImportError as err: self.logger.debug("Can't import scipy") - s95=-1 + s95 = -1 except Exception as err: - self.logger.debug(str(err)) - s95=-1 - - self.logger.debug('s95 = ' + str(s95) + ' pb') - regiondata["cov_subset"][cov_subset]["s95"+tag] = ("%.7f" % s95) + self.logger.debug(str(err.args[0])) + s95 = -1 + self.logger.debug(f"s95{tag} = {s95:.5f} [pb]") + regiondata["cov_subset"][cov_subset][f"s95{tag}"] = f"{s95:.7f}" return regiondata @@ -1593,8 +1638,10 @@ def pyhf_sig95Wrapper(self, lumi, regiondata, tag): if 'pyhf' not in list(regiondata.keys()): regiondata['pyhf'] = {} - def get_pyhf_result(*args): - rslt = pyhf_wrapper(*args) + def get_pyhf_result(background, signal): + from madanalysis.misc.pyhf_interface import PyhfInterface + interface = PyhfInterface(signal, background) + rslt = interface.computeCLs() if tag == "exp" and not self.is_apriori: return rslt["CLs_exp"][2] return rslt['CLs_obs'] @@ -1605,42 +1652,50 @@ def CLs(xsec): return get_pyhf_result(bkg(lumi), signal(lumi))-0.95 return CLs - iterator = [] if self.pyhf_config=={} else copy.deepcopy(list(self.pyhf_config.items())) + iterator = [] if self.pyhf_config == {} else copy.deepcopy(list(self.pyhf_config.items())) for n, (likelihood_profile, config) in enumerate(iterator): - self.logger.debug(' * Running sig95'+tag+' for '+likelihood_profile) + self.logger.debug(' * Running sig95' + tag + ' for ' + likelihood_profile) if likelihood_profile not in list(regiondata['pyhf'].keys()): regiondata['pyhf'][likelihood_profile] = {} - background = HF_Background(config, expected=(tag=='exp' and self.is_apriori)) - self.logger.debug('Config : '+str(config)) - if not HF_Signal(config, regiondata, xsection=1., background=background).isAlive(): - self.logger.debug(likelihood_profile+' has no signal event.') - regiondata['pyhf'][likelihood_profile]["s95"+tag] = "-1" + background = HF_Background(config, expected = (tag == 'exp' and self.is_apriori)) + self.logger.debug('Config : ' + str(config)) + if not HF_Signal(config, regiondata, xsection = 1., background = background).isAlive(): + self.logger.debug(likelihood_profile + ' has no signal event.') + regiondata['pyhf'][likelihood_profile]["s95" + tag] = "-1" continue low, hig = 1., 1. - while get_pyhf_result(background(lumi),\ - HF_Signal(config, regiondata,xsection=low)(lumi)) > 0.95: - self.logger.debug(tag+': profile '+likelihood_profile+\ - ', lower bound = '+str(low)) + while get_pyhf_result( + background(lumi), + HF_Signal(config, regiondata, xsection = low)(lumi) + ) > 0.95: + self.logger.debug( + tag + ': profile ' + likelihood_profile + \ + ', lower bound = ' + str(low) + ) low *= 0.1 if low < 1e-10: break - while get_pyhf_result(background(lumi),\ - HF_Signal(config, regiondata,xsection=hig)(lumi)) < 0.95: - self.logger.debug(tag+': profile '+likelihood_profile+\ - ', higher bound = '+str(hig)) + while get_pyhf_result( + background(lumi), + HF_Signal(config, regiondata, xsection = hig)(lumi) + ) < 0.95: + self.logger.debug( + tag + ': profile ' + likelihood_profile + \ + ', higher bound = ' + str(hig) + ) hig *= 10. if hig > 1e10: break try: import scipy s95 = scipy.optimize.brentq( - sig95(config, regiondata, background),low,hig,xtol=low/100. + sig95(config, regiondata, background), low, hig, xtol = low / 100. ) except Exception as err: self.logger.debug(str(err)) - self.logger.debug('Can not calculate sig95'+tag+' for '+likelihood_profile) - s95=-1 - regiondata['pyhf'][likelihood_profile]["s95"+tag] = "{:.7f}".format(s95) - self.logger.debug(likelihood_profile+' sig95'+tag+' = {:.7f} pb'.format(s95)) + self.logger.debug('Can not calculate sig95' + tag + ' for ' + likelihood_profile) + s95 = -1 + regiondata['pyhf'][likelihood_profile]["s95" + tag] = "{:.7f}".format(s95) + self.logger.debug(likelihood_profile + ' sig95' + tag + ' = {:.7f} pb'.format(s95)) return regiondata diff --git a/madanalysis/misc/simplified_likelihood.py b/madanalysis/misc/simplified_likelihood.py index 6ce03010..962e5727 100644 --- a/madanalysis/misc/simplified_likelihood.py +++ b/madanalysis/misc/simplified_likelihood.py @@ -1,8 +1,8 @@ ################################################################################ -# +# # Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks # The MadAnalysis development team, email: -# +# # This file is part of MadAnalysis 5. # Official website: # @@ -10,128 +10,234 @@ # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # MadAnalysis 5 is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with MadAnalysis 5. If not, see -# +# ################################################################################ - #!/usr/bin/env python3 + """ -.. module:: simplified_likelihood +.. module:: simplifiedLikelihoods :synopsis: Code that implements the simplified likelihoods as presented in CMS-NOTE-2017-001, see https://cds.cern.ch/record/2242860. - Copied from the SModelS v1.2 simplifiedLikelihood.py module (arXiv:1811.10624) + In collaboration with Andy Buckley, Sylvain Fichet and Nickolas Wardle. + Simplified likelihoods v2 (JHEP_021P_1018) are partly implemented. + Adapted from SModelS .. moduleauthor:: Wolfgang Waltenberger """ -from __future__ import print_function -from __future__ import absolute_import -from scipy import stats, optimize, integrate, special -from scipy import __version__ as scipy_version -from numpy import sqrt, exp, log, sign, array, ndarray +from scipy import stats, optimize, integrate, special, linalg +from numpy import sqrt, exp, log, sign, array, ndarray from functools import reduce -import numpy as NP -import math, copy, logging +from typing import Text, Optional, Union, Tuple + +# from smodels.tools.statistics import rootFromNLLs, determineBrentBracket + +import numpy as np +import math import copy -from six.moves import range -from six.moves import zip +import logging -logger = logging.getLogger('MA5') +logger = logging.getLogger("MA5") -class Data: - """ A very simple observed container to collect all the data - needed to fully define a specific statistical model """ +# CLsfromNLL, determineBrentBracket are retreived from smodels.tools.statistics +def CLsfromNLL( + nllA: float, nll0A: float, nll: float, nll0: float, return_type: Text = "CLs-alpha" +) -> float: + """ + compute the CLs - alpha from the NLLs + TODO: following needs explanation + :param nllA: + :param nll0A: + :param nll: + :param nll0: + :param return_type: (Text) "CLs-alpha" or "1-CLs" + :return: + """ + assert return_type in ["CLs-alpha", "1-CLs"], f"Unknown return type: {return_type}." + qmu = 0.0 if 2 * (nll - nll0) < 0.0 else 2 * (nll - nll0) + sqmu = np.sqrt(qmu) + qA = 2 * (nllA - nll0A) + if qA < 0.0: + qA = 0.0 + sqA = np.sqrt(qA) + if qA >= qmu: + CLsb = 1.0 - stats.multivariate_normal.cdf(sqmu) + CLb = stats.multivariate_normal.cdf(sqA - sqmu) + else: + CLsb = 1.0 if qA == 0.0 else 1.0 - stats.multivariate_normal.cdf((qmu + qA) / (2 * sqA)) + CLb = 1.0 if qA == 0.0 else 1.0 - stats.multivariate_normal.cdf((qmu - qA) / (2 * sqA)) + + CLs = CLsb / CLb if CLb > 0 else 0.0 + + if return_type == "1-CLs": + return 1.0 - CLs + + cl = 0.95 + root = CLs - 1.0 + cl + return root + + +def determineBrentBracket(mu_hat, sigma_mu, rootfinder): + """find a, b for brent bracketing + :param mu_hat: mu that maximizes likelihood + :param sigm_mu: error on mu_hat (not too reliable) + :param rootfinder: function that finds the root (usually root_func) + """ + sigma_mu = max(sigma_mu, 0.5) # there is a minimum on sigma_mu + # the root should be roughly at mu_hat + 2*sigma_mu + a = mu_hat + 1.5 * sigma_mu + ntrials = 20 + i = 0 + foundExtra = False + while rootfinder(a) < 0.0: + # if this is negative, we move it to the left + i += 1 + a -= (i**2.0) * sigma_mu + if i > ntrials or a < -10000.0: + for a in [0.0, 1.0, -1.0, 3.0, -3.0, 10.0, -10.0, 0.1, -0.1, 0.01, -0.01]: + if rootfinder(a) > 0.0: + foundExtra = True + break + if not foundExtra: + logger.error( + f"cannot find an a that is left of the root. last attempt, a={a:.2f}, root = {rootfinder(a):.2f}." + ) + logger.error(f"mu_hat={mu_hat:.2f}, sigma_mu={sigma_mu:.2f}") + return 0.0, 0.0 + i = 0 + foundExtra = False + b = mu_hat + 2.5 * sigma_mu + while rootfinder(b) > 0.0: + # if this is positive, we move it to the right + i += 1 + b += (i**2.0) * sigma_mu + closestr, closest = float("inf"), None + if i > ntrials: + for b in [1.0, 0.0, 3.0, -1.0, 10.0, -3.0, 0.1, -0.1, -10.0, 100.0, -100.0, 1000.0]: + root = rootfinder(b) + if root < 0.0: + foundExtra = True + break + if root < closestr: + closestr = root + closest = b + if not foundExtra: + logger.error(f"cannot find a b that is right of the root (i.e. rootfinder(b) < 0).") + logger.error(f"closest to zero rootfinder({closest})={closestr}") + logger.error(f"mu_hat was at {mu_hat:.2f} sigma_mu at {sigma_mu:.2f}") + return 0.0, 0.0 + return a, b - def __init__(self, observed, backgrounds, covariance, third_moment=None, - nsignal=None, name="model", deltas_rel = 0.2): + +class Data: + """A very simple observed container to collect all the data + needed to fully define a specific statistical model""" + + def __init__( + self, + observed, + backgrounds, + covariance, + third_moment=None, + nsignal=None, + name="model", + deltas_rel=0.2, + lumi=None, + ): """ :param observed: number of observed events per dataset :param backgrounds: expected bg per dataset :param covariance: uncertainty in background, as a covariance matrix :param nsignal: number of signal events in each dataset :param name: give the model a name, just for convenience - :param deltas_rel: the assumed relative error on the signal hypotheses. + :param deltas_rel: the assumed relative error on the signal hypotheses. The default is 20%. + :param lumi: luminosity of dataset in 1/fb, or None """ - self.observed = NP.around(self.convert(observed)) #Make sure observed number of events are integers + self.observed = self.convert(observed) # Make sure observed number of events are integers + ## self.observed = np.around(self.convert(observed)) #Make sure observed number of events are integers self.backgrounds = self.convert(backgrounds) self.n = len(self.observed) self.covariance = self._convertCov(covariance) self.nsignal = self.convert(nsignal) + self.lumi = lumi if self.nsignal is None: - self.signal_rel = self.convert(1.) + if len(self.backgrounds) == 1: + # doesnt matter, does it? + self.nsignal = np.array([1.0]) + self.signal_rel = self.convert(1.0) elif self.nsignal.sum(): - self.signal_rel = self.nsignal/self.nsignal.sum() - else: - self.signal_rel = array([0.]*len(self.nsignal)) - + self.signal_rel = self.nsignal / self.nsignal.sum() + else: + self.signal_rel = array([0.0] * len(self.nsignal)) + self.third_moment = self.convert(third_moment) - if type(self.third_moment) != type(None) and NP.sum([ abs(x) for x in self.third_moment ]) < 1e-10: + if ( + type(self.third_moment) != type(None) + and np.sum([abs(x) for x in self.third_moment]) < 1e-10 + ): self.third_moment = None self.name = name self.deltas_rel = deltas_rel self._computeABC() - # Checking the scip version - v = float(scipy_version.split(".")[0]) - if v < 1.0: - logger.warning("You're using an old version of scipy. This might impact the global CLs calculation.") - - def totalCovariance ( self, nsig ): - """ get the total covariance matrix, taking into account - also signal uncertainty for the signal hypothesis . + + def totalCovariance(self, nsig): + """get the total covariance matrix, taking into account + also signal uncertainty for the signal hypothesis . If nsig is None, the predefined signal hypothesis is taken. """ if self.isLinear(): - cov_tot = self.V + self.var_s( nsig ) + cov_tot = self.V + self.var_s(nsig) else: - cov_tot = self.covariance+ self.var_s(nsig) + cov_tot = self.covariance + self.var_s(nsig) return cov_tot - def zeroSignal(self): """ Is the total number of signal events zero? """ - - return len(self.nsignal[self.nsignal>0.]) == 0 + if self.nsignal is None: + return True + return len(self.nsignal[self.nsignal > 0.0]) == 0 - def var_s(self,nsig=None): + def var_s(self, nsig=None): """ The signal variances. Convenience function. - + :param nsig: If None, it will use the model expected number of signal events, otherwise will return the variances for the input value using the relative signal uncertainty defined for the model. - + """ - + if nsig is None: nsig = self.nsignal else: nsig = self.convert(nsig) - return NP.diag((nsig*self.deltas_rel)**2) + return np.diag((nsig * self.deltas_rel) ** 2) def isScalar(self, obj): """ Determine if obj is a scalar (float or int) """ - - if type(obj) == ndarray: + + if isinstance(obj, ndarray): ## need to treat separately since casting array([0.]) to float works return False try: _ = float(obj) return True - except: + except (ValueError, TypeError): pass return False @@ -141,7 +247,7 @@ def convert(self, obj): If object is a float or int, it is converted to a one element array. """ - + if type(obj) == type(None): return obj if self.isScalar(obj): @@ -152,18 +258,18 @@ def __str__(self): return self.name + " (%d dims)" % self.n def _convertCov(self, obj): - + if self.isScalar(obj): - return array ( [ [ obj ] ] ) - if type(obj[0]) == list: - return array ( obj ) - if type(obj[0]) == float: + return array([[obj]]) + if isinstance(obj[0], list): + return array(obj) + if isinstance(obj[0], float): ## if the matrix is flattened, unflatten it. - return array([ obj[self.n*i:self.n*(i+1)] for i in range(self.n)]) - + return array([obj[self.n * i : self.n * (i + 1)] for i in range(self.n)]) + return obj - def _computeABC( self ): + def _computeABC(self): """ Compute the terms A, B, C, rho, V. Corresponds with Eqs. 1.27-1.30 in the second paper. @@ -176,202 +282,340 @@ def _computeABC( self ): return covD = self.diagCov() - C=[] - for m2,m3 in zip(covD, self.third_moment): - if m3 == 0.: + C = [] + for m2, m3 in zip(covD, self.third_moment): + if m3 == 0.0: m3 = 1e-30 - k = -NP.sign(m3)*sqrt(2.*m2 ) - dm = sqrt ( 8.*m2**3/m3**2 - 1. ) - C.append( k*NP.cos ( 4.*NP.pi/3. + NP.arctan(dm) / 3. )) - - self.C=NP.array(C) ## C, as define in Eq. 1.27 (?) in the second paper - self.B = sqrt( covD - 2*self.C**2 ) ## B, as defined in Eq. 1.28(?) - self.A = self.backgrounds - self.C ## A, Eq. 1.30(?) - self.rho = NP.array( [ [0.]*self.n ]*self.n ) ## Eq. 1.29 (?) + k = -np.sign(m3) * sqrt(2.0 * m2) + dm = sqrt(8.0 * m2**3 / m3**2 - 1.0) + C.append(k * np.cos(4.0 * np.pi / 3.0 + np.arctan(dm) / 3.0)) + + self.C = np.array(C) ## C, as define in Eq. 1.27 (?) in the second paper + self.B = sqrt(covD - 2 * self.C**2) ## B, as defined in Eq. 1.28(?) + self.A = self.backgrounds - self.C ## A, Eq. 1.30(?) + self.rho = np.array([[0.0] * self.n] * self.n) ## Eq. 1.29 (?) for x in range(self.n): - for y in range(x,self.n): - bxby=self.B[x]*self.B[y] - cxcy=self.C[x]*self.C[y] - e=(4.*cxcy)**(-1)*(sqrt( bxby**2+8*cxcy*self.covariance[x][y])-bxby) - self.rho[x][y]=e - self.rho[y][x]=e + for y in range(x, self.n): + bxby = self.B[x] * self.B[y] + cxcy = self.C[x] * self.C[y] + e = (4.0 * cxcy) ** (-1) * ( + sqrt(bxby**2 + 8 * cxcy * self.covariance[x][y]) - bxby + ) + self.rho[x][y] = e + self.rho[y][x] = e self.sandwich() # self.V = sandwich ( self.B, self.rho ) - def sandwich( self ): + def sandwich(self): """ Sandwich product """ - - ret = NP.array ( [ [0.]*len(self.B) ]*len(self.B) ) + + ret = np.array([[0.0] * len(self.B)] * len(self.B)) for x in range(len(self.B)): - for y in range(x,len(self.B)): - T=self.B[x]*self.B[y]*self.rho[x][y] - ret[x][y]=T - ret[y][x]=T + for y in range(x, len(self.B)): + T = self.B[x] * self.B[y] * self.rho[x][y] + ret[x][y] = T + ret[y][x] = T self.V = ret def isLinear(self): """ Statistical model is linear, i.e. no quadratic term in poissonians """ - + return type(self.C) == type(None) def diagCov(self): """ Diagonal elements of covariance matrix. Convenience function. """ - - return NP.diag( self.covariance ) + + return np.diag(self.covariance) def correlations(self): """ Correlation matrix, computed from covariance matrix. - Convenience function. + Convenience function. """ - + if hasattr(self, "corr"): return self.corr - + self.corr = copy.deepcopy(self.covariance) for x in range(self.n): - self.corr[x][x]=1. - for y in range(x+1,self.n): - rho=self.corr[x][y]/sqrt(self.covariance[x][x]*self.covariance[y][y]) - self.corr[x][y]=rho - self.corr[y][x]=rho + self.corr[x][x] = 1.0 + for y in range(x + 1, self.n): + rho = self.corr[x][y] / sqrt(self.covariance[x][x] * self.covariance[y][y]) + self.corr[x][y] = rho + self.corr[y][x] = rho return self.corr def signals(self, mu): """ Returns the number of expected signal events, for all datasets, given total signal strength mu. - + :param mu: Total number of signal events summed over all datasets. """ - - return (mu*self.nsignal) + + return mu * self.signal_rel + + def nsignals(self, mu): + """ + Returns the number of expected signal events, for all datasets, + given total signal strength mu. + + :param mu: Total number of signal events summed over all datasets. + """ + + return mu * self.nsignal + class LikelihoodComputer: - def __init__(self, data, ntoys = 10000 ): + + debug_mode = False + + def __init__(self, data, toys=30000): """ :param data: a Data object. - :param ntoys: number of toys when marginalizing + :param toys: number of toys when marginalizing """ - + self.model = data - self.ntoys = ntoys + self.toys = toys - def dLdMu(self, mu, signal_rel, theta_hat): + def dNLLdMu(self, mu, signal_rel, theta_hat): """ - d (ln L)/d mu, if L is the likelihood. The function + d (- ln L)/d mu, if L is the likelihood. The function whose root gives us muhat, i.e. the mu that maximizes the likelihood. - + :param mu: total number of signal events :param signal_rel: array with the relative signal strengths for each dataset (signal region) :param theta_hat: array with nuisance parameters - - """ - - #Define relative signal strengths: - denominator = mu*signal_rel + self.model.backgrounds + theta_hat - for ctr,d in enumerate(denominator): - if d == 0.: - if (self.model.observed[ctr]*signal_rel[ctr]) == 0.: - # logger.debug("zero denominator, but numerator also zero, so we set denom to 1.") - denominator[ctr]=1. + """ + if not self.model.isLinear(): + logger.debug("implemented only for linear model") + # n_pred^i := mu s_i + b_i + theta_i + # NLL = sum_i [ - n_obs^i * ln ( n_pred^i ) + n_pred^i ] + # d NLL / d mu = sum_i [ - ( n_obs^i * s_ i ) / n_pred_i + s_i ] + + # Define relative signal strengths: + n_pred = mu * signal_rel + self.model.backgrounds + theta_hat + + for ctr, d in enumerate(n_pred): + if d == 0.0: + if (self.model.observed[ctr] * signal_rel[ctr]) == 0.0: + # logger.debug("zero denominator, but numerator also zero, so we set denom to 1.") + n_pred[ctr] = 1e-5 else: - raise Exception("we have a zero value in the denominator at pos %d, with a non-zero numerator. dont know how to handle." % ctr) - ret = self.model.observed*signal_rel/denominator - signal_rel - - if type(ret) in [ array, ndarray, list ]: + # n_pred[ctr]=1e-5 + raise Exception( + "we have a zero value in the denominator at pos %d, with a non-zero numerator. dont know how to handle." + % ctr + ) + ret = -self.model.observed * signal_rel / n_pred + signal_rel + + if type(ret) in [array, ndarray, list]: ret = sum(ret) return ret - def findMuHat(self, signal_rel): + def extendedOutput(self, extended_output, default=None): + if extended_output: + ret = {"muhat": default, "sigma_mu": default, "lmax": default} + return ret + return default + + def findAvgr(self, mu, theta_hat, signal_rel): + """from the difference observed - background, find got inital + values for lower and upper""" + mu_c = self.model.observed - self.model.backgrounds - theta_hat + mu_r, wmu_r = [], [] + n_pred = mu * signal_rel + self.model.backgrounds + theta_hat + obs = self.model.observed + for i, s in enumerate(n_pred): + if s == 0.0: + n_pred[i] = 1e-6 + hessian = self.model.observed * signal_rel**2 / n_pred**2 + wtot = 0.0 + for s in zip(mu_c, signal_rel, hessian): + if s[1] > 1e-10: + w = 1.0 + if s[2] > 0.0: + w = s[2] + wtot += w + mu_r.append(s[0] / s[1]) + wmu_r.append(w * s[0] / s[1]) + ret = min(mu_r), sum(wmu_r) / wtot, max(mu_r) + return ret + + def findMuHat( + self, nsig, allowNegativeSignals=False, extended_output=False, nll=False, marginalize=False + ): """ Find the most likely signal strength mu given the relative signal strengths in each dataset (signal region). - - :param signal_rel: array with relative signal strengths - - :returns: mu_hat, the total signal yield. + + :param nsig: array with relative signal strengths or signal yields + :param allowNegativeSignals: if true, then also allow for negative values + :param extended_output: if true, return also sigma_mu, the estimate of the error of mu_hat, and lmax, the likelihood at mu_hat + :param nll: if true, return nll instead of lmax in the extended output + + :returns: mu_hat, i.e. the maximum likelihood estimate of mu, if extended output is requested, it returns mu_hat, sigma_mu -- the standard deviation around mu_hat, and llhd, the likelihood at mu_hat """ - if (self.model.backgrounds == self.model.observed).all(): - return 0. - - if type(signal_rel) in [list, ndarray]: - signal_rel = array(signal_rel) - - signal_rel[signal_rel==0.] = 1e-20 - if sum(signal_rel<0.): + return self.extendedOutput(extended_output, 0.0) + + if type(nsig) in [list, ndarray]: + nsig = array(nsig) + + nsig[nsig == 0.0] = 1e-20 + if sum(nsig < 0.0): raise Exception("Negative relative signal strengths!") ## we need a very rough initial guess for mu(hat), to come ## up with a first theta - self.nsig = array([0.]*len(self.model.observed)) + # self.nsig = array([0.]*len(self.model.observed)) + self.nsig = nsig ## we start with theta_hat being all zeroes - theta_hat = array([0.]*len(self.model.observed)) - mu_hat_old, mu_hat = 0., 1. - ctr=0 - widener=3. - while abs(mu_hat - mu_hat_old)>1e-10 and abs(mu_hat - mu_hat_old )/(mu_hat+mu_hat_old) > .5e-2 and ctr < 20: - ctr+=1 + # theta_hat = array([0.]*len(self.model.observed)) + mu_hat_old, mu_hat = 0.0, 1.0 + ctr = 0 + widener = 3.0 + while ( + abs(mu_hat - mu_hat_old) > 1e-10 + and abs(mu_hat - mu_hat_old) / (mu_hat + mu_hat_old) > 0.5e-2 + and ctr < 20 + ): + theta_hat, _ = self.findThetaHat(mu_hat * nsig) + ctr += 1 mu_hat_old = mu_hat - #logger.info ( "theta hat[%d]=%s" % (ctr,list( theta_hat[:11] ) ) ) - #logger.info ( " mu hat[%d]=%s" % (ctr, mu_hat ) ) - mu_c = NP.abs(self.model.observed - self.model.backgrounds - theta_hat)/signal_rel + minr, avgr, maxr = self.findAvgr(mu_hat, theta_hat, nsig) + # for i,s in enumerate ( signal_rel ): + # if abs(s) < 1e-19: + # mu_c[i]=0. ## find mu_hat by finding the root of 1/L dL/dmu. We know ## that the zero has to be between min(mu_c) and max(mu_c). - lower,upper = 0.,widener*max(mu_c) - lower_v = self.dLdMu(lower, signal_rel, theta_hat) - upper_v = self.dLdMu(upper, signal_rel, theta_hat) - total_sign = NP.sign(lower_v * upper_v) - if total_sign > -.5: - if upper_v < lower_v < 0.: - ## seems like we really want to go for mu_hat = 0. - return 0. - logger.debug ( "weird. cant find a zero in the Brent bracket "\ - "for finding mu(hat). Let me try with a very small" - " value." ) - lower = 1e-4*max(mu_c) - lower_v = self.dLdMu( lower, signal_rel, theta_hat ) - total_sign = NP.sign( lower_v * upper_v ) - if total_sign > -.5: - logger.debug ( "cant find zero in Brentq bracket. l,u,ctr=%s,%s,%s" % \ - ( lower, upper, ctr ) ) - widener=widener*1.5 - continue - mu_hat = optimize.brentq ( self.dLdMu, lower, upper, args=(signal_rel, theta_hat ) ) - theta_hat,_ = self.findThetaHat( mu_hat*signal_rel) - ctr+=1 - + lstarters = [ + avgr - 0.2 * abs(avgr), + minr, + 0.0, + -1.0, + 1.0, + 10.0, + -0.1, + 0.1, + -100.0, + 100.0, + -1000.0, + ] + closestl, closestr = None, float("inf") + for lower in lstarters: + lower_v = self.dNLLdMu(lower, nsig, theta_hat) + if lower_v < 0.0: + break + if lower_v < closestr: + closestl, closestr = lower, lower_v + if lower_v > 0.0: + logger.debug( + f"did not find a lower value with rootfinder(lower) < 0. Closest: f({closestl})={closestr}" + ) + return self.extendedOutput(extended_output, 0.0) + ustarters = [ + avgr + 0.2 * abs(avgr), + maxr, + 0.0, + 1.0, + 10.0, + -1.0 - 0.1, + 0.1, + 100.0, + -100.0, + 1000.0, + -1000.0, + 0.01, + -0.01, + ] + closestl, closestr = None, float("inf") + for upper in ustarters: + upper_v = self.dNLLdMu(upper, nsig, theta_hat) + if upper_v > 0.0: + break + if upper_v < closestr: + closestl, closestr = upper, upper_v + if upper_v < 0.0: + logger.debug("did not find an upper value with rootfinder(upper) > 0.") + return self.extendedOutput(extended_output, 0.0) + mu_hat = optimize.brentq(self.dNLLdMu, lower, upper, args=(nsig, theta_hat), rtol=1e-9) + if not allowNegativeSignals and mu_hat < 0.0: + mu_hat = 0.0 + theta_hat, _ = self.findThetaHat(mu_hat * nsig) + if self.debug_mode: + self.theta_hat = theta_hat + + # print ( f">> found mu_hat {mu_hat:.2g} for signal {sum(signal_rel):.2f} allowNeg {allowNegativeSignals} l,u={lower},{upper} lastavgr={avgr}" ) + # print ( f">>>> obs {self.model.observed[:2]} bg {self.model.backgrounds[:2]}" ) + # print ( f">>>> mu_c {np.mean(mu_c)}" ) + if extended_output: + sigma_mu = self.getSigmaMu(mu_hat, nsig, theta_hat) + llhd = self.likelihood(self.model.signals(mu_hat), marginalize=marginalize, nll=nll) + ret = {"muhat": mu_hat, "sigma_mu": sigma_mu, "lmax": llhd} + return ret return mu_hat - def getSigmaMu(self, signal_rel): + def getSigmaMu(self, mu, nsig, theta_hat): """ - Get a rough estimate for the variance of mu around mu_max. - - :param signal_rel: array with relative signal strengths in each dataset (signal region) + Get an estimate for the standard deviation of mu at , from + the inverse hessian + + :param nsig: array with signal yields or relative signal strengths + in each dataset (signal region) + """ + if not self.model.isLinear(): + logger.debug("implemented only for linear model") + # d^2 mu NLL / d mu^2 = sum_i [ n_obs^i * s_i**2 / n_pred^i**2 ] + + # Define relative signal strengths: + n_pred = mu * nsig + self.model.backgrounds + theta_hat + + for ctr, d in enumerate(n_pred): + if d == 0.0: + if (self.model.observed[ctr] * nsig[ctr]) == 0.0: + # logger.debug("zero denominator, but numerator also zero, so we set denom to 1.") + n_pred[ctr] = 1.0 + else: + raise Exception( + "we have a zero value in the denominator at pos %d, with a non-zero numerator. dont know how to handle." + % ctr + ) + hessian = self.model.observed * nsig**2 / n_pred**2 + + if type(hessian) in [array, ndarray, list]: + hessian = sum(hessian) + # the error is the square root of the inverse of the hessian + if hessian == 0.0: + # if all observations are zero, we replace them by the expectations + if sum(self.model.observed) == 0: + hessian = sum(nsig**2 / n_pred) + stderr = float(np.sqrt(1.0 / hessian)) + return stderr """ - if type(signal_rel) in [ list, ndarray ]: - s_effs = sum(signal_rel) - - sgm_mu = sqrt(sum(self.model.observed) + sum(NP.diag(self.model.covariance)))/s_effs - + if type(nsig) in [ list, ndarray ]: + s_effs = sum(nsig) + + sgm_mu = float(sqrt(sum(self.model.observed) + sum(np.diag(self.model.covariance)))/s_effs) + return sgm_mu + """ - #Define integrand (gaussian_(bg+signal)*poisson(nobs)): + # Define integrand (gaussian_(bg+signal)*poisson(nobs)): # def prob(x0, x1 ) - - def probMV(self, nll, theta ): - """ probability, for nuicance parameters theta - :params nll: if True, compute negative log likelihood """ + def probMV(self, nll, theta): + """probability, for nuicance parameters theta + :params nll: if True, compute negative log likelihood""" # theta = array ( thetaA ) # ntot = self.model.backgrounds + self.nsig # lmbda = theta + self.ntot ## the lambda for the Poissonian @@ -379,301 +623,357 @@ def probMV(self, nll, theta ): lmbda = self.model.backgrounds + self.nsig + theta else: lmbda = self.nsig + self.model.A + theta + self.model.C * theta**2 / self.model.B**2 - lmbda[lmbda<=0.] = 1e-30 ## turn zeroes to small values + lmbda[lmbda <= 0.0] = 1e-30 ## turn zeroes to small values + obs = self.model.observed + + def is_integer(x): + if type(x) in [int, np.int64]: + return True + if type(x) in [float]: + return x.is_integer() + return False + + ## not needed for now + allintegers = np.all([is_integer(i) for i in obs]) if nll: - #poisson = self.model.observed * log ( lmbda ) - lmbda #- self.gammaln - poisson = stats.poisson.logpmf( self.model.observed, lmbda ) - #print ( "p",poisson,poisson2 ) + if allintegers: + poisson = stats.poisson.logpmf(obs, lmbda) + else: + poisson = -lmbda + obs * np.log(lmbda) - special.loggamma(obs + 1) else: - poisson = stats.poisson.pmf( self.model.observed, lmbda ) - #print ( "nonll",poisson ) + if allintegers: + poisson = stats.poisson.pmf(obs, lmbda) + else: + # poisson = np.exp(-lmbda)*np.power(lmbda,obs)/special.gamma(obs+1) + logpoiss = -lmbda + obs * np.log(lmbda) - special.loggamma(obs + 1) + poisson = np.exp(logpoiss) try: - M = [0.]*len(theta) + M = [0.0] * len(theta) C = self.model.V - if self.model.n == 1: - C = self.model.totalCovariance(self.nsig) + # if self.model.n == 1: I think not a good idea + # C = self.model.totalCovariance(self.nsig) + dTheta = theta - M + expon = -0.5 * np.dot(np.dot(dTheta, self.weight), dTheta) + self.logcoeff + # print ( "expon", expon, "coeff", self.coeff ) if nll: - gaussian = stats.multivariate_normal.logpdf(theta,mean=M,cov=C) - ret = - gaussian - sum(poisson) + gaussian = expon # np.log ( self.coeff ) + # gaussian2 = stats.multivariate_normal.logpdf(theta,mean=M,cov=C) + ret = -gaussian - sum(poisson) else: - gaussian = stats.multivariate_normal.pdf(theta,mean=M,cov=C) - ret = gaussian * ( reduce(lambda x, y: x*y, poisson) ) + gaussian = np.exp(expon) + # gaussian = self.coeff * np.exp ( expon ) + # gaussian2 = stats.multivariate_normal.pdf(theta,mean=M,cov=C) + ret = gaussian * (reduce(lambda x, y: x * y, poisson)) return ret except ValueError as e: - raise Exception("ValueError %s, %s" % ( e, self.model.totalCovariance(self.nsig) )) + raise Exception("ValueError %s, %s" % (e, self.model.V)) + # raise Exception("ValueError %s, %s" % ( e, self.model.totalCovariance(self.nsig) )) # raise Exception("ValueError %s, %s" % ( e, self.model.V )) - def nll( self, theta ): - """ probability, for nuicance parameters theta, - as a negative log likelihood. """ - return self.probMV(True,theta) + def nllOfNuisances(self, theta): + """probability, for nuicance parameters theta, + as a negative log likelihood.""" + return self.probMV(True, theta) - def nllprime( self, theta ): - """ the derivative of nll as a function of the thetas. - Makes it easier to find the maximum likelihood. """ + def dNLLdTheta(self, theta): + """the derivative of nll as a function of the thetas. + Makes it easier to find the maximum likelihood.""" if self.model.isLinear(): xtot = theta + self.model.backgrounds + self.nsig - xtot[xtot<=0.] = 1e-30 ## turn zeroes to small values - nllp_ = self.ones - self.model.observed / xtot + NP.dot( theta , self.weight ) + xtot[xtot <= 0.0] = 1e-30 ## turn zeroes to small values + nllp_ = self.ones - self.model.observed / xtot + np.dot(theta, self.weight) return nllp_ lmbda = self.nsig + self.model.A + theta + self.model.C * theta**2 / self.model.B**2 - lmbda[lmbda<=0.] = 1e-30 ## turn zeroes to small values - # nllp_ = ( self.ones - self.model.observed / lmbda + NP.dot( theta , self.weight ) ) * ( self.ones + 2*self.model.C * theta / self.model.B**2 ) - T=self.ones + 2*self.model.C/self.model.B**2*theta - nllp_ = T - self.model.observed / lmbda * ( T ) + NP.dot( theta , self.weight ) + lmbda[lmbda <= 0.0] = 1e-30 ## turn zeroes to small values + # nllp_ = ( self.ones - self.model.observed / lmbda + np.dot( theta , self.weight ) ) * ( self.ones + 2*self.model.C * theta / self.model.B**2 ) + T = self.ones + 2 * self.model.C / self.model.B**2 * theta + nllp_ = T - self.model.observed / lmbda * (T) + np.dot(theta, self.weight) return nllp_ - def nllHess( self, theta ): - """ the Hessian of nll as a function of the thetas. - Makes it easier to find the maximum likelihood. """ + def d2NLLdTheta2(self, theta): + """the Hessian of nll as a function of the thetas. + Makes it easier to find the maximum likelihood.""" # xtot = theta + self.ntot if self.model.isLinear(): xtot = theta + self.model.backgrounds + self.nsig - xtot[xtot<=0.] = 1e-30 ## turn zeroes to small values - nllh_ = self.weight + NP.diag ( self.model.observed / (xtot**2) ) + xtot[xtot <= 0.0] = 1e-30 ## turn zeroes to small values + nllh_ = self.weight + np.diag(self.model.observed / (xtot**2)) return nllh_ lmbda = self.nsig + self.model.A + theta + self.model.C * theta**2 / self.model.B**2 - lmbda[lmbda<=0.] = 1e-30 ## turn zeroes to small values - T=self.ones + 2*self.model.C/self.model.B**2*theta + lmbda[lmbda <= 0.0] = 1e-30 ## turn zeroes to small values + T = self.ones + 2 * self.model.C / self.model.B**2 * theta # T_i = 1_i + 2*c_i/B_i**2 * theta_i - nllh_ = self.weight + NP.diag ( self.model.observed * T**2 / (lmbda**2) ) - NP.diag ( self.model.observed / lmbda * 2 * self.model.C / self.model.B**2 ) + NP.diag ( 2*self.model.C/self.model.B**2 ) + nllh_ = ( + self.weight + + np.diag(self.model.observed * T**2 / (lmbda**2)) + - np.diag(self.model.observed / lmbda * 2 * self.model.C / self.model.B**2) + + np.diag(2 * self.model.C / self.model.B**2) + ) return nllh_ - def getThetaHat(self, nobs, nb, nsig, covb, max_iterations ): - """ Compute nuisance parameter theta that - maximizes our likelihood (poisson*gauss). """ - self.nsig = nsig - sigma2 = covb + self.model.var_s(nsig) ## NP.diag ( (self.model.deltas)**2 ) - ## for now deal with variances only - ntot = nb + nsig - cov = NP.array(sigma2) - weight = NP.linalg.inv(cov) ## weight matrix - diag_cov = NP.diag(cov) - # first: no covariances: - q = diag_cov * ( ntot - nobs ) + def getThetaHat(self, nobs, nb, nsig, covb, max_iterations): + """Compute nuisance parameter theta that + maximizes our likelihood (poisson*gauss).""" + self.nsig = nsig + sigma2 = covb + self.model.var_s(nsig) ## np.diag ( (self.model.deltas)**2 ) + ## for now deal with variances only + ntot = nb + nsig + cov = np.array(sigma2) + # weight = cov**(-1) ## weight matrix + weight = linalg.inv(cov) + diag_cov = np.diag(cov) + # first: no covariances: + q = diag_cov * (ntot - nobs) + p = ntot + diag_cov + thetamaxes = [] + thetamax = -p / 2.0 * (1 - sign(p) * sqrt(1.0 - 4 * q / p**2)) + thetamaxes.append(thetamax) + ndims = len(p) + + def distance(theta1, theta2): + for ctr, i in enumerate(theta1): + if i == 0.0: + theta1[ctr] = 1e-20 + for ctr, i in enumerate(theta2): + if i == 0.0: + theta2[ctr] = 1e-20 + return sum(np.abs(theta1 - theta2) / np.abs(theta1 + theta2)) + + ictr = 0 + while ictr < max_iterations: + ictr += 1 + q = diag_cov * (ntot - nobs) p = ntot + diag_cov - thetamaxes = [] - thetamax = -p/2. * ( 1 - sign(p) * sqrt ( 1. - 4*q / p**2 ) ) - thetamaxes.append ( thetamax ) - ndims = len(p) - def distance ( theta1, theta2 ): - for ctr,i in enumerate ( theta1 ): - if i == 0.: - theta1[ctr]=1e-20 - for ctr,i in enumerate ( theta2 ): - if i == 0.: - theta2[ctr]=1e-20 - return sum ( NP.abs(theta1 - theta2) / NP.abs ( theta1+theta2 ) ) - - ictr = 0 - while ictr < max_iterations: - ictr += 1 - q = diag_cov * ( ntot - nobs ) - p = ntot + diag_cov - for i in range(ndims): - #q[i] = diag_cov[i] * ( ntot[i] - nobs[i] ) - #p[i] = ntot[i] + diag_cov[i] - for j in range(ndims): - if i==j: continue - dq = thetamax[j]*ntot[i]*diag_cov[i]*weight[i,j] - dp = thetamax[j]*weight[i,j]*diag_cov[i] - if abs ( dq / q[i] ) > .3: - #logger.warning ( "too big a change in iteration." ) - dq=NP.abs( .3 * q[i] ) * NP.sign ( dq ) - if abs ( dp / p[i] ) > .3: - #logger.warning ( "too big a change in iteration." ) - dp=NP.abs( .3 * p[i] ) * NP.sign ( dp ) - q[i] += dq - p[i] += dp - thetamax = -p/2. * ( 1 - sign(p) * sqrt ( 1. - 4*q / p**2 ) ) - thetamaxes.append ( thetamax ) - if len(thetamaxes)>2: - d1 = distance ( thetamaxes[-2], thetamax ) - d2 = distance ( thetamaxes[-3], thetamaxes[-2] ) - if d1 > d2: - raise Exception("diverging when computing thetamax: %f > %f" % ( d1, d2 )) - if d1 < 1e-5: - return thetamax - return thetamax + for i in range(ndims): + # q[i] = diag_cov[i] * ( ntot[i] - nobs[i] ) + # p[i] = ntot[i] + diag_cov[i] + for j in range(ndims): + if i == j: + continue + dq = thetamax[j] * ntot[i] * diag_cov[i] * weight[i, j] + dp = thetamax[j] * weight[i, j] * diag_cov[i] + if abs(dq / q[i]) > 0.3: + # logger.warning ( "too big a change in iteration." ) + dq = np.abs(0.3 * q[i]) * np.sign(dq) + if abs(dp / p[i]) > 0.3: + # logger.warning ( "too big a change in iteration." ) + dp = np.abs(0.3 * p[i]) * np.sign(dp) + q[i] += dq + p[i] += dp + thetamax = -p / 2.0 * (1 - sign(p) * sqrt(1.0 - 4 * q / p**2)) + thetamaxes.append(thetamax) + if len(thetamaxes) > 2: + d1 = distance(thetamaxes[-2], thetamax) + d2 = distance(thetamaxes[-3], thetamaxes[-2]) + if d1 > d2: + raise Exception("diverging when computing thetamax: %f > %f" % (d1, d2)) + if d1 < 1e-5: + return thetamax + return thetamax def findThetaHat(self, nsig): - """ Compute nuisance parameter theta that maximizes our likelihood - (poisson*gauss). - """ - - ## first step is to disregard the covariances and solve the - ## quadratic equations - ini = self.getThetaHat ( self.model.observed, self.model.backgrounds, nsig, self.model.covariance, 0 ) - self.cov_tot = self.model.V - if self.model.n == 1: - self.cov_tot = self.model.totalCovariance ( nsig ) - # self.ntot = self.model.backgrounds + self.nsig - # if not self.model.isLinear(): - # self.cov_tot = self.model.V + self.model.var_s(nsig) - # self.cov_tot = self.model.totalCovariance (nsig) - #self.ntot = None - self.weight = NP.linalg.inv(self.cov_tot) - self.ones = 1. - if type ( self.model.observed) in [ list, ndarray ]: - self.ones = NP.ones ( len (self.model.observed) ) - self.gammaln = special.gammaln(self.model.observed + 1) - try: - ret_c = optimize.fmin_ncg ( self.nll, ini, fprime=self.nllprime, - fhess=self.nllHess, full_output=True, disp=0 ) - # then always continue with TNC - if type ( self.model.observed ) in [ int, float ]: - bounds = [ ( -10*self.model.observed, 10*self.model.observed ) ] - else: - bounds = [ ( -10*x, 10*x ) for x in self.model.observed ] - ini = ret_c - ret_c = optimize.fmin_tnc ( self.nll, ret_c[0], fprime=self.nllprime, - disp=0, bounds=bounds ) - # print ( "[findThetaHat] mu=%s bg=%s observed=%s V=%s, nsig=%s theta=%s, nll=%s" % ( self.nsig[0]/self.model.efficiencies[0], self.model.backgrounds, self.model.observed,self.model.covariance, self.nsig, ret_c[0], self.nll(ret_c[0]) ) ) - if ret_c[-1] not in [ 0, 1, 2 ]: - return ret_c[0],ret_c[-1] - else: - return ret_c[0],0 - logger.debug ( "tnc worked." ) - - ret = ret_c[0] - return ret,-2 - except Exception as e: - logger.error("exception: %s. ini[-3:]=%s" % (e,ini[-3:]) ) - raise Exception("cov-1=%s" % (self.model.covariance+self.model.var_s(nsig))**(-1)) - return ini,-1 + """Compute nuisance parameter theta that maximizes our likelihood + (poisson*gauss). + """ - def marginalizedLLHD1D(self, nsig, nll): - """ - Return the likelihood (of 1 signal region) to observe nobs events given the - predicted background nb, error on this background (deltab), - expected number of signal events nsig and the relative error on the signal (deltas_rel). + ## first step is to disregard the covariances and solve the + ## quadratic equations + ini = self.getThetaHat( + self.model.observed, self.model.backgrounds, nsig, self.model.covariance, 0 + ) + self.cov_tot = self.model.V + # if self.model.n == 1: + # self.cov_tot = self.model.totalCovariance ( nsig ) + # self.ntot = self.model.backgrounds + self.nsig + # if not self.model.isLinear(): + # self.cov_tot = self.model.V + self.model.var_s(nsig) + # self.cov_tot = self.model.totalCovariance (nsig) + # self.ntot = None + self.weight = np.linalg.inv(self.cov_tot) + # self.coeff = 1. + logdet = np.linalg.slogdet(self.cov_tot) + self.logcoeff = -self.model.n / 2 * np.log(2 * np.pi) - 0.5 * logdet[1] + # self.coeff = (2*np.pi)**(-self.model.n/2) * np.exp(-.5* logdet[1] ) + # print ( "coeff", self.coeff, "n", self.model.n, "det", np.linalg.slogdet ( self.cov_tot ) ) + # print ( "cov_tot", self.cov_tot[:10] ) + self.ones = 1.0 + if type(self.model.observed) in [list, ndarray]: + self.ones = np.ones(len(self.model.observed)) + self.gammaln = special.gammaln(self.model.observed + 1) + try: + ret_c = optimize.fmin_ncg( + self.nllOfNuisances, + ini, + fprime=self.dNLLdTheta, + fhess=self.d2NLLdTheta2, + full_output=True, + disp=0, + ) + # then always continue with TNC + if type(self.model.observed) in [int, float]: + bounds = [(-10 * self.model.observed, 10 * self.model.observed)] + else: + bounds = [(-10 * x, 10 * x) for x in self.model.observed] + ini = ret_c + ret_c = optimize.fmin_tnc( + self.nllOfNuisances, ret_c[0], fprime=self.dNLLdTheta, disp=0, bounds=bounds + ) + # print ( "[findThetaHat] mu=%s bg=%s observed=%s V=%s, nsig=%s theta=%s, nll=%s" % ( self.nsig[0]/self.model.efficiencies[0], self.model.backgrounds, self.model.observed,self.model.covariance, self.nsig, ret_c[0], self.nllOfNuisances(ret_c[0]) ) ) + if ret_c[-1] not in [0, 1, 2]: + return ret_c[0], ret_c[-1] + else: + return ret_c[0], 0 + logger.debug("tnc worked.") - :param nsig: predicted signal (float) - :param nobs: number of observed events (float) - :param nb: predicted background (float) - :param deltab: uncertainty on background (float) + ret = ret_c[0] + return ret, -2 + except (IndexError, ValueError) as e: + logger.error("exception: %s. ini[-3:]=%s" % (e, ini[-3:])) + raise Exception("cov-1=%s" % (self.model.covariance + self.model.var_s(nsig)) ** (-1)) + return ini, -1 - :return: likelihood to observe nobs events (float) + def marginalizedLLHD1D(self, nsig, nll): + """ + Return the likelihood (of 1 signal region) to observe nobs events given the + predicted background nb, error on this background (deltab), + expected number of signal events nsig and the relative error on the signal (deltas_rel). - """ - self.sigma2 = self.model.covariance + self.model.var_s(nsig)## (self.model.deltas)**2 - self.sigma_tot = sqrt(self.sigma2) - self.lngamma = math.lgamma(self.model.observed[0] + 1) - # Why not a simple gamma function for the factorial: - # ----------------------------------------------------- - # The scipy.stats.poisson.pmf probability mass function - # for the Poisson distribution only works for discrete - # numbers. The gamma distribution is used to create a - # continuous Poisson distribution. - # - # Why not a simple gamma function for the factorial: - # ----------------------------------------------------- - # The gamma function does not yield results for integers - # larger than 170. Since the expression for the Poisson - # probability mass function as a whole should not be huge, - # the exponent of the log of this expression is calculated - # instead to avoid using large numbers. - - - #Define integrand (gaussian_(bg+signal)*poisson(nobs)): - def prob( x, nsig ): - poisson = exp(self.model.observed*log(x) - x - self.lngamma ) - gaussian = stats.norm.pdf(x,loc=self.model.backgrounds+nsig,scale=self.sigma_tot) - - return poisson*gaussian - - #Compute maximum value for the integrand: - xm = self.model.backgrounds + nsig - self.sigma2 - #If nb + nsig = sigma2, shift the values slightly: - if xm == 0.: - xm = 0.001 - xmax = xm*(1.+sign(xm)*sqrt(1. + 4.*self.model.observed*self.sigma2/xm**2))/2. - - #Define initial integration range: - nrange = 5. - a = max(0.,xmax-nrange*sqrt(self.sigma2)) - b = xmax+nrange*self.sigma_tot - like = integrate.quad(prob,a,b,(nsig), epsabs=0.,epsrel=1e-3)[0] - if like==0.: - return 0. - - #Increase integration range until integral converges - err = 1. - ctr=0 - while err > 0.01: - ctr+=1 - if ctr > 10.: - raise Exception("Could not compute likelihood within required precision") - - like_old = like - nrange = nrange*2 - a = max(0.,(xmax-nrange*self.sigma_tot)[0][0] ) - b = (xmax+nrange*self.sigma_tot)[0][0] - like = integrate.quad(prob,a,b,(nsig), - epsabs=0.,epsrel=1e-3)[0] - if like == 0.: - continue - err = abs(like_old-like)/like - - #Renormalize the likelihood to account for the cut at x = 0. - #The integral of the gaussian from 0 to infinity gives: - #(1/2)*(1 + Erf(mu/sqrt(2*sigma2))), so we need to divide by it - #(for mu - sigma >> 0, the normalization gives 1.) - norm = (1./2.)*(1. + special.erf((self.model.backgrounds+nsig)/sqrt(2.*self.sigma2))) - like = like/norm + :param nsig: predicted signal (float) + :param nobs: number of observed events (float) + :param nb: predicted background (float) + :param deltab: uncertainty on background (float) - if nll: - like = - log ( like ) - - return like[0][0] - - def marginalizedLikelihood(self, nsig, nll ): - """ compute the marginalized likelihood of observing nsig signal event""" - if self.model.isLinear() and self.model.n == 1: ## 1-dimensional non-skewed llhds we can integrate analytically - return self.marginalizedLLHD1D ( nsig, nll ) - - vals=[] - self.gammaln = special.gammaln(self.model.observed + 1) - thetas = stats.multivariate_normal.rvs(mean=[0.]*self.model.n, - # cov=(self.model.totalCovariance(nsig)), - cov=self.model.V, - size=self.ntoys ) ## get ntoys values - for theta in thetas : - if self.model.isLinear(): - lmbda = nsig + self.model.backgrounds + theta - else: - lmbda = nsig + self.model.A + theta + self.model.C*theta**2/self.model.B**2 - if self.model.isScalar( lmbda ): - lmbda = array([lmbda]) - for ctr,v in enumerate( lmbda ): - if v<=0.: lmbda[ctr]=1e-30 -# print ( "lmbda=",lmbda ) - poisson = self.model.observed*NP.log(lmbda) - lmbda - self.gammaln - # poisson = NP.exp(self.model.observed*NP.log(lmbda) - lmbda - self.model.backgrounds - self.gammaln) - vals.append( NP.exp ( sum(poisson) ) ) - #vals.append ( reduce(lambda x, y: x*y, poisson) ) - mean = NP.mean( vals ) - if nll: - if mean == 0.: - mean = 1e-100 - mean = - log ( mean ) - return mean + :return: likelihood to observe nobs events (float) + """ + self.sigma2 = self.model.covariance + self.model.var_s(nsig) ## (self.model.deltas)**2 + self.sigma_tot = sqrt(self.sigma2) + self.lngamma = math.lgamma(self.model.observed[0] + 1) + # Why not a simple gamma function for the factorial: + # ----------------------------------------------------- + # The scipy.stats.poisson.pmf probability mass function + # for the Poisson distribution only works for discrete + # numbers. The gamma distribution is used to create a + # continuous Poisson distribution. + # + # Why not a simple gamma function for the factorial: + # ----------------------------------------------------- + # The gamma function does not yield results for integers + # larger than 170. Since the expression for the Poisson + # probability mass function as a whole should not be huge, + # the exponent of the log of this expression is calculated + # instead to avoid using large numbers. + + # Define integrand (gaussian_(bg+signal)*poisson(nobs)): + def prob(x, nsig): + poisson = exp(self.model.observed * log(x) - x - self.lngamma) + gaussian = stats.norm.pdf(x, loc=self.model.backgrounds + nsig, scale=self.sigma_tot) + + return poisson * gaussian + + # Compute maximum value for the integrand: + xm = self.model.backgrounds + nsig - self.sigma2 + # If nb + nsig = sigma2, shift the values slightly: + if xm == 0.0: + xm = 0.001 + xmax = ( + xm + * (1.0 + sign(xm) * sqrt(1.0 + 4.0 * self.model.observed * self.sigma2 / xm**2)) + / 2.0 + ) + + # Define initial integration range: + nrange = 5.0 + a = max(0.0, xmax - nrange * sqrt(self.sigma2)) + b = xmax + nrange * self.sigma_tot + like = integrate.quad(prob, a, b, (nsig), epsabs=0.0, epsrel=1e-3)[0] + if like == 0.0: + return 0.0 + + # Increase integration range until integral converges + err = 1.0 + ctr = 0 + while err > 0.01: + ctr += 1 + if ctr > 10.0: + raise Exception("Could not compute likelihood within required precision") + + like_old = like + nrange = nrange * 2 + a = max(0.0, (xmax - nrange * self.sigma_tot)[0][0]) + b = (xmax + nrange * self.sigma_tot)[0][0] + like = integrate.quad(prob, a, b, (nsig), epsabs=0.0, epsrel=1e-3)[0] + if like == 0.0: + continue + err = abs(like_old - like) / like + + # Renormalize the likelihood to account for the cut at x = 0. + # The integral of the gaussian from 0 to infinity gives: + # (1/2)*(1 + Erf(mu/sqrt(2*sigma2))), so we need to divide by it + # (for mu - sigma >> 0, the normalization gives 1.) + norm = (1.0 / 2.0) * ( + 1.0 + special.erf((self.model.backgrounds + nsig) / sqrt(2.0 * self.sigma2)) + ) + like = like / norm - def profileLikelihood( self, nsig, nll ): - """ compute the profiled likelihood for nsig. - Warning: not normalized. - Returns profile likelihood and error code (0=no error) + if nll: + like = -log(like) + + return like[0][0] + + def marginalizedLikelihood(self, nsig, nll): + """compute the marginalized likelihood of observing nsig signal event""" + if ( + self.model.isLinear() and self.model.n == 1 + ): ## 1-dimensional non-skewed llhds we can integrate analytically + return self.marginalizedLLHD1D(nsig, nll) + + vals = [] + self.gammaln = special.gammaln(self.model.observed + 1) + thetas = stats.multivariate_normal.rvs( + mean=[0.0] * self.model.n, + # cov=(self.model.totalCovariance(nsig)), + cov=self.model.V, + size=self.toys, + ) ## get ntoys values + for theta in thetas: + if self.model.isLinear(): + lmbda = nsig + self.model.backgrounds + theta + else: + lmbda = nsig + self.model.A + theta + self.model.C * theta**2 / self.model.B**2 + if self.model.isScalar(lmbda): + lmbda = array([lmbda]) + for ctr, v in enumerate(lmbda): + if v <= 0.0: + lmbda[ctr] = 1e-30 + # print ( "lmbda=",lmbda ) + poisson = self.model.observed * np.log(lmbda) - lmbda - self.gammaln + # poisson = np.exp(self.model.observed*np.log(lmbda) - lmbda - self.model.backgrounds - self.gammaln) + vals.append(np.exp(sum(poisson))) + # vals.append ( reduce(lambda x, y: x*y, poisson) ) + mean = np.mean(vals) + if nll: + if mean == 0.0: + mean = 1e-100 + mean = -log(mean) + return mean + + def profileLikelihood(self, nsig, nll): + """compute the profiled likelihood for nsig. + Warning: not normalized. + Returns profile likelihood and error code (0=no error) """ # compute the profiled (not normalized) likelihood of observing # nsig signal events - theta_hat,_ = self.findThetaHat ( nsig ) - ret = self.probMV ( nll, theta_hat ) + theta_hat, _ = self.findThetaHat(nsig) + if self.debug_mode: + self.theta_hat = theta_hat + ret = self.probMV(nll, theta_hat) return ret - def likelihood(self, nsig, marginalize=False, nll=False ): - """ compute likelihood for nsig, profiling the nuisances + def likelihood(self, nsig, marginalize=False, nll=False): + """compute likelihood for nsig, profiling the nuisances :param marginalize: if true, marginalize, if false, profile :param nll: return nll instead of likelihood """ @@ -686,270 +986,394 @@ def likelihood(self, nsig, marginalize=False, nll=False ): else: return self.profileLikelihood(nsig, nll) + def lmax(self, nsig=None, marginalize=False, nll=False, allowNegativeSignals=False): + """convenience function, computes likelihood for nsig = nobs-nbg, + :param marginalize: if true, marginalize, if false, profile nuisances. + :param nsig: number of signal events, needed only for combinations + if None, then it gets replaced with obsN - expBG + :param nll: return nll instead of likelihood + :param allowNegativeSignals: if False, then negative nsigs are replaced with 0. + """ + if type(nsig) == type(None): + nsig = self.model.observed - self.model.backgrounds + if len(self.model.observed) == 1: + if not allowNegativeSignals and nsig[0] < 0.0: + nsig = [0.0] + self.muhat = float(nsig[0]) + if abs(self.model.nsignal) > 1e-100: + self.muhat = float(nsig[0] / self.model.nsignal[0]) + self.sigma_mu = np.sqrt(self.model.observed[0] + self.model.covariance[0][0]) + return self.likelihood(nsig, marginalize=marginalize, nll=nll) + fmh = self.findMuHat( + nsig, allowNegativeSignals=allowNegativeSignals, extended_output=True, nll=nll + ) + muhat_, sigma_mu, lmax = fmh["muhat"], fmh["sigma_mu"], fmh["lmax"] + self.muhat = muhat_ + self.sigma_mu = sigma_mu + return self.likelihood(muhat_ * nsig, marginalize=marginalize, nll=nll) + + def findMuHatViaGradient( + self, + signal_rel, + allowNegativeSignals=False, + extended_output=False, + nll=False, + marginalize=False, + ): + """currently not used but finding muhat via gradient descent""" + + def myllhd(mu): + return self.likelihood(mu * signal_rel, nll=True, marginalize=marginalize) + + import scipy.optimize + + bounds = None + if not allowNegativeSignals: + bounds = [(0, 1000.0)] + o = scipy.optimize.minimize(myllhd, 0.0, bounds=bounds) + llhd = o.fun + if not nll: + llhd = np.exp(-o.fun) + hess = o.hess_inv + try: + hess = hess.todense() + except Exception as e: + pass + mu_hat = float(o.x[0]) + if extended_output: + sigma_mu = float(np.sqrt(hess[0][0])) + return mu_hat, sigma_mu, llhd + return mu_hat + def chi2(self, nsig, marginalize=False): - """ - Computes the chi2 for a given number of observed events nobs given - the predicted background nb, error on this background deltab, - expected number of signal events nsig and the relative error on - signal (deltas_rel). - :param marginalize: if true, marginalize, if false, profile - :param nsig: number of signal events - :return: chi2 (float) + """ + Computes the chi2 for a given number of observed events nobs given + the predicted background nb, error on this background deltab, + expected number of signal events nsig and the relative error on + signal (deltas_rel). + :param marginalize: if true, marginalize, if false, profile + :param nsig: number of signal events + :return: chi2 (float) - """ - nsig = self.model.convert(nsig) - - # Compute the likelhood for the null hypothesis (signal hypothesis) H0: - llhd = self.likelihood(nsig, marginalize=marginalize, nll=True) - - # Compute the maximum likelihood H1, which sits at nsig = nobs - nb - # (keeping the same % error on signal): - dn = self.model.observed-self.model.backgrounds - maxllhd = self.likelihood(dn, marginalize=marginalize, nll=True ) - - chi2=2*(llhd-maxllhd) - - if not NP.isfinite ( chi2 ): - logger.error("chi2 is not a finite number! %s,%s,%s" % \ - (chi2, llhd,maxllhd)) - # Return the test statistic -2log(H0/H1) - return chi2 - -class CLsComputer: - def __init__(self, ntoys=10000, cl=.95): + """ + nsig = self.model.convert(nsig) + + # Compute the likelhood for the null hypothesis (signal hypothesis) H0: + llhd = self.likelihood(nsig, marginalize=marginalize, nll=True) + + # Compute the maximum likelihood H1, which sits at nsig = nobs - nb + # (keeping the same % error on signal): + if len(nsig) == 1: + nsig = self.model.observed - self.model.backgrounds + maxllhd = self.likelihood(nsig, marginalize=marginalize, nll=True) + else: + maxllhd = self.lmax(nsig, marginalize=marginalize, nll=True, allowNegativeSignals=False) + chi2 = 2 * (llhd - maxllhd) + + if not np.isfinite(chi2): + logger.error("chi2 is not a finite number! %s,%s,%s" % (chi2, llhd, maxllhd)) + # Return the test statistic -2log(H0/H1) + return chi2 + + +class UpperLimitComputer: + debug_mode = False + + def __init__(self, ntoys: float = 30000, cl: float = 0.95): """ :param ntoys: number of toys when marginalizing :param cl: desired quantile for limits """ - self.ntoys = ntoys + self.toys = ntoys self.cl = cl - def ulSigma(self, model, marginalize=False, toys=None, expected=False ): - """ upper limit obtained from the defined Data (using the signal prediction + def getUpperLimitOnSigmaTimesEff( + self, model, marginalize=False, toys=None, expected=False, trylasttime=False + ): + """upper limit on the fiducial cross section sigma times efficiency, + summed over all signal regions, i.e. sum_i xsec^prod_i eff_i + obtained from the defined Data (using the signal prediction for each signal regio/dataset), by using the q_mu test statistic from the CCGV paper (arXiv:1007.1727). :params marginalize: if true, marginalize nuisances, else profile them :params toys: specify number of toys. Use default is none - :params expected: compute the expected value, not the observed. - :returns: upper limit on *production* xsec (efficiencies unfolded) + :params expected: if false, compute observed, + true: compute a priori expected, "posteriori": + compute a posteriori expected + :params trylasttime: if True, then dont try extra + :returns: upper limit on fiducial cross section """ + ul = self.getUpperLimitOnMu( + model, marginalize=marginalize, toys=toys, expected=expected, trylasttime=trylasttime + ) + + if ul == None: + return ul + if model.lumi is None: + logger.error(f"asked for upper limit on fiducial xsec, but no lumi given with the data") + return ul + return ul / model.lumi + + def _ul_preprocess( + self, + model: Data, + marginalize: Optional[bool] = False, + toys: Optional[float] = None, + expected: Optional[Union[bool, Text]] = False, + trylasttime: Optional[bool] = False, + signal_type: Optional[Text] = "signal_rel", + ) -> Tuple: + """ + Process the upper limit calculator + :param model: statistical model + :param marginalize: if true, marginalize nuisances, else profile them + :param toys: specify number of toys. Use default is none + :param expected: if false, compute observed, + true: compute a priori expected, "posteriori": + compute a posteriori expected + :param trylasttime: if True, then dont try extra + :param signal_type: signal_type will allow both SModelS and MadAnalysis interface + to use this function simultaneously. For signal_rel upper limit + is calculated for normalised signal events for nsignals upper limit + is calculated for number of signal events. + :return: mu_hat, sigma_mu, root_func + """ + assert signal_type in [ + "signal_rel", + "nsignal", + ], f"Signal type can only be `signal_rel` or `nsignal`. `{signal_type}` is given." + # if expected: + # marginalize = True if model.zeroSignal(): - """ only zeroes in efficiencies? cannot give a limit! """ - return None - if toys==None: - toys=self.ntoys + """only zeroes in efficiencies? cannot give a limit!""" + return None, None, None + if toys == None: + toys = self.toys oldmodel = model if expected: model = copy.deepcopy(oldmodel) - #model.observed = model.backgrounds - for i,d in enumerate(model.backgrounds): - model.observed[i]=int(NP.round(d)) + if expected == "posteriori": + # print ( "here!" ) + tempc = LikelihoodComputer(oldmodel, toys) + theta_hat_, _ = tempc.findThetaHat(0 * getattr(oldmodel, signal_type)) + # model.observed = model.backgrounds + for i, d in enumerate(model.backgrounds): + # model.observed[i]=int(np.ceil(d)) + # model.observed[i]=int(np.round(d)) + if expected == "posteriori": + d += theta_hat_[i] + model.observed[i] = float(d) computer = LikelihoodComputer(model, toys) - mu_hat = computer.findMuHat(model.signal_rel) - theta_hat0,_ = computer.findThetaHat(0*model.signal_rel) - sigma_mu = computer.getSigmaMu(model.signal_rel) - + mu_hat = computer.findMuHat( + getattr(model, signal_type), allowNegativeSignals=False, extended_output=False + ) + theta_hat0, _ = computer.findThetaHat(0 * getattr(model, signal_type)) + sigma_mu = computer.getSigmaMu(mu_hat, getattr(model, signal_type), theta_hat0) + + nll0 = computer.likelihood( + getattr(model, "signals" if signal_type == "signal_rel" else "nsignals")(mu_hat), + marginalize=marginalize, + nll=True, + ) + # print ( f"SL nll0 {nll0:.3f} muhat {mu_hat:.3f} sigma_mu {sigma_mu:.3f}" ) + if np.isinf(nll0) and not marginalize and not trylasttime: + logger.warning( + "nll is infinite in profiling! we switch to marginalization, but only for this one!" + ) + marginalize = True + nll0 = computer.likelihood( + getattr(model, "signals" if signal_type == "signal_rel" else "nsignals")(mu_hat), + marginalize=True, + nll=True, + ) + if np.isinf(nll0): + logger.warning("marginalization didnt help either. switch back.") + marginalize = False + else: + logger.warning("marginalization worked.") aModel = copy.deepcopy(model) - aModel.observed = array([NP.round(x+y) for x,y in zip(model.backgrounds,theta_hat0)]) - #print ( "aModeldata=", aModel.observed ) - #aModel.observed = array ( [ round(x) for x in model.backgrounds ] ) + aModel.observed = array([x + y for x, y in zip(model.backgrounds, theta_hat0)]) aModel.name = aModel.name + "A" + # print ( f"SL finding mu hat with {aModel.signal_rel}: mu_hatA, obs: {aModel.observed}" ) compA = LikelihoodComputer(aModel, toys) ## compute - mu_hatA = compA.findMuHat(aModel.signal_rel) - if mu_hat < 0.: - mu_hat = 0. - nll0 = computer.likelihood(model.signals(mu_hat), - marginalize=marginalize, - nll=True) - if NP.isinf(nll0) and marginalize==False: - logger.warning("nll is infinite in profiling! we switch to marginalization, but only for this one!" ) - marginalize=True - nll0 = computer.likelihood(model.signals(mu_hat), - marginalize=True, - nll=True) - if NP.isinf(nll0): - logger.warning("marginalization didnt help either. switch back.") - marginalize=False - else: - logger.warning("marginalization worked.") - nll0A = compA.likelihood(aModel.signals(mu_hatA), - marginalize=marginalize, - nll=True) - - def root_func(mu): - ## the function to minimize. - nsig = model.signals(mu) + mu_hatA = compA.findMuHat(getattr(aModel, signal_type)) + nll0A = compA.likelihood( + getattr(aModel, "signals" if signal_type == "signal_rel" else "nsignals")(mu_hatA), + marginalize=marginalize, + nll=True, + ) + # print ( f"SL nll0A {nll0A:.3f} mu_hatA {mu_hatA:.3f} bg {aModel.backgrounds[0]:.3f} obs {aModel.observed[0]:.3f}" ) + # return 1. + + def clsRoot(mu: float, return_type: Text = "CLs-alpha") -> float: + """ + Calculate the root + :param mu: float POI + :param get_cls: if true returns 1-CLs value + """ + ## the function to find the zero of (ie CLs - alpha) + nsig = getattr(model, "signals" if signal_type == "signal_rel" else "nsignals")(mu) computer.ntot = model.backgrounds + nsig - nll = computer.likelihood(nsig, marginalize=marginalize, nll=True ) - nllA = compA.likelihood(nsig, marginalize=marginalize, nll=True ) - qmu = 2*( nll - nll0 ) - if qmu<0.: qmu=0. - sqmu = sqrt (qmu) - qA = 2*( nllA - nll0A ) - # print ( "mu: %s, qMu: %s, qA: %s nll0A: %s nllA: %s" % ( mu, qmu, qA, nll0A, nllA ) ) - if qA<0.: - qA=0. - sqA = sqrt(qA) - CLsb = 1. - stats.multivariate_normal.cdf(sqmu) - CLb = 0. - if qA >= qmu: - CLb = stats.multivariate_normal.cdf(sqA - sqmu) - else: - if qA == 0.: - CLsb = 1. - CLb = 1. - else: - CLsb = 1. - stats.multivariate_normal.cdf( (qmu + qA)/(2*sqA) ) - CLb = 1. - stats.multivariate_normal.cdf( (qmu - qA)/(2*sqA) ) - CLs = 0. - if CLb > 0.: - CLs = CLsb/CLb - root = CLs - 1. + self.cl - return root - - - a,b=1.5*mu_hat,2.5*mu_hat+2*sigma_mu - ctr=0 - while True: - while ( NP.sign ( root_func(a)* root_func(b) ) > -.5 ): - b=1.4*b ## widen bracket FIXME make a linear extrapolation! - a=a-(b-a)*.3 ## widen bracket - if a < 0.: a=0. - ctr+=1 - if ctr>20: ## but stop after 20 trials - if toys > 2000: - logger.error("cannot find brent bracket after 20 trials. a,b=%s(%s),%s(%s)" % ( root_func(a),a,root_func(b),b ) ) - return None - else: - logger.debug("cannot find brent bracket after 20 trials. but very low number of toys") - return self.ulSigma ( model, marginalize, 4*toys ) - try: - mu_lim = optimize.brentq ( root_func, a, b, rtol=1e-03, xtol=1e-06 ) - return mu_lim - except ValueError as e: ## it could still be that the signs arent opposite - # in that case, try again - pass + nll = computer.likelihood(nsig, marginalize=marginalize, nll=True) + nllA = compA.likelihood(nsig, marginalize=marginalize, nll=True) + return CLsfromNLL(nllA, nll0A, nll, nll0, return_type=return_type) - """ - .. method:: computeCLs - :synopsis: Method that implements CLs calculation from simplified likelihoods as presented - in CMS-NOTE-2017-001. It basically consist in fixing mu to 1.0 instead of varying it - like it is done in the previous ulSigma method - - .. methodauthor:: Gael Alguero + return mu_hat, sigma_mu, clsRoot - """ + def getUpperLimitOnMu( + self, model, marginalize=False, toys=None, expected=False, trylasttime=False + ): + """upper limit on the signal strength multiplier mu + obtained from the defined Data (using the signal prediction - def computeCLs(self, model, marginalize=False, toys=None, expected=False ): - """ exclusion confidence level obtained from the defined Data (using the signal prediction for each signal regio/dataset), by using the q_mu test statistic from the CCGV paper (arXiv:1007.1727). :params marginalize: if true, marginalize nuisances, else profile them :params toys: specify number of toys. Use default is none - :params expected: compute the expected value, not the observed. - :returns: exclusion confidence level (1-CLs) + :params expected: if false, compute observed, + true: compute a priori expected, "posteriori": + compute a posteriori expected + :params trylasttime: if True, then dont try extra + :returns: upper limit on the signal strength multiplier mu """ - if model.zeroSignal(): - """ only zeroes in efficiencies? cannot give a limit! """ + mu_hat, sigma_mu, clsRoot = self._ul_preprocess( + model, marginalize, toys, expected, trylasttime + ) + if mu_hat == None: return None - if toys==None: - toys=self.ntoys - oldmodel = model - if expected: - model = copy.deepcopy(oldmodel) - #model.observed = model.backgrounds - for i,d in enumerate(model.backgrounds): - model.observed[i]=int(NP.round(d)) - computer = LikelihoodComputer(model, toys) - mu_hat = computer.findMuHat(model.nsignal) - theta_hat0,_ = computer.findThetaHat(0*model.nsignal) - sigma_mu = computer.getSigmaMu(model.nsignal) + a, b = determineBrentBracket(mu_hat, sigma_mu, clsRoot) + mu_lim = optimize.brentq(clsRoot, a, b, rtol=1e-03, xtol=1e-06) + return mu_lim + + def computeCLs( + self, + model: Data, + marginalize: bool = False, + toys: float = None, + expected: Union[bool, Text] = False, + trylasttime: bool = False, + ) -> float: + """ + Compute the exclusion confidence level of the model (1-CLs) + :param model: statistical model + :param marginalize: if true, marginalize nuisances, else profile them + :param toys: specify number of toys. Use default is none + :param expected: if false, compute observed, + true: compute a priori expected, "posteriori": + compute a posteriori expected + :param trylasttime: if True, then dont try extra + :return: 1 - CLs value + """ + _, _, clsRoot = self._ul_preprocess(model, marginalize, toys, expected, trylasttime) - aModel = copy.deepcopy(model) - aModel.observed = array([NP.round(x+y) for x,y in zip(model.backgrounds,theta_hat0)]) - #print ( "aModeldata=", aModel.observed ) - #aModel.observed = array ( [ round(x) for x in model.backgrounds ] ) - aModel.name = aModel.name + "A" - compA = LikelihoodComputer(aModel, toys) - ## compute - mu_hatA = compA.findMuHat(aModel.nsignal) - # if mu_hat < 0.: - # mu_hat = 0. - # -log L(mu_hat, theta_hat(mu_hat)) - nll0 = computer.likelihood(model.signals(mu_hat), - marginalize=marginalize, - nll=True) - if NP.isinf(nll0) and marginalize==False: - logger.warning("nll is infinite in profiling! we switch to marginalization, but only for this one!" ) - marginalize=True - nll0 = computer.likelihood(model.signals(mu_hat), - marginalize=True, - nll=True) - if NP.isinf(nll0): - logger.warning("marginalization didnt help either. switch back.") - marginalize=False - else: - logger.warning("marginalization worked.") - nll0A = compA.likelihood(aModel.signals(mu_hatA), - marginalize=marginalize, - nll=True) - - # nsig = model.signals(1.) - computer.ntot = model.backgrounds + model.nsignal - # -log L(mu = 1, theta(1)) - nll = computer.likelihood(model.nsignal, marginalize=marginalize, nll=True ) - nllA = compA.likelihood(model.nsignal, marginalize=marginalize, nll=True ) - qmu = 2*( nll - nll0 ) - if qmu<0.: qmu=0. - sqmu = sqrt (qmu) - qA = 2*( nllA - nll0A ) - # print ( "mu: %s, qMu: %s, qA: %s nll0A: %s nllA: %s" % ( mu, qmu, qA, nll0A, nllA ) ) - if qA<0.: - qA=0. - sqA = sqrt(qA) - if qA >= qmu: - CLsb = 1. - stats.multivariate_normal.cdf(sqmu) - CLb = stats.multivariate_normal.cdf(sqA - sqmu) - else: - if qA == 0.: - CLsb = 1. - CLb = 1. - else: - CLsb = 1. - stats.multivariate_normal.cdf( (qmu + qA)/(2*sqA) ) - CLb = 1. - stats.multivariate_normal.cdf( (qmu - qA)/(2*sqA) ) - # CLs = 0. - # if CLb > 0.: - CLs = CLsb/CLb if CLb > 0. else 1. - return 1 - CLs + # 1-(CLs+alpha) -> alpha = 0.05 + return clsRoot(1.0, return_type="1-CLs") if __name__ == "__main__": - C = [ 18774.2, -2866.97, -5807.3, -4460.52, -2777.25, -1572.97, -846.653, -442.531, - -2866.97, 496.273, 900.195, 667.591, 403.92, 222.614, 116.779, 59.5958, - -5807.3, 900.195, 1799.56, 1376.77, 854.448, 482.435, 258.92, 134.975, - -4460.52, 667.591, 1376.77, 1063.03, 664.527, 377.714, 203.967, 106.926, - -2777.25, 403.92, 854.448, 664.527, 417.837, 238.76, 129.55, 68.2075, - -1572.97, 222.614, 482.435, 377.714, 238.76, 137.151, 74.7665, 39.5247, - -846.653, 116.779, 258.92, 203.967, 129.55, 74.7665, 40.9423, 21.7285, - -442.531, 59.5958, 134.975, 106.926, 68.2075, 39.5247, 21.7285, 11.5732] - nsignal = [ x/100. for x in [47,29.4,21.1,14.3,9.4,7.1,4.7,4.3] ] - m=Data( observed=[1964,877,354,182,82,36,15,11], - backgrounds=[2006.4,836.4,350.,147.1,62.0,26.2,11.1,4.7], - covariance= C, -# third_moment = [ 0.1, 0.02, 0.1, 0.1, 0.003, 0.0001, 0.0002, 0.0005 ], - third_moment = [ 0. ] * 8, - nsignal = nsignal, - name="CMS-NOTE-2017-001 model" ) - ulComp = UpperLimitComputer(ntoys=500, cl=.95) - #uls = ulComp.ulSigma ( Data ( 15,17.5,3.2,0.00454755 ) ) - #print ( "uls=", uls ) - ul_old = 131.828*sum(nsignal) #With respect to the older refernece value one must normalize the xsec - print ( "old ul=", ul_old ) - ul = ulComp.ulSigma ( m ) - print ( "ul (marginalized)", ul ) - ul = ulComp.ulSigma ( m, marginalize=False ) - print ( "ul (profiled)", ul ) + C = [ + 18774.2, + -2866.97, + -5807.3, + -4460.52, + -2777.25, + -1572.97, + -846.653, + -442.531, + -2866.97, + 496.273, + 900.195, + 667.591, + 403.92, + 222.614, + 116.779, + 59.5958, + -5807.3, + 900.195, + 1799.56, + 1376.77, + 854.448, + 482.435, + 258.92, + 134.975, + -4460.52, + 667.591, + 1376.77, + 1063.03, + 664.527, + 377.714, + 203.967, + 106.926, + -2777.25, + 403.92, + 854.448, + 664.527, + 417.837, + 238.76, + 129.55, + 68.2075, + -1572.97, + 222.614, + 482.435, + 377.714, + 238.76, + 137.151, + 74.7665, + 39.5247, + -846.653, + 116.779, + 258.92, + 203.967, + 129.55, + 74.7665, + 40.9423, + 21.7285, + -442.531, + 59.5958, + 134.975, + 106.926, + 68.2075, + 39.5247, + 21.7285, + 11.5732, + ] + nsignal = [x / 100.0 for x in [47, 29.4, 21.1, 14.3, 9.4, 7.1, 4.7, 4.3]] + m = Data( + observed=[1964, 877, 354, 182, 82, 36, 15, 11], + backgrounds=[2006.4, 836.4, 350.0, 147.1, 62.0, 26.2, 11.1, 4.7], + covariance=C, + # third_moment = [ 0.1, 0.02, 0.1, 0.1, 0.003, 0.0001, 0.0002, 0.0005 ], + third_moment=[0.0] * 8, + nsignal=nsignal, + name="CMS-NOTE-2017-001 model", + ) + ulComp = UpperLimitComputer(ntoys=500, cl=0.95) + # uls = ulComp.ulSigma ( Data ( 15,17.5,3.2,0.00454755 ) ) + # print ( "uls=", uls ) + ul_old = 131.828 * sum( + nsignal + ) # With respect to the older refernece value one must normalize the xsec + print("old ul=", ul_old) + ul = ulComp.getUpperLimitOnMu(m, marginalize=True) + + cls = ulComp.computeCLs(m, marginalize=True) + print("ul (marginalized)", ul) + print("CLs (marginalized)", cls) + + ul = ulComp.getUpperLimitOnMu(m, marginalize=False) + cls = ulComp.computeCLs(m, marginalize=False) + print("ul (profiled)", ul) + print("CLs (profiled)", cls) + + """ + results: + old ul= 180.999844 + ul (marginalized) 184.8081186162269 + CLs (marginalized) 1.0 + ul (profiled) 180.68039063387553 + CLs (profiled) 0.75 + """ diff --git a/madanalysis/misc/taco/__init__.py b/madanalysis/misc/taco/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/madanalysis/misc/taco/base.py b/madanalysis/misc/taco/base.py new file mode 100644 index 00000000..6874332e --- /dev/null +++ b/madanalysis/misc/taco/base.py @@ -0,0 +1,143 @@ +################################################################################ +# +# Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks +# The MadAnalysis development team, email: +# +# This file is part of MadAnalysis 5. +# Official website: +# +# MadAnalysis 5 is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MadAnalysis 5 is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MadAnalysis 5. If not, see +# +################################################################################ + +from abc import ABC, abstractmethod +from typing import Text + + +class MA5DataSet: + def __init__(self, srname): + """param srname: the name of the signal region + ("dataset" is SModelS' name for a signal region)""" + self.srname = srname + + def getID(self): + return self.srname + + +class MA5CrossSection: + def __init__(self, value: float, sqrts: float = 13.0): + """production cross section + :param value: the value of the production cross section + :param sqrts: center of mass energy + """ + self.value = value + self.sqrts = sqrts + + +class TACOBase(ABC): + def __init__( + self, + analysis: Text = "__unknown_analysis__", + regionID: Text = "__unknown_region__", + xsection: float = -1, + ): + """ + Abstract Class for TACO interface + + Parameters + ---------- + analysis: (Text) analysis name + regionID: (Text) region name + """ + self.analysis = analysis + self.dataset = MA5DataSet(regionID) + self.xsection = MA5CrossSection(xsection, 13.0) + + def __repr__(self): + return f"Analysis: {self.analysis}, Region: {self.dataset.srname}" + + @abstractmethod + def likelihood( + self, mu: float = 1.0, nll: bool = False, expected: bool = False, useCached=False + ) -> float: + """ + Returns the value of the likelihood. + Inspired by the `pyhf.infer.mle` module but for non-log likelihood + + Parameters + ---------- + mu: float + POI: signal strength + nll: bool + if true, return nll, not llhd + expected: bool + if true, compute expected likelihood, else observed. + """ + raise NotImplementedError + + @abstractmethod + def lmax(self, expected: bool = False, nll: bool = False) -> float: + """ + Compute maximum likelihood + + Parameters + ---------- + expected: bool + if expected, compute expected max likelihood + nll: bool + if true, return nll, not llhd + """ + raise NotImplementedError + + @abstractmethod + def sigma_mu(self, expected: bool = False) -> float: + """ + get an estimate for the standard deviation of mu around mu_hat this is only used for + initialisation of the optimizer, so can be skipped in the first version, + + expected: + get muhat for the expected likelihood, not observed. + """ + raise NotImplementedError + + @abstractmethod + def muhat(self, expected: bool = False, allowNegativeSignals: bool = True) -> float: + """ + get the value of mu for which the likelihood is maximal. this is only used for + initialisation of the optimizer, so can be skipped in the first version of the code. + + Parameters + ---------- + expected: bool + get muhat for the expected likelihood, not observed. + allowNegativeSignals: bool + if true, allow negative values for mu + """ + raise NotImplementedError + + @abstractmethod + def getUpperLimit(self, expected: bool = False): + """ + code here that retrieves the upper limit, expected or observed + """ + raise NotImplementedError + + def dataType(self): # a technicality + return "efficiencyMap" + + def analysisId(self): + """ + return the analysis id, e.g. "atlas_susy_2018_02" + """ + return self.analysis diff --git a/madanalysis/misc/taco/pyhf_backend.py b/madanalysis/misc/taco/pyhf_backend.py new file mode 100644 index 00000000..a31c3d2c --- /dev/null +++ b/madanalysis/misc/taco/pyhf_backend.py @@ -0,0 +1,144 @@ +################################################################################ +# +# Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks +# The MadAnalysis development team, email: +# +# This file is part of MadAnalysis 5. +# Official website: +# +# MadAnalysis 5 is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MadAnalysis 5 is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MadAnalysis 5. If not, see +# +################################################################################ + +from madanalysis.misc.taco.base import TACOBase +from madanalysis.misc.pyhf_interface import PyhfInterface +from typing import Text + + +class pyhfSingleRegion(TACOBase): + def __init__( + self, + analysis: Text, + regionID: Text, + nobs: int, + nb: float, + deltanb: float, + N0: float, + Nf: float, + xsection: float, + lumi: float, + sqrts: float = 13.0, + ): + super(pyhfSingleRegion, self).__init__(analysis, regionID, xsection) + self.xsection.sqrts = sqrts + self.Nf = Nf + self.N0 = N0 + self.lumi = lumi + + self.experimental_data = [nobs, nb, deltanb] + + self.marginalize = False + self.cachedLlhds = {"exp": {}, "obs": {}} + + def nsignal(self, mu: float = 1.0): + """ + Number of signal events + + Parameters + ---------- + mu: (float) POI signal strength + """ + return mu * self.xsection.value * 1000.0 * self.lumi * self.Nf / self.N0 + + def computeCLs(self, expected: bool = False) -> float: + """ + for testing purposes + """ + interface = PyhfInterface(self.nsignal(), *self.experimental_data) + if expected: + return interface.computeCLs(CLs_exp=True) + else: + return interface.computeCLs(CLs_obs=True) + + def likelihood( + self, + mu: float = 1.0, + expected: bool = False, + nll: bool = False, + useCached=False, + ) -> float: + """ + + Parameters + ---------- + mu: (float) POI signal strength + expected: if true, compute expected likelihood, else observed. + nll: if True, return the negative log likelihood instead of the likelihood + useCached: if true reuse cached results + + Returns + ------- + likelihood with respect to given POI + """ + + if useCached: + cached = None + if expected: + cached = self.cachedLlhds["exp"].get(mu, None) + else: + cached = self.cachedLlhds["obs"].get(mu, None) + + if cached is not None: + return cached + + interface = PyhfInterface(self.nsignal(), *self.experimental_data) + llhd = interface.likelihood(mu, expected=expected, nll=nll) + + if expected: + cached = self.cachedLlhds["exp"][mu] = llhd + else: + cached = self.cachedLlhds["obs"][mu] = llhd + + return llhd + + def sigma_mu(self, expected: bool = False, **kwargs) -> float: + """ + get an estimate for the standard deviation of mu around mu_hat this is only used for + initialisation of the optimizer, so can be skipped in the first version, it's even + less important than muhat + + Parameters + ---------- + expected: (float) if true, compute expected likelihood, else observed. + if "posteriori", compute a posteriori expected likelihood + (FIXME do we need this?) + """ + interface = PyhfInterface(self.nsignal(), *self.experimental_data) + return interface.sigma_mu(expected=expected) + + def muhat(self, expected: bool = False, allowNegativeSignals: bool = True) -> float: + """ + Find the most likely signal strength mu. + + Parameters + ---------- + expected + allowNegativeSignals: if true, then also allow for negative values + """ + interface = PyhfInterface(self.nsignal(), *self.experimental_data) + return interface.muhat(expected, allowNegativeSignals) + + def getUpperLimit(self, expected: bool = False): + interface = PyhfInterface(self.nsignal(), *self.experimental_data) + return interface.computeUpperLimitOnMu(expected = expected) diff --git a/madanalysis/misc/taco/sl_backend.py b/madanalysis/misc/taco/sl_backend.py new file mode 100644 index 00000000..33183d84 --- /dev/null +++ b/madanalysis/misc/taco/sl_backend.py @@ -0,0 +1,158 @@ +################################################################################ +# +# Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks +# The MadAnalysis development team, email: +# +# This file is part of MadAnalysis 5. +# Official website: +# +# MadAnalysis 5 is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MadAnalysis 5 is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MadAnalysis 5. If not, see +# +################################################################################ + +from madanalysis.misc.taco.base import TACOBase +from madanalysis.misc.simplified_likelihood import Data, LikelihoodComputer, UpperLimitComputer +from typing import Text + + +class slSingleRegion(TACOBase): + def __init__( + self, + analysis: Text, + regionID: Text, + nobs: int, + nb: float, + deltanb: float, + N0: float, + Nf: float, + xsection: float, + lumi: float, + sqrts: float = 13.0, + ): + super(slSingleRegion, self).__init__(analysis, regionID, xsection) + self.xsection.sqrts = sqrts + self.Nf = Nf + self.N0 = N0 + self.lumi = lumi + + self.experimental_data = [nobs, nb, deltanb] + self.data = Data(*self.experimental_data, nsignal=self.nsignal()) + + self.marginalize = False + self.cachedLlhds = {"exp": {}, "obs": {}} + + def nsignal(self, mu: float = 1.0): + """ + Number of signal events + + Parameters + ---------- + mu: (float) POI signal strength + """ + return mu * self.xsection.value * 1000.0 * self.lumi * self.Nf / self.N0 + + def computeCLs(self, expected: bool = False) -> float: + """ + for testing purposes + """ + interface = UpperLimitComputer() + return interface.computeCLs(self.data, expected=expected) + + def likelihood( + self, + mu: float = 1.0, + expected: bool = False, + nll: bool = False, + useCached=False, + ) -> float: + """ + + Parameters + ---------- + mu: (float) POI signal strength + expected: if true, compute expected likelihood, else observed. + nll: if True, return the negative log likelihood instead of the likelihood + useCached: if true reuse cached results + + Returns + ------- + likelihood with respect to given POI + """ + + if useCached: + cached = None + if expected: + cached = self.cachedLlhds["exp"].get(mu, None) + else: + cached = self.cachedLlhds["obs"].get(mu, None) + if cached is not None: + return cached + + if expected: + data = Data( + self.experimental_data[1], *self.experimental_data[1:], nsignal=self.nsignal() + ) + interface = LikelihoodComputer(self.data) + llhd = interface.likelihood(self.nsignal(mu), marginalize=self.marginalize, nll=nll) + cached = self.cachedLlhds["exp"][mu] = llhd + else: + interface = LikelihoodComputer(self.data) + llhd = interface.likelihood(self.nsignal(mu), marginalize=self.marginalize, nll=nll) + cached = self.cachedLlhds["obs"][mu] = llhd + return llhd + + def sigma_mu(self, expected: bool = False, **kwargs) -> float: + """ + get an estimate for the standard deviation of mu around mu_hat this is only used for + initialisation of the optimizer, so can be skipped in the first version, it's even + less important than muhat + + Parameters + ---------- + expected: (float) if true, compute expected likelihood, else observed. + if "posteriori", compute a posteriori expected likelihood + (FIXME do we need this?) + """ + + if expected: + data = Data( + self.experimental_data[1], *self.experimental_data[1:], nsignal=self.nsignal() + ) + else: + data = self.data + interface = LikelihoodComputer(data) + theta_hat = interface.findThetaHat(self.nsignal()) + return interface.getSigmaMu(1.0, self.nsignal(), theta_hat) + + def muhat(self, expected: bool = False, allowNegativeSignals: bool = True) -> float: + """ + Find the most likely signal strength mu. + + Parameters + ---------- + expected + allowNegativeSignals: if true, then also allow for negative values + """ + if expected: + data = Data( + self.experimental_data[1], *self.experimental_data[1:], nsignal=self.nsignal() + ) + else: + data = self.data + interface = LikelihoodComputer(data) + return interface.findMuHat([self.nsignal()], allowNegativeSignals=allowNegativeSignals) + + def getUpperLimit(self, expected: bool = False): + interface = UpperLimitComputer() + return interface.getUpperLimitOnMu(self.data, marginalize=self.marginalize, expected=expected) diff --git a/madanalysis/misc/taco_interface.py b/madanalysis/misc/taco_interface.py new file mode 100644 index 00000000..6de94cf0 --- /dev/null +++ b/madanalysis/misc/taco_interface.py @@ -0,0 +1,257 @@ +################################################################################ +# +# Copyright (C) 2012-2022 Jack Araz, Eric Conte & Benjamin Fuks +# The MadAnalysis development team, email: +# +# This file is part of MadAnalysis 5. +# Official website: +# +# MadAnalysis 5 is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MadAnalysis 5 is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MadAnalysis 5. If not, see +# +################################################################################ + +from typing import Text + +from database import Region + +from numpy import warnings +import numpy as np +import pyhf + +# TODO: remove pyhf dependency from TACO interface + + +class TACORegion(Region): + def __init__( + self, + analysis: Text, + regionID: Text, + nobs: int, + nb: float, + deltanb: float, + N0: float, + Nf: float, + xsection: float, + lumi: float, + s95exp: float, + s95obs: float, + cls: float, + best: bool = None # we might not need this + ) -> None: + """ + Initialization of MadAnalysis Signal Region + :param analysis: analysis name + :param regionID: region name + :param N0: sum of weights before the analysis + :param Nf: sum of weights after the final cut + :param xsection: production cross section + :param lumi: Luminosity of the analysis [fb^-1] + :param s95exp: [pb] excluded xsec at 95% CLs calculated via expected events + :param s95obs: [pb] excluded xsec at 95% CLs calculated via observed events + :param cls: exclusion CLs + """ + super(TACORegion, self).__init__( + regionID=regionID, nobs=nobs, nb=nb, deltanb=deltanb, lumi=lumi, analysis=analysis + ) + self.final_nevt = Nf + self.initial_nevt = N0 + + # Temporary: SMODELS requires xsection to be a class with value attr + class XSEC: + def __init__(self, xsec): + self.value = xsec + + self.xsection = XSEC(xsection) + self.xsec = xsection + + self.s95exp = s95exp + self.s95obs = s95obs + self.cls = cls + self.best = best + + self.marginalize = False + + self.cachedLlhds = {"exp": {}, "obs": {}} + + @property + def analysisId(self): + return self.analysis + + @property + def dataType(self) -> Text: + return "efficiencyMap" + + def getUpperLimit(self, expected: bool = False) -> float: + """ + get exclusion cross section at 95% CLs + :param expected: is xsec value expected or observed + :return: float cross section in pb + """ + return getattr(self, "s95exp" if expected else "s95obs") + + def getRValue(self, expected: bool = False) -> float: + return self.getUpperLimit(expected) / self.xsection.value + + def likelihood( + self, + mu: float = 1.0, + expected: bool = False, + nll: bool = False, + useCached=False, + ) -> float: + """ + compute the likelihood for the signal strength mu + + :param mu: signal strength + :param expected: if true, compute expected likelihood, else observed. + if "posteriori", compute a posteriori expected likelihood + (FIXME do we need this?) + :param nll: if True, return the negative log likelihood instead of the + likelihood + """ + + if useCached: + cached = None + if expected: + cached = self.cachedLlhds["exp"].get(mu, None) + else: + cached = self.cachedLlhds["obs"].get(mu, None) + + if cached is not None: + return cached + + def get_twice_nllh(model, data, par_bounds): + try: + with warnings.catch_warnings(): + # `pyhf.infer.mle.fixed_poi_fit` returns twice negative log likelihood + _, twice_nllh = pyhf.infer.mle.fixed_poi_fit( + mu, data, model, return_fitted_val=True, par_bounds=par_bounds + ) + return twice_nllh + except ValueError as err: + return "update bounds" + + model = pyhf.simplemodels.uncorrelated_background([self.nsignal], [self.nb], [self.deltanb]) + par_bounds = model.config.suggested_bounds() + # print(par_bounds) + data = [self.nobs if not expected else self.nb] + model.config.auxdata + + counter = 0 + while True: + twice_nllh = get_twice_nllh(model, data, par_bounds) + if twice_nllh == "update bounds": + par_bounds = [ + (par_bounds[0][0], par_bounds[0][1] * 2.0), + (par_bounds[1][0], par_bounds[1][1] * 2.0), + ] + # print(f"update bounds: {par_bounds}") + else: + break + counter += 1 + if counter > 3: + twice_nllh = -1 + break + + if nll: + return float(twice_nllh / 2.0) + + llhd = float(np.exp(-twice_nllh / 2.0)) + if expected: + cached = self.cachedLlhds["exp"][mu] = llhd + else: + cached = self.cachedLlhds["obs"][mu] = llhd + + return llhd + + +if __name__ == "__main__": + import argparse, json, os + from smodels.tools.theoryPredictionsCombiner import TheoryPredictionsCombiner + from smodels.tools.statistics import rootFromNLLs + + parser = argparse.ArgumentParser(description="Test TACO interface") + path = parser.add_argument_group("Path handling") + path.add_argument("REGIONDATA", type=str, help="Path to the JSON sterilized region data file.") + + param = parser.add_argument_group("Parameter handling") + param.add_argument("-xs", "--xsection", type=float, dest="XSEC", help="Cross section in pb.") + param.add_argument("-lumi", type=float, dest="LUMI", help="Luminosity in fb^-1.") + + args = parser.parse_args() + if not os.path.isfile(args.REGIONDATA): + raise FileNotFoundError + + with open(args.REGIONDATA, "r") as f: + regiondata = json.load(f) + + SRs = [] + for analysis, regdat in regiondata.items(): + + for regionID, data in regdat["regiondata"].items(): + if "nobs" not in data.keys(): + continue + current_region = TACORegion( + analysis, + regionID, + float(data["nobs"]), + float(data["nb"]), + float(data["deltanb"]), + float(data["N0"]), + float(data["Nf"]), + args.XSEC, + args.LUMI, + float(data["s95exp"]), + float(data["s95obs"]), + float(data["CLs"]), + ) + SRs.append(current_region) + break + + print(f"Analysis : {analysis}") + + for region in SRs: + if region.final_nevt > 0.0: + print(f"\nRegion name: {region.region_id}") + print(f"Expected upper limit : {region.s95exp}") + print(f"Obs upper limit : {region.s95obs}") + print(f"R-value : {region.getRValue()}") + print(f"Likelihood mu = 1 : {region.likelihood(1.)}") + print(f"Likelihood mu = 0 : {region.likelihood(0.)}") + + def clsRoot(mu, combiner, expected=False): + # at - infinity this should be .95, + # at + infinity it should -.05 + # Make sure to always compute the correct llhd value (from theoryPrediction) + # and not used the cached value (which is constant for mu~=1 an mu~=0) + + mu_hat, sigma_mu, lmax = combiner.findMuHat(allowNegativeSignals=True, extended_output=True) + nll0 = combiner.likelihood(mu_hat, expected=expected, nll=True) + # a posteriori expected is needed here + # mu_hat is mu_hat for signal_rel + mu_hatA, _, nll0A = combiner.findMuHat( + expected="posteriori", nll=True, extended_output=True + ) + + nll = combiner.likelihood(mu, nll=True, expected=expected, useCached=False) + nllA = combiner.likelihood(mu, expected="posteriori", nll=True, useCached=False) + ret = rootFromNLLs(nllA, nll0A, nll, nll0) + return ret + + combiner = TheoryPredictionsCombiner(SRs) + print("\nTheoryPredictionsCombiner: ") + print("ul", combiner.getUpperLimitOnMu(expected=False)) + print("expected ul", combiner.getUpperLimitOnMu(expected=True)) + print("r-value", combiner.getRValue()) + print(f"expected 1-CLs : {1. - clsRoot(1., combiner, True)}") + print(f"obs 1-CLs : {1. - clsRoot(1., combiner, False)}") diff --git a/tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp b/tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp index 8818fc20..7b505700 100644 --- a/tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp +++ b/tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp @@ -920,23 +920,25 @@ void SampleAnalyzer::UpdateProgressBar() } - +// Write header for the TACO file in CSV format void SampleAnalyzer::HeadSR(std::ostream &outwriter) { - outwriter << "#"; - for(MAuint32 i=0; iManager()->HeadSR(outwriter, analyzers_[i]->name()); - } + for(MAuint32 i=0; iManager()->HeadSR(outwriter, analyzers_[i]->name(), is_first); + } } - +// Write body of the TACO file in CSV format void SampleAnalyzer::DumpSR(std::ostream &outwriter) { - for(MAuint32 i=0; iManager()->DumpSR(outwriter); - outwriter << std::endl; + for(MAuint32 i=0; iManager()->DumpSR(outwriter, is_first); + } + outwriter << std::endl; } void SampleAnalyzer::AddDefaultHadronic() diff --git a/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.cpp b/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.cpp index b666a4af..2e835fcd 100644 --- a/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.cpp +++ b/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.cpp @@ -164,16 +164,23 @@ void RegionSelectionManager::WriteHistoDefinition(SAFWriter& output) -void RegionSelectionManager::HeadSR(std::ostream &outwriter, const std::string &ananame) +void RegionSelectionManager::HeadSR(std::ostream &outwriter, const std::string &ananame, MAbool &is_first) { - for (MAuint32 i=0;iGetName(); + // Set first SR out of the for loop to avoid many if executions + // use :: instead of - since - is generally used in SR names + if (regions_.size() > 0 && is_first) outwriter << ananame << "::" << regions_[0]->GetName(); + + for (MAuint32 i = is_first ? 1 : 0; i < regions_.size(); i++) + outwriter << "," << ananame << "::" << regions_[i]->GetName(); } -void RegionSelectionManager::DumpSR(std::ostream &outwriter) +void RegionSelectionManager::DumpSR(std::ostream &outwriter, MAbool &is_first) { - for (MAuint32 i=0;iIsSurviving(); + // Set first SR out of the for loop to avoid many if executions + if (regions_.size() > 0 && is_first) outwriter << regions_[0]->IsSurviving(); + + for (MAuint32 i = is_first ? 1 : 0; i < regions_.size(); i++) + outwriter << "," << regions_[i]->IsSurviving(); } diff --git a/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.h b/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.h index 2e250199..30955623 100644 --- a/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.h +++ b/tools/SampleAnalyzer/Process/RegionSelection/RegionSelectionManager.h @@ -475,8 +475,8 @@ class RegionSelectionManager } /// Dumping the content of the counters - void HeadSR(std::ostream &, const std::string&); - void DumpSR(std::ostream &); + void HeadSR(std::ostream &, const std::string&, MAbool&); + void DumpSR(std::ostream &, MAbool&); };