From afbbd1c01527acf42e702a026f36eaf3f06f988d Mon Sep 17 00:00:00 2001 From: Nathan Petersen Date: Tue, 13 Feb 2024 20:06:33 -0600 Subject: [PATCH 1/3] Run design evaluation in its own process to protect against crashes --- mach_opt/mach_opt.py | 48 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/mach_opt/mach_opt.py b/mach_opt/mach_opt.py index 447ed338..c9791d90 100644 --- a/mach_opt/mach_opt.py +++ b/mach_opt/mach_opt.py @@ -9,6 +9,7 @@ from abc import abstractmethod, ABC import numpy as np import pickle +import multiprocessing as mp __all__ = [ "DesignOptimizationMOEAD", @@ -32,13 +33,13 @@ def initial_pop(self, pop_size): pop = pg.population(self.prob, size=pop_size) return pop - def run_optimization(self, pop, gen_size, filepath=None): + def run_optimization(self, pop, gen_size, filepath=None, pg_neighbors=20): algo = pg.algorithm( pg.moead( gen=1, weight_generation="grid", decomposition="tchebycheff", - neighbours=20, + neighbours=pg_neighbors, CR=1, F=0.5, eta_m=20, @@ -108,6 +109,16 @@ def __init__( dh.save_designer(designer) + @staticmethod + def evaluate_design_func(evaluator, design, queue: mp.Queue): + # Run the evaluator (this is slow and might crash!) + full_results = evaluator.evaluate(design) + + # Give the result to the caller process + ret = queue.get() + ret["full_results"] = full_results + queue.put(ret) + def fitness(self, x: "tuple") -> "tuple": """Calculates the fitness or objectives of each design based on evaluation results. @@ -125,7 +136,38 @@ def fitness(self, x: "tuple") -> "tuple": """ try: design = self.__designer.create_design(x) - full_results = self.__evaluator.evaluate(design) + + ############################################### + # Evaluate the design + ############################################### + + USE_CRASH_SAFE_EVAL_METHOD = True + if not USE_CRASH_SAFE_EVAL_METHOD: + full_results = self.__evaluator.evaluate(design) + else: + # Make a new process to evaluate the design + queue = mp.Queue() + ret = {} + queue.put(ret) + p = mp.Process( + target=self.evaluate_design_func, + args=( + self.__evaluator, + design, + queue, + ), + ) + p.start() + p.join() + + if p.exitcode != 0: + raise InvalidDesign("Evaluation crashed") + + ret = queue.get() + full_results = ret["full_results"] + + ############################################### + objs = self.__design_space.get_objectives(full_results) self.__dh.save_to_archive(x, design, full_results, objs) # print('The fitness values are', objs) From be9911d19bb663b3b4742c92e656723dbcfb5c7b Mon Sep 17 00:00:00 2001 From: Nathan Petersen Date: Tue, 13 Feb 2024 22:07:40 -0600 Subject: [PATCH 2/3] Working crash-safe code to run design eval --- mach_opt/mach_opt.py | 62 +++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/mach_opt/mach_opt.py b/mach_opt/mach_opt.py index c9791d90..d67c6ff6 100644 --- a/mach_opt/mach_opt.py +++ b/mach_opt/mach_opt.py @@ -9,6 +9,8 @@ from abc import abstractmethod, ABC import numpy as np import pickle +import time +import sys import multiprocessing as mp __all__ = [ @@ -111,13 +113,24 @@ def __init__( @staticmethod def evaluate_design_func(evaluator, design, queue: mp.Queue): - # Run the evaluator (this is slow and might crash!) - full_results = evaluator.evaluate(design) + try: + # Run the evaluator (this is slow and might crash!) + full_results = evaluator.evaluate(design) + except InvalidDesign: + # Tell caller this design is invalid with code 1 + sys.exit(1) + except Exception: + # Some other failure... tell caller with code 2 + sys.exit(2) + + # Tell parent we are done + queue.put(True) # Give the result to the caller process - ret = queue.get() - ret["full_results"] = full_results - queue.put(ret) + queue.put(full_results) + + # Code of 0 means this eval was a success + sys.exit(0) def fitness(self, x: "tuple") -> "tuple": """Calculates the fitness or objectives of each design based on evaluation results. @@ -147,8 +160,6 @@ def fitness(self, x: "tuple") -> "tuple": else: # Make a new process to evaluate the design queue = mp.Queue() - ret = {} - queue.put(ret) p = mp.Process( target=self.evaluate_design_func, args=( @@ -158,15 +169,36 @@ def fitness(self, x: "tuple") -> "tuple": ), ) p.start() - p.join() - - if p.exitcode != 0: - raise InvalidDesign("Evaluation crashed") - ret = queue.get() - full_results = ret["full_results"] - - ############################################### + # Wait for evalulation to complete, or it to crash + is_done = False + while not is_done: + if queue.empty(): + time.sleep(0.1) + + if p.exitcode is not None: + # Child process (evaluation) is done + if p.exitcode != 0: + if p.exitcode == 2: + # Unknown error during design evaluation + # (NOT InvalidDesign) + # Breakpoint here catches JMAG crash + pass + + # It was not successful + raise InvalidDesign("Bad design (code %d)" % p.exitcode) + else: + is_done = queue.get() + + # We know the child process will put the results + # into the queue right NOW, so pull them out to + # trigger the queue's buffer to flush......see: + # https://stackoverflow.com/questions/26025486/#comment40796894_26041762 + full_results = queue.get() + + # The process should be done by now, + # but make sure by joining it here + p.join() objs = self.__design_space.get_objectives(full_results) self.__dh.save_to_archive(x, design, full_results, objs) From 7a0b96a3cbdbe70c366951a376f0790d6ef7e71f Mon Sep 17 00:00:00 2001 From: Nathan Petersen Date: Wed, 14 Feb 2024 13:02:10 -0600 Subject: [PATCH 3/3] Allow user to configure crash safe eval mode --- mach_opt/mach_opt.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mach_opt/mach_opt.py b/mach_opt/mach_opt.py index d67c6ff6..9ba5f6a9 100644 --- a/mach_opt/mach_opt.py +++ b/mach_opt/mach_opt.py @@ -96,11 +96,13 @@ def __init__( design_space: "DesignSpace", dh: "DataHandler", invalid_design_objs=None, + crash_safe_evaluation=False, ): self.__designer = designer self.__evaluator = evaluator self.__design_space = design_space self.__dh = dh + self.__crash_safe_evaluation = crash_safe_evaluation if invalid_design_objs is None: self.__invalid_design_objs = 1e4 * np.ones([1, self.get_nobj()]) @@ -154,8 +156,7 @@ def fitness(self, x: "tuple") -> "tuple": # Evaluate the design ############################################### - USE_CRASH_SAFE_EVAL_METHOD = True - if not USE_CRASH_SAFE_EVAL_METHOD: + if not self.__crash_safe_evaluation: full_results = self.__evaluator.evaluate(design) else: # Make a new process to evaluate the design @@ -179,10 +180,10 @@ def fitness(self, x: "tuple") -> "tuple": if p.exitcode is not None: # Child process (evaluation) is done if p.exitcode != 0: - if p.exitcode == 2: + if p.exitcode not in [1, 2]: # Unknown error during design evaluation - # (NOT InvalidDesign) - # Breakpoint here catches JMAG crash + # (NOT InvalidDesign or Exception) + # Breakpoint here can catch JMAG crash pass # It was not successful