diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index a9b8c55..198b319 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -109,7 +109,7 @@ class Diagnose: Environment variables: DEEPEP_DIAGNOSE_ENABLE: determine diagnose enable switch from environment variable. Default 1. - DEEPEP_DIAGNOSE_INTERVAL: controls the diagnose cycle period in seconds. Default 10. + DEEPEP_DIAGNOSE_INTERVAL: controls the diagnose cycle period in seconds. Default 20. DEEPEP_DIAGNOSE_WARMING_TIME: controls the diagnose start warming up time. Default same as 'DEEPEP_DIAGNOSE_INTERVAL'. DEEPEP_DIAGNOSE_SYNC_STEP: controls the diagnose step counter. Default: 580. DEEPEP_DIAGNOSE_LOG_PATH: set the output file path for diagnose logs. Default ".". @@ -124,7 +124,7 @@ class Diagnose: def __init__( self, group: dist.ProcessGroup, - interval: int = 10, + interval: int = 20, enable_ll_diagnose: bool = True, enable_normal_diagnose: bool = False, enable_async: bool = False) -> None: @@ -133,7 +133,7 @@ def __init__( Arguments: group: the communication group(i.e., the EP communication group). - interval: diagnose interval. Default 10. + interval: diagnose interval. Default 20. enable_ll_diagnose: enable low latency mode diagnose. Default `True`. enable_normal_diagnose: enable normal mode diagnose. Default `False`. enable_async: enable async diagnose mode. Default `False`. @@ -172,14 +172,14 @@ def __init__( self.enable_ll_diagnose = enable_ll_diagnose and enable_diagnose self.enable_normal_diagnose = enable_normal_diagnose and enable_diagnose self.enable_async = enable_async - # Controls the diagnose cycle period in seconds. Default: 10 + # Controls the diagnose cycle period in seconds. Default: 20 self.interval = int(os.getenv("DEEPEP_DIAGNOSE_INTERVAL", interval)) # Controls the diagnose warming up time. Default: same as interval self.warm_time = int( os.getenv( "DEEPEP_DIAGNOSE_WARMING_TIME", self.interval)) - # Controls the diagnose step counter. Default: 100 + # Controls the diagnose step counter. Default: 580 self.sync_step = np.uint64(os.getenv("DEEPEP_DIAGNOSE_SYNC_STEP", 580)) self.stop_diagnose = threading.Event() @@ -469,7 +469,7 @@ def diagnose_ll_sync(self, diagnose_step: int = 0) -> List[Dict[str, Any]]: Notes: In synchronous (sync) mode, **all ranks in the EP domain must call this - function at the same code location (for example, once every 100 steps)**. + function at the same code location (for example, once every 580 steps)**. Failing to do so can result in deadlocks or hangs due to distributed synchronization. Returns: @@ -504,7 +504,7 @@ def diagnose_normal_sync( Notes: In synchronous (sync) mode, **all ranks in the EP domain must call this - function at the same code location (for example, once every 100 steps)**. + function at the same code location (for example, once every 580 steps)**. Failing to do so can result in deadlocks or hangs due to distributed synchronization. Returns: