Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/deepxtrace/diagnose.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class Diagnose:

Environment variables:
DEEPEP_DIAGNOSE_ENABLE: determine diagnose enable switch from environment variable. Default 1.
DEEPEP_DIAGNOSE_INTERVAL: controls the diagnose cycle period in seconds. Default 10.
DEEPEP_DIAGNOSE_INTERVAL: controls the diagnose cycle period in seconds. Default 20.
DEEPEP_DIAGNOSE_WARMING_TIME: controls the diagnose start warming up time. Default same as 'DEEPEP_DIAGNOSE_INTERVAL'.
DEEPEP_DIAGNOSE_SYNC_STEP: controls the diagnose step counter. Default: 580.
DEEPEP_DIAGNOSE_LOG_PATH: set the output file path for diagnose logs. Default ".".
Expand All @@ -124,7 +124,7 @@ class Diagnose:
def __init__(
self,
group: dist.ProcessGroup,
interval: int = 10,
interval: int = 20,
enable_ll_diagnose: bool = True,
enable_normal_diagnose: bool = False,
enable_async: bool = False) -> None:
Expand All @@ -133,7 +133,7 @@ def __init__(

Arguments:
group: the communication group(i.e., the EP communication group).
interval: diagnose interval. Default 10.
interval: diagnose interval. Default 20.
enable_ll_diagnose: enable low latency mode diagnose. Default `True`.
enable_normal_diagnose: enable normal mode diagnose. Default `False`.
enable_async: enable async diagnose mode. Default `False`.
Expand Down Expand Up @@ -172,14 +172,14 @@ def __init__(
self.enable_ll_diagnose = enable_ll_diagnose and enable_diagnose
self.enable_normal_diagnose = enable_normal_diagnose and enable_diagnose
self.enable_async = enable_async
# Controls the diagnose cycle period in seconds. Default: 10
# Controls the diagnose cycle period in seconds. Default: 20
self.interval = int(os.getenv("DEEPEP_DIAGNOSE_INTERVAL", interval))
# Controls the diagnose warming up time. Default: same as interval
self.warm_time = int(
os.getenv(
"DEEPEP_DIAGNOSE_WARMING_TIME",
self.interval))
# Controls the diagnose step counter. Default: 100
# Controls the diagnose step counter. Default: 580
self.sync_step = np.uint64(os.getenv("DEEPEP_DIAGNOSE_SYNC_STEP", 580))
self.stop_diagnose = threading.Event()

Expand Down Expand Up @@ -469,7 +469,7 @@ def diagnose_ll_sync(self, diagnose_step: int = 0) -> List[Dict[str, Any]]:

Notes:
In synchronous (sync) mode, **all ranks in the EP domain must call this
function at the same code location (for example, once every 100 steps)**.
function at the same code location (for example, once every 580 steps)**.
Failing to do so can result in deadlocks or hangs due to distributed synchronization.

Returns:
Expand Down Expand Up @@ -504,7 +504,7 @@ def diagnose_normal_sync(

Notes:
In synchronous (sync) mode, **all ranks in the EP domain must call this
function at the same code location (for example, once every 100 steps)**.
function at the same code location (for example, once every 580 steps)**.
Failing to do so can result in deadlocks or hangs due to distributed synchronization.

Returns:
Expand Down