-
Notifications
You must be signed in to change notification settings - Fork 26
Correctness - BugFix - Drain P2P messages in PRESUSPEND event instead of PRECHECKPOINT event. #312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1030,6 +1030,42 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) | |
| // FIXME: See commant at: dmtcpplugin.cpp:'case DMTCP_EVENT_PRESUSPEND' | ||
| drain_mpi_collective(); | ||
| openCkptFileFds(); | ||
|
|
||
| /* P2P messages draining */ | ||
| fprintf(stdout, "\n[Rank-%d] Suspending P2P communication...", | ||
| g_world_rank); | ||
| fflush(stdout); | ||
|
|
||
| // Suspend the global P2P communication | ||
| global_p2p_communication = 0; | ||
| sleep(2); // Wait for the user thread to get stuck in the P2P API | ||
|
|
||
| fprintf(stdout, "\n[Rank-%d] MPI:Register-local-sends-and-receives\n", | ||
tarunsmalviya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| g_world_rank); | ||
| fflush(stdout); | ||
| dmtcp_global_barrier("MPI:Register-local-sends-and-receives"); | ||
|
|
||
| fprintf(stdout, "\n[Rank-%d] mana_state = CKPT_P2P\n", g_world_rank); | ||
| fflush(stdout); | ||
| mana_state = CKPT_P2P; | ||
|
|
||
| fprintf(stdout, "\n[Rank-%d] registerLocalSendsAndRecvs()\n", | ||
| g_world_rank); | ||
| fflush(stdout); | ||
| registerLocalSendsAndRecvs(); // p2p_drain_send_recv.cpp | ||
|
|
||
| fprintf(stdout, "\n[Rank-%d] MPI:Drain-Send-Recv\n", g_world_rank); | ||
| fflush(stdout); | ||
| dmtcp_global_barrier("MPI:Drain-Send-Recv"); | ||
|
|
||
| fprintf(stdout, "\n[Rank-%d] drainSendRecv()\n", g_world_rank); | ||
| fflush(stdout); | ||
| drainSendRecv(); // p2p_drain_send_recv.cpp | ||
|
|
||
| fprintf(stdout, "\n[Rank-%d] Exiting DMTCP_EVENT_PRESUSPEND\n", | ||
| g_world_rank); | ||
| fflush(stdout); | ||
|
|
||
| break; | ||
| } | ||
|
|
||
|
|
@@ -1042,11 +1078,12 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) | |
| getLocalRankInfo(); // p2p_log_replay.cpp | ||
| dmtcp_global_barrier("MPI:update-ckpt-dir-by-rank"); | ||
| updateCkptDirByRank(); // mpi_plugin.cpp | ||
| dmtcp_global_barrier("MPI:Register-local-sends-and-receives"); | ||
| mana_state = CKPT_P2P; | ||
| registerLocalSendsAndRecvs(); // p2p_drain_send_recv.cpp | ||
| dmtcp_global_barrier("MPI:Drain-Send-Recv"); | ||
| drainSendRecv(); // p2p_drain_send_recv.cpp | ||
| // dmtcp_global_barrier("MPI:Register-local-sends-and-receives"); | ||
| // mana_state = CKPT_P2P; | ||
| // registerLocalSendsAndRecvs(); // p2p_drain_send_recv.cpp | ||
| // dmtcp_global_barrier("MPI:Drain-Send-Recv"); | ||
| // drainSendRecv(); // p2p_drain_send_recv.cpp | ||
| dmtcp_global_barrier("MPI:computeUnionOfCkptImageAddresses"); | ||
| computeUnionOfCkptImageAddresses(); | ||
| dmtcp_global_barrier("MPI:save-mana-header-and-mpi-files"); | ||
| const char *file = get_mana_header_file_name(); | ||
|
|
@@ -1062,6 +1099,9 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) | |
| } | ||
|
|
||
| case DMTCP_EVENT_RESUME: { | ||
| // Resume the global P2P communication | ||
| global_p2p_communication = 1; | ||
|
|
||
| processingOpenCkpFileFds = false; | ||
| dmtcp_local_barrier("MPI:Reset-Drain-Send-Recv-Counters"); | ||
| resetDrainCounters(); // p2p_drain_send_recv.cpp | ||
|
|
@@ -1072,6 +1112,9 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) | |
| } | ||
|
|
||
| case DMTCP_EVENT_RESTART: { | ||
| // Resume the global P2P communication | ||
| global_p2p_communication = 1; | ||
|
|
||
| processingOpenCkpFileFds = false; | ||
| logCkptFileFds(); | ||
|
|
||
|
|
@@ -1123,6 +1166,36 @@ mpi_plugin_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) | |
| } | ||
| } | ||
|
|
||
| void | ||
| global_p2p_communication_barrier() | ||
| { | ||
| if (global_p2p_communication == 1 || | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it have race condition between checkpoint thread and user application threads? Should we use lock to protect global_p2p_communication access? |
||
| (global_p2p_communication == 0 && internal_p2p_communication == 1)) | ||
| return; | ||
|
|
||
| time_t my_time = time(NULL); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand this code, a call to MPI_Send will always call this routine, which will always print a message. Computing a time and then doing a print is very expensive compared to MPI_Send. So, we should compute a time (and do some printing) only in the uncommon case, and not in the common case. Do I understand the code correctly? If this is not a problem, then could you add some comments somewhere to make it clear that this happens only in the uncommon case? Maybe my confusion comes from not understanding the variable Finally, should we be printing only in the case that we want to debug ( |
||
| char *time_str = ctime(&my_time); | ||
|
|
||
| my_time = time(NULL); | ||
| time_str = ctime(&my_time); | ||
| time_str[strlen(time_str) - 1] = '\0'; | ||
|
|
||
| fprintf(stdout, "\n%s [Rank-%d] Global P2P communication barrier entered.", | ||
tarunsmalviya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| time_str, g_world_rank); | ||
| fflush(stdout); | ||
|
|
||
| while (global_p2p_communication == 0) { | ||
| } | ||
|
|
||
| my_time = time(NULL); | ||
| time_str = ctime(&my_time); | ||
| time_str[strlen(time_str) - 1] = '\0'; | ||
|
|
||
| fprintf(stdout, "\n%s [Rank-%d] Global P2P communication barrier exited.\n", | ||
| time_str, g_world_rank); | ||
| fflush(stdout); | ||
| } | ||
|
|
||
| DmtcpPluginDescriptor_t mpi_plugin = { | ||
| DMTCP_PLUGIN_API_VERSION, | ||
| PACKAGE_VERSION, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,4 +67,21 @@ enum mana_state_t { | |
|
|
||
| extern mana_state_t mana_state; | ||
|
|
||
| /******************************************************************************/ | ||
| /* | ||
| In applications using MPI, an MPI thread is responsible for managing asynchronous requests in the background. During checkpointing, it is necessary to suspend Collective and P2P communication so that MANA can drain messages from the network and create a checkpoint image. Previously, we had been draining collective messages during the PRESUSPEND event, while all threads were still running. This is because draining collective messages requires all ranks to be in a consistent state, which can only be achieved through trial and error and by allowing the user application to make progress. However, we were draining P2P messages after suspending all threads, including the MPI thread. This was because each rank explicitly asks other ranks for any pending messages floating in the network, and MANA drains them into its internal buffer so that they can be passed to the user application during checkpoint restart. Additionally, we rely on MPI to provide information about pending requests or messages in the network. | ||
tarunsmalviya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| While this approach seemed reasonable, it did not account for a case where the MPI thread was creating metadata for a P2P request and was suspended in between. In such a scenario, that message would not be visible to other ranks since the MPI APIs used by MANA to gather information about pending requests would report that there are no messages, even though there is a message pending that is not yet visible to all ranks because its metadata has not yet been generated. | ||
tarunsmalviya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| */ | ||
|
|
||
| /* | ||
| The P2P communication in an application is controlled globally by <global_p2p_communication> variable. When we say "control", we mean that it suspends P2P communication and prevents the user application from being in the lower half. This allows for P2P message draining and checkpointing during the PRESUSPEND checkpoint event. If the P2P message draining is performed after PRESUSPEND and before PRECHECKPOINT, the variable mentioned above is not needed. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reformat to 80 char line as above.
|
||
| */ | ||
| static int global_p2p_communication = 1; | ||
| static int internal_p2p_communication = 0; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's almost always wrong to allocate storage for a variable within a .h file. We should declare the type in a .h variable (e.g., If you don't follow this convention, you can create lots of bugs. Right now, you're saying that if Also, the convention within MANA is to use the prefix Even worse, you seem to use in both |
||
|
|
||
| void global_p2p_communication_barrier(); | ||
|
|
||
| /******************************************************************************/ | ||
|
|
||
| #endif // ifndef _MPI_PLUGIN_H | ||
Uh oh!
There was an error while loading. Please reload this page.