diff --git a/LanguageModeling/bert_oneflow/README.md b/LanguageModeling/bert-oneflow/README.md similarity index 100% rename from LanguageModeling/bert_oneflow/README.md rename to LanguageModeling/bert-oneflow/README.md diff --git a/LanguageModeling/bert_oneflow/dataset/dataset.py b/LanguageModeling/bert-oneflow/dataset/dataset.py similarity index 100% rename from LanguageModeling/bert_oneflow/dataset/dataset.py rename to LanguageModeling/bert-oneflow/dataset/dataset.py diff --git a/LanguageModeling/bert_oneflow/dataset/vocab.py b/LanguageModeling/bert-oneflow/dataset/vocab.py similarity index 100% rename from LanguageModeling/bert_oneflow/dataset/vocab.py rename to LanguageModeling/bert-oneflow/dataset/vocab.py diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.INFO b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.INFO new file mode 120000 index 0000000..06d2785 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.INFO @@ -0,0 +1 @@ +oneflow.VS002.lichunyou.log.INFO.20210621-141811.2328 \ No newline at end of file diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-153653.216106 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-153653.216106 new file mode 100644 index 0000000..678624d --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-153653.216106 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/15 15:36:53 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 15:36:53.881130 216106 global.h:36] NewGlobal 14cudaDeviceProp +I0615 15:36:53.946734 216106 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 15:36:53.946768 216106 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 15:36:53.946776 216106 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 15:36:53.946787 216106 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 15:36:53.946794 216106 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 15:36:53.953372 216106 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 15:36:53.954262 216106 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 15:36:53.954270 216106 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 15:36:53.954275 216106 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 15:36:53.954282 216106 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 15:36:53.962713 216106 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 15:36:53.962735 216106 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 15:36:53.962810 216106 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:42334 +I0615 15:36:53.962831 216106 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 15:36:53.962906 216106 global.h:36] NewGlobal N7oneflow9TransportE +I0615 15:36:54.047060 216106 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0615 15:36:54.047080 216106 global.h:43] DeleteGlobal N7oneflow9TransportE +I0615 15:36:54.047129 216106 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0615 15:36:54.047137 216106 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0615 15:36:54.047158 216106 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0615 15:36:54.047178 216106 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0615 15:36:54.047197 216106 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0615 15:36:54.047350 216106 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 15:36:54.047358 216106 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 15:36:54.047363 216106 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0615 15:36:54.052028 216106 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 15:36:54.052049 216106 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0615 15:36:54.052057 216106 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0615 15:36:54.053153 216106 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0615 15:36:54.053164 216106 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0615 15:36:54.053169 216106 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0615 15:36:54.053181 216106 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0615 15:36:54.053187 216106 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0615 15:36:54.053194 216106 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0615 15:36:54.053201 216106 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0615 15:36:54.053208 216106 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-153942.217036 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-153942.217036 new file mode 100644 index 0000000..d700f4b --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-153942.217036 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/15 15:39:42 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 15:39:42.711254 217036 global.h:36] NewGlobal 14cudaDeviceProp +I0615 15:39:42.772075 217036 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 15:39:42.772105 217036 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 15:39:42.772114 217036 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 15:39:42.772123 217036 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 15:39:42.772130 217036 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 15:39:42.778702 217036 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 15:39:42.779572 217036 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 15:39:42.779580 217036 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 15:39:42.779584 217036 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 15:39:42.779589 217036 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 15:39:42.787849 217036 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 15:39:42.787869 217036 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 15:39:42.787940 217036 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:40496 +I0615 15:39:42.787958 217036 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 15:39:42.788033 217036 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-155552.219680 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-155552.219680 new file mode 100644 index 0000000..81cb4aa --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-155552.219680 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/15 15:55:52 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 15:55:52.643743 219680 global.h:36] NewGlobal 14cudaDeviceProp +I0615 15:55:52.703361 219680 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 15:55:52.703390 219680 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 15:55:52.703397 219680 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 15:55:52.703408 219680 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 15:55:52.703413 219680 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 15:55:52.709794 219680 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 15:55:52.710682 219680 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 15:55:52.710691 219680 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 15:55:52.710695 219680 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 15:55:52.710700 219680 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 15:55:52.718988 219680 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 15:55:52.719008 219680 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 15:55:52.719079 219680 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:35038 +I0615 15:55:52.719099 219680 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 15:55:52.719175 219680 global.h:36] NewGlobal N7oneflow9TransportE +I0615 15:58:40.093083 219680 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0615 15:58:40.093164 219680 global.h:43] DeleteGlobal N7oneflow9TransportE +I0615 15:58:40.093287 219680 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0615 15:58:40.093299 219680 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0615 15:58:40.093397 219680 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0615 15:58:40.093466 219680 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0615 15:58:40.093623 219680 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0615 15:58:40.093811 219680 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 15:58:40.093822 219680 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 15:58:40.093827 219680 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0615 15:58:40.123791 219680 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 15:58:40.123827 219680 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0615 15:58:40.123838 219680 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0615 15:58:40.126832 219680 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0615 15:58:40.126843 219680 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0615 15:58:40.126853 219680 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0615 15:58:40.126874 219680 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0615 15:58:40.126884 219680 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0615 15:58:40.126891 219680 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0615 15:58:40.126904 219680 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0615 15:58:40.126916 219680 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-160705.221626 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-160705.221626 new file mode 100644 index 0000000..525ca3c --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-160705.221626 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/15 16:07:05 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 16:07:05.230154 221626 global.h:36] NewGlobal 14cudaDeviceProp +I0615 16:07:05.301681 221626 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 16:07:05.301717 221626 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 16:07:05.301725 221626 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 16:07:05.301738 221626 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:07:05.301744 221626 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:07:05.309193 221626 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 16:07:05.310214 221626 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 16:07:05.310225 221626 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 16:07:05.310235 221626 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 16:07:05.310242 221626 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 16:07:05.319964 221626 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 16:07:05.319988 221626 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 16:07:05.320072 221626 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:33728 +I0615 16:07:05.320096 221626 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 16:07:05.320184 221626 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-160849.222566 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-160849.222566 new file mode 100644 index 0000000..a5d9977 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-160849.222566 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/15 16:08:49 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 16:08:49.703088 222566 global.h:36] NewGlobal 14cudaDeviceProp +I0615 16:08:49.766950 222566 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 16:08:49.766980 222566 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 16:08:49.766988 222566 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 16:08:49.766997 222566 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:08:49.767004 222566 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:08:49.773383 222566 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 16:08:49.774260 222566 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 16:08:49.774269 222566 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 16:08:49.774273 222566 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 16:08:49.774279 222566 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 16:08:49.782550 222566 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 16:08:49.782570 222566 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 16:08:49.782644 222566 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:44700 +I0615 16:08:49.782673 222566 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 16:08:49.782743 222566 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-161144.223722 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-161144.223722 new file mode 100644 index 0000000..a27e077 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-161144.223722 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/15 16:11:44 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 16:11:44.690667 223722 global.h:36] NewGlobal 14cudaDeviceProp +I0615 16:11:44.762032 223722 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 16:11:44.762066 223722 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 16:11:44.762075 223722 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 16:11:44.762087 223722 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:11:44.762095 223722 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:11:44.769475 223722 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 16:11:44.770489 223722 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 16:11:44.770499 223722 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 16:11:44.770505 223722 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 16:11:44.770512 223722 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 16:11:44.780143 223722 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 16:11:44.780164 223722 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 16:11:44.780247 223722 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:33194 +I0615 16:11:44.780270 223722 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 16:11:44.780357 223722 global.h:36] NewGlobal N7oneflow9TransportE +I0615 16:12:06.573643 223722 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0615 16:12:06.573709 223722 global.h:43] DeleteGlobal N7oneflow9TransportE +I0615 16:12:06.573839 223722 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0615 16:12:06.573853 223722 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0615 16:12:06.573976 223722 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0615 16:12:06.574168 223722 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0615 16:12:06.574260 223722 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0615 16:12:06.574414 223722 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 16:12:06.574424 223722 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 16:12:06.574431 223722 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0615 16:12:06.606727 223722 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 16:12:06.606770 223722 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0615 16:12:06.606781 223722 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0615 16:12:06.609259 223722 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0615 16:12:06.609272 223722 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0615 16:12:06.609282 223722 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0615 16:12:06.609308 223722 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0615 16:12:06.609314 223722 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0615 16:12:06.609323 223722 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0615 16:12:06.609333 223722 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0615 16:12:06.609344 223722 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-161246.224493 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-161246.224493 new file mode 100644 index 0000000..846a4d8 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-161246.224493 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/15 16:12:46 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 16:12:46.772022 224493 global.h:36] NewGlobal 14cudaDeviceProp +I0615 16:12:46.844890 224493 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 16:12:46.844928 224493 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 16:12:46.844936 224493 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 16:12:46.844949 224493 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:12:46.844956 224493 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:12:46.852439 224493 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 16:12:46.853443 224493 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 16:12:46.853453 224493 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 16:12:46.853459 224493 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 16:12:46.853466 224493 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 16:12:46.863137 224493 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 16:12:46.863163 224493 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 16:12:46.863245 224493 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:46335 +I0615 16:12:46.863268 224493 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 16:12:46.863353 224493 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-162501.226843 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-162501.226843 new file mode 100644 index 0000000..6df7d63 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210615-162501.226843 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/15 16:25:01 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0615 16:25:01.193742 226843 global.h:36] NewGlobal 14cudaDeviceProp +I0615 16:25:01.254277 226843 global.h:36] NewGlobal N7oneflow7EnvDescE +I0615 16:25:01.254308 226843 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0615 16:25:01.254315 226843 env_global_objects_scope.cpp:112] using rpc backend: local +I0615 16:25:01.254325 226843 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:25:01.254331 226843 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0615 16:25:01.260859 226843 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0615 16:25:01.261742 226843 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0615 16:25:01.261749 226843 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0615 16:25:01.261754 226843 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0615 16:25:01.261759 226843 global.h:36] NewGlobal N7oneflow9OneflowVME +I0615 16:25:01.270251 226843 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0615 16:25:01.270270 226843 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0615 16:25:01.270340 226843 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:44430 +I0615 16:25:01.270360 226843 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0615 16:25:01.270432 226843 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121348.68726 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121348.68726 new file mode 100644 index 0000000..62369fc --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121348.68726 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 12:13:48 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 12:13:48.738842 68726 global.h:36] NewGlobal 14cudaDeviceProp +I0616 12:13:49.115579 68726 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 12:13:49.115646 68726 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 12:13:49.115658 68726 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 12:13:49.115706 68726 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 12:13:49.115717 68726 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 12:13:49.124030 68726 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 12:13:49.125347 68726 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 12:13:49.125360 68726 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 12:13:49.125367 68726 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 12:13:49.125375 68726 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 12:13:49.134821 68726 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 12:13:49.134842 68726 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 12:13:49.134929 68726 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:36657 +I0616 12:13:49.134959 68726 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 12:13:49.135046 68726 global.h:36] NewGlobal N7oneflow9TransportE +I0616 12:13:49.205109 68726 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 12:13:49.205139 68726 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 12:13:49.205246 68726 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 12:13:49.205256 68726 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 12:13:49.205293 68726 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 12:13:49.205324 68726 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 12:13:49.205354 68726 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 12:13:49.205478 68726 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 12:13:49.205489 68726 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 12:13:49.205497 68726 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 12:13:49.212520 68726 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 12:13:49.212568 68726 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 12:13:49.212575 68726 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 12:13:49.214356 68726 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 12:13:49.214373 68726 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 12:13:49.214380 68726 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 12:13:49.214404 68726 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 12:13:49.214411 68726 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 12:13:49.214426 68726 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 12:13:49.214434 68726 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 12:13:49.214442 68726 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121504.69423 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121504.69423 new file mode 100644 index 0000000..4d4b251 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121504.69423 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 12:15:04 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 12:15:04.468799 69423 global.h:36] NewGlobal 14cudaDeviceProp +I0616 12:15:04.864488 69423 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 12:15:04.864612 69423 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 12:15:04.864631 69423 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 12:15:04.864665 69423 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 12:15:04.864681 69423 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 12:15:04.874059 69423 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 12:15:04.875102 69423 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 12:15:04.875111 69423 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 12:15:04.875115 69423 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 12:15:04.875121 69423 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 12:15:04.884797 69423 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 12:15:04.884819 69423 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 12:15:04.884923 69423 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:45076 +I0616 12:15:04.884955 69423 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 12:15:04.885033 69423 global.h:36] NewGlobal N7oneflow9TransportE +I0616 12:15:04.990201 69423 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 12:15:04.990232 69423 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 12:15:04.990332 69423 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 12:15:04.990340 69423 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 12:15:04.990376 69423 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 12:15:04.990406 69423 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 12:15:04.990435 69423 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 12:15:04.990547 69423 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 12:15:04.990556 69423 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 12:15:04.990561 69423 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 12:15:04.995379 69423 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 12:15:04.995400 69423 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 12:15:04.995407 69423 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 12:15:04.996918 69423 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 12:15:04.996929 69423 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 12:15:04.996937 69423 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 12:15:04.996951 69423 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 12:15:04.996958 69423 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 12:15:04.996963 69423 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 12:15:04.996971 69423 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 12:15:04.996978 69423 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121548.70085 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121548.70085 new file mode 100644 index 0000000..14a37e4 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-121548.70085 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 12:15:48 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 12:15:48.501327 70085 global.h:36] NewGlobal 14cudaDeviceProp +I0616 12:15:48.884912 70085 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 12:15:48.885016 70085 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 12:15:48.885042 70085 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 12:15:48.885076 70085 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 12:15:48.885089 70085 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 12:15:48.897119 70085 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 12:15:48.898788 70085 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 12:15:48.898846 70085 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 12:15:48.898854 70085 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 12:15:48.898861 70085 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 12:15:48.909502 70085 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 12:15:48.909530 70085 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 12:15:48.909626 70085 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:46610 +I0616 12:15:48.909657 70085 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 12:15:48.909744 70085 global.h:36] NewGlobal N7oneflow9TransportE +I0616 12:15:49.243652 70085 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 12:15:49.243695 70085 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 12:15:49.243834 70085 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 12:15:49.243842 70085 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 12:15:49.243896 70085 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 12:15:49.243925 70085 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 12:15:49.243984 70085 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 12:15:49.244272 70085 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 12:15:49.244333 70085 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 12:15:49.244349 70085 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 12:15:49.252218 70085 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 12:15:49.252254 70085 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 12:15:49.252269 70085 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 12:15:49.255165 70085 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 12:15:49.255223 70085 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 12:15:49.255239 70085 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 12:15:49.255266 70085 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 12:15:49.255280 70085 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 12:15:49.255295 70085 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 12:15:49.255311 70085 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 12:15:49.255329 70085 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143323.80849 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143323.80849 new file mode 100644 index 0000000..c4c3e70 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143323.80849 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 14:33:23 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 14:33:23.703843 80849 global.h:36] NewGlobal 14cudaDeviceProp +I0616 14:33:24.055442 80849 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 14:33:24.055500 80849 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 14:33:24.055507 80849 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 14:33:24.055541 80849 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:33:24.055548 80849 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:33:24.061887 80849 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 14:33:24.062867 80849 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:33:24.062876 80849 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:33:24.062880 80849 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:33:24.062885 80849 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 14:33:24.073797 80849 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:33:24.073866 80849 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 14:33:24.073969 80849 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:35633 +I0616 14:33:24.074003 80849 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 14:33:24.074086 80849 global.h:36] NewGlobal N7oneflow9TransportE +I0616 14:33:29.385749 80849 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 14:33:29.385855 80849 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 14:33:29.386006 80849 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 14:33:29.386024 80849 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 14:33:29.386140 80849 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 14:33:29.386188 80849 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 14:33:29.386268 80849 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 14:33:29.386445 80849 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:33:29.386462 80849 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:33:29.386474 80849 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 14:33:29.399066 80849 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:33:29.399102 80849 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:33:29.399111 80849 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 14:33:29.402797 80849 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 14:33:29.402856 80849 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 14:33:29.402871 80849 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 14:33:29.402897 80849 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 14:33:29.402912 80849 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 14:33:29.402927 80849 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 14:33:29.402942 80849 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 14:33:29.402956 80849 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143407.81483 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143407.81483 new file mode 100644 index 0000000..088b6de --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143407.81483 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 14:34:07 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 14:34:07.170157 81483 global.h:36] NewGlobal 14cudaDeviceProp +I0616 14:34:07.558320 81483 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 14:34:07.558429 81483 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 14:34:07.558437 81483 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 14:34:07.558460 81483 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:34:07.558468 81483 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:34:07.565513 81483 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 14:34:07.566704 81483 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:34:07.566715 81483 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:34:07.566720 81483 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:34:07.566725 81483 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 14:34:07.577455 81483 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:34:07.577481 81483 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 14:34:07.577610 81483 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:37443 +I0616 14:34:07.577634 81483 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 14:34:07.577718 81483 global.h:36] NewGlobal N7oneflow9TransportE +I0616 14:34:12.959990 81483 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 14:34:12.960086 81483 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 14:34:12.960222 81483 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 14:34:12.960237 81483 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 14:34:12.960294 81483 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 14:34:12.960341 81483 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 14:34:12.960440 81483 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 14:34:12.960605 81483 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:34:12.960628 81483 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:34:12.960640 81483 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 14:34:12.973390 81483 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:34:12.973443 81483 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:34:12.973456 81483 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 14:34:12.975657 81483 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 14:34:12.975674 81483 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 14:34:12.975682 81483 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 14:34:12.975708 81483 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 14:34:12.975716 81483 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 14:34:12.975725 81483 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 14:34:12.975734 81483 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 14:34:12.975744 81483 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143528.82120 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143528.82120 new file mode 100644 index 0000000..b73089b --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-143528.82120 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/16 14:35:28 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 14:35:28.097849 82120 global.h:36] NewGlobal 14cudaDeviceProp +I0616 14:35:28.478919 82120 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 14:35:28.479009 82120 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 14:35:28.479018 82120 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 14:35:28.479033 82120 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:35:28.479040 82120 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:35:28.485635 82120 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 14:35:28.486738 82120 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:35:28.486748 82120 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:35:28.486753 82120 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:35:28.486759 82120 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 14:35:28.496556 82120 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:35:28.496577 82120 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 14:35:28.496685 82120 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:40155 +I0616 14:35:28.496708 82120 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 14:35:28.496790 82120 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-144741.83066 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-144741.83066 new file mode 100644 index 0000000..d1bd5ea --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-144741.83066 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 14:47:41 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 14:47:41.597259 83066 global.h:36] NewGlobal 14cudaDeviceProp +I0616 14:47:41.975654 83066 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 14:47:41.975700 83066 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 14:47:41.975708 83066 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 14:47:41.975719 83066 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:47:41.975726 83066 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:47:41.982234 83066 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 14:47:41.983127 83066 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:47:41.983135 83066 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:47:41.983140 83066 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:47:41.983145 83066 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 14:47:41.991276 83066 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:47:41.991298 83066 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 14:47:41.991374 83066 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:36373 +I0616 14:47:41.991396 83066 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 14:47:41.991464 83066 global.h:36] NewGlobal N7oneflow9TransportE +I0616 14:47:49.087057 83066 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 14:47:49.087123 83066 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 14:47:49.087374 83066 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 14:47:49.087451 83066 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 14:47:49.087553 83066 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 14:47:49.087625 83066 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 14:47:49.087733 83066 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 14:47:49.087904 83066 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:47:49.087924 83066 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:47:49.087937 83066 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 14:47:49.114552 83066 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:47:49.114586 83066 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:47:49.114600 83066 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 14:47:49.116340 83066 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 14:47:49.116353 83066 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 14:47:49.116362 83066 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 14:47:49.116387 83066 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 14:47:49.116397 83066 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 14:47:49.116405 83066 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 14:47:49.116420 83066 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 14:47:49.116434 83066 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-144906.83746 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-144906.83746 new file mode 100644 index 0000000..a388a16 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-144906.83746 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/16 14:49:06 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 14:49:06.940276 83746 global.h:36] NewGlobal 14cudaDeviceProp +I0616 14:49:07.297325 83746 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 14:49:07.297399 83746 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 14:49:07.297408 83746 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 14:49:07.297451 83746 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:49:07.297459 83746 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:49:07.304072 83746 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 14:49:07.305037 83746 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:49:07.305047 83746 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:49:07.305053 83746 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:49:07.305059 83746 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 14:49:07.313510 83746 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:49:07.313535 83746 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 14:49:07.313640 83746 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:36376 +I0616 14:49:07.313675 83746 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 14:49:07.313750 83746 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-145207.84540 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-145207.84540 new file mode 100644 index 0000000..9c21c06 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-145207.84540 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/16 14:52:07 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 14:52:07.388607 84540 global.h:36] NewGlobal 14cudaDeviceProp +I0616 14:52:07.754192 84540 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 14:52:07.754243 84540 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 14:52:07.754253 84540 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 14:52:07.754266 84540 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:52:07.754274 84540 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 14:52:07.761760 84540 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 14:52:07.762856 84540 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 14:52:07.762867 84540 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 14:52:07.762873 84540 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 14:52:07.762879 84540 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 14:52:07.772663 84540 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 14:52:07.772688 84540 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 14:52:07.772779 84540 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:45780 +I0616 14:52:07.772812 84540 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 14:52:07.772897 84540 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-150059.85434 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-150059.85434 new file mode 100644 index 0000000..2c6fcf8 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-150059.85434 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/16 15:00:59 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 15:00:59.435274 85434 global.h:36] NewGlobal 14cudaDeviceProp +I0616 15:00:59.813540 85434 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 15:00:59.813589 85434 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 15:00:59.813598 85434 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 15:00:59.813613 85434 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 15:00:59.813621 85434 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 15:00:59.821141 85434 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 15:00:59.822273 85434 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 15:00:59.822283 85434 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 15:00:59.822289 85434 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 15:00:59.822296 85434 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 15:00:59.833077 85434 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 15:00:59.833104 85434 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 15:00:59.833190 85434 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:33941 +I0616 15:00:59.833214 85434 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 15:00:59.833305 85434 global.h:36] NewGlobal N7oneflow9TransportE +I0616 15:34:10.653270 85434 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0616 15:34:10.653389 85434 global.h:43] DeleteGlobal N7oneflow9TransportE +I0616 15:34:10.653527 85434 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0616 15:34:10.653549 85434 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0616 15:34:10.653633 85434 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0616 15:34:10.653703 85434 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0616 15:34:10.653772 85434 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0616 15:34:10.653913 85434 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 15:34:10.653932 85434 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 15:34:10.653986 85434 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0616 15:34:10.679136 85434 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 15:34:10.679169 85434 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0616 15:34:10.679176 85434 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0616 15:34:10.683662 85434 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 15:34:10.683728 85434 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0616 15:34:10.683749 85434 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0616 15:34:10.683794 85434 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0616 15:34:10.683810 85434 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0616 15:34:10.683827 85434 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0616 15:34:10.683843 85434 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0616 15:34:10.683861 85434 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-155156.93096 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-155156.93096 new file mode 100644 index 0000000..31194ff --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210616-155156.93096 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/16 15:51:56 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0616 15:51:56.472827 93096 global.h:36] NewGlobal 14cudaDeviceProp +I0616 15:51:56.800246 93096 global.h:36] NewGlobal N7oneflow7EnvDescE +I0616 15:51:56.800299 93096 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0616 15:51:56.800307 93096 env_global_objects_scope.cpp:112] using rpc backend: local +I0616 15:51:56.800321 93096 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 15:51:56.800329 93096 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0616 15:51:56.806735 93096 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0616 15:51:56.807636 93096 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0616 15:51:56.807644 93096 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0616 15:51:56.807651 93096 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0616 15:51:56.807657 93096 global.h:36] NewGlobal N7oneflow9OneflowVME +I0616 15:51:56.816476 93096 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0616 15:51:56.816498 93096 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0616 15:51:56.816570 93096 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:43218 +I0616 15:51:56.816593 93096 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0616 15:51:56.816663 93096 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210617-132416.22441 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210617-132416.22441 new file mode 100644 index 0000000..d3cb554 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210617-132416.22441 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/17 13:24:16 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0617 13:24:16.577369 22441 global.h:36] NewGlobal 14cudaDeviceProp +I0617 13:24:16.954938 22441 global.h:36] NewGlobal N7oneflow7EnvDescE +I0617 13:24:16.955024 22441 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0617 13:24:16.955044 22441 env_global_objects_scope.cpp:112] using rpc backend: local +I0617 13:24:16.955061 22441 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0617 13:24:16.955068 22441 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0617 13:24:16.961655 22441 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0617 13:24:16.962766 22441 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0617 13:24:16.962776 22441 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0617 13:24:16.962781 22441 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0617 13:24:16.962787 22441 global.h:36] NewGlobal N7oneflow9OneflowVME +I0617 13:24:16.972095 22441 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0617 13:24:16.972115 22441 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0617 13:24:16.972209 22441 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:38143 +I0617 13:24:16.972234 22441 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0617 13:24:16.972308 22441 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210617-132529.23158 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210617-132529.23158 new file mode 100644 index 0000000..d4b7253 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210617-132529.23158 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/17 13:25:29 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0617 13:25:29.002583 23158 global.h:36] NewGlobal 14cudaDeviceProp +I0617 13:25:29.371557 23158 global.h:36] NewGlobal N7oneflow7EnvDescE +I0617 13:25:29.371663 23158 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0617 13:25:29.371693 23158 env_global_objects_scope.cpp:112] using rpc backend: local +I0617 13:25:29.371752 23158 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0617 13:25:29.371768 23158 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0617 13:25:29.378367 23158 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0617 13:25:29.379439 23158 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0617 13:25:29.379448 23158 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0617 13:25:29.379453 23158 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0617 13:25:29.379462 23158 global.h:36] NewGlobal N7oneflow9OneflowVME +I0617 13:25:29.388799 23158 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0617 13:25:29.388821 23158 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0617 13:25:29.388970 23158 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:43347 +I0617 13:25:29.388994 23158 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0617 13:25:29.389086 23158 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-105503.13951 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-105503.13951 new file mode 100644 index 0000000..3d5891a --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-105503.13951 @@ -0,0 +1,40 @@ +Log file created at: 2021/06/21 10:55:03 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 10:55:03.954901 13951 global.h:36] NewGlobal 14cudaDeviceProp +I0621 10:55:04.140427 13951 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 10:55:04.140476 13951 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 10:55:04.140486 13951 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 10:55:04.140527 13951 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 10:55:04.140534 13951 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 10:55:04.160535 13951 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 10:55:04.161613 13951 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 10:55:04.161623 13951 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 10:55:04.161628 13951 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 10:55:04.161634 13951 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 10:55:04.171347 13951 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 10:55:04.171376 13951 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 10:55:04.171459 13951 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:34373 +I0621 10:55:04.171485 13951 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 10:55:04.171561 13951 global.h:36] NewGlobal N7oneflow9TransportE +I0621 10:55:07.442209 13951 global.h:43] DeleteGlobal N7oneflow21EnvGlobalObjectsScopeE +I0621 10:55:07.442253 13951 global.h:43] DeleteGlobal N7oneflow9TransportE +I0621 10:55:07.442427 13951 global.h:43] DeleteGlobal N7oneflow12EpollCommNetE +I0621 10:55:07.442502 13951 epoll_comm_network.cpp:87] CommNet Thread 0 finish +I0621 10:55:07.442584 13951 epoll_comm_network.cpp:87] CommNet Thread 1 finish +I0621 10:55:07.442634 13951 epoll_comm_network.cpp:87] CommNet Thread 2 finish +I0621 10:55:07.442726 13951 epoll_comm_network.cpp:87] CommNet Thread 3 finish +I0621 10:55:07.442898 13951 global.h:43] DeleteGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 10:55:07.442915 13951 global.h:43] DeleteGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 10:55:07.442929 13951 global.h:43] DeleteGlobal N7oneflow9OneflowVME +I0621 10:55:07.450901 13951 global.h:43] DeleteGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 10:55:07.450938 13951 global.h:43] DeleteGlobal N7oneflow16EagerNcclCommMgrE +I0621 10:55:07.450947 13951 global.h:43] DeleteGlobal N7oneflow10ThreadPoolE +I0621 10:55:07.453130 13951 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0621 10:55:07.453140 13951 global.h:43] DeleteGlobal N7oneflow12ResourceDescE +I0621 10:55:07.453146 13951 global.h:43] DeleteGlobal N7oneflow6device27NodeDeviceDescriptorManagerE +I0621 10:55:07.453163 13951 global.h:43] DeleteGlobal N7oneflow10RpcManagerE +I0621 10:55:07.453171 13951 global.h:43] DeleteGlobal N7oneflow10CtrlClientE +I0621 10:55:07.453177 13951 global.h:43] DeleteGlobal N7oneflow10ProcessCtxE +I0621 10:55:07.453184 13951 global.h:43] DeleteGlobal N7oneflow7EnvDescE +I0621 10:55:07.453193 13951 global.h:43] DeleteGlobal 14cudaDeviceProp diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-124230.196962 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-124230.196962 new file mode 100644 index 0000000..1b4a1e5 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-124230.196962 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/21 12:42:30 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 12:42:30.981474 196962 global.h:36] NewGlobal 14cudaDeviceProp +I0621 12:42:31.055187 196962 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 12:42:31.055233 196962 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 12:42:31.055241 196962 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 12:42:31.055275 196962 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 12:42:31.055284 196962 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 12:42:31.064765 196962 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 12:42:31.066110 196962 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 12:42:31.066120 196962 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 12:42:31.066126 196962 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 12:42:31.066133 196962 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 12:42:31.078323 196962 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 12:42:31.078349 196962 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 12:42:31.078440 196962 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:41359 +I0621 12:42:31.078474 196962 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 12:42:31.078570 196962 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-124337.198163 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-124337.198163 new file mode 100644 index 0000000..fdf5b2a --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-124337.198163 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/21 12:43:37 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 12:43:37.827342 198163 global.h:36] NewGlobal 14cudaDeviceProp +I0621 12:43:37.901626 198163 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 12:43:37.901664 198163 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 12:43:37.901672 198163 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 12:43:37.901686 198163 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 12:43:37.901695 198163 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 12:43:37.909188 198163 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 12:43:37.910650 198163 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 12:43:37.910661 198163 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 12:43:37.910673 198163 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 12:43:37.910681 198163 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 12:43:37.924012 198163 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 12:43:37.924034 198163 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 12:43:37.924126 198163 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:44248 +I0621 12:43:37.924149 198163 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 12:43:37.924260 198163 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-130530.205914 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-130530.205914 new file mode 100644 index 0000000..c10c95f --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-130530.205914 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/21 13:05:30 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 13:05:30.001822 205914 global.h:36] NewGlobal 14cudaDeviceProp +I0621 13:05:30.438489 205914 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 13:05:30.438539 205914 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 13:05:30.438547 205914 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 13:05:30.438560 205914 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 13:05:30.438567 205914 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 13:05:30.445070 205914 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 13:05:30.446085 205914 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 13:05:30.446094 205914 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 13:05:30.446099 205914 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 13:05:30.446105 205914 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 13:05:30.454943 205914 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 13:05:30.454964 205914 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 13:05:30.455040 205914 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:36618 +I0621 13:05:30.455072 205914 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 13:05:30.455154 205914 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-140937.227330 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-140937.227330 new file mode 100644 index 0000000..f6f8ba4 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-140937.227330 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/21 14:09:37 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 14:09:37.440506 227330 global.h:36] NewGlobal 14cudaDeviceProp +I0621 14:09:37.502467 227330 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 14:09:37.502503 227330 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 14:09:37.502511 227330 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 14:09:37.502522 227330 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 14:09:37.502528 227330 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 14:09:37.508935 227330 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 14:09:37.510071 227330 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 14:09:37.510080 227330 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 14:09:37.510084 227330 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 14:09:37.510090 227330 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 14:09:37.520928 227330 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 14:09:37.520952 227330 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 14:09:37.521034 227330 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:35867 +I0621 14:09:37.521054 227330 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 14:09:37.521147 227330 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-141631.1046 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-141631.1046 new file mode 100644 index 0000000..d317af8 --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-141631.1046 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/21 14:16:31 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 14:16:31.812388 1046 global.h:36] NewGlobal 14cudaDeviceProp +I0621 14:16:32.319845 1046 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 14:16:32.319926 1046 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 14:16:32.319934 1046 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 14:16:32.319977 1046 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 14:16:32.319989 1046 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 14:16:32.326699 1046 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 14:16:32.327786 1046 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 14:16:32.327795 1046 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 14:16:32.327802 1046 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 14:16:32.327808 1046 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 14:16:32.337846 1046 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 14:16:32.337867 1046 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 14:16:32.338035 1046 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:39423 +I0621 14:16:32.338060 1046 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 14:16:32.338157 1046 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-141811.2328 b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-141811.2328 new file mode 100644 index 0000000..7648ecf --- /dev/null +++ b/LanguageModeling/bert-oneflow/log/default_physical_env_log/VS002/oneflow.VS002.lichunyou.log.INFO.20210621-141811.2328 @@ -0,0 +1,19 @@ +Log file created at: 2021/06/21 14:18:11 +Running on machine: VS002 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0621 14:18:11.731659 2328 global.h:36] NewGlobal 14cudaDeviceProp +I0621 14:18:12.142030 2328 global.h:36] NewGlobal N7oneflow7EnvDescE +I0621 14:18:12.142112 2328 global.h:36] NewGlobal N7oneflow10ProcessCtxE +I0621 14:18:12.142119 2328 env_global_objects_scope.cpp:112] using rpc backend: local +I0621 14:18:12.142175 2328 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 14:18:12.142189 2328 global.h:36] NewGlobal N7oneflow12ResourceDescE +I0621 14:18:12.148756 2328 global.h:36] NewGlobal N7oneflow10ThreadPoolE +I0621 14:18:12.149766 2328 global.h:36] NewGlobal N7oneflow16EagerNcclCommMgrE +I0621 14:18:12.149775 2328 global.h:36] NewGlobal N7oneflow18CudnnConvAlgoCacheE +I0621 14:18:12.149778 2328 global.h:36] NewGlobal N7oneflow2vm19VirtualMachineScopeE +I0621 14:18:12.149785 2328 global.h:36] NewGlobal N7oneflow9OneflowVME +I0621 14:18:12.159430 2328 global.h:36] NewGlobal N7oneflow27EagerJobBuildAndInferCtxMgrE +I0621 14:18:12.159451 2328 global.h:36] NewGlobal N7oneflow12EpollCommNetE +I0621 14:18:12.159600 2328 epoll_comm_network.cpp:63] CommNet:Epoll listening on 0.0.0.0:44480 +I0621 14:18:12.159623 2328 epoll_comm_network.cpp:198] machine 0 sockfd -1 +I0621 14:18:12.159706 2328 global.h:36] NewGlobal N7oneflow9TransportE diff --git a/LanguageModeling/bert_oneflow/main.py b/LanguageModeling/bert-oneflow/main.py similarity index 98% rename from LanguageModeling/bert_oneflow/main.py rename to LanguageModeling/bert-oneflow/main.py index 0ab3c30..5d7d81f 100644 --- a/LanguageModeling/bert_oneflow/main.py +++ b/LanguageModeling/bert-oneflow/main.py @@ -27,7 +27,7 @@ def train(): parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len") - parser.add_argument("-b", "--batch_size", type=int, default=16, help="number of batch_size") + parser.add_argument("-b", "--batch_size", type=int, default=32, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") @@ -80,4 +80,4 @@ def train(): -train() \ No newline at end of file +train() diff --git a/LanguageModeling/bert_oneflow/model/attention/multi_head.py b/LanguageModeling/bert-oneflow/model/attention/multi_head.py similarity index 92% rename from LanguageModeling/bert_oneflow/model/attention/multi_head.py rename to LanguageModeling/bert-oneflow/model/attention/multi_head.py index 6252432..4fc0023 100644 --- a/LanguageModeling/bert_oneflow/model/attention/multi_head.py +++ b/LanguageModeling/bert-oneflow/model/attention/multi_head.py @@ -1,6 +1,6 @@ import oneflow.experimental.nn as nn import oneflow as flow -from .single import Attention +from model.attention.single import Attention class MultiHeadedAttention(nn.Module): @@ -23,8 +23,8 @@ def __init__(self, h, d_model, dropout=0.1): def forward(self, query, key, value, mask=None): batch_size = query.size(0) # 16 + - # 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = [l(x).reshape(shape=[batch_size, -1, self.h, self.d_k]).permute(0, 2, 1, 3) for l, x in zip(self.linear_layers, (query, key, value))] # # query,key,value shape >> flow.Size([16, 8, 20, 32]) diff --git a/LanguageModeling/bert_oneflow/model/attention/single.py b/LanguageModeling/bert-oneflow/model/attention/single.py similarity index 100% rename from LanguageModeling/bert_oneflow/model/attention/single.py rename to LanguageModeling/bert-oneflow/model/attention/single.py diff --git a/LanguageModeling/bert_oneflow/model/bert.py b/LanguageModeling/bert-oneflow/model/bert.py similarity index 93% rename from LanguageModeling/bert_oneflow/model/bert.py rename to LanguageModeling/bert-oneflow/model/bert.py index 0d7f54c..ef5b60d 100644 --- a/LanguageModeling/bert_oneflow/model/bert.py +++ b/LanguageModeling/bert-oneflow/model/bert.py @@ -1,7 +1,7 @@ import oneflow.experimental.nn as nn -from .transformer import TransformerBlock -from .embedding import BERTEmbedding +from model.transformer import TransformerBlock +from model.embedding.bert import BERTEmbedding import numpy as np import oneflow.experimental as flow @@ -42,8 +42,9 @@ def forward(self, x, segment_info): # x.shape >> flow.Size([16, 20]) # embedding the indexed sequence to sequence of vectors x = self.embedding(x, segment_info) - + # running over multiple transformer blocks for transformer in self.transformer_blocks: x = transformer.forward(x, mask) + return x diff --git a/LanguageModeling/bert_oneflow/model/embedding/bert.py b/LanguageModeling/bert-oneflow/model/embedding/bert.py similarity index 95% rename from LanguageModeling/bert_oneflow/model/embedding/bert.py rename to LanguageModeling/bert-oneflow/model/embedding/bert.py index 178aab2..8cdd737 100644 --- a/LanguageModeling/bert_oneflow/model/embedding/bert.py +++ b/LanguageModeling/bert-oneflow/model/embedding/bert.py @@ -44,7 +44,7 @@ class BERTEmbedding(nn.Module): sum of all these features are output of BERTEmbedding """ - def __init__(self, vocab_size, embed_size, dropout=0.1): + def __init__(self, vocab_size, embed_size, dropout=0): """ :param vocab_size: total vocab size :param embed_size: embedding size of token embedding @@ -60,6 +60,9 @@ def __init__(self, vocab_size, embed_size, dropout=0.1): self.embed_size = embed_size def forward(self, sequence, segment_label): # sequence/segment_label .shape >>> flow.Size([16, 20]) - sequence = sequence.to(dtype=flow.int) + + x = self.segment(segment_label) + self.token(sequence) + self.position(sequence) + + return self.dropout(x) diff --git a/LanguageModeling/bert_oneflow/model/language_model.py b/LanguageModeling/bert-oneflow/model/language_model.py similarity index 97% rename from LanguageModeling/bert_oneflow/model/language_model.py rename to LanguageModeling/bert-oneflow/model/language_model.py index cf8462c..3012865 100644 --- a/LanguageModeling/bert_oneflow/model/language_model.py +++ b/LanguageModeling/bert-oneflow/model/language_model.py @@ -1,6 +1,6 @@ import oneflow.experimental as flow import oneflow.experimental.nn as nn -from .bert import BERT +from model.bert import BERT import numpy as np @@ -22,6 +22,7 @@ def __init__(self, bert: BERT, vocab_size): self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size) def forward(self, x, segment_label): + x = self.bert(x, segment_label) return self.next_sentence(x), self.mask_lm(x) diff --git a/LanguageModeling/bert_oneflow/model/transformer.py b/LanguageModeling/bert-oneflow/model/transformer.py similarity index 74% rename from LanguageModeling/bert_oneflow/model/transformer.py rename to LanguageModeling/bert-oneflow/model/transformer.py index 8b41eca..c0ac0a5 100644 --- a/LanguageModeling/bert_oneflow/model/transformer.py +++ b/LanguageModeling/bert-oneflow/model/transformer.py @@ -1,9 +1,9 @@ import oneflow.experimental.nn as nn import oneflow.experimental as flow -from .attention import MultiHeadedAttention -from .utils import SublayerConnection, PositionwiseFeedForward - +from model.attention.multi_head import MultiHeadedAttention +from model.utils.sublayer import SublayerConnection +from model.utils.feed_forward import PositionwiseFeedForward class TransformerBlock(nn.Module): """ @@ -19,14 +19,14 @@ def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout): :param dropout: dropout rate """ super().__init__() - self.multihead_attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) + self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.dropout = nn.Dropout(p=dropout) def forward(self, x, mask): - # input/output shape >> flow.Size([16, 20, 256]) - x = self.input_sublayer(x, lambda _x: self.multihead_attention.forward(_x, _x, _x, mask=mask)) + + x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask)) x = self.output_sublayer(x, self.feed_forward) return self.dropout(x) diff --git a/LanguageModeling/bert_oneflow/model/utils/feed_forward.py b/LanguageModeling/bert-oneflow/model/utils/feed_forward.py similarity index 95% rename from LanguageModeling/bert_oneflow/model/utils/feed_forward.py rename to LanguageModeling/bert-oneflow/model/utils/feed_forward.py index 8ae290d..1bb84d9 100644 --- a/LanguageModeling/bert_oneflow/model/utils/feed_forward.py +++ b/LanguageModeling/bert-oneflow/model/utils/feed_forward.py @@ -1,5 +1,5 @@ import oneflow.experimental.nn as nn -from .gelu import GELU +from model.utils.gelu import GELU class PositionwiseFeedForward(nn.Module): diff --git a/LanguageModeling/bert_oneflow/model/utils/gelu.py b/LanguageModeling/bert-oneflow/model/utils/gelu.py similarity index 97% rename from LanguageModeling/bert_oneflow/model/utils/gelu.py rename to LanguageModeling/bert-oneflow/model/utils/gelu.py index 3fbc7e6..d08507c 100644 --- a/LanguageModeling/bert_oneflow/model/utils/gelu.py +++ b/LanguageModeling/bert-oneflow/model/utils/gelu.py @@ -11,4 +11,4 @@ def __init__(self): def forward(self, x): tmp = flow.Tensor([math.sqrt(2 / math.pi)], device=x.device) - return 0.5 * x * (1 + flow.tanh(tmp) * (x + 0.044715 * x.pow(3.0))) \ No newline at end of file + return 0.5 * x * (1 + flow.tanh(tmp) * (x + 0.044715 * x.pow(3.0))) diff --git a/LanguageModeling/bert_oneflow/model/utils/layer_norm.py b/LanguageModeling/bert-oneflow/model/utils/layer_norm.py similarity index 99% rename from LanguageModeling/bert_oneflow/model/utils/layer_norm.py rename to LanguageModeling/bert-oneflow/model/utils/layer_norm.py index 711c257..fee8e2e 100644 --- a/LanguageModeling/bert_oneflow/model/utils/layer_norm.py +++ b/LanguageModeling/bert-oneflow/model/utils/layer_norm.py @@ -14,5 +14,6 @@ def __init__(self, features, eps=1e-6): def forward(self, x): # x input/output >> shape flow.Size([16, 20, 256]) mean = x.mean(-1, keepdim=True) + std = x.std(dim=-1, keepdim=True) return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 diff --git a/LanguageModeling/bert_oneflow/model/utils/sublayer.py b/LanguageModeling/bert-oneflow/model/utils/sublayer.py similarity index 83% rename from LanguageModeling/bert_oneflow/model/utils/sublayer.py rename to LanguageModeling/bert-oneflow/model/utils/sublayer.py index 810881d..85529dd 100644 --- a/LanguageModeling/bert_oneflow/model/utils/sublayer.py +++ b/LanguageModeling/bert-oneflow/model/utils/sublayer.py @@ -1,5 +1,5 @@ import oneflow.experimental.nn as nn -from .layer_norm import LayerNorm +from model.utils.layer_norm import LayerNorm class SublayerConnection(nn.Module): @@ -14,6 +14,6 @@ def __init__(self, size, dropout): self.dropout = nn.Dropout(dropout) def forward(self, x, sublayer): - "Apply residual connection to any sublayer with the same size." + return x + self.dropout(sublayer(self.norm(x))) # return x + self.dropout(sublayer(x)) diff --git a/LanguageModeling/bert_oneflow/requirements.txt b/LanguageModeling/bert-oneflow/requirements.txt similarity index 100% rename from LanguageModeling/bert_oneflow/requirements.txt rename to LanguageModeling/bert-oneflow/requirements.txt diff --git a/LanguageModeling/bert_oneflow/test.py b/LanguageModeling/bert-oneflow/test.py similarity index 100% rename from LanguageModeling/bert_oneflow/test.py rename to LanguageModeling/bert-oneflow/test.py diff --git a/LanguageModeling/bert_oneflow/test.sh b/LanguageModeling/bert-oneflow/test.sh similarity index 100% rename from LanguageModeling/bert_oneflow/test.sh rename to LanguageModeling/bert-oneflow/test.sh diff --git a/LanguageModeling/bert_oneflow/train.py b/LanguageModeling/bert-oneflow/train.py similarity index 93% rename from LanguageModeling/bert_oneflow/train.py rename to LanguageModeling/bert-oneflow/train.py index 3fd325f..48cafa3 100644 --- a/LanguageModeling/bert_oneflow/train.py +++ b/LanguageModeling/bert-oneflow/train.py @@ -3,9 +3,10 @@ from torch.utils.data import DataLoader # from oneflow.utils.data import DataLoader import oneflow as flow -from model import BERT -from trainer import BERTTrainer -from dataset import BERTDataset, WordVocab +from model.bert import BERT +from trainer.pretrain import BERTTrainer +from dataset.dataset import BERTDataset +from dataset.vocab import WordVocab # eager mode flow.enable_eager_execution() @@ -17,7 +18,7 @@ def main(): parser.add_argument("-c", "--train_dataset", required=False, type=str, default='data/corpus.small', help="train dataset for train bert") parser.add_argument("-t", "--test_dataset", type=str, default='data/corpus.small', help="test set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=False, default='data/vocab.small', type=str, help="built vocab model path with bert-vocab") - parser.add_argument("-o", "--output_path", required=False, default='output/bert.model', type=str, help="ex)output/bert.model") + parser.add_argument("-o", "--output_path", required=False, default='output/', type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") @@ -70,9 +71,9 @@ def main(): for epoch in range(args.epochs): trainer.train(epoch) # print("Saving model...") - # trainer.save(epoch, args.output_path) + trainer.save(epoch, args.output_path) if test_data_loader is not None: print("Running testing...") trainer.test(epoch) -main() \ No newline at end of file +main() diff --git a/LanguageModeling/bert_oneflow/train.sh b/LanguageModeling/bert-oneflow/train.sh similarity index 100% rename from LanguageModeling/bert_oneflow/train.sh rename to LanguageModeling/bert-oneflow/train.sh diff --git a/LanguageModeling/bert_oneflow/trainer/optim_schedule.py b/LanguageModeling/bert-oneflow/trainer/optim_schedule.py similarity index 100% rename from LanguageModeling/bert_oneflow/trainer/optim_schedule.py rename to LanguageModeling/bert-oneflow/trainer/optim_schedule.py diff --git a/LanguageModeling/bert_oneflow/trainer/pretrain.py b/LanguageModeling/bert-oneflow/trainer/pretrain.py similarity index 90% rename from LanguageModeling/bert_oneflow/trainer/pretrain.py rename to LanguageModeling/bert-oneflow/trainer/pretrain.py index 74fb8ee..b890173 100644 --- a/LanguageModeling/bert_oneflow/trainer/pretrain.py +++ b/LanguageModeling/bert-oneflow/trainer/pretrain.py @@ -4,8 +4,11 @@ # from flow.utils.data import DataLoader from torch.utils.data import DataLoader -from model import BERTLM, BERT -from .optim_schedule import ScheduledOptim +from model.language_model import BERTLM +from model.bert import BERT +from trainer.optim_schedule import ScheduledOptim +flow.enable_eager_execution() + import tqdm import numpy as np @@ -47,6 +50,7 @@ def __init__(self, bert: BERT, vocab_size: int, self.bert = bert.to(self.device) # Initialize the BERT Language Model, with BERT model self.model = BERTLM(bert, vocab_size).to(self.device) + #self.model.load_state_dict(flow.load("output/init")) # # Distributed GPU training if CUDA can detect more than 1 GPU # if with_cuda and flow.cuda.device_count() > 1: @@ -62,6 +66,7 @@ def __init__(self, bert: BERT, vocab_size: int, self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token + # self.criterion = nn.NLLLoss(ignore_index=0) self.criterion = nn.NLLLoss(ignore_index=0) self.criterion = self.criterion.to(self.device) @@ -122,9 +127,10 @@ def iteration(self, epoch, data_loader, train=True): self.optim_schedule.step_and_update_lr() self.optim_schedule.zero_grad() - flow.save(self.bert.state_dict(), "checkpoints/bert_%d_loss_%f" % (i, loss.numpy().item())) + #flow.save(self.bert.state_dict(), "checkpoints/bert_%d_loss_%f" % (i, loss.numpy().item())) # next sentence prediction accuracy + # correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item() correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().numpy().item() avg_loss += loss.numpy().item() total_correct += correct @@ -138,9 +144,12 @@ def iteration(self, epoch, data_loader, train=True): "loss": loss.numpy().item() } + if i % self.log_freq == 0: data_iter.write(str(post_fix)) - + + print("total_correct >>>>>>>>>>>>>> ", total_correct) + print("total_element >>>>>>>>>>>>>> ", total_element) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element) @@ -153,7 +162,7 @@ def save(self, epoch, file_path="checkpoints"): :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ - output_path = file_path + ".ep%d" % epoch + output_path = file_path + "epoch%d" % epoch flow.save(self.bert.state_dict(), output_path) print("EP:%d Model Saved on:" % epoch, output_path) return output_path diff --git a/LanguageModeling/bert_oneflow/bert_pytorch.zip b/LanguageModeling/bert_oneflow/bert_pytorch.zip deleted file mode 100644 index 460c2a8..0000000 Binary files a/LanguageModeling/bert_oneflow/bert_pytorch.zip and /dev/null differ diff --git a/LanguageModeling/bert_oneflow/dataset/__init__.py b/LanguageModeling/bert_oneflow/dataset/__init__.py deleted file mode 100644 index 90e9036..0000000 --- a/LanguageModeling/bert_oneflow/dataset/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .dataset import BERTDataset -from .vocab import WordVocab diff --git a/LanguageModeling/bert_oneflow/model/__init__.py b/LanguageModeling/bert_oneflow/model/__init__.py deleted file mode 100644 index aa318cb..0000000 --- a/LanguageModeling/bert_oneflow/model/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .bert import BERT -from .language_model import BERTLM diff --git a/LanguageModeling/bert_oneflow/model/attention/__init__.py b/LanguageModeling/bert_oneflow/model/attention/__init__.py deleted file mode 100644 index 6a39ec1..0000000 --- a/LanguageModeling/bert_oneflow/model/attention/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .multi_head import MultiHeadedAttention -from .single import Attention diff --git a/LanguageModeling/bert_oneflow/model/embedding/__init__.py b/LanguageModeling/bert_oneflow/model/embedding/__init__.py deleted file mode 100644 index 0eb5843..0000000 --- a/LanguageModeling/bert_oneflow/model/embedding/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .bert import BERTEmbedding diff --git a/LanguageModeling/bert_oneflow/model/utils/__init__.py b/LanguageModeling/bert_oneflow/model/utils/__init__.py deleted file mode 100644 index e7bddc6..0000000 --- a/LanguageModeling/bert_oneflow/model/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .feed_forward import PositionwiseFeedForward -from .layer_norm import LayerNorm -from .sublayer import SublayerConnection -from .gelu import GELU diff --git a/LanguageModeling/bert_oneflow/trainer/__init__.py b/LanguageModeling/bert_oneflow/trainer/__init__.py deleted file mode 100644 index 6a0eb37..0000000 --- a/LanguageModeling/bert_oneflow/trainer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .pretrain import BERTTrainer