astra-sim · JoongunPark · Jul 30, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 2, 2024
diff --git a/src/converter/pytorch_converter.py b/src/converter/pytorch_converter.py
@@ -11,6 +11,7 @@
     COMM_RECV_NODE,
     COMM_SEND_NODE,
     COMP_NODE,
+    METADATA_NODE,
     REDUCE_SCATTER,
     GlobalMetadata,
 )
@@ -338,6 +339,8 @@ def get_protobuf_node_type_from_json_node(
         Returns:
             int: The corresponding Chakra node type.
         """
+        if json_node.is_metadata_op():
+            return METADATA_NODE
         if json_node.is_gpu_op():
             if "ncclDevKernel_SendRecv" in json_node.name:
                 parent_node = json_node_map[json_node.parent]
@@ -346,10 +349,17 @@ def get_protobuf_node_type_from_json_node(
                     if parent_node.name == "record_param_comms"
                     else parent_node.name
                 )
+                if parent_node.name == "record_param_comms" and parent_node.pg_name != "":
+                    json_node.pg_name = parent_node.pg_name
                 if "send" in keyword:
                     return COMM_SEND_NODE
                 if "recv" in keyword:
                     return COMM_RECV_NODE
+                # In NCCL, all-to-all communication is implemented using point-to-point
+                # communications. More details can be found here:
+                # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
+                if "nccl:all_to_all" in keyword:
+                    return COMM_COLL_NODE
             if "ncclKernel" in json_node.name or "ncclDevKernel" in json_node.name:
                 return COMM_COLL_NODE
         return COMP_NODE
@@ -379,6 +389,10 @@ def get_collective_comm_type(self, name: str) -> int:
         for key in comm_type_mapping:
             if key in normalized_name:
                 return comm_type_mapping[key]
+        # If both COMM_COLL_NAME and ncclDevKernel_SendRecv are present, this is nccl:all_to_all.
+        if "ncclDevKernel_SendRecv" in name:
+            return comm_type_mapping["alltoall"]
+
         raise ValueError(
             f"The name '{name}' does not correspond to a recognized collective communication type. "
             "The converter determines collective communication types based on the node name of a GPU operator. "
@@ -460,11 +474,15 @@ def convert_ctrl_dep_to_data_dep(
             if json_node.sync_dep:
                 for sync_dep in json_node.sync_dep:
                     if sync_dep not in current_node.data_deps:
-                        current_node.data_deps.append(sync_dep)
-                        logging.info(
-                            f"Node ID {current_node.id} now has an synchonization dependency on Node ID {sync_dep}"
-                        )
-
+                        # Found a bug encoding false dependency HTA.
+                        # Compare start_time to eliminate false sync dependency.
+                        prior_node = protobuf_node_map.get(sync_dep)
+                        if prior_node is not None and prior_node.start_time_micros < current_node.start_time_micros:
+                            current_node.data_deps.append(sync_dep)
+                            logging.debug(
+                                f"Node ID {current_node.id} now has an synchonization dependency on Node ID "
+                                f"{sync_dep}"
+                            )
             # Add children to the stack
             children_chakra_ids = [child.id for child in json_node.children]
             for child_chakra_id in sorted(children_chakra_ids, reverse=True):

diff --git a/src/converter/pytorch_node.py b/src/converter/pytorch_node.py
@@ -47,7 +47,7 @@ class PyTorchNode:
         pg_name (str): Process Group name for the inter-GPU communication.
     """
 
-    SUPPORTED_VERSIONS = ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4"]
+    SUPPORTED_VERSIONS = ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4", "1.1.1-chakra.0.0.4"]
 
     def __init__(self, schema: str, node_data: Dict[str, Any]) -> None:
         """
@@ -86,7 +86,7 @@ def parse_data(self, node_data: Dict[str, Any]) -> None:
             node_data (Dict[str, Any]): The node data to be parsed.
         """
         if self.schema in self.SUPPORTED_VERSIONS:
-            if self.schema in ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4"]:
+            if self.schema in ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4", "1.1.1-chakra.0.0.4"]:
                 self._parse_data_1_0_3_chakra_0_0_4(node_data)
         else:
             raise ValueError(
@@ -137,6 +137,15 @@ def get_op_type(self) -> PyTorchNodeType:
         else:
             return PyTorchNodeType.LABEL
 
+    def is_metadata_op(self) -> bool:
+        """
+        Check if the node is a METADATA operator.
+
+        Returns
+            bool: True if the node is a METADATA operator, False otherwise.
+        """
+        return self.get_op_type() == PyTorchNodeType.METADATA
+
     def is_cpu_op(self) -> bool:
         """
         Check if the node is a CPU operator.

diff --git a/src/feeder/et_feeder.cpp b/src/feeder/et_feeder.cpp
@@ -173,4 +173,4 @@ void ETFeeder::readNextWindow() {
       dep_free_node_queue_.emplace(node);
     }
   }
-}
+}
diff --git a/src/feeder/et_feeder.h b/src/feeder/et_feeder.h
@@ -33,13 +33,12 @@ class ETFeeder {
   void pushBackIssuableNode(uint64_t node_id);
   std::shared_ptr<ETFeederNode> lookupNode(uint64_t node_id);
   void freeChildrenNodes(uint64_t node_id);
-
- private:
   void readGlobalMetadata();
   std::shared_ptr<ETFeederNode> readNode();
   void readNextWindow();
   void resolveDep();
 
+ private:
   ProtoInputStream trace_;
   const uint32_t window_size_;
   bool et_complete_;
@@ -54,4 +53,4 @@ class ETFeeder {
   std::unordered_set<std::shared_ptr<ETFeederNode>> dep_unresolved_node_set_{};
 };
 
-} // namespace Chakra
+} // namespace Chakra
diff --git a/src/feeder/et_feeder_node.cpp b/src/feeder/et_feeder_node.cpp
@@ -10,6 +10,18 @@ ETFeederNode::ETFeederNode(std::shared_ptr<ChakraProtoMsg::Node> node) {
   this->runtime_ = node->duration_micros();
   this->is_cpu_op_ = 0;
 
+  if (node->has_inputs()) {
+    this->inputs_values_ = static_cast<string>(node->inputs().values());
+    this->inputs_shapes_ = static_cast<string>(node->inputs().shapes());
+    this->inputs_types_ = static_cast<string>(node->inputs().types());
+  }
+
+  if (node->has_outputs()) {
+    this->outputs_values_ = static_cast<string>(node->outputs().values());
+    this->outputs_shapes_ = static_cast<string>(node->outputs().shapes());
+    this->outputs_types_ = static_cast<string>(node->outputs().types());
+  }
+
   for (const auto& attr : node->attr()) {
     const string& attr_name = attr.name();
 
@@ -144,3 +156,45 @@ uint32_t ETFeederNode::comm_tag() {
 string ETFeederNode::pg_name() {
   return pg_name_;
 }
+
+string ETFeederNode::get_inputs_values() const {
+  if (node_->has_inputs()) {
+    return inputs_values_;
+  }
+  return "";
+}
+
+string ETFeederNode::get_inputs_shapes() const {
+  if (node_->has_inputs()) {
+    return inputs_shapes_;
+  }
+  return "";
+}
+
+string ETFeederNode::get_inputs_types() const {
+  if (node_->has_inputs()) {
+    return inputs_types_;
+  }
+  return "";
+}
+
+string ETFeederNode::get_outputs_values() const {
+  if (node_->has_outputs()) {
+    return outputs_values_;
+  }
+  return "";
+}
+
+string ETFeederNode::get_outputs_shapes() const {
+  if (node_->has_outputs()) {
+    return outputs_shapes_;
+  }
+  return "";
+}
+
+string ETFeederNode::get_outputs_types() const {
+  if (node_->has_outputs()) {
+    return outputs_types_;
+  }
+  return "";
+}
diff --git a/src/feeder/et_feeder_node.h b/src/feeder/et_feeder_node.h
@@ -39,6 +39,12 @@ class ETFeederNode {
   uint32_t comm_dst();
   uint32_t comm_tag();
   std::string pg_name();
+  std::string get_inputs_values() const;
+  std::string get_inputs_shapes() const;
+  std::string get_inputs_types() const;
+  std::string get_outputs_values() const;
+  std::string get_outputs_shapes() const;
+  std::string get_outputs_types() const;
 
  private:
   void assign_attr_val(
@@ -67,6 +73,12 @@ class ETFeederNode {
   uint32_t comm_dst_;
   uint32_t comm_tag_;
   std::string pg_name_;
+  std::string inputs_values_;
+  std::string inputs_shapes_;
+  std::string inputs_types_;
+  std::string outputs_values_;
+  std::string outputs_shapes_;
+  std::string outputs_types_;
 };
 
 } // namespace Chakra
-Original file line number
+Diff line change
@@ Expand Up / @@ -173,4 +173,4 @@ void ETFeeder::readNextWindow() { @@
           dep_free_node_queue_.emplace(node);
         }
       }
-    }
+    }