Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
1559bcf
Add JSON support + wrapper
rvinaybharadwaj Jul 30, 2024
649c37b
Rebasing with main
rvinaybharadwaj Aug 1, 2024
8e311bc
adding json test data
rvinaybharadwaj Aug 1, 2024
6054b0b
adding wrapper tests
rvinaybharadwaj Aug 2, 2024
f93765f
code cleanup: make class members private
rvinaybharadwaj Aug 5, 2024
8351938
Updating tests
rvinaybharadwaj Aug 7, 2024
9796253
changing datatypes to match protobuf
rvinaybharadwaj Aug 8, 2024
0ce0ae2
adding missed datatype changes
rvinaybharadwaj Aug 8, 2024
67d29cd
updating WrapperTests datatypes
rvinaybharadwaj Aug 8, 2024
045bd6f
removing involved dims and minor bug fix
rvinaybharadwaj Aug 15, 2024
e5a4627
fixing include path in et_feeder
rvinaybharadwaj Aug 15, 2024
46d42d6
add missing break statement
rvinaybharadwaj Aug 15, 2024
69dffb1
minor bug fixes
rvinaybharadwaj Aug 26, 2024
3b7a0ad
fix include path
rvinaybharadwaj Aug 26, 2024
6cbbc4f
fix lint errors
rvinaybharadwaj Sep 5, 2024
124de08
merging et_feeder_node
rvinaybharadwaj Sep 6, 2024
274826f
adding install setuptools to github workflows
rvinaybharadwaj Sep 6, 2024
21417d8
updating workflows
rvinaybharadwaj Sep 6, 2024
ad998fc
updating workflows
rvinaybharadwaj Sep 6, 2024
bf61b64
updating cpp_lint.yml
rvinaybharadwaj Sep 6, 2024
2474103
updating clang-format version
rvinaybharadwaj Sep 6, 2024
3a7c2c9
updating clang-format version
rvinaybharadwaj Sep 6, 2024
7756ec2
fix lint errors
rvinaybharadwaj Sep 6, 2024
f439bf4
Fix rebase error
rvinaybharadwaj Sep 23, 2024
58fda3e
addressing reviewer comments
rvinaybharadwaj Sep 25, 2024
61c74d8
fix lint errors
rvinaybharadwaj Sep 25, 2024
f803f33
fix lint errors
rvinaybharadwaj Sep 25, 2024
6ed6e66
Specify the kineto filepath explicitly when running HTA analysis
AlexDenisov Nov 11, 2024
b915ab8
Merge pull request #167 from flexaihq/alexdenisov/specify-kineto-file…
tushar-krishna Nov 16, 2024
9247489
Merge pull request #145 from rvinaybharadwaj/jsonify
tushar-krishna Dec 6, 2024
4fb397e
Update is_cpu_op to default to false
willjwon Nov 3, 2024
40ce3be
Fix mishandling All-to-All communication
JoongunPark Oct 8, 2024
470bde1
Update logging.info to logging.debug to make it consistent
JoongunPark Oct 14, 2024
ed7e286
Eliminate false positive sync dependency
JoongunPark Oct 14, 2024
b3dca0b
PyTorch nightly needs to support 1.1.1-chakra.0.0.4.
JoongunPark Nov 14, 2024
cc660d0
Get pg_name from record_param_comms for collectives
JoongunPark Nov 14, 2024
c7c7c05
Update trace_linker to use external_id for finding GPU op's parent CP…
JoongunPark Nov 14, 2024
6d8dea8
Handling HTA Errors in Chakra
JoongunPark Nov 16, 2024
f51050d
Fix error encoding METADATA node
JoongunPark Dec 1, 2024
810ff88
Implement getter functions for nodes' inputs/outputs
JoongunPark Dec 2, 2024
7883df1
Merge branch 'main' into develop
JoongunPark Jan 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions src/converter/pytorch_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
COMM_RECV_NODE,
COMM_SEND_NODE,
COMP_NODE,
METADATA_NODE,
REDUCE_SCATTER,
GlobalMetadata,
)
Expand Down Expand Up @@ -338,6 +339,8 @@ def get_protobuf_node_type_from_json_node(
Returns:
int: The corresponding Chakra node type.
"""
if json_node.is_metadata_op():
return METADATA_NODE
if json_node.is_gpu_op():
if "ncclDevKernel_SendRecv" in json_node.name:
parent_node = json_node_map[json_node.parent]
Expand All @@ -346,10 +349,17 @@ def get_protobuf_node_type_from_json_node(
if parent_node.name == "record_param_comms"
else parent_node.name
)
if parent_node.name == "record_param_comms" and parent_node.pg_name != "":
json_node.pg_name = parent_node.pg_name
if "send" in keyword:
return COMM_SEND_NODE
if "recv" in keyword:
return COMM_RECV_NODE
# In NCCL, all-to-all communication is implemented using point-to-point
# communications. More details can be found here:
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
if "nccl:all_to_all" in keyword:
return COMM_COLL_NODE
if "ncclKernel" in json_node.name or "ncclDevKernel" in json_node.name:
return COMM_COLL_NODE
return COMP_NODE
Expand Down Expand Up @@ -379,6 +389,10 @@ def get_collective_comm_type(self, name: str) -> int:
for key in comm_type_mapping:
if key in normalized_name:
return comm_type_mapping[key]
# If both COMM_COLL_NAME and ncclDevKernel_SendRecv are present, this is nccl:all_to_all.
if "ncclDevKernel_SendRecv" in name:
return comm_type_mapping["alltoall"]

raise ValueError(
f"The name '{name}' does not correspond to a recognized collective communication type. "
"The converter determines collective communication types based on the node name of a GPU operator. "
Expand Down Expand Up @@ -460,11 +474,15 @@ def convert_ctrl_dep_to_data_dep(
if json_node.sync_dep:
for sync_dep in json_node.sync_dep:
if sync_dep not in current_node.data_deps:
current_node.data_deps.append(sync_dep)
logging.info(
f"Node ID {current_node.id} now has an synchonization dependency on Node ID {sync_dep}"
)

# Found a bug encoding false dependency HTA.
# Compare start_time to eliminate false sync dependency.
prior_node = protobuf_node_map.get(sync_dep)
if prior_node is not None and prior_node.start_time_micros < current_node.start_time_micros:
current_node.data_deps.append(sync_dep)
logging.debug(
f"Node ID {current_node.id} now has an synchonization dependency on Node ID "
f"{sync_dep}"
)
# Add children to the stack
children_chakra_ids = [child.id for child in json_node.children]
for child_chakra_id in sorted(children_chakra_ids, reverse=True):
Expand Down
13 changes: 11 additions & 2 deletions src/converter/pytorch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class PyTorchNode:
pg_name (str): Process Group name for the inter-GPU communication.
"""

SUPPORTED_VERSIONS = ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4"]
SUPPORTED_VERSIONS = ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4", "1.1.1-chakra.0.0.4"]

def __init__(self, schema: str, node_data: Dict[str, Any]) -> None:
"""
Expand Down Expand Up @@ -86,7 +86,7 @@ def parse_data(self, node_data: Dict[str, Any]) -> None:
node_data (Dict[str, Any]): The node data to be parsed.
"""
if self.schema in self.SUPPORTED_VERSIONS:
if self.schema in ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4"]:
if self.schema in ["1.0.2-chakra.0.0.4", "1.0.3-chakra.0.0.4", "1.1.0-chakra.0.0.4", "1.1.1-chakra.0.0.4"]:
self._parse_data_1_0_3_chakra_0_0_4(node_data)
else:
raise ValueError(
Expand Down Expand Up @@ -137,6 +137,15 @@ def get_op_type(self) -> PyTorchNodeType:
else:
return PyTorchNodeType.LABEL

def is_metadata_op(self) -> bool:
"""
Check if the node is a METADATA operator.

Returns
bool: True if the node is a METADATA operator, False otherwise.
"""
return self.get_op_type() == PyTorchNodeType.METADATA

def is_cpu_op(self) -> bool:
"""
Check if the node is a CPU operator.
Expand Down
2 changes: 1 addition & 1 deletion src/feeder/et_feeder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,4 +173,4 @@ void ETFeeder::readNextWindow() {
dep_free_node_queue_.emplace(node);
}
}
}
}
5 changes: 2 additions & 3 deletions src/feeder/et_feeder.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,12 @@ class ETFeeder {
void pushBackIssuableNode(uint64_t node_id);
std::shared_ptr<ETFeederNode> lookupNode(uint64_t node_id);
void freeChildrenNodes(uint64_t node_id);

private:
void readGlobalMetadata();
std::shared_ptr<ETFeederNode> readNode();
void readNextWindow();
void resolveDep();

private:
ProtoInputStream trace_;
const uint32_t window_size_;
bool et_complete_;
Expand All @@ -54,4 +53,4 @@ class ETFeeder {
std::unordered_set<std::shared_ptr<ETFeederNode>> dep_unresolved_node_set_{};
};

} // namespace Chakra
} // namespace Chakra
54 changes: 54 additions & 0 deletions src/feeder/et_feeder_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ ETFeederNode::ETFeederNode(std::shared_ptr<ChakraProtoMsg::Node> node) {
this->runtime_ = node->duration_micros();
this->is_cpu_op_ = 0;

if (node->has_inputs()) {
this->inputs_values_ = static_cast<string>(node->inputs().values());
this->inputs_shapes_ = static_cast<string>(node->inputs().shapes());
this->inputs_types_ = static_cast<string>(node->inputs().types());
}

if (node->has_outputs()) {
this->outputs_values_ = static_cast<string>(node->outputs().values());
this->outputs_shapes_ = static_cast<string>(node->outputs().shapes());
this->outputs_types_ = static_cast<string>(node->outputs().types());
}

for (const auto& attr : node->attr()) {
const string& attr_name = attr.name();

Expand Down Expand Up @@ -144,3 +156,45 @@ uint32_t ETFeederNode::comm_tag() {
string ETFeederNode::pg_name() {
return pg_name_;
}

string ETFeederNode::get_inputs_values() const {
if (node_->has_inputs()) {
return inputs_values_;
}
return "";
}

string ETFeederNode::get_inputs_shapes() const {
if (node_->has_inputs()) {
return inputs_shapes_;
}
return "";
}

string ETFeederNode::get_inputs_types() const {
if (node_->has_inputs()) {
return inputs_types_;
}
return "";
}

string ETFeederNode::get_outputs_values() const {
if (node_->has_outputs()) {
return outputs_values_;
}
return "";
}

string ETFeederNode::get_outputs_shapes() const {
if (node_->has_outputs()) {
return outputs_shapes_;
}
return "";
}

string ETFeederNode::get_outputs_types() const {
if (node_->has_outputs()) {
return outputs_types_;
}
return "";
}
12 changes: 12 additions & 0 deletions src/feeder/et_feeder_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ class ETFeederNode {
uint32_t comm_dst();
uint32_t comm_tag();
std::string pg_name();
std::string get_inputs_values() const;
std::string get_inputs_shapes() const;
std::string get_inputs_types() const;
std::string get_outputs_values() const;
std::string get_outputs_shapes() const;
std::string get_outputs_types() const;

private:
void assign_attr_val(
Expand Down Expand Up @@ -67,6 +73,12 @@ class ETFeederNode {
uint32_t comm_dst_;
uint32_t comm_tag_;
std::string pg_name_;
std::string inputs_values_;
std::string inputs_shapes_;
std::string inputs_types_;
std::string outputs_values_;
std::string outputs_shapes_;
std::string outputs_types_;
};

} // namespace Chakra
Loading