diff --git a/xprof/convert/smart_suggestion/BUILD b/xprof/convert/smart_suggestion/BUILD index 437dc2b3..0eaf0904 100644 --- a/xprof/convert/smart_suggestion/BUILD +++ b/xprof/convert/smart_suggestion/BUILD @@ -109,6 +109,21 @@ cc_library( ], ) +cc_library( + name = "infeed_rule", + hdrs = ["infeed_rule.h"], + deps = [ + ":constants", + ":signal_provider", + ":smart_suggestion_rule", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc", + ], +) + cc_library( name = "data_transfer_bound_rule", hdrs = ["data_transfer_bound_rule.h"], @@ -262,6 +277,7 @@ cc_library( ":data_transfer_bound_rule", ":debug_print_rule", ":host_processing_bound_rule", + ":infeed_rule", ":memory_bound_rule", ":smart_suggestion_rule_factory", ":sparse_core_offload_rule", diff --git a/xprof/convert/smart_suggestion/all_rules.h b/xprof/convert/smart_suggestion/all_rules.h index 6c79ecd6..ce85f1fa 100644 --- a/xprof/convert/smart_suggestion/all_rules.h +++ b/xprof/convert/smart_suggestion/all_rules.h @@ -23,6 +23,7 @@ limitations under the License. #include "xprof/convert/smart_suggestion/data_transfer_bound_rule.h" #include "xprof/convert/smart_suggestion/debug_print_rule.h" #include "xprof/convert/smart_suggestion/host_processing_bound_rule.h" +#include "xprof/convert/smart_suggestion/infeed_rule.h" #include "xprof/convert/smart_suggestion/memory_bound_rule.h" #include "xprof/convert/smart_suggestion/smart_suggestion_rule_factory.h" #include "xprof/convert/smart_suggestion/sparse_core_offload_rule.h" @@ -41,6 +42,7 @@ inline void RegisterAllRules(SmartSuggestionRuleFactory* f) { f->Register(); f->Register(); f->Register(); + f->Register(); f->Register(); f->Register(); f->Register(); @@ -51,6 +53,7 @@ inline void RegisterAllRules(SmartSuggestionRuleFactory* f) { inline void RegisterAllRulesFor3P(SmartSuggestionRuleFactory* f) { // go/keep-sorted start f->Register(); + f->Register(); // TODO Enable SparseCoreOffloadRule for 3P. // go/keep-sorted end } diff --git a/xprof/convert/smart_suggestion/constants.h b/xprof/convert/smart_suggestion/constants.h index 41a4f7cf..1b01e972 100644 --- a/xprof/convert/smart_suggestion/constants.h +++ b/xprof/convert/smart_suggestion/constants.h @@ -72,6 +72,10 @@ inline constexpr double kSpecialOpBoundThresholdInPercent = 10; // than this threshold, it is considered a bottleneck. inline constexpr double kDebugPrintBoundThresholdInPercent = 5; +// If the percentage of step time that is due to the infeed op is higher than +// this threshold, it is considered a bottleneck. +inline constexpr double kInfeedBoundThresholdInPercent = 10; + // If the percentage of async-done time is higher than this threshold, it is // considered a bottleneck. inline constexpr double kAsyncDoneThresholdInPercent = 10; diff --git a/xprof/convert/smart_suggestion/infeed_rule.h b/xprof/convert/smart_suggestion/infeed_rule.h new file mode 100644 index 00000000..764fb0e4 --- /dev/null +++ b/xprof/convert/smart_suggestion/infeed_rule.h @@ -0,0 +1,123 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_INFEED_RULE_H_ +#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_INFEED_RULE_H_ + +#include +#include +#include +#include + +#include "absl/container/flat_hash_map.h" +#include "absl/status/statusor.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "xprof/convert/smart_suggestion/constants.h" +#include "xprof/convert/smart_suggestion/signal_provider.h" +#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h" +#include "plugin/xprof/protobuf/smart_suggestion.pb.h" + +namespace tensorflow { +namespace profiler { + +constexpr char kInfeedOpName[] = "infeed"; + +// Rule to detect infeed percentage bottleneck. +class InfeedRule : public SmartSuggestionRule { + public: + bool MeetsConditions(const SignalProvider& signal_provider) const override { + auto host_stats = + signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName); + if (!host_stats.ok() || host_stats->empty()) { + return false; + } + for (const auto& host_stat : *host_stats) { + if (host_stat.second >= kDebugPrintBoundThresholdInPercent) { + return true; + } + } + return false; + } + + // Generates suggestions if the infeed percentage is above the threshold. + absl::StatusOr> GenerateSuggestion( + const SignalProvider& signal_provider) const override { + SmartSuggestion suggestion; + suggestion.set_rule_name("InfeedRule"); + // If MeetsConditions passed, GetPerHostAvgEventTimePercent is ok and has + // hosts with fractions >= kDebugPrintBoundThresholdInPercent. + absl::flat_hash_map high_infeed_hosts; + double max_percent = 0.0; + auto host_stats = + signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName); + if (host_stats.ok()) { + for (const auto& host_stat : *host_stats) { + if (host_stat.second >= kDebugPrintBoundThresholdInPercent) { + high_infeed_hosts.insert(host_stat); + if (host_stat.second > max_percent) { + max_percent = host_stat.second; + } + } + } + } + std::string infeed_hosts_list_html; + std::string infeed_suggestion; + if (high_infeed_hosts.size() > 5) { + infeed_hosts_list_html = absl::StrCat( + " ", high_infeed_hosts.size(), + " hosts have an average infeed time fraction above ", + absl::StrFormat("%.1f", kDebugPrintBoundThresholdInPercent), + "%."); + infeed_suggestion = + absl::StrCat("
  • Investigate Hosts with High Infeed Time:", + infeed_hosts_list_html, "
  • "); + } else { + std::vector host_entries; + for (const auto& [hostname, avg_percent] : high_infeed_hosts) { + host_entries.push_back(absl::StrCat( + "Host ", hostname, " ", + "average infeed time fraction: ", + absl::StrFormat("%.1f", avg_percent), "%")); + } + std::sort(host_entries.begin(), host_entries.end()); + infeed_hosts_list_html = absl::StrJoin(host_entries, ", "); + infeed_suggestion = absl::StrCat( + "
  • Investigate Hosts with High Infeed Time" + ": The following hosts have high infeed time fraction: ", + infeed_hosts_list_html, "
  • "); + } + + auto display_name = absl::StrCat(kInfeedOpName); + std::string suggestion_text = absl::StrCat( + "

    Your program is likely bottlenecked by ", display_name, + " operations: up to ", + absl::StrFormat("%.1f", max_percent), + "% of step time is spent on these operations on some hosts. " + "Please consider the following " + "optimizations:

    ", + "
      ", infeed_suggestion); + absl::StrAppend(&suggestion_text, "
    "); + + suggestion.set_suggestion_text(suggestion_text); + return suggestion; + } +}; + +} // namespace profiler +} // namespace tensorflow + +#endif // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_INFEED_RULE_H_