From 1c1933f16f6f372c2f3eb9a88ba498e858f7456f Mon Sep 17 00:00:00 2001 From: Victor Li Date: Tue, 11 Mar 2025 20:42:03 -0700 Subject: [PATCH 01/11] MCMC algorithm draft --- .../mcmc/machine_mapping_mutation_set.h | 32 ++ .../include/compiler/mcmc/mcmc_algorithm.h | 22 ++ .../compiler/mcmc/mcmc_graph_optimize_state.h | 35 ++ .../unity_algorithm/graph_optimize_state.h | 4 +- .../mcmc/machine_mapping_mutation_set.cc | 110 ++++++ .../src/compiler/mcmc/mcmc_algorithm.cc | 320 ++++++++++++++++++ .../mcmc/mcmc_graph_optimize_state.cc | 84 +++++ .../test/src/compiler/mcmc/mcmc_algorithm.cc | 88 +++++ .../apply_substitution/apply_substitution.cc | 2 + .../operator_pattern/satisfies_constraint.cc | 5 + .../src/substitutions/pcg_pattern.cc | 13 + .../sub_parallel_computation_graph.cc | 55 ++- .../unlabelled/find_pattern_matches.cc | 8 +- .../unlabelled/pattern_matching.cc | 10 + 14 files changed, 757 insertions(+), 31 deletions(-) create mode 100644 lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h create mode 100644 lib/compiler/include/compiler/mcmc/mcmc_algorithm.h create mode 100644 lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h create mode 100644 lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc create mode 100644 lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc create mode 100644 lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc create mode 100644 lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc diff --git a/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h new file mode 100644 index 0000000000..e41aad2f71 --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H + +#include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/search_result.dtg.h" + +namespace FlexFlow { +std::vector + get_possible_mutations(SearchResult mapped_pcg, + MachineSpecification const &resource); +MachineMapping permute_layers(std::vector layers, + MachineMapping mapping); +MachineMapping copy_layer(parallel_layer_guid_t source, + parallel_layer_guid_t destination, + MachineMapping mapping); +MachineView change_stride(nonnegative_int stride, + parallel_layer_guid_t layer, + MachineView machine_view, + nonnegative_int dim); +MachineView change_node_idx(nonnegative_int node_ix, + parallel_layer_guid_t layer, + MachineView machine_view); +MachineView change_device_idx(nonnegative_int device_idx, + parallel_layer_guid_t layer, + MachineView machine_view); +MachineView change_projection(MachineSpecificationDimension projection, + parallel_layer_guid_t layer, + MachineView machine_view, + nonnegative_int dim); +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h new file mode 100644 index 0000000000..53efa845cf --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H +#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H + +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/search_result.dtg.h" +#include "compiler/unity_algorithm/unity_search_config.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/substitution.h" + +namespace FlexFlow { + +SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + UnitySearchConfig const &search_config); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h b/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h new file mode 100644 index 0000000000..3306af123a --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h @@ -0,0 +1,35 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H +#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H + +#include "compiler/search_result.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" + +namespace FlexFlow { + +struct MCMCOptimizeState { + MCMCOptimizeState() = delete; + explicit MCMCOptimizeState(SearchResult const &mapped_pcg, float runtime); + + SearchResult mapped_pcg; + float runtime; + + bool operator==(MCMCOptimizeState const &other) const; + bool operator!=(MCMCOptimizeState const &other) const; + bool operator<(MCMCOptimizeState const &other) const; +}; + +std::string format_as(MCMCOptimizeState const &); +std::ostream &operator<<(std::ostream &, MCMCOptimizeState const &); + +} // namespace FlexFlow + +namespace std { + +template <> +struct hash<::FlexFlow::MCMCOptimizeState> { + size_t operator()(::FlexFlow::MCMCOptimizeState const &) const; +}; + +} // namespace std + +#endif diff --git a/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h index 5f06fd242c..9f609f3118 100644 --- a/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h +++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H -#define _FLEXFLOW_COMPILER_MCMC_STATE_H +#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H +#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H #include "pcg/parallel_computation_graph/parallel_computation_graph.h" diff --git a/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc new file mode 100644 index 0000000000..d67e4cb592 --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc @@ -0,0 +1,110 @@ +#include "compiler/mcmc/machine_mapping_mutation_set.h" +#include "compiler/machine_mapping/allowed_machine_views.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.h" +#include "utils/containers/vector_of.h" +#include "utils/vector.h" + +namespace FlexFlow { + +bool mutation_is_allowed(ParallelComputationGraph &pcg, + parallel_layer_guid_t layer, + MachineSpecification const &resources, + MachineView machine_view) { + OperatorTaskSpace task = get_operator_task_space(pcg, layer); + std::unordered_set allowed_machine_views = + get_allowed_machine_views(resources, task, DeviceType::GPU); + return allowed_machine_views.count(machine_view); +} + +std::vector + get_possible_mutations(SearchResult mapped_pcg, + MachineSpecification const &resources) { + //each mutation only changes one layer at a time + ParallelComputationGraph pcg = mapped_pcg.pcg; + std::vector layers = topological_ordering(pcg); + std::vector machine_mappings; + for (parallel_layer_guid_t layer : layers) { + MachineMapping original_mapping = mapped_pcg.machine_mapping; + MachineView machine_view = original_mapping.machine_views.at(layer); + OperatorTaskSpace task = get_operator_task_space(pcg, layer); + std::vector allowed_machine_views = + vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU)); + + std::vector new_machine_mappings = + transform(allowed_machine_views, [&](MachineView machine_views) { + MachineMapping original_mapping = mapped_pcg.machine_mapping; + original_mapping.machine_views.at(layer) = machine_views; + return original_mapping; + }); + machine_mappings = concat(machine_mappings, new_machine_mappings); + } + return machine_mappings; +} + +MachineMapping permute_layers(std::vector layers, + MachineMapping mapping) { + NOT_IMPLEMENTED(); +} + +MachineMapping copy_layer(parallel_layer_guid_t source, + parallel_layer_guid_t destination, + MachineMapping mapping) { + std::unordered_map machine_views = + mapping.machine_views; + MachineView machine_view_to_copy = machine_views.at(source); + machine_views.try_emplace(destination, machine_view_to_copy); + return MachineMapping{machine_views}; +} + +MachineView change_stride(nonnegative_int stride, + parallel_layer_guid_t layer, + MachineView machine_view, + nonnegative_int dim) { + std::vector strides = get_strides(machine_view); + strides.at(dim.unwrap_nonnegative()) = stride_t{stride}; + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + machine_view.start, strides, get_dimensions(machine_view)); + return new_machine_view; +} + +MachineView change_node_idx(nonnegative_int node_ix, + parallel_layer_guid_t layer, + MachineView machine_view) { + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + MachineSpaceCoordinate{node_ix, + machine_view.start.device_idx, + machine_view.start.device_type}, + get_strides(machine_view), + get_dimensions(machine_view)); + return new_machine_view; +} + +MachineView change_device_idx(nonnegative_int device_idx, + parallel_layer_guid_t layer, + MachineView machine_view) { + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + MachineSpaceCoordinate{machine_view.start.node_idx, + device_idx, + machine_view.start.device_type}, + get_strides(machine_view), + get_dimensions(machine_view)); + return new_machine_view; +} + +MachineView change_projection(MachineSpecificationDimension projection, + parallel_layer_guid_t layer, + MachineView machine_view, + nonnegative_int dim) { + std::vector dims = + get_dimensions(machine_view); + dims.at(dim.unwrap_nonnegative()) = projection; + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + machine_view.start, get_strides(machine_view), dims); + return new_machine_view; +} +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc new file mode 100644 index 0000000000..61f425fec6 --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc @@ -0,0 +1,320 @@ +#include "compiler/mcmc/mcmc_algorithm.h" +#include "compiler/machine_mapping/allowed_machine_views.h" +#include "compiler/mcmc/machine_mapping_mutation_set.h" +#include "compiler/mcmc/mcmc_graph_optimize_state.h" +#include "compiler/task_graph_simulator/task_simulator.h" +#include "pcg/operator_task_space.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" +#include "substitutions/apply_substitution/apply_substitution.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" +#include "substitutions/open_parallel_tensor_guid_t.h" +#include "substitutions/pcg_pattern.h" +#include "substitutions/pcg_pattern_match.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/sub_parallel_computation_graph_data.dtg.h" +#include "substitutions/sub_parallel_computation_graph_edge.h" +#include "substitutions/substitution.h" +#include "substitutions/unity_substitution_set.h" +#include "utils/containers/keys.h" +#include "utils/containers/merge_maps.h" +#include "utils/containers/restrict_keys.h" +#include "utils/containers/set_minus.h" +#include "utils/containers/transform.h" +#include "utils/containers/values.h" +#include "utils/deduplicated_priority_queue.h" +#include "utils/full_binary_tree/binary_tree_path.h" +#include "utils/graph/node/algorithms.h" +#include "utils/optional.h" + +namespace FlexFlow { + +std::optional + get_naive_mapping(ParallelComputationGraph &pcg, + MachineSpecification const &resources) { + std::vector layers = topological_ordering(pcg); + std::unordered_map machine_views; + for (parallel_layer_guid_t layer : layers) { + OperatorTaskSpace task = get_operator_task_space(pcg, layer); + std::unordered_set allowed_machine_views = + get_allowed_machine_views(resources, task, DeviceType::GPU); + if (allowed_machine_views.empty()) { + return std::nullopt; + } + machine_views.insert({layer, *(allowed_machine_views.begin())}); + } + return MachineMapping{machine_views}; +} + +SearchResult apply_substitution_and_update_machine_mapping( + SearchResult const &mapped_pcg, + Substitution const &sub, + PCGPatternMatch const &match) { + // std::cout << "applying substitution" << std::endl; + SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); + + auto substitution_output_result = + evaluate_substitution_output(spcg, sub, match); + SubParallelComputationGraph substitution_output_graph = + substitution_output_result.first; + OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping = + substitution_output_result.second; + + SubParallelComputationGraphData output_graph_data = + get_sub_pcg_data(substitution_output_graph); + SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg); + + std::unordered_set pre_nodes = + keys(pre_data.node_data); + std::unordered_set matched_nodes = + unordered_set_of(values(match.node_assignment)); + std::unordered_set post_nodes_from_original_graph = + set_minus(pre_nodes, matched_nodes); + + std::unordered_map machine_views = + mapped_pcg.machine_mapping.machine_views; + + std::unordered_set substituted_machine_views = + transform(matched_nodes, [&](parallel_layer_guid_t const &node) { + return machine_views.at(node); + }); + + std::unordered_map post_node_data = + [&] { + std::unordered_map + post_node_data_from_orig = restrict_keys( + pre_data.node_data, post_nodes_from_original_graph); + std::unordered_map + post_node_data_from_sub = output_graph_data.node_data; + + // just taking the first substituted machine view, not sure if this + // is fine + for (auto [layer, attrs] : post_node_data_from_sub) { + machine_views.try_emplace(layer, *substituted_machine_views.begin()); + } + + return merge_disjoint_maps(post_node_data_from_orig, + post_node_data_from_sub); + }(); + + std::unordered_set post_edges = [&] { + std::unordered_set post_edges_from_orig = + filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) { + if (e.raw_edge.has()) { + return true; + } else { + DataflowEdge dfe = e.raw_edge.get(); + parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node}; + parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node}; + return !(contains(matched_nodes, src) || + contains(matched_nodes, dst)); + } + }); + + std::unordered_set post_edges_from_sub = + filter(output_graph_data.edges, + [&](SubParallelComputationGraphEdge const &e) { + return !e.raw_edge.has(); + }); + + bidict + output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match( + match, sub.pcg_pattern, spcg); + bidict + output_post_outexpr_mapping = get_output_graph_expr_output_mapping( + output_expr_to_result_sub_pcg_mapping, + sub.output_graph_expr, + substitution_output_graph); + + std::unordered_set incoming_to_sub_edges; + for (auto const &[pattern_input, base_graph_tensor] : + match.input_assignment) { + OutputGraphExprInput output_expr_input = + sub.inputs_mapping.at_l(pattern_input); + input_parallel_tensor_guid_t output_graph_input = + output_expr_to_result_sub_pcg_mapping.input_mapping.at_r( + output_expr_input); + std::unordered_set uses = get_parallel_tensor_uses( + substitution_output_graph, + open_parallel_tensor_guid_from_input(output_graph_input)); + for (parallel_tensor_use_t const &use : uses) { + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_use(base_graph_tensor, use); + incoming_to_sub_edges.insert(new_edge); + } + } + + std::unordered_set outgoing_from_sub_edges; + for (ParallelComputationGraphEdge const &outgoing_edge : + get_subgraph_outgoing_edges(spcg, matched_nodes)) { + parallel_tensor_guid_t original_tensor = + get_parallel_tensor(outgoing_edge); + PatternNodeOutput pattern_tensor = + output_orig_pattern_mapping.at_r(original_tensor); + OutputGraphExprNodeOutput output_graph_tensor = + sub.outputs_mapping.at_l(pattern_tensor); + parallel_tensor_guid_t new_tensor = + output_post_outexpr_mapping.at_r(output_graph_tensor); + + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_dst( + new_tensor, + get_dst_layer(outgoing_edge), + get_dst_layer_input_idx(outgoing_edge)); + outgoing_from_sub_edges.insert(new_edge); + } + + return set_union(std::vector{ + post_edges_from_orig, + post_edges_from_sub, + incoming_to_sub_edges, + outgoing_from_sub_edges, + }); + }(); + + std::unordered_set post_inputs = + pre_data.inputs; + + std::unordered_map + post_value_data = [&] { + std::unordered_map + post_value_data_from_orig = filter_keys( + pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) { + return visit_open_parallel_tensor_guid( + t, + overload{ + [&](parallel_tensor_guid_t const &t) { + return contains(post_nodes_from_original_graph, + get_source_layer(t)); + }, + [](input_parallel_tensor_guid_t const &) { + return true; + }, + }); + }); + + std::unordered_map + post_value_data_from_sub = output_graph_data.value_data; + return merge_disjoint_maps(post_value_data_from_orig, + post_value_data_from_sub); + }(); + + SubParallelComputationGraphData post_data = SubParallelComputationGraphData{ + post_node_data, + post_edges, + post_inputs, + post_value_data, + }; + + return SearchResult{ + pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)), + MachineMapping{machine_views}}; +} + +std::vector all_pcgs_obtained_by_applying_a_substitution( + SearchResult const &mapped_pcg, + std::vector const &substitutions) { + std::vector results; + SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); + // std::cout << "len" << substitutions.size() << std::endl; + for (Substitution const &substitution : substitutions) { + std::cout << "in outer loop" << std::endl; + for (PCGPatternMatch const &pattern_match : + find_pattern_matches(substitution.pcg_pattern, subpcg)) { + std::cout << "getting stuff" << std::endl; + SearchResult mapped_pcg_from_substitution = + apply_substitution_and_update_machine_mapping( + mapped_pcg, substitution, pattern_match); + results.push_back(mapped_pcg_from_substitution); + } + } + return results; +} + +SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + UnitySearchConfig const &search_config) { + + std::vector substitutions = get_substitution_set(resources); + DeduplicatedPriorityQueue candidates; + + std::optional naive_mapping = + get_naive_mapping(pcg, resources); + if (naive_mapping == std::nullopt) { + throw std::runtime_error("Failed to find any solutions"); + } + + // multiply runtime by -1 to make it minheap instead of maxheap + MCMCOptimizeState best_state = MCMCOptimizeState{ + SearchResult{pcg, naive_mapping.value()}, + -1 * task_simulator_estimate_forward_pass_time( + pcg, cost_estimator, naive_mapping.value(), resources)}; + + candidates.push(best_state); + + for (int iteration = 0; + !candidates.empty() && iteration < search_config.budget; + ++iteration) { + MCMCOptimizeState current_state = candidates.top(); + candidates.pop(); + + SearchResult current_mapped_pcg = current_state.mapped_pcg; + float current_estimate = current_state.runtime * -1; + float best_estimate = best_state.runtime * -1; + + if (current_estimate < best_estimate) { + best_state = current_state; + std::cout << "new best state" << std::endl; + std::cout << current_estimate << std::endl; + std::cout << best_estimate << std::endl; + } else if (current_estimate > best_estimate * search_config.alpha) { + continue; + } else { + std::cout << current_estimate << best_estimate * search_config.alpha + << std::endl; + } + // std::cout << "Hello" << std::endl; + + for (SearchResult const &new_mapped_pcg : + all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg, + substitutions)) { + float new_estimate = task_simulator_estimate_forward_pass_time( + new_mapped_pcg.pcg, + cost_estimator, + new_mapped_pcg.machine_mapping, + resources); + + std::cout << "new substitution" << std::endl; + + std::cout << "new estimate" << new_estimate << std::endl; + if (new_estimate <= search_config.threshold && + get_nodes(new_mapped_pcg.pcg.raw_graph).size() <= + search_config.max_num_ops) { + candidates.push(MCMCOptimizeState{new_mapped_pcg, -1 * new_estimate}); + } + } + + for (MachineMapping const &new_machine_mapping : + get_possible_mutations(current_mapped_pcg, resources)) { + float new_estimate = + task_simulator_estimate_forward_pass_time(current_mapped_pcg.pcg, + cost_estimator, + new_machine_mapping, + resources); + //std::cout << "new mapping" << std::endl; + + //std::cout << "new estimate" << new_estimate << std::endl; + if (new_estimate <= search_config.threshold) { + //std::cout << "pushed" << std::endl; + candidates.push( + MCMCOptimizeState{SearchResult{current_mapped_pcg.pcg, new_machine_mapping}, -1 * new_estimate}); + } + } + } + return best_state.mapped_pcg; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc b/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc new file mode 100644 index 0000000000..2556a50b4d --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc @@ -0,0 +1,84 @@ +#include "compiler/mcmc/mcmc_graph_optimize_state.h" +#include "pcg/machine_view.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" + +namespace FlexFlow { + +MCMCOptimizeState::MCMCOptimizeState(SearchResult const &mapped_pcg, + float runtime) + : mapped_pcg(mapped_pcg), runtime(runtime) {} + +bool MCMCOptimizeState::operator==(MCMCOptimizeState const &other) const { + return pcgs_are_isomorphic(mapped_pcg.pcg, other.mapped_pcg.pcg) && + mapped_pcg.machine_mapping == other.mapped_pcg.machine_mapping && + runtime == other.runtime; +} + +bool MCMCOptimizeState::operator!=(MCMCOptimizeState const &other) const { + return !(*this == other); +} + +bool MCMCOptimizeState::operator<(MCMCOptimizeState const &other) const { + return runtime < other.runtime; +} + +std::string format_as(MCMCOptimizeState const &r) { + return fmt::format("", + as_dot(r.mapped_pcg.pcg), + r.mapped_pcg.machine_mapping, + r.runtime); +} + +std::ostream &operator<<(std::ostream &s, MCMCOptimizeState const &st) { + return (s << fmt::to_string(st)); +} +} // namespace FlexFlow + +namespace std { + +size_t hash<::FlexFlow::MCMCOptimizeState>::operator()( + ::FlexFlow::MCMCOptimizeState const &state) const { + ::FlexFlow::ParallelComputationGraph pcg = state.mapped_pcg.pcg; + ::FlexFlow::MachineMapping machine_mapping = state.mapped_pcg.machine_mapping; + size_t seed = 0; + ::FlexFlow::hash_combine(seed, state.runtime); + std::vector<::FlexFlow::parallel_layer_guid_t> layers = + topological_ordering(pcg); + ::FlexFlow::hash_combine(seed, layers.size()); + for (::FlexFlow::parallel_layer_guid_t const &layer : layers) { + ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(pcg, layer)); + std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = + get_incoming_tensors(pcg, layer); + ::FlexFlow::hash_combine(seed, inputs.size()); + for (::FlexFlow::parallel_tensor_guid_t input : inputs) { + for (size_t i = 0; i < layers.size(); ++i) { + if (get_source_layer(input) == layers.at(i)) { + ::FlexFlow::hash_combine(seed, i); + break; + } + } + } + ::FlexFlow::MachineView machine_view = + machine_mapping.machine_views.at(layer); + ::FlexFlow::hash_combine(seed, machine_view.start.node_idx); + ::FlexFlow::hash_combine(seed, machine_view.start.device_idx); + if (get_device_type(machine_view) == ::FlexFlow::DeviceType::CPU) { + ::FlexFlow::hash_combine(seed, 0); + } else { + ::FlexFlow::hash_combine(seed, 1); + } + for (::FlexFlow::MachineViewDimension dimension : machine_view.dimensions) { + ::FlexFlow::hash_combine(seed, dimension.stride.unwrapped); + if (dimension.projection == + ::FlexFlow::MachineSpecificationDimension::INTRA_NODE) { + ::FlexFlow::hash_combine(seed, 0); + } else { + ::FlexFlow::hash_combine(seed, 1); + } + } + } + + return seed; +} + +} // namespace std diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc new file mode 100644 index 0000000000..d441db199f --- /dev/null +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc @@ -0,0 +1,88 @@ +#include "compiler/mcmc/mcmc_algorithm.h" +#include "../cost_estimator_for_test.h" +#include "doctest/doctest.h" +#include "op-attrs/parallel_tensor_dims.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/replica_type.dtg.h" +#include "op-attrs/shard_parallel_dim.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "pcg/pcg_from_computation_graph.h" +#include "utils/integer_conversions.h" +#include "compiler/task_graph_simulator/task_simulator.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("mcmc_graph_optimize") { + ComputationGraph cg = [&] { + ComputationGraphBuilder b; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{ + FFOrdered{nonnegative_int{32}, + nonnegative_int{64}}, + }, + DataType::FLOAT, + }; + tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES); + t = b.dense(t, + /*outDim=*/nonnegative_int{16}, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/nonnegative_int{12}, + /*activation=*/std::nullopt, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + t = b.relu(t); + t = b.dense(t, + /*outDim=*/nonnegative_int{8}, + /*activation=*/Activation::RELU); + return b.computation_graph; + }(); + + ParallelComputationGraph pcg = pcg_from_computation_graph(cg); + + CostEstimator cost_estimator = make_fake_cost_estimator( + [](OpCostEstimateKey const &k) { + return OpCostMetrics{ + /*forward_runtime=*/1.0, + /*backward_runtime=*/2.0, + /*memory=*/nonnegative_int{1}, + }; + }, + [](TensorSetMovement const &) { return 1.0; }); + + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/nonnegative_int{2}, + /*num_cpus_per_node=*/nonnegative_int{1}, + /*num_gpus_per_node=*/nonnegative_int{1}, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + + UnitySearchConfig search_config = UnitySearchConfig{ + /*alpha=*/1.2, + /*budget=*/10, + /*threshold=*/30.0, + /*max_num_ops=*/100, + }; + + SearchResult result = mcmc_graph_optimize( + pcg, cost_estimator, full_machine_spec, search_config); + + std::cout << task_simulator_estimate_forward_pass_time(result.pcg, + cost_estimator, + result.machine_mapping, + full_machine_spec) << std::endl; + + CHECK(task_simulator_estimate_forward_pass_time(result.pcg, + cost_estimator, + result.machine_mapping, + full_machine_spec) < 16); + + CHECK(false); + } +} diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc index 61bfe15d7b..f1354264f8 100644 --- a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc +++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc @@ -159,6 +159,8 @@ SubParallelComputationGraph post_value_data, }; + std::cout << as_dot(sub_pcg_from_graph_data(post_data)) << std::endl; + return sub_pcg_from_graph_data(post_data); } diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc index 194ae49255..40c69bf4c8 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc @@ -13,9 +13,14 @@ bool operator_satisfies_constraint( return false; } + // std::cout << constraint.constraint_type << std::endl; switch (constraint.constraint_type) { case ConstraintType::EQUAL: return expr_val.value() == constraint.attribute_value; + case ConstraintType::DIVISIBLE_BY: + return (expr_val.value().get() % + constraint.attribute_value.get()) == + 0; default: throw mk_runtime_error( fmt::format("Unknown constraint type {}", diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc index a0af875848..3277789a57 100644 --- a/lib/substitutions/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/src/substitutions/pcg_pattern.cc @@ -23,6 +23,8 @@ std::unordered_set get_nodes(PCGPattern const &p) { static MatchAdditionalCriterion pcg_pattern_criteria(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { + // std::cout << "GGETTING pattern criteria" << std::endl; + // std::cout << get_nodes(pattern) << std::endl; return MatchAdditionalCriterion{ [&](PatternNode const &patternNode, Node const &pcgNode) { return operator_satisfies_pattern( @@ -40,6 +42,8 @@ static MatchAdditionalCriterion std::vector find_pattern_matches(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { + + // std::cout << "IN PATTERN MATCH"<< std::endl; std::vector unlabelled_matches = find_pattern_matches(get_unlabelled_pattern(pattern), pcg.raw_graph, @@ -65,11 +69,20 @@ UnlabelledGraphPattern get_unlabelled_pattern(PCGPattern const &p) { TensorAttributePattern get_tensor_pattern(PCGPattern const &p, PatternValue const &v) { + + // std::cout << "get tensor pattern"<< std::endl; + // std::cout << v << std::endl; + // std::cout << raw_open_dataflow_value_from_pattern_value(v) << std::endl; + TensorAttributePattern t = + p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v)); + // std::cout << "hmm" << std::endl; return p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v)); } OperatorAttributePattern get_operator_pattern(PCGPattern const &p, PatternNode const &n) { + + // std::cout << "get op pattern"<< std::endl; return p.raw_graph.at(n.raw_node); } diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc index 83df74f21b..0c673f0a8a 100644 --- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc +++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc @@ -188,34 +188,33 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs, } std::string as_dot(SubParallelComputationGraph const &spcg) { - NOT_IMPLEMENTED(); - // std::function get_node_label = - // [](ParallelLayerAttrs const &a) -> std::string { - // RecordFormatter r = as_dot(a.op_attrs); - // - // if (a.name.has_value()) { - // RecordFormatter rr; - // rr << "Name" << a.name.value(); - // r << rr; - // } - // - // std::ostringstream oss; - // oss << r; - // return oss.str(); - // }; - // - // std::function get_input_label = - // [](ParallelTensorAttrs const &a) -> std::string { - // RecordFormatter r; - // - // r << fmt::to_string(a.shape); - // - // std::ostringstream oss; - // oss << r; - // return oss.str(); - // }; - // - // return as_dot(spcg.raw_graph, get_node_label, get_input_label); + std::function get_node_label = + [](ParallelLayerAttrs const &a) -> std::string { + RecordFormatter r = as_dot(a.op_attrs); + + if (a.name.has_value()) { + RecordFormatter rr; + rr << "Name" << a.name.value(); + r << rr; + } + + std::ostringstream oss; + oss << r; + return oss.str(); + }; + + std::function get_input_label = + [](ParallelTensorAttrs const &a) -> std::string { + RecordFormatter r; + + r << fmt::to_string(a.shape); + + std::ostringstream oss; + oss << r; + return oss.str(); + }; + + return as_dot(spcg.raw_graph, get_node_label, get_input_label); } void debug_print_dot(SubParallelComputationGraph const &spcg) { diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc index a7ebc0bff7..1142333f48 100644 --- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc +++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc @@ -71,16 +71,22 @@ std::vector find_pattern_matches(UnlabelledGraphPattern const &pattern, OpenDataflowGraphView const &graph, MatchAdditionalCriterion const &additional_criterion) { + // std::cout << "find pattern matches" << std::endl; std::vector matches; if (is_singleton_pattern(pattern)) { + // std::cout << "singleton pattern" << std::endl; for (Node const &graph_node : get_nodes(graph)) { + // std::cout << "11111" << std::endl; std::optional candidate = get_candidate_singleton_match(pattern, graph, graph_node); + // std::cout << "22222" << std::endl; if (candidate.has_value() && unlabelled_pattern_does_match( pattern, graph, candidate.value(), additional_criterion)) { + // std::cout << "2.555" << std::endl; matches.push_back(candidate.value()); } + // std::cout << "33333" << std::endl; } } else { PatternSplit split = find_even_split(pattern); @@ -110,7 +116,7 @@ std::vector } } } - + // std::cout << "return from pattern matches" << std::endl; return matches; } diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc index 304bb8cf46..e4285e37bf 100644 --- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc +++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc @@ -97,24 +97,30 @@ bool pattern_matches_subgraph_under( &full_graph_values_to_subgraph_inputs, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { + // std::cout << "pattern amtches subgrpah under" << std::endl; SubgraphConcreteFromPattern concrete_from_pattern{ match, full_graph_values_to_subgraph_inputs}; std::unordered_set concrete_nodes = get_nodes(subgraph); std::unordered_set concrete_nodes_from_match = transform(get_nodes(pattern), concrete_from_pattern); + // std::cout << "mid of pattern amtches subgrpah under" << std::endl; if (concrete_nodes != concrete_nodes_from_match) { return false; } for (PatternNode const &pattern_node : get_nodes(pattern)) { + // std::cout << "hello hello hello" << std::endl; if (!additional_criterion.node_criterion( pattern_node, concrete_from_pattern(pattern_node))) { + // std::cout << "hello hello hello hello hello" << std::endl; return false; } } + // std::cout << "later mid of pattern amtches subgrpah under" << std::endl; + std::unordered_set concrete_edges = get_edges(subgraph); std::unordered_set concrete_edge_from_match = transform(get_edges(pattern), concrete_from_pattern); @@ -138,6 +144,7 @@ bool pattern_matches_subgraph_under( return false; } } + // std::cout << "end of pattern amtches subgrpah under" << std::endl; return true; } @@ -147,12 +154,14 @@ bool unlabelled_pattern_does_match( OpenDataflowGraphView const &graph, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { + // std::cout << "unlabelled_pattern_does_match" << std::endl; OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match); OpenDataflowGraphView matched_subgraph = subgraph_result.graph; assert(left_entries(match.node_assignment) == get_nodes(pattern)); assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph)); + // std::cout << "middle of" << std::endl; MatchAdditionalCriterion through_subgraph_operation = MatchAdditionalCriterion{ @@ -171,6 +180,7 @@ bool unlabelled_pattern_does_match( }}); }, }; + // std::cout << "end of unlabelled_pattern_does_match" << std::endl; return pattern_matches_subgraph_under( pattern, From fe6d9505f8ea11d6ed8d421663b0f1eb1364db7b Mon Sep 17 00:00:00 2001 From: Victor Li Date: Wed, 12 Mar 2025 03:02:45 -0700 Subject: [PATCH 02/11] removing substitution part of MCMC for now --- .envrc | 3 ++ .vimrc | 8 +++++ .../src/compiler/mcmc/mcmc_algorithm.cc | 25 +++----------- .../test/src/compiler/mcmc/mcmc_algorithm.cc | 6 ---- .../src/substitutions/pcg_pattern.cc | 13 -------- .../unlabelled/find_pattern_matches.cc | 7 ---- .../unlabelled/pattern_matching.cc | 33 +++++++++++++------ 7 files changed, 39 insertions(+), 56 deletions(-) create mode 100644 .envrc create mode 100644 .vimrc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..2797f0f929 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +source_up_if_exists + +use flake diff --git a/.vimrc b/.vimrc new file mode 100644 index 0000000000..4c8a8a8279 --- /dev/null +++ b/.vimrc @@ -0,0 +1,8 @@ +" example search path configuration +set path=lib/runtime/**,lib/** + +" set build target +" let g:target = "pcg" + +" set test target +" let g:test_target = "utils-test" diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc index 61f425fec6..025fb586c6 100644 --- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc @@ -28,6 +28,8 @@ #include "utils/full_binary_tree/binary_tree_path.h" #include "utils/graph/node/algorithms.h" #include "utils/optional.h" +#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h" + namespace FlexFlow { @@ -52,7 +54,6 @@ SearchResult apply_substitution_and_update_machine_mapping( SearchResult const &mapped_pcg, Substitution const &sub, PCGPatternMatch const &match) { - // std::cout << "applying substitution" << std::endl; SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); auto substitution_output_result = @@ -217,19 +218,17 @@ std::vector all_pcgs_obtained_by_applying_a_substitution( SearchResult const &mapped_pcg, std::vector const &substitutions) { std::vector results; - SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); - // std::cout << "len" << substitutions.size() << std::endl; + //currently not functional + /*SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); for (Substitution const &substitution : substitutions) { - std::cout << "in outer loop" << std::endl; for (PCGPatternMatch const &pattern_match : find_pattern_matches(substitution.pcg_pattern, subpcg)) { - std::cout << "getting stuff" << std::endl; SearchResult mapped_pcg_from_substitution = apply_substitution_and_update_machine_mapping( mapped_pcg, substitution, pattern_match); results.push_back(mapped_pcg_from_substitution); } - } + }*/ return results; } @@ -267,16 +266,9 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, if (current_estimate < best_estimate) { best_state = current_state; - std::cout << "new best state" << std::endl; - std::cout << current_estimate << std::endl; - std::cout << best_estimate << std::endl; } else if (current_estimate > best_estimate * search_config.alpha) { continue; - } else { - std::cout << current_estimate << best_estimate * search_config.alpha - << std::endl; } - // std::cout << "Hello" << std::endl; for (SearchResult const &new_mapped_pcg : all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg, @@ -287,9 +279,6 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, new_mapped_pcg.machine_mapping, resources); - std::cout << "new substitution" << std::endl; - - std::cout << "new estimate" << new_estimate << std::endl; if (new_estimate <= search_config.threshold && get_nodes(new_mapped_pcg.pcg.raw_graph).size() <= search_config.max_num_ops) { @@ -304,11 +293,7 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, cost_estimator, new_machine_mapping, resources); - //std::cout << "new mapping" << std::endl; - - //std::cout << "new estimate" << new_estimate << std::endl; if (new_estimate <= search_config.threshold) { - //std::cout << "pushed" << std::endl; candidates.push( MCMCOptimizeState{SearchResult{current_mapped_pcg.pcg, new_machine_mapping}, -1 * new_estimate}); } diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc index d441db199f..a7ffa8e5e0 100644 --- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc @@ -73,16 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) { SearchResult result = mcmc_graph_optimize( pcg, cost_estimator, full_machine_spec, search_config); - std::cout << task_simulator_estimate_forward_pass_time(result.pcg, - cost_estimator, - result.machine_mapping, - full_machine_spec) << std::endl; - CHECK(task_simulator_estimate_forward_pass_time(result.pcg, cost_estimator, result.machine_mapping, full_machine_spec) < 16); - CHECK(false); } } diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc index 3277789a57..a0af875848 100644 --- a/lib/substitutions/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/src/substitutions/pcg_pattern.cc @@ -23,8 +23,6 @@ std::unordered_set get_nodes(PCGPattern const &p) { static MatchAdditionalCriterion pcg_pattern_criteria(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { - // std::cout << "GGETTING pattern criteria" << std::endl; - // std::cout << get_nodes(pattern) << std::endl; return MatchAdditionalCriterion{ [&](PatternNode const &patternNode, Node const &pcgNode) { return operator_satisfies_pattern( @@ -42,8 +40,6 @@ static MatchAdditionalCriterion std::vector find_pattern_matches(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { - - // std::cout << "IN PATTERN MATCH"<< std::endl; std::vector unlabelled_matches = find_pattern_matches(get_unlabelled_pattern(pattern), pcg.raw_graph, @@ -69,20 +65,11 @@ UnlabelledGraphPattern get_unlabelled_pattern(PCGPattern const &p) { TensorAttributePattern get_tensor_pattern(PCGPattern const &p, PatternValue const &v) { - - // std::cout << "get tensor pattern"<< std::endl; - // std::cout << v << std::endl; - // std::cout << raw_open_dataflow_value_from_pattern_value(v) << std::endl; - TensorAttributePattern t = - p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v)); - // std::cout << "hmm" << std::endl; return p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v)); } OperatorAttributePattern get_operator_pattern(PCGPattern const &p, PatternNode const &n) { - - // std::cout << "get op pattern"<< std::endl; return p.raw_graph.at(n.raw_node); } diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc index 1142333f48..0f26ce93fa 100644 --- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc +++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc @@ -71,22 +71,16 @@ std::vector find_pattern_matches(UnlabelledGraphPattern const &pattern, OpenDataflowGraphView const &graph, MatchAdditionalCriterion const &additional_criterion) { - // std::cout << "find pattern matches" << std::endl; std::vector matches; if (is_singleton_pattern(pattern)) { - // std::cout << "singleton pattern" << std::endl; for (Node const &graph_node : get_nodes(graph)) { - // std::cout << "11111" << std::endl; std::optional candidate = get_candidate_singleton_match(pattern, graph, graph_node); - // std::cout << "22222" << std::endl; if (candidate.has_value() && unlabelled_pattern_does_match( pattern, graph, candidate.value(), additional_criterion)) { - // std::cout << "2.555" << std::endl; matches.push_back(candidate.value()); } - // std::cout << "33333" << std::endl; } } else { PatternSplit split = find_even_split(pattern); @@ -116,7 +110,6 @@ std::vector } } } - // std::cout << "return from pattern matches" << std::endl; return matches; } diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc index e4285e37bf..cfb34aac3a 100644 --- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc +++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc @@ -97,30 +97,26 @@ bool pattern_matches_subgraph_under( &full_graph_values_to_subgraph_inputs, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { - // std::cout << "pattern amtches subgrpah under" << std::endl; + std::cout << "gamer" << std::endl; + std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; SubgraphConcreteFromPattern concrete_from_pattern{ match, full_graph_values_to_subgraph_inputs}; std::unordered_set concrete_nodes = get_nodes(subgraph); std::unordered_set concrete_nodes_from_match = transform(get_nodes(pattern), concrete_from_pattern); - // std::cout << "mid of pattern amtches subgrpah under" << std::endl; if (concrete_nodes != concrete_nodes_from_match) { return false; } for (PatternNode const &pattern_node : get_nodes(pattern)) { - // std::cout << "hello hello hello" << std::endl; if (!additional_criterion.node_criterion( pattern_node, concrete_from_pattern(pattern_node))) { - // std::cout << "hello hello hello hello hello" << std::endl; return false; } } - // std::cout << "later mid of pattern amtches subgrpah under" << std::endl; - std::unordered_set concrete_edges = get_edges(subgraph); std::unordered_set concrete_edge_from_match = transform(get_edges(pattern), concrete_from_pattern); @@ -137,14 +133,20 @@ bool pattern_matches_subgraph_under( if (concrete_values != concrete_values_from_match) { return false; } + std::cout << "later later mid of pattern amtches subgrpah under" << std::endl; + for (PatternValue const &pattern_value : get_values(pattern)) { + std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvt " << std::endl; + std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; + std::cout << pattern_value << std::endl; if (!additional_criterion.value_criterion( pattern_value, concrete_from_pattern(pattern_value))) { + std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf " << std::endl; return false; } } - // std::cout << "end of pattern amtches subgrpah under" << std::endl; + std::cout << "end of pattern amtches subgrpah under" << std::endl; return true; } @@ -154,14 +156,19 @@ bool unlabelled_pattern_does_match( OpenDataflowGraphView const &graph, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { - // std::cout << "unlabelled_pattern_does_match" << std::endl; + std::cout << "unlabelled_pattern_does_match" << std::endl; OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match); OpenDataflowGraphView matched_subgraph = subgraph_result.graph; assert(left_entries(match.node_assignment) == get_nodes(pattern)); assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph)); - // std::cout << "middle of" << std::endl; + std::cout << "middle of" << std::endl; + std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; + std::cout << left_entries(match.node_assignment) << std::endl; + std::cout << right_entries(match.node_assignment) << std::endl; + std::cout << get_nodes(pattern) << std::endl; + std::cout << get_nodes(matched_subgraph) << std::endl; MatchAdditionalCriterion through_subgraph_operation = MatchAdditionalCriterion{ @@ -169,18 +176,24 @@ bool unlabelled_pattern_does_match( [&](PatternValue const &pv, OpenDataflowValue const &v) { return v.visit(overload{ [&](DataflowOutput const &) { + //std::cout << "whefihweoifhewfi" < Date: Tue, 1 Apr 2025 13:56:25 -0700 Subject: [PATCH 03/11] Adding randomness to MCMC to make it true MCMC, adding secondary non-lazy random mutation generator --- .../machine_mapping_mutation_set.h | 29 ++ .../mcmc/machine_mapping_mutation_set.h | 32 -- .../include/compiler/mcmc/mcmc_algorithm.h | 4 +- .../mcmc/mcmc_search_config.struct.toml | 26 ++ .../machine_mapping_mutation_set.cc | 189 ++++++++++++ .../mcmc/machine_mapping_mutation_set.cc | 110 ------- .../src/compiler/mcmc/mcmc_algorithm.cc | 273 +++--------------- .../test/src/compiler/mcmc/mcmc_algorithm.cc | 37 ++- ..._substitution_and_update_machine_mapping.h | 32 ++ ...ly_substitution_and_update_machine_mapping | 185 ++++++++++++ .../unlabelled/pattern_matching.cc | 38 +-- lib/utils/include/utils/random_utils.h | 2 +- 12 files changed, 540 insertions(+), 417 deletions(-) create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h delete mode 100644 lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h create mode 100644 lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml create mode 100644 lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc delete mode 100644 lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc create mode 100644 lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h create mode 100644 lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h new file mode 100644 index 0000000000..443ab06f02 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h @@ -0,0 +1,29 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H + +#include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/search_result.dtg.h" + +namespace FlexFlow { +std::optional + get_naive_mapping(ParallelComputationGraph &pcg, + MachineSpecification const &resources); +std::vector + get_possible_mutations(SearchResult mapped_pcg, + MachineSpecification const &resource); +std::optional + get_random_mutation(SearchResult mapped_pcg, + MachineSpecification const &resource, + DeviceType const &device_type = DeviceType::GPU); +MachineView increment_stride(MachineView machine_view, nonnegative_int dim); +MachineView decrement_all_strides(MachineView machine_view); +MachineView change_stride(nonnegative_int stride, + MachineView machine_view, + nonnegative_int dim); +MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view); +MachineView change_device_idx(nonnegative_int device_idx, + MachineView machine_view); +MachineView switch_projection(MachineView machine_view, nonnegative_int dim); +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h deleted file mode 100644 index e41aad2f71..0000000000 --- a/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H -#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H - -#include "compiler/machine_mapping/machine_mapping.h" -#include "compiler/search_result.dtg.h" - -namespace FlexFlow { -std::vector - get_possible_mutations(SearchResult mapped_pcg, - MachineSpecification const &resource); -MachineMapping permute_layers(std::vector layers, - MachineMapping mapping); -MachineMapping copy_layer(parallel_layer_guid_t source, - parallel_layer_guid_t destination, - MachineMapping mapping); -MachineView change_stride(nonnegative_int stride, - parallel_layer_guid_t layer, - MachineView machine_view, - nonnegative_int dim); -MachineView change_node_idx(nonnegative_int node_ix, - parallel_layer_guid_t layer, - MachineView machine_view); -MachineView change_device_idx(nonnegative_int device_idx, - parallel_layer_guid_t layer, - MachineView machine_view); -MachineView change_projection(MachineSpecificationDimension projection, - parallel_layer_guid_t layer, - MachineView machine_view, - nonnegative_int dim); -} // namespace FlexFlow - -#endif diff --git a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h index 53efa845cf..b17eaf3e16 100644 --- a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h +++ b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H #include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/mcmc/mcmc_search_config.dtg.h" #include "compiler/search_result.dtg.h" -#include "compiler/unity_algorithm/unity_search_config.dtg.h" #include "pcg/computation_graph.h" #include "pcg/machine_specification.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" @@ -15,7 +15,7 @@ namespace FlexFlow { SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, CostEstimator const &cost_estimator, MachineSpecification const &resources, - UnitySearchConfig const &search_config); + MCMCSearchConfig const &search_config); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml new file mode 100644 index 0000000000..e96ced81cd --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "MCMCSearchConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ +] + +[[fields]] +name = "temperature" +type = "float" + +[[fields]] +name = "num_iterations" +type = "int" + +[[fields]] +name = "num_mutations_per_iteration" +type = "int" + +[[fields]] +name = "max_num_ops" +type = "int" diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc new file mode 100644 index 0000000000..36651fdc5d --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc @@ -0,0 +1,189 @@ +#include "compiler/machine_mapping/machine_mapping_mutation_set.h" +#include "compiler/machine_mapping/allowed_machine_views.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.h" +#include "utils/containers/vector_of.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/random_utils.h" +#include "utils/vector.h" + +namespace FlexFlow { + +std::optional + get_naive_mapping(ParallelComputationGraph &pcg, + MachineSpecification const &resources) { + std::vector layers = topological_ordering(pcg); + std::unordered_map machine_views; + for (parallel_layer_guid_t layer : layers) { + OperatorTaskSpace task = get_operator_task_space(pcg, layer); + std::unordered_set allowed_machine_views = + get_allowed_machine_views(resources, task, DeviceType::GPU); + if (allowed_machine_views.empty()) { + return std::nullopt; + } + machine_views.insert({layer, *(allowed_machine_views.begin())}); + } + return MachineMapping{machine_views}; +} + +std::optional + get_random_mutation_notlazy(SearchResult mapped_pcg, + MachineSpecification const &resources, + DeviceType const &device_type) { + ParallelComputationGraph pcg = mapped_pcg.pcg; + std::vector layers = topological_ordering(pcg); + if (layers.size() == 0) { + return std::nullopt; + } + parallel_layer_guid_t random_layer = select_random(layers); + MachineMapping machine_mapping = mapped_pcg.machine_mapping; + MachineView machine_view = machine_mapping.machine_views.at(random_layer); + + int mutation_op = select_random(range(6)); + switch (mutation_op) { + case 0: { + machine_view = decrement_all_strides(machine_view); + break; + } + case 1: { + nonnegative_int rand_dim = select_random( + nonnegative_range(nonnegative_int{num_dims(machine_view)})); + machine_view = increment_stride(machine_view, rand_dim); + break; + } + case 2: { + nonnegative_int rand_node_idx = + select_random(nonnegative_range(resources.num_nodes)); + machine_view = change_node_idx(rand_node_idx, machine_view); + break; + } + case 3: { + if (device_type == DeviceType::GPU) { + nonnegative_int rand_device_idx = + select_random(nonnegative_range(resources.num_gpus_per_node)); + machine_view = change_device_idx(rand_device_idx, machine_view); + } else { + nonnegative_int rand_device_idx = + select_random(nonnegative_range(resources.num_cpus_per_node)); + machine_view = change_device_idx(rand_device_idx, machine_view); + } + break; + } + case 4: { + nonnegative_int rand_dim = select_random( + nonnegative_range(nonnegative_int{num_dims(machine_view)})); + machine_view = switch_projection(machine_view, rand_dim); + break; + } + case 5: { + // copy layer + parallel_layer_guid_t layer_to_copy = select_random(layers); + machine_view = machine_mapping.machine_views.at(layer_to_copy); + break; + } + } + OperatorTaskSpace task = get_operator_task_space(pcg, random_layer); + if (is_valid_machine_view(machine_view, task, resources)) { + // only apply it if valid + machine_mapping.machine_views.at(random_layer) = machine_view; + } + return machine_mapping; +} + +std::optional + get_random_mutation(SearchResult mapped_pcg, + MachineSpecification const &resources, + DeviceType const &device_type) { + ParallelComputationGraph pcg = mapped_pcg.pcg; + std::vector layers = topological_ordering(pcg); + if (layers.size() == 0) { + return std::nullopt; + } + parallel_layer_guid_t random_layer = layers.at(rand() % layers.size()); + + MachineMapping machine_mapping = mapped_pcg.machine_mapping; + MachineView machine_view = machine_mapping.machine_views.at(random_layer); + OperatorTaskSpace task = get_operator_task_space(pcg, random_layer); + + std::vector allowed_machine_views = + vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU)); + MachineView random_new_machine_view = + allowed_machine_views.at(rand() % allowed_machine_views.size()); + + machine_mapping.machine_views.at(random_layer) = random_new_machine_view; + return machine_mapping; +} + +MachineView increment_stride(MachineView machine_view, nonnegative_int dim) { + std::vector strides = get_strides(machine_view); + nonnegative_int new_stride = + strides.at(dim.unwrap_nonnegative()).unwrapped + 1_n; + return change_stride(new_stride, machine_view, dim); +} + +MachineView decrement_all_strides(MachineView machine_view) { + std::vector strides = get_strides(machine_view); + for (nonnegative_int dim : + nonnegative_range(nonnegative_int{num_dims(machine_view)})) { + nonnegative_int old_stride = strides.at(dim.unwrap_nonnegative()).unwrapped; + if (old_stride >= 1_n) { + machine_view = + change_stride(nonnegative_int{old_stride.unwrap_nonnegative() - 1}, + machine_view, + dim); + } + } + return machine_view; +} + +MachineView change_stride(nonnegative_int stride, + MachineView machine_view, + nonnegative_int dim) { + std::vector strides = get_strides(machine_view); + strides.at(dim.unwrap_nonnegative()) = stride_t{stride}; + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + machine_view.start, strides, get_dimensions(machine_view)); + return new_machine_view; +} + +MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view) { + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + MachineSpaceCoordinate{node_ix, + machine_view.start.device_idx, + machine_view.start.device_type}, + get_strides(machine_view), + get_dimensions(machine_view)); + return new_machine_view; +} + +MachineView change_device_idx(nonnegative_int device_idx, + MachineView machine_view) { + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + MachineSpaceCoordinate{machine_view.start.node_idx, + device_idx, + machine_view.start.device_type}, + get_strides(machine_view), + get_dimensions(machine_view)); + return new_machine_view; +} + +MachineView switch_projection(MachineView machine_view, nonnegative_int dim) { + std::vector dims = + get_dimensions(machine_view); + MachineSpecificationDimension projection = dims.at(dim.unwrap_nonnegative()); + if (projection == MachineSpecificationDimension::INTER_NODE) { + dims.at(dim.unwrap_nonnegative()) = + MachineSpecificationDimension::INTRA_NODE; + } else { + dims.at(dim.unwrap_nonnegative()) = + MachineSpecificationDimension::INTER_NODE; + } + MachineView new_machine_view = + machine_view_from_strides_and_machine_spec_dimensions( + machine_view.start, get_strides(machine_view), dims); + return new_machine_view; +} +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc deleted file mode 100644 index d67e4cb592..0000000000 --- a/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc +++ /dev/null @@ -1,110 +0,0 @@ -#include "compiler/mcmc/machine_mapping_mutation_set.h" -#include "compiler/machine_mapping/allowed_machine_views.h" -#include "pcg/machine_view.h" -#include "pcg/operator_task_space.h" -#include "utils/containers/vector_of.h" -#include "utils/vector.h" - -namespace FlexFlow { - -bool mutation_is_allowed(ParallelComputationGraph &pcg, - parallel_layer_guid_t layer, - MachineSpecification const &resources, - MachineView machine_view) { - OperatorTaskSpace task = get_operator_task_space(pcg, layer); - std::unordered_set allowed_machine_views = - get_allowed_machine_views(resources, task, DeviceType::GPU); - return allowed_machine_views.count(machine_view); -} - -std::vector - get_possible_mutations(SearchResult mapped_pcg, - MachineSpecification const &resources) { - //each mutation only changes one layer at a time - ParallelComputationGraph pcg = mapped_pcg.pcg; - std::vector layers = topological_ordering(pcg); - std::vector machine_mappings; - for (parallel_layer_guid_t layer : layers) { - MachineMapping original_mapping = mapped_pcg.machine_mapping; - MachineView machine_view = original_mapping.machine_views.at(layer); - OperatorTaskSpace task = get_operator_task_space(pcg, layer); - std::vector allowed_machine_views = - vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU)); - - std::vector new_machine_mappings = - transform(allowed_machine_views, [&](MachineView machine_views) { - MachineMapping original_mapping = mapped_pcg.machine_mapping; - original_mapping.machine_views.at(layer) = machine_views; - return original_mapping; - }); - machine_mappings = concat(machine_mappings, new_machine_mappings); - } - return machine_mappings; -} - -MachineMapping permute_layers(std::vector layers, - MachineMapping mapping) { - NOT_IMPLEMENTED(); -} - -MachineMapping copy_layer(parallel_layer_guid_t source, - parallel_layer_guid_t destination, - MachineMapping mapping) { - std::unordered_map machine_views = - mapping.machine_views; - MachineView machine_view_to_copy = machine_views.at(source); - machine_views.try_emplace(destination, machine_view_to_copy); - return MachineMapping{machine_views}; -} - -MachineView change_stride(nonnegative_int stride, - parallel_layer_guid_t layer, - MachineView machine_view, - nonnegative_int dim) { - std::vector strides = get_strides(machine_view); - strides.at(dim.unwrap_nonnegative()) = stride_t{stride}; - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - machine_view.start, strides, get_dimensions(machine_view)); - return new_machine_view; -} - -MachineView change_node_idx(nonnegative_int node_ix, - parallel_layer_guid_t layer, - MachineView machine_view) { - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - MachineSpaceCoordinate{node_ix, - machine_view.start.device_idx, - machine_view.start.device_type}, - get_strides(machine_view), - get_dimensions(machine_view)); - return new_machine_view; -} - -MachineView change_device_idx(nonnegative_int device_idx, - parallel_layer_guid_t layer, - MachineView machine_view) { - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - MachineSpaceCoordinate{machine_view.start.node_idx, - device_idx, - machine_view.start.device_type}, - get_strides(machine_view), - get_dimensions(machine_view)); - return new_machine_view; -} - -MachineView change_projection(MachineSpecificationDimension projection, - parallel_layer_guid_t layer, - MachineView machine_view, - nonnegative_int dim) { - std::vector dims = - get_dimensions(machine_view); - dims.at(dim.unwrap_nonnegative()) = projection; - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - machine_view.start, get_strides(machine_view), dims); - return new_machine_view; -} -} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc index 025fb586c6..6553823252 100644 --- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc @@ -1,224 +1,22 @@ #include "compiler/mcmc/mcmc_algorithm.h" -#include "compiler/machine_mapping/allowed_machine_views.h" -#include "compiler/mcmc/machine_mapping_mutation_set.h" +#include "compiler/machine_mapping/machine_mapping_mutation_set.h" #include "compiler/mcmc/mcmc_graph_optimize_state.h" #include "compiler/task_graph_simulator/task_simulator.h" -#include "pcg/operator_task_space.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" -#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" -#include "substitutions/apply_substitution/apply_substitution.h" -#include "substitutions/apply_substitution/evaluate_substitution_output.h" -#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" -#include "substitutions/open_parallel_tensor_guid_t.h" +#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h" #include "substitutions/pcg_pattern.h" #include "substitutions/pcg_pattern_match.h" -#include "substitutions/sub_parallel_computation_graph.h" -#include "substitutions/sub_parallel_computation_graph_data.dtg.h" -#include "substitutions/sub_parallel_computation_graph_edge.h" #include "substitutions/substitution.h" #include "substitutions/unity_substitution_set.h" -#include "utils/containers/keys.h" -#include "utils/containers/merge_maps.h" -#include "utils/containers/restrict_keys.h" -#include "utils/containers/set_minus.h" -#include "utils/containers/transform.h" -#include "utils/containers/values.h" -#include "utils/deduplicated_priority_queue.h" -#include "utils/full_binary_tree/binary_tree_path.h" -#include "utils/graph/node/algorithms.h" #include "utils/optional.h" -#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h" - +#include "utils/random_utils.h" namespace FlexFlow { -std::optional - get_naive_mapping(ParallelComputationGraph &pcg, - MachineSpecification const &resources) { - std::vector layers = topological_ordering(pcg); - std::unordered_map machine_views; - for (parallel_layer_guid_t layer : layers) { - OperatorTaskSpace task = get_operator_task_space(pcg, layer); - std::unordered_set allowed_machine_views = - get_allowed_machine_views(resources, task, DeviceType::GPU); - if (allowed_machine_views.empty()) { - return std::nullopt; - } - machine_views.insert({layer, *(allowed_machine_views.begin())}); - } - return MachineMapping{machine_views}; -} - -SearchResult apply_substitution_and_update_machine_mapping( - SearchResult const &mapped_pcg, - Substitution const &sub, - PCGPatternMatch const &match) { - SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); - - auto substitution_output_result = - evaluate_substitution_output(spcg, sub, match); - SubParallelComputationGraph substitution_output_graph = - substitution_output_result.first; - OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping = - substitution_output_result.second; - - SubParallelComputationGraphData output_graph_data = - get_sub_pcg_data(substitution_output_graph); - SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg); - - std::unordered_set pre_nodes = - keys(pre_data.node_data); - std::unordered_set matched_nodes = - unordered_set_of(values(match.node_assignment)); - std::unordered_set post_nodes_from_original_graph = - set_minus(pre_nodes, matched_nodes); - - std::unordered_map machine_views = - mapped_pcg.machine_mapping.machine_views; - - std::unordered_set substituted_machine_views = - transform(matched_nodes, [&](parallel_layer_guid_t const &node) { - return machine_views.at(node); - }); - - std::unordered_map post_node_data = - [&] { - std::unordered_map - post_node_data_from_orig = restrict_keys( - pre_data.node_data, post_nodes_from_original_graph); - std::unordered_map - post_node_data_from_sub = output_graph_data.node_data; - - // just taking the first substituted machine view, not sure if this - // is fine - for (auto [layer, attrs] : post_node_data_from_sub) { - machine_views.try_emplace(layer, *substituted_machine_views.begin()); - } - - return merge_disjoint_maps(post_node_data_from_orig, - post_node_data_from_sub); - }(); - - std::unordered_set post_edges = [&] { - std::unordered_set post_edges_from_orig = - filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) { - if (e.raw_edge.has()) { - return true; - } else { - DataflowEdge dfe = e.raw_edge.get(); - parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node}; - parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node}; - return !(contains(matched_nodes, src) || - contains(matched_nodes, dst)); - } - }); - - std::unordered_set post_edges_from_sub = - filter(output_graph_data.edges, - [&](SubParallelComputationGraphEdge const &e) { - return !e.raw_edge.has(); - }); - - bidict - output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match( - match, sub.pcg_pattern, spcg); - bidict - output_post_outexpr_mapping = get_output_graph_expr_output_mapping( - output_expr_to_result_sub_pcg_mapping, - sub.output_graph_expr, - substitution_output_graph); - - std::unordered_set incoming_to_sub_edges; - for (auto const &[pattern_input, base_graph_tensor] : - match.input_assignment) { - OutputGraphExprInput output_expr_input = - sub.inputs_mapping.at_l(pattern_input); - input_parallel_tensor_guid_t output_graph_input = - output_expr_to_result_sub_pcg_mapping.input_mapping.at_r( - output_expr_input); - std::unordered_set uses = get_parallel_tensor_uses( - substitution_output_graph, - open_parallel_tensor_guid_from_input(output_graph_input)); - for (parallel_tensor_use_t const &use : uses) { - SubParallelComputationGraphEdge new_edge = - subpcg_edge_from_tensor_and_use(base_graph_tensor, use); - incoming_to_sub_edges.insert(new_edge); - } - } - - std::unordered_set outgoing_from_sub_edges; - for (ParallelComputationGraphEdge const &outgoing_edge : - get_subgraph_outgoing_edges(spcg, matched_nodes)) { - parallel_tensor_guid_t original_tensor = - get_parallel_tensor(outgoing_edge); - PatternNodeOutput pattern_tensor = - output_orig_pattern_mapping.at_r(original_tensor); - OutputGraphExprNodeOutput output_graph_tensor = - sub.outputs_mapping.at_l(pattern_tensor); - parallel_tensor_guid_t new_tensor = - output_post_outexpr_mapping.at_r(output_graph_tensor); - - SubParallelComputationGraphEdge new_edge = - subpcg_edge_from_tensor_and_dst( - new_tensor, - get_dst_layer(outgoing_edge), - get_dst_layer_input_idx(outgoing_edge)); - outgoing_from_sub_edges.insert(new_edge); - } - - return set_union(std::vector{ - post_edges_from_orig, - post_edges_from_sub, - incoming_to_sub_edges, - outgoing_from_sub_edges, - }); - }(); - - std::unordered_set post_inputs = - pre_data.inputs; - - std::unordered_map - post_value_data = [&] { - std::unordered_map - post_value_data_from_orig = filter_keys( - pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) { - return visit_open_parallel_tensor_guid( - t, - overload{ - [&](parallel_tensor_guid_t const &t) { - return contains(post_nodes_from_original_graph, - get_source_layer(t)); - }, - [](input_parallel_tensor_guid_t const &) { - return true; - }, - }); - }); - - std::unordered_map - post_value_data_from_sub = output_graph_data.value_data; - return merge_disjoint_maps(post_value_data_from_orig, - post_value_data_from_sub); - }(); - - SubParallelComputationGraphData post_data = SubParallelComputationGraphData{ - post_node_data, - post_edges, - post_inputs, - post_value_data, - }; - - return SearchResult{ - pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)), - MachineMapping{machine_views}}; -} - std::vector all_pcgs_obtained_by_applying_a_substitution( SearchResult const &mapped_pcg, std::vector const &substitutions) { std::vector results; - //currently not functional + // currently not functional /*SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); for (Substitution const &substitution : substitutions) { for (PCGPatternMatch const &pattern_match : @@ -232,13 +30,16 @@ std::vector all_pcgs_obtained_by_applying_a_substitution( return results; } +bool mcmc_accept(int delta, float temperature) { + return delta < 0 || randf() < exp(-delta / temperature); +} + SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, CostEstimator const &cost_estimator, MachineSpecification const &resources, - UnitySearchConfig const &search_config) { + MCMCSearchConfig const &search_config) { std::vector substitutions = get_substitution_set(resources); - DeduplicatedPriorityQueue candidates; std::optional naive_mapping = get_naive_mapping(pcg, resources); @@ -246,31 +47,20 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, throw std::runtime_error("Failed to find any solutions"); } - // multiply runtime by -1 to make it minheap instead of maxheap - MCMCOptimizeState best_state = MCMCOptimizeState{ + MCMCOptimizeState current_state = MCMCOptimizeState{ SearchResult{pcg, naive_mapping.value()}, - -1 * task_simulator_estimate_forward_pass_time( - pcg, cost_estimator, naive_mapping.value(), resources)}; + task_simulator_estimate_forward_pass_time( + pcg, cost_estimator, naive_mapping.value(), resources)}; - candidates.push(best_state); + MCMCOptimizeState best_state = current_state; - for (int iteration = 0; - !candidates.empty() && iteration < search_config.budget; + for (int iteration = 0; iteration < search_config.num_iterations; ++iteration) { - MCMCOptimizeState current_state = candidates.top(); - candidates.pop(); SearchResult current_mapped_pcg = current_state.mapped_pcg; - float current_estimate = current_state.runtime * -1; - float best_estimate = best_state.runtime * -1; + float best_estimate = best_state.runtime; - if (current_estimate < best_estimate) { - best_state = current_state; - } else if (current_estimate > best_estimate * search_config.alpha) { - continue; - } - - for (SearchResult const &new_mapped_pcg : + /*for (SearchResult const &new_mapped_pcg : all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg, substitutions)) { float new_estimate = task_simulator_estimate_forward_pass_time( @@ -284,21 +74,36 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, search_config.max_num_ops) { candidates.push(MCMCOptimizeState{new_mapped_pcg, -1 * new_estimate}); } - } - - for (MachineMapping const &new_machine_mapping : - get_possible_mutations(current_mapped_pcg, resources)) { + }*/ + + std::optional new_machine_mapping = + get_random_mutation(current_mapped_pcg, resources); + for (int searched_mutations = 0; + searched_mutations < search_config.num_mutations_per_iteration; + searched_mutations++) { + if (new_machine_mapping == std::nullopt) { + break; + } float new_estimate = task_simulator_estimate_forward_pass_time(current_mapped_pcg.pcg, cost_estimator, - new_machine_mapping, + new_machine_mapping.value(), resources); - if (new_estimate <= search_config.threshold) { - candidates.push( - MCMCOptimizeState{SearchResult{current_mapped_pcg.pcg, new_machine_mapping}, -1 * new_estimate}); + float runtime_delta = new_estimate - best_estimate; + + if (mcmc_accept(runtime_delta, search_config.temperature)) { + current_state = MCMCOptimizeState{ + SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()}, + new_estimate}; + if (runtime_delta < 0) { + best_state = current_state; + } } + + new_machine_mapping = get_random_mutation(current_mapped_pcg, resources); } } + return best_state.mapped_pcg; } diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc index a7ffa8e5e0..7cde75cecf 100644 --- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc @@ -1,5 +1,6 @@ #include "compiler/mcmc/mcmc_algorithm.h" #include "../cost_estimator_for_test.h" +#include "compiler/task_graph_simulator/task_simulator.h" #include "doctest/doctest.h" #include "op-attrs/parallel_tensor_dims.h" #include "op-attrs/parallel_tensor_shape.dtg.h" @@ -9,7 +10,6 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "pcg/pcg_from_computation_graph.h" #include "utils/integer_conversions.h" -#include "compiler/task_graph_simulator/task_simulator.h" using namespace FlexFlow; @@ -19,18 +19,17 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_tensor_shape = TensorShape{ TensorDims{ - FFOrdered{nonnegative_int{32}, - nonnegative_int{64}}, + FFOrdered{32_n, 64_n}, }, DataType::FLOAT, }; tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES); t = b.dense(t, - /*outDim=*/nonnegative_int{16}, + /*outDim=*/16_n, /*activation=*/std::nullopt); t = b.gelu(t); t = b.dense(t, - /*outDim=*/nonnegative_int{12}, + /*outDim=*/12_n, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -38,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*bias_initializer=*/std::nullopt); t = b.relu(t); t = b.dense(t, - /*outDim=*/nonnegative_int{8}, + /*outDim=*/8_n, /*activation=*/Activation::RELU); return b.computation_graph; }(); @@ -50,33 +49,33 @@ TEST_SUITE(FF_TEST_SUITE) { return OpCostMetrics{ /*forward_runtime=*/1.0, /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{1}, + /*memory=*/1_n, }; }, [](TensorSetMovement const &) { return 1.0; }); MachineSpecification full_machine_spec = MachineSpecification{ - /*num_nodes=*/nonnegative_int{2}, - /*num_cpus_per_node=*/nonnegative_int{1}, - /*num_gpus_per_node=*/nonnegative_int{1}, + /*num_nodes=*/2_n, + /*num_cpus_per_node=*/1_n, + /*num_gpus_per_node=*/1_n, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; - UnitySearchConfig search_config = UnitySearchConfig{ - /*alpha=*/1.2, - /*budget=*/10, - /*threshold=*/30.0, + MCMCSearchConfig search_config = MCMCSearchConfig{ + /*temperature=*/1.0, + /*num_iterations=*/100, + /*num_mutations_per_iteration=*/10, /*max_num_ops=*/100, }; SearchResult result = mcmc_graph_optimize( pcg, cost_estimator, full_machine_spec, search_config); + float runtime = task_simulator_estimate_forward_pass_time( + result.pcg, cost_estimator, result.machine_mapping, full_machine_spec); + std::cout << runtime << std::endl; - CHECK(task_simulator_estimate_forward_pass_time(result.pcg, - cost_estimator, - result.machine_mapping, - full_machine_spec) < 16); - + CHECK(runtime < 16); + CHECK(false); } } diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h new file mode 100644 index 0000000000..b08ca57851 --- /dev/null +++ b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H + +#include "compiler/search_result.dtg.h" +#include "substitutions/pcg_pattern_match.dtg.h" +#include "substitutions/sub_parallel_computation_graph.dtg.h" +#include "substitutions/substitution.dtg.h" + +namespace FlexFlow { +/** + * @brief Applies \p substitution to \p mapped_pcg at the location specified by + * \p match, returning the resulting SearchResult (mapped pcg) + * + * @param mapped_pcg + * @param substitution + * @param match The location at which to apply substitution. This location in + * sub_pcg should match substitution's PCGPattern. Likely created by running + * FlexFlow::find_pattern_matches(PCGPattern const &, + * SubParallelComputationGraph const &). + * @return SearchResult A mapped pcg similar to mapped_pcg, but with + * the subgraph of the pcg specified by match replaced with the result of the + * output expression of substitution and the machine mapping updated to account + * for the new output + */ +SearchResult apply_substitution_and_update_machine_mapping( + SearchResult const &mapped_pcg, + Substitution const &sub, + PCGPatternMatch const &match); + +} // namespace FlexFlow + +#endif diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping new file mode 100644 index 0000000000..1721ee26d8 --- /dev/null +++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping @@ -0,0 +1,185 @@ +#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" +#include "substitutions/open_parallel_tensor_guid_t.h" +#include "substitutions/pcg_pattern_match.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/sub_parallel_computation_graph_data.dtg.h" +#include "substitutions/sub_parallel_computation_graph_edge.h" +#include "utils/containers/keys.h" +#include "utils/containers/merge_maps.h" +#include "utils/containers/restrict_keys.h" +#include "utils/containers/set_minus.h" +#include "utils/containers/values.h" + +namespace FlexFlow { + +SearchResult apply_substitution_and_update_machine_mapping( + SearchResult const &mapped_pcg, + Substitution const &sub, + PCGPatternMatch const &match) { + SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); + + auto substitution_output_result = + evaluate_substitution_output(spcg, sub, match); + SubParallelComputationGraph substitution_output_graph = + substitution_output_result.first; + OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping = + substitution_output_result.second; + + SubParallelComputationGraphData output_graph_data = + get_sub_pcg_data(substitution_output_graph); + SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg); + + std::unordered_set pre_nodes = + keys(pre_data.node_data); + std::unordered_set matched_nodes = + unordered_set_of(values(match.node_assignment)); + std::unordered_set post_nodes_from_original_graph = + set_minus(pre_nodes, matched_nodes); + + std::unordered_map machine_views = + mapped_pcg.machine_mapping.machine_views; + + std::unordered_set substituted_machine_views = + transform(matched_nodes, [&](parallel_layer_guid_t const &node) { + return machine_views.at(node); + }); + + std::unordered_map post_node_data = + [&] { + std::unordered_map + post_node_data_from_orig = restrict_keys( + pre_data.node_data, post_nodes_from_original_graph); + std::unordered_map + post_node_data_from_sub = output_graph_data.node_data; + + // just taking the first substituted machine view, not sure if this + // is fine + for (auto [layer, attrs] : post_node_data_from_sub) { + machine_views.try_emplace(layer, *substituted_machine_views.begin()); + } + + return merge_disjoint_maps(post_node_data_from_orig, + post_node_data_from_sub); + }(); + + std::unordered_set post_edges = [&] { + std::unordered_set post_edges_from_orig = + filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) { + if (e.raw_edge.has()) { + return true; + } else { + DataflowEdge dfe = e.raw_edge.get(); + parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node}; + parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node}; + return !(contains(matched_nodes, src) || + contains(matched_nodes, dst)); + } + }); + + std::unordered_set post_edges_from_sub = + filter(output_graph_data.edges, + [&](SubParallelComputationGraphEdge const &e) { + return !e.raw_edge.has(); + }); + + bidict + output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match( + match, sub.pcg_pattern, spcg); + bidict + output_post_outexpr_mapping = get_output_graph_expr_output_mapping( + output_expr_to_result_sub_pcg_mapping, + sub.output_graph_expr, + substitution_output_graph); + + std::unordered_set incoming_to_sub_edges; + for (auto const &[pattern_input, base_graph_tensor] : + match.input_assignment) { + OutputGraphExprInput output_expr_input = + sub.inputs_mapping.at_l(pattern_input); + input_parallel_tensor_guid_t output_graph_input = + output_expr_to_result_sub_pcg_mapping.input_mapping.at_r( + output_expr_input); + std::unordered_set uses = get_parallel_tensor_uses( + substitution_output_graph, + open_parallel_tensor_guid_from_input(output_graph_input)); + for (parallel_tensor_use_t const &use : uses) { + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_use(base_graph_tensor, use); + incoming_to_sub_edges.insert(new_edge); + } + } + + std::unordered_set outgoing_from_sub_edges; + for (ParallelComputationGraphEdge const &outgoing_edge : + get_subgraph_outgoing_edges(spcg, matched_nodes)) { + parallel_tensor_guid_t original_tensor = + get_parallel_tensor(outgoing_edge); + PatternNodeOutput pattern_tensor = + output_orig_pattern_mapping.at_r(original_tensor); + OutputGraphExprNodeOutput output_graph_tensor = + sub.outputs_mapping.at_l(pattern_tensor); + parallel_tensor_guid_t new_tensor = + output_post_outexpr_mapping.at_r(output_graph_tensor); + + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_dst( + new_tensor, + get_dst_layer(outgoing_edge), + get_dst_layer_input_idx(outgoing_edge)); + outgoing_from_sub_edges.insert(new_edge); + } + + return set_union(std::vector{ + post_edges_from_orig, + post_edges_from_sub, + incoming_to_sub_edges, + outgoing_from_sub_edges, + }); + }(); + + std::unordered_set post_inputs = + pre_data.inputs; + + std::unordered_map + post_value_data = [&] { + std::unordered_map + post_value_data_from_orig = filter_keys( + pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) { + return visit_open_parallel_tensor_guid( + t, + overload{ + [&](parallel_tensor_guid_t const &t) { + return contains(post_nodes_from_original_graph, + get_source_layer(t)); + }, + [](input_parallel_tensor_guid_t const &) { + return true; + }, + }); + }); + + std::unordered_map + post_value_data_from_sub = output_graph_data.value_data; + return merge_disjoint_maps(post_value_data_from_orig, + post_value_data_from_sub); + }(); + + SubParallelComputationGraphData post_data = SubParallelComputationGraphData{ + post_node_data, + post_edges, + post_inputs, + post_value_data, + }; + + return SearchResult{ + pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)), + MachineMapping{machine_views}}; +} + +} // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc index cfb34aac3a..d73764dc3e 100644 --- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc +++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc @@ -97,8 +97,8 @@ bool pattern_matches_subgraph_under( &full_graph_values_to_subgraph_inputs, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { - std::cout << "gamer" << std::endl; - std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; + std::cout << "gamer" << std::endl; + std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; SubgraphConcreteFromPattern concrete_from_pattern{ match, full_graph_values_to_subgraph_inputs}; @@ -135,18 +135,18 @@ bool pattern_matches_subgraph_under( } std::cout << "later later mid of pattern amtches subgrpah under" << std::endl; - for (PatternValue const &pattern_value : get_values(pattern)) { std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvt " << std::endl; std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; std::cout << pattern_value << std::endl; if (!additional_criterion.value_criterion( pattern_value, concrete_from_pattern(pattern_value))) { - std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf " << std::endl; + std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf " + << std::endl; return false; } } - std::cout << "end of pattern amtches subgrpah under" << std::endl; + std::cout << "end of pattern amtches subgrpah under" << std::endl; return true; } @@ -156,19 +156,19 @@ bool unlabelled_pattern_does_match( OpenDataflowGraphView const &graph, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { - std::cout << "unlabelled_pattern_does_match" << std::endl; + std::cout << "unlabelled_pattern_does_match" << std::endl; OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match); OpenDataflowGraphView matched_subgraph = subgraph_result.graph; assert(left_entries(match.node_assignment) == get_nodes(pattern)); assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph)); - std::cout << "middle of" << std::endl; - std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; - std::cout << left_entries(match.node_assignment) << std::endl; - std::cout << right_entries(match.node_assignment) << std::endl; - std::cout << get_nodes(pattern) << std::endl; - std::cout << get_nodes(matched_subgraph) << std::endl; + std::cout << "middle of" << std::endl; + std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; + std::cout << left_entries(match.node_assignment) << std::endl; + std::cout << right_entries(match.node_assignment) << std::endl; + std::cout << get_nodes(pattern) << std::endl; + std::cout << get_nodes(matched_subgraph) << std::endl; MatchAdditionalCriterion through_subgraph_operation = MatchAdditionalCriterion{ @@ -176,24 +176,24 @@ bool unlabelled_pattern_does_match( [&](PatternValue const &pv, OpenDataflowValue const &v) { return v.visit(overload{ [&](DataflowOutput const &) { - //std::cout << "whefihweoifhewfi" < #include -float randf() { +inline float randf() { return static_cast(std::rand()) / static_cast(RAND_MAX); } From 355fe3f29518f875b5ef00ff13285eab5c51c892 Mon Sep 17 00:00:00 2001 From: Victor Li Date: Thu, 3 Apr 2025 01:58:52 -0700 Subject: [PATCH 04/11] Adding substitutions to MCMC (not quite working yet) --- ..._substitution_and_update_machine_mapping.h | 0 .../machine_mapping_mutation_set.h | 5 +- .../mcmc/mcmc_search_config.struct.toml | 5 + ...ubstitution_and_update_machine_mapping.cc} | 4 +- .../machine_mapping_mutation_set.cc | 24 ++-- .../src/compiler/mcmc/mcmc_algorithm.cc | 107 +++++++++--------- .../test/src/compiler/mcmc/mcmc_algorithm.cc | 12 +- .../include/substitutions/pcg_pattern.h | 4 + .../substitutions/unity_substitution_set.h | 2 + .../operator_pattern/satisfies_constraint.cc | 1 - .../src/substitutions/pcg_pattern.cc | 12 ++ .../substitutions/unity_substitution_set.cc | 10 ++ .../unlabelled/pattern_matching.cc | 23 ---- 13 files changed, 108 insertions(+), 101 deletions(-) rename lib/{substitutions/include/substitutions/apply_substitution => compiler/include/compiler/machine_mapping}/apply_substitution_and_update_machine_mapping.h (100%) rename lib/{substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping => compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc} (98%) diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h similarity index 100% rename from lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h rename to lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h index 443ab06f02..6dfefec7d1 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h @@ -7,14 +7,15 @@ namespace FlexFlow { std::optional get_naive_mapping(ParallelComputationGraph &pcg, - MachineSpecification const &resources); + MachineSpecification const &resources, + DeviceType const &device_type); std::vector get_possible_mutations(SearchResult mapped_pcg, MachineSpecification const &resource); std::optional get_random_mutation(SearchResult mapped_pcg, MachineSpecification const &resource, - DeviceType const &device_type = DeviceType::GPU); + DeviceType const &device_type); MachineView increment_stride(MachineView machine_view, nonnegative_int dim); MachineView decrement_all_strides(MachineView machine_view); MachineView change_stride(nonnegative_int stride, diff --git a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml index e96ced81cd..6bc5d98be7 100644 --- a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml +++ b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml @@ -7,6 +7,7 @@ features = [ ] includes = [ + "pcg/device_type.dtg.h" ] [[fields]] @@ -24,3 +25,7 @@ type = "int" [[fields]] name = "max_num_ops" type = "int" + +[[fields]] +name = "device_type" +type = "::FlexFlow::DeviceType" \ No newline at end of file diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc similarity index 98% rename from lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping rename to lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc index 1721ee26d8..411ee67145 100644 --- a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping +++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc @@ -1,4 +1,4 @@ -#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h" +#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" #include "substitutions/apply_substitution/evaluate_substitution_output.h" @@ -181,5 +181,3 @@ SearchResult apply_substitution_and_update_machine_mapping( } } // namespace FlexFlow - -#endif \ No newline at end of file diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc index 36651fdc5d..7f7a54d07a 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc @@ -11,7 +11,8 @@ namespace FlexFlow { std::optional get_naive_mapping(ParallelComputationGraph &pcg, - MachineSpecification const &resources) { + MachineSpecification const &resources, + DeviceType const &device_type) { std::vector layers = topological_ordering(pcg); std::unordered_map machine_views; for (parallel_layer_guid_t layer : layers) { @@ -27,9 +28,9 @@ std::optional } std::optional - get_random_mutation_notlazy(SearchResult mapped_pcg, - MachineSpecification const &resources, - DeviceType const &device_type) { + get_random_mutation(SearchResult mapped_pcg, + MachineSpecification const &resources, + DeviceType const &device_type) { ParallelComputationGraph pcg = mapped_pcg.pcg; std::vector layers = topological_ordering(pcg); if (layers.size() == 0) { @@ -90,25 +91,26 @@ std::optional return machine_mapping; } +// "lazy" version just picks a random available machine view for a random layer std::optional - get_random_mutation(SearchResult mapped_pcg, - MachineSpecification const &resources, - DeviceType const &device_type) { + get_random_mutation_lazy(SearchResult mapped_pcg, + MachineSpecification const &resources, + DeviceType const &device_type) { ParallelComputationGraph pcg = mapped_pcg.pcg; std::vector layers = topological_ordering(pcg); if (layers.size() == 0) { return std::nullopt; } - parallel_layer_guid_t random_layer = layers.at(rand() % layers.size()); + parallel_layer_guid_t random_layer = select_random(layers); + ; MachineMapping machine_mapping = mapped_pcg.machine_mapping; MachineView machine_view = machine_mapping.machine_views.at(random_layer); OperatorTaskSpace task = get_operator_task_space(pcg, random_layer); std::vector allowed_machine_views = - vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU)); - MachineView random_new_machine_view = - allowed_machine_views.at(rand() % allowed_machine_views.size()); + vector_of(get_allowed_machine_views(resources, task, device_type)); + MachineView random_new_machine_view = select_random(allowed_machine_views); machine_mapping.machine_views.at(random_layer) = random_new_machine_view; return machine_mapping; diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc index 6553823252..f8ef392eee 100644 --- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc @@ -1,8 +1,8 @@ #include "compiler/mcmc/mcmc_algorithm.h" +#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" #include "compiler/machine_mapping/machine_mapping_mutation_set.h" #include "compiler/mcmc/mcmc_graph_optimize_state.h" #include "compiler/task_graph_simulator/task_simulator.h" -#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h" #include "substitutions/pcg_pattern.h" #include "substitutions/pcg_pattern_match.h" #include "substitutions/substitution.h" @@ -12,28 +12,28 @@ namespace FlexFlow { -std::vector all_pcgs_obtained_by_applying_a_substitution( - SearchResult const &mapped_pcg, - std::vector const &substitutions) { - std::vector results; - // currently not functional - /*SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); - for (Substitution const &substitution : substitutions) { - for (PCGPatternMatch const &pattern_match : - find_pattern_matches(substitution.pcg_pattern, subpcg)) { - SearchResult mapped_pcg_from_substitution = - apply_substitution_and_update_machine_mapping( - mapped_pcg, substitution, pattern_match); - results.push_back(mapped_pcg_from_substitution); - } - }*/ - return results; -} - bool mcmc_accept(int delta, float temperature) { return delta < 0 || randf() < exp(-delta / temperature); } +void modify_graph_state(MCMCOptimizeState &best_state, + MCMCOptimizeState ¤t_state, + SearchResult candidate, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + MCMCSearchConfig const &search_config) { + float best_estimate = best_state.runtime; + float new_estimate = task_simulator_estimate_forward_pass_time( + candidate.pcg, cost_estimator, candidate.machine_mapping, resources); + float runtime_delta = new_estimate - best_estimate; + if (mcmc_accept(runtime_delta, search_config.temperature)) { + current_state = MCMCOptimizeState{candidate, new_estimate}; + if (runtime_delta < 0) { + best_state = current_state; + } + } +} + SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, CostEstimator const &cost_estimator, MachineSpecification const &resources, @@ -42,7 +42,7 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, std::vector substitutions = get_substitution_set(resources); std::optional naive_mapping = - get_naive_mapping(pcg, resources); + get_naive_mapping(pcg, resources, search_config.device_type); if (naive_mapping == std::nullopt) { throw std::runtime_error("Failed to find any solutions"); } @@ -58,49 +58,46 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, ++iteration) { SearchResult current_mapped_pcg = current_state.mapped_pcg; - float best_estimate = best_state.runtime; - /*for (SearchResult const &new_mapped_pcg : - all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg, - substitutions)) { - float new_estimate = task_simulator_estimate_forward_pass_time( - new_mapped_pcg.pcg, - cost_estimator, - new_mapped_pcg.machine_mapping, - resources); - - if (new_estimate <= search_config.threshold && - get_nodes(new_mapped_pcg.pcg.raw_graph).size() <= - search_config.max_num_ops) { - candidates.push(MCMCOptimizeState{new_mapped_pcg, -1 * new_estimate}); - } - }*/ - - std::optional new_machine_mapping = - get_random_mutation(current_mapped_pcg, resources); - for (int searched_mutations = 0; + std::optional new_machine_mapping = get_random_mutation( + current_mapped_pcg, resources, search_config.device_type); + for (int searched_mutations = 1; searched_mutations < search_config.num_mutations_per_iteration; searched_mutations++) { if (new_machine_mapping == std::nullopt) { break; } - float new_estimate = - task_simulator_estimate_forward_pass_time(current_mapped_pcg.pcg, - cost_estimator, - new_machine_mapping.value(), - resources); - float runtime_delta = new_estimate - best_estimate; + modify_graph_state( + best_state, + current_state, + SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()}, + cost_estimator, + resources, + search_config); - if (mcmc_accept(runtime_delta, search_config.temperature)) { - current_state = MCMCOptimizeState{ - SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()}, - new_estimate}; - if (runtime_delta < 0) { - best_state = current_state; - } - } + new_machine_mapping = get_random_mutation( + current_mapped_pcg, resources, search_config.device_type); + } - new_machine_mapping = get_random_mutation(current_mapped_pcg, resources); + std::optional random_substitution = + get_random_substitution(resources); + if (random_substitution != std::nullopt) { + std::optional pattern_match = get_random_pattern_match( + random_substitution.value().pcg_pattern, + sub_pcg_from_full_pcg(current_mapped_pcg.pcg)); + if (pattern_match != std::nullopt) { + SearchResult new_mapped_pcg = + apply_substitution_and_update_machine_mapping( + current_mapped_pcg, + random_substitution.value(), + pattern_match.value()); + modify_graph_state(best_state, + current_state, + new_mapped_pcg, + cost_estimator, + resources, + search_config); + } } } diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc index 7cde75cecf..7aad8b098d 100644 --- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc @@ -62,12 +62,12 @@ TEST_SUITE(FF_TEST_SUITE) { /*intra_node_bandwidth=*/1, }; - MCMCSearchConfig search_config = MCMCSearchConfig{ - /*temperature=*/1.0, - /*num_iterations=*/100, - /*num_mutations_per_iteration=*/10, - /*max_num_ops=*/100, - }; + MCMCSearchConfig search_config = + MCMCSearchConfig{/*temperature=*/1.0, + /*num_iterations=*/5, + /*num_mutations_per_iteration=*/10, + /*max_num_ops=*/100, + /*device_type=*/DeviceType::GPU}; SearchResult result = mcmc_graph_optimize( pcg, cost_estimator, full_machine_spec, search_config); diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h index f0962b15c2..5005a0b51c 100644 --- a/lib/substitutions/include/substitutions/pcg_pattern.h +++ b/lib/substitutions/include/substitutions/pcg_pattern.h @@ -12,6 +12,10 @@ namespace FlexFlow { std::unordered_set get_nodes(PCGPattern const &); +std::optional + get_random_pattern_match(PCGPattern const &pattern, + SubParallelComputationGraph const &pcg); + /** * @brief Find all locations in \p pcg that match \p pattern */ diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h index 183f76ac8a..959ba3da2c 100644 --- a/lib/substitutions/include/substitutions/unity_substitution_set.h +++ b/lib/substitutions/include/substitutions/unity_substitution_set.h @@ -6,6 +6,8 @@ #include "utils/fmt/vector.h" namespace FlexFlow { +std::optional + get_random_substitution(MachineSpecification const &resources); std::vector get_substitution_set(MachineSpecification const &resources); diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc index a95db6fbb4..f39b771364 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc @@ -13,7 +13,6 @@ bool operator_satisfies_constraint( return false; } - // std::cout << constraint.constraint_type << std::endl; switch (constraint.constraint_type) { case ConstraintType::EQUAL: return expr_val.value() == constraint.attribute_value; diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc index a0af875848..fbc181a0f9 100644 --- a/lib/substitutions/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/src/substitutions/pcg_pattern.cc @@ -11,6 +11,7 @@ #include "utils/graph/node/algorithms.h" #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h" #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h" +#include "utils/random_utils.h" namespace FlexFlow { @@ -37,6 +38,17 @@ static MatchAdditionalCriterion }}; } +std::optional + get_random_pattern_match(PCGPattern const &pattern, + SubParallelComputationGraph const &pcg) { + std::vector pattern_matches = + find_pattern_matches(pattern, pcg); + if (pattern_matches.empty()) { + return std::nullopt; + } + return select_random(pattern_matches); +} + std::vector find_pattern_matches(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc index 4b00cdd95f..c8d9266978 100644 --- a/lib/substitutions/src/substitutions/unity_substitution_set.cc +++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc @@ -7,9 +7,19 @@ #include "utils/containers/get_only.h" #include "utils/nonnegative_int/nonnegative_int.h" #include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/random_utils.h" namespace FlexFlow { +std::optional + get_random_substitution(MachineSpecification const &resources) { + std::vector substitutions = get_substitution_set(resources); + if (substitutions.empty()) { + return std::nullopt; + } + return select_random(substitutions); +} + std::vector get_substitution_set(MachineSpecification const &resources) { std::vector substitutions; diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc index d73764dc3e..304bb8cf46 100644 --- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc +++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc @@ -97,8 +97,6 @@ bool pattern_matches_subgraph_under( &full_graph_values_to_subgraph_inputs, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { - std::cout << "gamer" << std::endl; - std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; SubgraphConcreteFromPattern concrete_from_pattern{ match, full_graph_values_to_subgraph_inputs}; @@ -133,20 +131,13 @@ bool pattern_matches_subgraph_under( if (concrete_values != concrete_values_from_match) { return false; } - std::cout << "later later mid of pattern amtches subgrpah under" << std::endl; for (PatternValue const &pattern_value : get_values(pattern)) { - std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvt " << std::endl; - std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; - std::cout << pattern_value << std::endl; if (!additional_criterion.value_criterion( pattern_value, concrete_from_pattern(pattern_value))) { - std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf " - << std::endl; return false; } } - std::cout << "end of pattern amtches subgrpah under" << std::endl; return true; } @@ -156,19 +147,12 @@ bool unlabelled_pattern_does_match( OpenDataflowGraphView const &graph, UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { - std::cout << "unlabelled_pattern_does_match" << std::endl; OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match); OpenDataflowGraphView matched_subgraph = subgraph_result.graph; assert(left_entries(match.node_assignment) == get_nodes(pattern)); assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph)); - std::cout << "middle of" << std::endl; - std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl; - std::cout << left_entries(match.node_assignment) << std::endl; - std::cout << right_entries(match.node_assignment) << std::endl; - std::cout << get_nodes(pattern) << std::endl; - std::cout << get_nodes(matched_subgraph) << std::endl; MatchAdditionalCriterion through_subgraph_operation = MatchAdditionalCriterion{ @@ -176,24 +160,17 @@ bool unlabelled_pattern_does_match( [&](PatternValue const &pv, OpenDataflowValue const &v) { return v.visit(overload{ [&](DataflowOutput const &) { - // std::cout << "whefihweoifhewfi" < Date: Fri, 4 Apr 2025 14:47:02 -0700 Subject: [PATCH 05/11] Added generic MCMC --- .../compiler/mcmc/generic_mcmc_algorithm.h | 60 +++++++++++++++++++ .../mcmc/generic_mcmc_config.struct.toml | 19 ++++++ .../compiler/mcmc/generic_mcmc_state.h | 27 +++++++++ .../compiler/mcmc/generic_mcmc_algorithm.cc | 1 + .../src/compiler/mcmc/generic_mcmc_state.cc | 12 ++++ .../compiler/mcmc/generic_mcmc_algorithm.cc | 32 ++++++++++ 6 files changed, 151 insertions(+) create mode 100644 lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h create mode 100644 lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml create mode 100644 lib/compiler/include/compiler/mcmc/generic_mcmc_state.h create mode 100644 lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc create mode 100644 lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc create mode 100644 lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h new file mode 100644 index 0000000000..bea0061d47 --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h @@ -0,0 +1,60 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H +#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H + +#include "compiler/mcmc/generic_mcmc_config.dtg.h" +#include "compiler/mcmc/generic_mcmc_state.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/random_utils.h" +#include + +namespace FlexFlow { + +bool mcmc_accept(float delta, float temperature) { + return delta < 0 || (randf() < exp(-delta / temperature)); +} + +template +void modify_state(Generic_MCMC_state &best_state, + Generic_MCMC_state ¤t_state, + State candidate, + ScoringFunc scorer, + float temperature) { + float best_estimate = best_state.get_score(); + float new_estimate = scorer(candidate); + float runtime_delta = new_estimate - best_estimate; + if (mcmc_accept(runtime_delta, temperature)) { + current_state = Generic_MCMC_state(candidate, new_estimate); + if (runtime_delta < 0) { + best_state = current_state; + } + } +} + +// GeneratingFunc : State -> nn_int -> std::optional +// ScoringFunc : State -> float + +template +Generic_MCMC_state + minimize_score(State const &starting_state, + GeneratingFunc const &generator, + ScoringFunc const &scorer, + GenericMCMCConfig const &search_config) { + using MCMCState = Generic_MCMC_state; + MCMCState best_state = MCMCState(starting_state, scorer(starting_state)); + MCMCState current_state = best_state; + for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) { + std::optional candidate = generator(current_state.get_state(), i); + if (candidate != std::nullopt) { + modify_state(best_state, + current_state, + candidate.value(), + scorer, + search_config.temperature); + } + } + return best_state; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml new file mode 100644 index 0000000000..e11c84f0bd --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "GenericMCMCConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "utils/nonnegative_int/nonnegative_int.h" +] + +[[fields]] +name = "temperature" +type = "float" + +[[fields]] +name = "num_iterations" +type = "::FlexFlow::nonnegative_int" \ No newline at end of file diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h new file mode 100644 index 0000000000..6a6aada32b --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H +#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H +#include "utils/nonnegative_int/nonnegative_int.h" + +namespace FlexFlow { + +template +struct Generic_MCMC_state { +public: + Generic_MCMC_state(State const &state, Score const &score) + : state(state), score(score) {} + + State const &get_state() const { + return state; + } + Score const &get_score() const { + return score; + } + +private: + State state; + Score score; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc new file mode 100644 index 0000000000..1bf4f5c2b7 --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc @@ -0,0 +1 @@ +#include "compiler/mcmc/generic_mcmc_algorithm.h" diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc new file mode 100644 index 0000000000..6aa4dd5eff --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc @@ -0,0 +1,12 @@ +#include "compiler/mcmc/generic_mcmc_state.h" +#include "utils/archetypes/ordered_value_type.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { +using State = value_type<0>; +using Score = ordered_value_type<1>; + +template struct Generic_MCMC_state; +template struct Generic_MCMC_state; + +} // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc new file mode 100644 index 0000000000..0a175933cf --- /dev/null +++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc @@ -0,0 +1,32 @@ +#include "compiler/mcmc/generic_mcmc_algorithm.h" +#include "doctest/doctest.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("generic_mcmc_algorithm") { + float starting_state = 0.1; + auto generating_func = [](float x, + nonnegative_int i) -> std::optional { + float new_x = x + (randf() - 0.5) / (i.unwrap_nonnegative() + 1); + if (new_x < 0) { + return std::nullopt; + } + if (new_x > 1) { + return std::nullopt; + } + return new_x; + }; + auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); }; + GenericMCMCConfig config = GenericMCMCConfig{/*temperature*/ 1.0, + /*num_iterations*/ 10_n}; + Generic_MCMC_state result = + minimize_score(starting_state, generating_func, scoring_func, config); + float answer = result.get_state(); + float error = result.get_score(); + CHECK(answer > 0.49); + CHECK(answer < 0.51); + CHECK(error >= 0); + CHECK(error < 0.01); + } +} From 2f186c393cdbdd4e60b7c4d0571da0870c9eff98 Mon Sep 17 00:00:00 2001 From: Victor Li Date: Fri, 4 Apr 2025 18:07:06 -0700 Subject: [PATCH 06/11] Refactor MCMC to fit the generic --- .../compiler/mcmc/generic_mcmc_algorithm.h | 31 +++-- .../compiler/mcmc/mcmc_graph_optimize_state.h | 35 ------ ...cmc_algorithm.h => mcmc_over_mapped_pcg.h} | 8 +- ...> mcmc_over_mapped_pcg_config.struct.toml} | 15 +-- .../src/compiler/mcmc/mcmc_algorithm.cc | 107 ------------------ .../mcmc/mcmc_graph_optimize_state.cc | 84 -------------- .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc | 73 ++++++++++++ .../compiler/mcmc/generic_mcmc_algorithm.cc | 4 +- .../test/src/compiler/mcmc/mcmc_algorithm.cc | 13 +-- 9 files changed, 105 insertions(+), 265 deletions(-) delete mode 100644 lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h rename lib/compiler/include/compiler/mcmc/{mcmc_algorithm.h => mcmc_over_mapped_pcg.h} (71%) rename lib/compiler/include/compiler/mcmc/{mcmc_search_config.struct.toml => mcmc_over_mapped_pcg_config.struct.toml} (53%) delete mode 100644 lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc delete mode 100644 lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc create mode 100644 lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h index bea0061d47..a27ecbc8f4 100644 --- a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h @@ -9,22 +9,19 @@ namespace FlexFlow { -bool mcmc_accept(float delta, float temperature) { - return delta < 0 || (randf() < exp(-delta / temperature)); -} - template -void modify_state(Generic_MCMC_state &best_state, - Generic_MCMC_state ¤t_state, - State candidate, - ScoringFunc scorer, - float temperature) { +void modify_state_for_minimization( + Generic_MCMC_state &best_state, + Generic_MCMC_state ¤t_state, + State candidate, + ScoringFunc scorer, + float temperature) { float best_estimate = best_state.get_score(); float new_estimate = scorer(candidate); - float runtime_delta = new_estimate - best_estimate; - if (mcmc_accept(runtime_delta, temperature)) { + float delta = new_estimate - best_estimate; + if (delta < 0 || (randf() < exp(-delta / temperature))) { current_state = Generic_MCMC_state(candidate, new_estimate); - if (runtime_delta < 0) { + if (delta < 0) { best_state = current_state; } } @@ -45,11 +42,11 @@ Generic_MCMC_state for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) { std::optional candidate = generator(current_state.get_state(), i); if (candidate != std::nullopt) { - modify_state(best_state, - current_state, - candidate.value(), - scorer, - search_config.temperature); + modify_state_for_minimization(best_state, + current_state, + candidate.value(), + scorer, + search_config.temperature); } } return best_state; diff --git a/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h b/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h deleted file mode 100644 index 3306af123a..0000000000 --- a/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H -#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H - -#include "compiler/search_result.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph.h" - -namespace FlexFlow { - -struct MCMCOptimizeState { - MCMCOptimizeState() = delete; - explicit MCMCOptimizeState(SearchResult const &mapped_pcg, float runtime); - - SearchResult mapped_pcg; - float runtime; - - bool operator==(MCMCOptimizeState const &other) const; - bool operator!=(MCMCOptimizeState const &other) const; - bool operator<(MCMCOptimizeState const &other) const; -}; - -std::string format_as(MCMCOptimizeState const &); -std::ostream &operator<<(std::ostream &, MCMCOptimizeState const &); - -} // namespace FlexFlow - -namespace std { - -template <> -struct hash<::FlexFlow::MCMCOptimizeState> { - size_t operator()(::FlexFlow::MCMCOptimizeState const &) const; -}; - -} // namespace std - -#endif diff --git a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h similarity index 71% rename from lib/compiler/include/compiler/mcmc/mcmc_algorithm.h rename to lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h index b17eaf3e16..c2d8737184 100644 --- a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h +++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h @@ -1,8 +1,8 @@ -#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H -#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H +#ifndef _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H +#define _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H #include "compiler/cost_estimator/cost_estimator.h" -#include "compiler/mcmc/mcmc_search_config.dtg.h" +#include "compiler/mcmc/mcmc_over_mapped_pcg_config.dtg.h" #include "compiler/search_result.dtg.h" #include "pcg/computation_graph.h" #include "pcg/machine_specification.dtg.h" @@ -15,7 +15,7 @@ namespace FlexFlow { SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, CostEstimator const &cost_estimator, MachineSpecification const &resources, - MCMCSearchConfig const &search_config); + MCMCOverMappedPCGConfig const &search_config); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml similarity index 53% rename from lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml rename to lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml index 6bc5d98be7..e1548a581e 100644 --- a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml +++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "MCMCSearchConfig" +name = "MCMCOverMappedPCGConfig" features = [ "eq", "hash", @@ -7,7 +7,8 @@ features = [ ] includes = [ - "pcg/device_type.dtg.h" + "pcg/device_type.dtg.h", + "utils/nonnegative_int/nonnegative_int.h" ] [[fields]] @@ -16,15 +17,11 @@ type = "float" [[fields]] name = "num_iterations" -type = "int" +type = "::FlexFlow::nonnegative_int" [[fields]] -name = "num_mutations_per_iteration" -type = "int" - -[[fields]] -name = "max_num_ops" -type = "int" +name = "substitution_interval" +type = "::FlexFlow::nonnegative_int" [[fields]] name = "device_type" diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc deleted file mode 100644 index f8ef392eee..0000000000 --- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc +++ /dev/null @@ -1,107 +0,0 @@ -#include "compiler/mcmc/mcmc_algorithm.h" -#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" -#include "compiler/machine_mapping/machine_mapping_mutation_set.h" -#include "compiler/mcmc/mcmc_graph_optimize_state.h" -#include "compiler/task_graph_simulator/task_simulator.h" -#include "substitutions/pcg_pattern.h" -#include "substitutions/pcg_pattern_match.h" -#include "substitutions/substitution.h" -#include "substitutions/unity_substitution_set.h" -#include "utils/optional.h" -#include "utils/random_utils.h" - -namespace FlexFlow { - -bool mcmc_accept(int delta, float temperature) { - return delta < 0 || randf() < exp(-delta / temperature); -} - -void modify_graph_state(MCMCOptimizeState &best_state, - MCMCOptimizeState ¤t_state, - SearchResult candidate, - CostEstimator const &cost_estimator, - MachineSpecification const &resources, - MCMCSearchConfig const &search_config) { - float best_estimate = best_state.runtime; - float new_estimate = task_simulator_estimate_forward_pass_time( - candidate.pcg, cost_estimator, candidate.machine_mapping, resources); - float runtime_delta = new_estimate - best_estimate; - if (mcmc_accept(runtime_delta, search_config.temperature)) { - current_state = MCMCOptimizeState{candidate, new_estimate}; - if (runtime_delta < 0) { - best_state = current_state; - } - } -} - -SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, - CostEstimator const &cost_estimator, - MachineSpecification const &resources, - MCMCSearchConfig const &search_config) { - - std::vector substitutions = get_substitution_set(resources); - - std::optional naive_mapping = - get_naive_mapping(pcg, resources, search_config.device_type); - if (naive_mapping == std::nullopt) { - throw std::runtime_error("Failed to find any solutions"); - } - - MCMCOptimizeState current_state = MCMCOptimizeState{ - SearchResult{pcg, naive_mapping.value()}, - task_simulator_estimate_forward_pass_time( - pcg, cost_estimator, naive_mapping.value(), resources)}; - - MCMCOptimizeState best_state = current_state; - - for (int iteration = 0; iteration < search_config.num_iterations; - ++iteration) { - - SearchResult current_mapped_pcg = current_state.mapped_pcg; - - std::optional new_machine_mapping = get_random_mutation( - current_mapped_pcg, resources, search_config.device_type); - for (int searched_mutations = 1; - searched_mutations < search_config.num_mutations_per_iteration; - searched_mutations++) { - if (new_machine_mapping == std::nullopt) { - break; - } - modify_graph_state( - best_state, - current_state, - SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()}, - cost_estimator, - resources, - search_config); - - new_machine_mapping = get_random_mutation( - current_mapped_pcg, resources, search_config.device_type); - } - - std::optional random_substitution = - get_random_substitution(resources); - if (random_substitution != std::nullopt) { - std::optional pattern_match = get_random_pattern_match( - random_substitution.value().pcg_pattern, - sub_pcg_from_full_pcg(current_mapped_pcg.pcg)); - if (pattern_match != std::nullopt) { - SearchResult new_mapped_pcg = - apply_substitution_and_update_machine_mapping( - current_mapped_pcg, - random_substitution.value(), - pattern_match.value()); - modify_graph_state(best_state, - current_state, - new_mapped_pcg, - cost_estimator, - resources, - search_config); - } - } - } - - return best_state.mapped_pcg; -} - -} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc b/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc deleted file mode 100644 index 2556a50b4d..0000000000 --- a/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc +++ /dev/null @@ -1,84 +0,0 @@ -#include "compiler/mcmc/mcmc_graph_optimize_state.h" -#include "pcg/machine_view.h" -#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" - -namespace FlexFlow { - -MCMCOptimizeState::MCMCOptimizeState(SearchResult const &mapped_pcg, - float runtime) - : mapped_pcg(mapped_pcg), runtime(runtime) {} - -bool MCMCOptimizeState::operator==(MCMCOptimizeState const &other) const { - return pcgs_are_isomorphic(mapped_pcg.pcg, other.mapped_pcg.pcg) && - mapped_pcg.machine_mapping == other.mapped_pcg.machine_mapping && - runtime == other.runtime; -} - -bool MCMCOptimizeState::operator!=(MCMCOptimizeState const &other) const { - return !(*this == other); -} - -bool MCMCOptimizeState::operator<(MCMCOptimizeState const &other) const { - return runtime < other.runtime; -} - -std::string format_as(MCMCOptimizeState const &r) { - return fmt::format("", - as_dot(r.mapped_pcg.pcg), - r.mapped_pcg.machine_mapping, - r.runtime); -} - -std::ostream &operator<<(std::ostream &s, MCMCOptimizeState const &st) { - return (s << fmt::to_string(st)); -} -} // namespace FlexFlow - -namespace std { - -size_t hash<::FlexFlow::MCMCOptimizeState>::operator()( - ::FlexFlow::MCMCOptimizeState const &state) const { - ::FlexFlow::ParallelComputationGraph pcg = state.mapped_pcg.pcg; - ::FlexFlow::MachineMapping machine_mapping = state.mapped_pcg.machine_mapping; - size_t seed = 0; - ::FlexFlow::hash_combine(seed, state.runtime); - std::vector<::FlexFlow::parallel_layer_guid_t> layers = - topological_ordering(pcg); - ::FlexFlow::hash_combine(seed, layers.size()); - for (::FlexFlow::parallel_layer_guid_t const &layer : layers) { - ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(pcg, layer)); - std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = - get_incoming_tensors(pcg, layer); - ::FlexFlow::hash_combine(seed, inputs.size()); - for (::FlexFlow::parallel_tensor_guid_t input : inputs) { - for (size_t i = 0; i < layers.size(); ++i) { - if (get_source_layer(input) == layers.at(i)) { - ::FlexFlow::hash_combine(seed, i); - break; - } - } - } - ::FlexFlow::MachineView machine_view = - machine_mapping.machine_views.at(layer); - ::FlexFlow::hash_combine(seed, machine_view.start.node_idx); - ::FlexFlow::hash_combine(seed, machine_view.start.device_idx); - if (get_device_type(machine_view) == ::FlexFlow::DeviceType::CPU) { - ::FlexFlow::hash_combine(seed, 0); - } else { - ::FlexFlow::hash_combine(seed, 1); - } - for (::FlexFlow::MachineViewDimension dimension : machine_view.dimensions) { - ::FlexFlow::hash_combine(seed, dimension.stride.unwrapped); - if (dimension.projection == - ::FlexFlow::MachineSpecificationDimension::INTRA_NODE) { - ::FlexFlow::hash_combine(seed, 0); - } else { - ::FlexFlow::hash_combine(seed, 1); - } - } - } - - return seed; -} - -} // namespace std diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc new file mode 100644 index 0000000000..47ecc2479f --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -0,0 +1,73 @@ +#include "compiler/mcmc/mcmc_over_mapped_pcg.h" +#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping_mutation_set.h" +#include "compiler/mcmc/generic_mcmc_algorithm.h" +#include "compiler/task_graph_simulator/task_simulator.h" +#include "substitutions/pcg_pattern.h" +#include "substitutions/pcg_pattern_match.h" +#include "substitutions/unity_substitution_set.h" +#include "utils/optional.h" + +namespace FlexFlow { + +SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + MCMCOverMappedPCGConfig const &search_config) { + + std::vector substitutions = get_substitution_set(resources); + + std::optional naive_mapping = + get_naive_mapping(pcg, resources, search_config.device_type); + if (naive_mapping == std::nullopt) { + throw std::runtime_error("Failed to find any solutions"); + } + + SearchResult starting_state = SearchResult{pcg, naive_mapping.value()}; + + auto generating_func = [&](SearchResult mapped_pcg, + nonnegative_int i) -> std::optional { + if (i.unwrap_nonnegative() % + search_config.substitution_interval.unwrap_nonnegative() == + 0) { + // substitutions every (substitution_interval) iterations + std::optional random_substitution = + get_random_substitution(resources); + if (random_substitution != std::nullopt) { + std::optional pattern_match = + get_random_pattern_match(random_substitution.value().pcg_pattern, + sub_pcg_from_full_pcg(mapped_pcg.pcg)); + if (pattern_match != std::nullopt) { + std::cout << "HELLO" << std::endl; + return apply_substitution_and_update_machine_mapping( + mapped_pcg, random_substitution.value(), pattern_match.value()); + } + } + return std::nullopt; + } else { + // machine mapping mutations otherwise + std::optional new_machine_mapping = + get_random_mutation(mapped_pcg, resources, search_config.device_type); + if (new_machine_mapping == std::nullopt) { + return std::nullopt; + } + return SearchResult{mapped_pcg.pcg, new_machine_mapping.value()}; + } + }; + + auto scoring_func = [&](SearchResult mapped_pcg) -> float { + return task_simulator_estimate_forward_pass_time( + mapped_pcg.pcg, cost_estimator, mapped_pcg.machine_mapping, resources); + }; + + GenericMCMCConfig config = + GenericMCMCConfig{/*temperature*/ search_config.temperature, + /*num_iterations*/ search_config.num_iterations}; + + Generic_MCMC_state result = + minimize_score(starting_state, generating_func, scoring_func, config); + + return result.get_state(); +} + +} // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc index 0a175933cf..ba6faa93c4 100644 --- a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc +++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc @@ -18,8 +18,8 @@ TEST_SUITE(FF_TEST_SUITE) { return new_x; }; auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); }; - GenericMCMCConfig config = GenericMCMCConfig{/*temperature*/ 1.0, - /*num_iterations*/ 10_n}; + GenericMCMCConfig config = GenericMCMCConfig{/*temperature=*/1.0, + /*num_iterations=*/10_n}; Generic_MCMC_state result = minimize_score(starting_state, generating_func, scoring_func, config); float answer = result.get_state(); diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc index 7aad8b098d..5c469c4301 100644 --- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc @@ -1,5 +1,5 @@ -#include "compiler/mcmc/mcmc_algorithm.h" #include "../cost_estimator_for_test.h" +#include "compiler/mcmc/mcmc_over_mapped_pcg.h" #include "compiler/task_graph_simulator/task_simulator.h" #include "doctest/doctest.h" #include "op-attrs/parallel_tensor_dims.h" @@ -62,12 +62,11 @@ TEST_SUITE(FF_TEST_SUITE) { /*intra_node_bandwidth=*/1, }; - MCMCSearchConfig search_config = - MCMCSearchConfig{/*temperature=*/1.0, - /*num_iterations=*/5, - /*num_mutations_per_iteration=*/10, - /*max_num_ops=*/100, - /*device_type=*/DeviceType::GPU}; + MCMCOverMappedPCGConfig search_config = + MCMCOverMappedPCGConfig{/*temperature=*/1.0, + /*num_iterations=*/100_n, + /*substitution_interval=*/100_n, + /*device_type=*/DeviceType::GPU}; SearchResult result = mcmc_graph_optimize( pcg, cost_estimator, full_machine_spec, search_config); From d0480f4151f7da33da3cd6af261391e309721fe5 Mon Sep 17 00:00:00 2001 From: Dylan Lim <72822184+dylanllim@users.noreply.github.com> Date: Fri, 2 May 2025 03:47:30 -0700 Subject: [PATCH 07/11] CPU Kernel Tests (#1439) * test_utils refactor, local_cpu_allocator * test utils modification, cast, reverse, and replicate cpu kernels * combine kernel * combine kernels .h file * Implementations for methods for machine_views and associated modules (#1429) * initial commit for machine view adjacent modules * Formatting * Tests for new machine_view.cc functions * formatting * Minor Test correction * formatting * PR fixes * PR Fixes --------- Co-authored-by: Pietro Max Marsella * test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes * cpu_kernel's refactor, generic tensor accessor indexing * accessor.h formatting * mk_runtime_error formatting * reverse_kernels include * test_utils refactor and clarity * formatting * comment removal reverse_kernels * Issue #1435, tests for managed stream and handle * #1435 formatting * #1409 issue, change datatype for linear kernels away from void * * R & W accessor changes, minimize code bloat * code formatting and refactor * issue #1502 & issue #1540 * format check * branch merge and test fixes * build issues * Add AWS linux AMI to runs-on for testing (#1589) * Pin runs-on images (#1590) * GPU CI Fix (Pin runs-on GPU image) (#1588) * Debug * Change to base DL AMI * Print disk usage * Run nvidia-smi * Remove excess cuda installs in base ami * Re-enable freeing space in GPU CI * Try updating nix-develop version * Check what happens if you just enter the non-nixGL environment * Try switching AMIs * Try to remove the module stuff * Move to lockshaw/develop-action * Try pointing at a fixed commit * Update nix-develop action * Update nix-develop action to use BASH_FUNC filtering * Remove all the /usr/local/cuda entries * Switch back to gpu-ci env * Update the cuda arch * Try out the new runs-on gpu image * Move over to pinned runs-on image * Remove a bunch more unnecessary stuff in image to get back disk space * Try using an emphemeral store * Try mounting * Fix bug * Try sudo * Move nix into _work * Rollback all unnecessary changes * Re-enable waiting on cpu-ci * Merge substitution-builder (#1575) * Start on pcg builder * Add tests and some implementation for pcg builder * Add pcg tests, make dtgen constructors explicit to fix bug * Add remainder of PCG tests * Fix build issues in local-execution * Format * Address Reyna comments, add topological_order function for PCG * Pre multidigraph refactor * Removing visitable from sp code * Add open dataflow graph, start to replace pcg dataflow graph * Start refactoring substitutions * Add utility functions to support pattern matching * Pre-refactor inputs * Fix proj url * Get back to substitutions, now with unordered graph inputs * Get substitutions building * substitutions-tests now builds * Fix bug in filter, pass some initial substitution tests * Add tests for fmt::to_string, fix some substitutions bugs * Pass initial unit tests for find_pattern_matches * Start on unit tests for pcg pattern * Pass initial test for find_pattern_matches * Fix small build issue in tests * Format * Sync tests in CI with tests in proj * Fix minor build errors in kernels and local-execution * Format * Remove outdated code * More outdated code removal * More cleanup, add test for sp decomposition * Pull apart containers.h * More sp testing and fixes * Break up graph algorithms.h * Pre- full SP algo commit * Add initial implementation and tests for cbc decomposition and inverse line graph * Pass test for get_inverse_line_graph * Add new multidigraph * Fix get_inverse_line_graph to return a MultiDiGraph instead of a DiGraph * Add tests for parallel and series reduction finding * Add really rough implementation of valdez sp decomposition * Fix local-execution build * Add implementations and tests for applying series/parallel reductions * Format * Clean up sp decomposition interface and tests * Format * Add comments for top-level substitutions functions, add proj doxygen support * Start sketching out substitutions code * Fix build errors * Add ability to permute node ids * Cleanup and start to test new substitutions code * Add test case for evaluate_substitution_output * Add naive isomorphism detection code * Add graph inputs to open dataflow graph isomorphism * Add input permutation to evaluate_substitution_output * Fix permute_node_ids * Add test for permute_input_ids * Migrate over to mutable implementation of apply_substitution * Add fast isomorphism checking and an initial implementation of full substitution logic * Pass initial full substitutions test * Cleanup old isomorphism checking code * Fix post-merge bugs * Fix broken pcg builder test * Format * Reorganize code and remove some outdated code pre-code-review * Format * Restarting work on this after working on export-model-arch * Adding in some a simple function to get the currently available substritutions * nonnegative_int additions, code cleanup, etc. * A bunch more moving over to nonnegative_int * Even more nonnegative_int updating * Fix build * Fix failing tests * Format * Format --------- Co-authored-by: Colin Unger Co-authored-by: Victor Li * test_utils refactor, local_cpu_allocator * test utils modification, cast, reverse, and replicate cpu kernels * combine kernel * test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes * cpu_kernel's refactor, generic tensor accessor indexing * test_utils refactor and clarity * R & W accessor changes, minimize code bloat * issue #1502 & issue #1540 * branch merge and test fixes * merge * build after merge * kernel issues * managed stream / handle test case fix * test_utils update, kernel/ops refactor * Review fixes * Update doctest includes in kernels * More PR review * Try using rhel package-based nixgl * Format * Update proj with test command fixes * Attempt to fix gpu CI * Use custom AMI in GPU CI * Fix proj bug in cpu-ci * Try including run id * Temporarily allow gpu ci to run regardless for testing purposes * Try using official ubuntu ami in gpu ci * Try out new ami * Change to use new flexflow-gpu-ci AMI * Fix bugs in GPU tests and restore GPU CI gating * Format * Fix bug in accessor formatting test cases * Bugfixes and updated proj * Fix all cpu tests * Format * Add improved test failure output for replicate cpu vs gpu tests * Continue debugging replicate cuda testcases * Format * Fix incorrect tensor size in replicate kernel tests * Transpose replicate backward cpu kernel * Try flipping output dimensions in replica cuda kernel test * Update proj --------- Co-authored-by: Marsella8 <45826022+Marsella8@users.noreply.github.com> Co-authored-by: Pietro Max Marsella Co-authored-by: Colin Unger Co-authored-by: Victor Li <32348970+victorli2002@users.noreply.github.com> Co-authored-by: Victor Li --- .flake/pkgs/fccf/default.nix | 54 ++ .flake/pkgs/fccf/fix-argparse-include.patch | 13 + .flake/pkgs/fccf/json-package-name.patch | 12 + .github/runs-on.yml | 19 - .github/workflows/tests.yml | 4 +- .proj.toml | 60 +- cmake/flexflow-utils.cmake | 7 +- flake.lock | 6 +- flake.nix | 3 + lib/kernels/CMakeLists.txt | 4 +- lib/kernels/include/kernels/accessor.h | 223 ++++--- lib/kernels/include/kernels/allocation.h | 9 +- .../include/kernels/array_coord.struct.toml | 19 + lib/kernels/include/kernels/array_shape.h | 31 +- .../include/kernels/attention_kernels.h | 7 +- .../include/kernels/batch_matmul_kernels.h | 10 +- .../include/kernels/batch_norm_kernels.h | 15 +- lib/kernels/include/kernels/cast_kernels.h | 22 +- .../include/kernels/cast_kernels_cpu.h | 17 + lib/kernels/include/kernels/combine_kernels.h | 10 +- .../include/kernels/combine_kernels_cpu.h | 17 + lib/kernels/include/kernels/concat_kernels.h | 10 +- lib/kernels/include/kernels/conv_2d_kernels.h | 12 +- .../include/kernels/copy_tensor_accessor.h | 27 + .../include/kernels/datatype_dispatch.h | 13 +- lib/kernels/include/kernels/dropout_kernels.h | 8 +- .../include/kernels/element_binary_kernels.h | 8 +- .../include/kernels/element_unary_kernels.h | 14 +- .../include/kernels/embedding_kernels.h | 14 +- lib/kernels/include/kernels/ff_handle.h | 2 +- lib/kernels/include/kernels/flat_kernels.h | 15 +- .../kernels/format_accessor_contents.h | 13 + lib/kernels/include/kernels/gather_kernels.h | 10 +- .../include/kernels/layer_norm_kernels.h | 8 +- lib/kernels/include/kernels/legion_dim.h | 24 +- .../kernels/legion_ordered/legion_ordered.h | 197 ++++++ .../include/kernels/legion_ordered/slice.h | 24 + .../kernels/legion_ordered/transform.h | 17 + lib/kernels/include/kernels/linear_kernels.h | 30 +- .../include/kernels}/local_cpu_allocator.h | 7 + .../include/kernels/local_cuda_allocator.h | 2 + .../include/kernels/managed_ff_stream.h | 5 +- .../kernels/managed_per_device_ff_handle.h | 8 +- lib/kernels/include/kernels/metrics_kernels.h | 29 +- lib/kernels/include/kernels/nccl.h | 8 +- .../include/kernels/optimizer_kernels.h | 19 +- .../include/kernels/partition_kernels.h | 12 +- .../kernels}/per_device_op_state.variant.toml | 0 lib/kernels/include/kernels/pool_2d_kernels.h | 14 +- lib/kernels/include/kernels/profiling.h | 2 +- lib/kernels/include/kernels/reduce_kernels.h | 12 +- .../include/kernels/reduction_kernels.h | 14 +- .../include/kernels/replicate_kernels.h | 12 +- .../include/kernels/replicate_kernels_cpu.h | 18 + lib/kernels/include/kernels/reshape_kernels.h | 12 +- lib/kernels/include/kernels/reverse_kernels.h | 29 +- .../include/kernels/reverse_kernels_cpu.h | 20 + .../include/kernels/reverse_kernels_params.h | 16 + .../reverse_kernels_params.struct.toml | 28 + lib/kernels/include/kernels/softmax_kernels.h | 10 +- lib/kernels/include/kernels/split_kernels.h | 11 +- lib/kernels/include/kernels/topk_kernels.h | 8 +- .../include/kernels/transpose_kernels.h | 12 +- lib/kernels/src/accessor.cc | 192 ------ lib/kernels/src/allocation.cc | 21 - lib/kernels/src/cpu/ops/cast_kernels.cc | 51 ++ lib/kernels/src/cpu/ops/combine_kernels.cc | 39 ++ .../src/cpu/{ => ops}/initializer_kernels.cc | 0 lib/kernels/src/cpu/ops/replicate_kernels.cc | 51 ++ lib/kernels/src/cpu/ops/reverse_kernels.cc | 46 ++ lib/kernels/src/cuda/cuda_helper.cu | 14 +- lib/kernels/src/cuda/embedding_kernels.cu | 567 ++++++++++++++---- lib/kernels/src/cuda/loss_function_kernels.cu | 2 +- lib/kernels/src/cuda/metrics_functions.cu | 96 +-- lib/kernels/src/cuda/ops/attention_kernels.cu | 2 +- .../src/cuda/ops/batch_matmul_kernels.cu | 2 +- .../src/cuda/ops/batch_norm_kernels.cu | 6 +- lib/kernels/src/cuda/ops/cast_kernels.cu | 24 +- lib/kernels/src/cuda/ops/combine_kernels.cu | 2 +- lib/kernels/src/cuda/ops/concat_kernels.cu | 92 +-- lib/kernels/src/cuda/ops/conv_2d_kernels.cu | 6 +- lib/kernels/src/cuda/ops/dropout_kernels.cu | 2 +- .../src/cuda/ops/element_binary_kernels.cu | 2 +- .../src/cuda/ops/element_unary_kernels.cu | 20 +- lib/kernels/src/cuda/ops/flat_kernels.cu | 6 +- lib/kernels/src/cuda/ops/gather_kernels.cu | 26 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 78 +-- lib/kernels/src/cuda/ops/partition_kernels.cu | 12 +- lib/kernels/src/cuda/ops/pool_2d_kernels.cu | 8 +- lib/kernels/src/cuda/ops/reduce_kernels.cu | 2 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 12 +- lib/kernels/src/cuda/ops/replicate_kernels.cu | 15 +- lib/kernels/src/cuda/ops/reshape_kernels.cu | 12 +- lib/kernels/src/cuda/ops/reverse_kernels.cu | 78 ++- lib/kernels/src/cuda/ops/softmax_kernels.cu | 4 +- lib/kernels/src/cuda/ops/split_kernels.cu | 2 +- lib/kernels/src/cuda/ops/topk_kernels.cu | 2 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 8 +- lib/kernels/src/cuda/optimizer_kernel.cu | 216 ------- lib/kernels/src/cuda/optimizer_kernels.cu | 205 +++++++ lib/kernels/src/hip/embedding_kernels.cpp | 32 +- lib/kernels/src/hip/loss_function_kernels.cpp | 2 +- lib/kernels/src/hip/ops/attention_kernels.cpp | 2 +- .../src/hip/ops/batch_matmul_kernels.cpp | 2 +- .../src/hip/ops/batch_norm_kernels.cpp | 2 +- lib/kernels/src/hip/ops/cast_kernels.cpp | 2 +- lib/kernels/src/hip/ops/combine_kernels.cpp | 2 +- lib/kernels/src/hip/ops/concat_kernels.cpp | 2 +- lib/kernels/src/hip/ops/conv_2d_kernels.h | 2 +- lib/kernels/src/hip/ops/dropout_kernels.cpp | 2 +- .../src/hip/ops/element_binary_kernels.cpp | 2 +- .../src/hip/ops/element_unary_kernels.cpp | 2 +- lib/kernels/src/hip/ops/flat_kernels.cpp | 2 +- lib/kernels/src/hip/ops/gather_kernels.cpp | 2 +- lib/kernels/src/hip/ops/partition_kernels.cpp | 2 +- lib/kernels/src/hip/ops/pool_2d_kernels.cpp | 2 +- lib/kernels/src/hip/ops/reduce_kernels.cpp | 2 +- lib/kernels/src/hip/ops/replicate_kernels.cpp | 2 +- lib/kernels/src/hip/ops/reshape_kernels.cpp | 2 +- lib/kernels/src/hip/ops/reverse_kernels.cpp | 2 +- lib/kernels/src/hip/ops/softmax_kernels.cpp | 2 +- lib/kernels/src/hip/ops/split_kernels.cpp | 2 +- lib/kernels/src/hip/ops/topk_kernels.cpp | 2 +- lib/kernels/src/hip/ops/transpose_kernels.cpp | 2 +- lib/kernels/src/{ => internal}/device.cc | 2 +- lib/kernels/src/{ => internal}/device.h | 4 +- lib/kernels/src/kernels/accessor.cc | 249 ++++++++ lib/kernels/src/kernels/allocation.cc | 38 ++ lib/kernels/src/{ => kernels}/array_shape.cc | 87 ++- .../src/kernels/copy_tensor_accessor.cc | 66 ++ .../src/kernels/format_accessor_contents.cc | 184 ++++++ lib/kernels/src/{ => kernels}/legion_dim.cc | 5 + .../kernels/legion_ordered/legion_ordered.cc | 10 + .../src/kernels/legion_ordered/slice.cc | 12 + .../src/kernels/legion_ordered/transform.cc | 12 + .../src/kernels}/local_cpu_allocator.cc | 21 +- .../src/{ => kernels}/local_cuda_allocator.cc | 20 +- .../src/kernels/reverse_kernels_params.cc | 30 + lib/kernels/src/managed_ff_stream.cc | 20 +- .../src/managed_per_device_ff_handle.cc | 38 +- lib/kernels/test/CMakeLists.txt | 1 + .../test/src/cpu/ops/replicate_kernels.cc | 57 ++ .../test/src/cpu/ops/reverse_kernels.cc | 206 +++++++ lib/kernels/test/src/internal/test_utils.cc | 392 ++++++++++++ lib/kernels/test/src/internal/test_utils.h | 78 +++ lib/kernels/test/src/kernels/accessor.cc | 73 +++ lib/kernels/test/src/kernels/array_shape.cc | 49 ++ .../src/kernels/format_accessor_contents.cc | 94 +++ lib/kernels/test/src/kernels/legion_dim.cc | 32 + .../kernels/legion_ordered/legion_ordered.cc | 12 + .../test/src/kernels/legion_ordered/slice.cc | 30 + .../src/kernels/legion_ordered/transform.cc | 36 ++ lib/kernels/test/src/test_attention_kernel.cc | 44 +- .../test/src/test_batch_matmul_kernel.cc | 28 +- .../test/src/test_batch_norm_kernel.cc | 68 +-- lib/kernels/test/src/test_cast_kernel.cc | 102 ++-- lib/kernels/test/src/test_combine_kernel.cc | 93 ++- lib/kernels/test/src/test_concat_kernel.cc | 139 +++-- lib/kernels/test/src/test_cuda.cc | 6 +- lib/kernels/test/src/test_dropout.cc | 31 +- lib/kernels/test/src/test_flat_kernel.cc | 48 +- lib/kernels/test/src/test_gather_kernels.cc | 134 +++-- .../test/src/test_layer_norm_kernels.cc | 37 +- .../test/src/test_managed_ff_stream.cc | 107 ++++ .../src/test_managed_per_device_ff_handle.cc | 37 ++ lib/kernels/test/src/test_partition_kernel.cc | 51 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 42 +- lib/kernels/test/src/test_reduction_kernel.cc | 48 +- lib/kernels/test/src/test_replicate_kernel.cc | 157 ++++- lib/kernels/test/src/test_reshape_kernel.cc | 43 +- lib/kernels/test/src/test_reverse_kernels.cc | 137 +++-- lib/kernels/test/src/test_softmax_kernel.cc | 33 +- lib/kernels/test/src/test_split_kernel.cc | 25 +- lib/kernels/test/src/test_transpose_kernel.cc | 40 +- lib/kernels/test/src/test_utils.cc | 106 ---- lib/kernels/test/src/test_utils.h | 72 --- .../local-execution/per_device_op_state.h | 2 +- .../local-execution/task_argument_accessor.h | 2 +- .../local-execution/tracked_allocator.h | 3 + .../src/local_task_argument_accessor.cc | 7 +- lib/local-execution/src/ops/batch_norm.cc | 4 +- lib/local-execution/src/ops/cast.cc | 8 +- lib/local-execution/src/ops/conv_2d.cc | 6 +- lib/local-execution/src/ops/element_unary.cc | 16 +- lib/local-execution/src/ops/flat.cc | 6 +- lib/local-execution/src/ops/linear.cc | 39 +- lib/local-execution/src/ops/pool_2d.cc | 10 +- lib/local-execution/src/ops/reduction.cc | 6 +- lib/local-execution/src/ops/repartition.cc | 4 +- lib/local-execution/src/ops/replicate.cc | 2 +- lib/local-execution/src/ops/reshape.cc | 4 +- lib/local-execution/src/ops/reverse.cc | 48 +- lib/local-execution/src/ops/softmax.cc | 2 +- lib/local-execution/src/ops/transpose.cc | 4 +- ...device_state.cc => per_device_op_state.cc} | 0 lib/local-execution/src/tracked_allocator.cc | 7 +- .../test/src/test_local_cost_estimator.cc | 113 ++-- .../test/src/test_local_slots_backing.cc | 22 +- .../test/src/test_local_task_arg_accessor.cc | 2 +- .../include/op-attrs/aggregate_op.enum.toml | 3 +- .../include/op-attrs/datatype_value.h | 16 + .../op-attrs/dim_ordered/dim_ordered.h | 199 +----- .../include/op-attrs/dim_ordered/slice.h | 45 +- .../{dim_ordered => ff_ordered}/concat.h | 2 +- .../{dim_ordered => ff_ordered}/enumerate.h | 2 +- .../include/op-attrs/ff_ordered/ff_ordered.h | 228 +++++++ .../ff_ordered_from_map.h | 4 +- .../ff_ordered_of.h | 2 +- .../{dim_ordered => ff_ordered}/get_idxs.h | 2 +- .../include/op-attrs/ff_ordered/slice.h | 49 ++ .../include/op-attrs/ff_ordered/transform.h | 17 + .../include/op-attrs/ff_ordered/zip.h | 18 + .../op-attrs/ops/transpose_attrs.struct.toml | 2 +- .../parallel_tensor_dim_degrees.struct.toml | 2 +- .../op-attrs/parallel_tensor_dims.struct.toml | 2 +- lib/op-attrs/include/op-attrs/tensor_dims.h | 2 +- .../include/op-attrs/tensor_dims.struct.toml | 2 +- lib/op-attrs/include/op-attrs/tensor_shape.h | 2 +- lib/op-attrs/src/op-attrs/datatype_value.cc | 25 + .../src/op-attrs/dim_ordered/concat.cc | 1 - .../src/op-attrs/dim_ordered/enumerate.cc | 1 - .../dim_ordered/ff_ordered_from_map.cc | 1 - .../src/op-attrs/dim_ordered/ff_ordered_of.cc | 1 - .../src/op-attrs/dim_ordered/get_idxs.cc | 1 - .../src/op-attrs/dim_ordered/slice.cc | 25 - .../src/op-attrs/dim_ordered/transform.cc | 1 + .../src/op-attrs/ff_ordered/enumerate.cc | 10 + .../src/op-attrs/ff_ordered/ff_ordered.cc | 14 + .../ff_ordered/ff_ordered_from_map.cc | 13 + .../src/op-attrs/ff_ordered/get_idxs.cc | 10 + lib/op-attrs/src/op-attrs/ff_ordered/slice.cc | 24 + .../src/op-attrs/ff_ordered/transform.cc | 12 + lib/op-attrs/src/op-attrs/ff_ordered/zip.cc | 12 + lib/op-attrs/src/op-attrs/ops/batch_norm.cc | 4 +- lib/op-attrs/src/op-attrs/ops/concat.cc | 4 +- lib/op-attrs/src/op-attrs/ops/embedding.cc | 6 +- lib/op-attrs/src/op-attrs/ops/flat.cc | 4 +- lib/op-attrs/src/op-attrs/ops/layer_norm.cc | 4 +- lib/op-attrs/src/op-attrs/ops/linear.cc | 13 +- .../src/op-attrs/parallel_tensor_dims.cc | 4 +- lib/op-attrs/src/op-attrs/tensor_dims.cc | 6 +- lib/op-attrs/src/op-attrs/tensor_shape.cc | 2 +- .../test/src/op-attrs/datatype_value.cc | 68 +++ .../src/op-attrs/dim_ordered/dim_ordered.cc | 4 - .../{dim_ordered => ff_ordered}/concat.cc | 2 +- .../{dim_ordered => ff_ordered}/enumerate.cc | 2 +- .../src/op-attrs/ff_ordered/ff_ordered.cc | 11 + .../ff_ordered_from_map.cc | 2 +- .../{dim_ordered => ff_ordered}/slice.cc | 19 +- .../test/src/op-attrs/ff_ordered/transform.cc | 35 ++ .../test/src/op-attrs/ff_ordered/zip.cc | 38 ++ lib/pcg/include/pcg/metric.enum.toml | 26 + lib/pcg/include/pcg/metric_attrs.h | 28 + lib/pcg/src/pcg/metric_attrs.cc | 38 ++ .../generate_weight_transform.cc | 2 +- lib/runtime/src/metrics_functions.cc | 33 - lib/runtime/src/metrics_functions.h | 63 +- lib/runtime/src/ops/embedding.cc | 4 +- .../utils/containers/{subvec.h => slice.h} | 16 +- .../include/utils/containers/zip_strict.h | 14 +- lib/utils/include/utils/exception.h | 1 + lib/utils/include/utils/indent.h | 12 + .../include/utils/stack_vector/stack_vector.h | 29 +- lib/utils/src/utils/containers/slice.cc | 3 + lib/utils/src/utils/containers/subvec.cc | 1 - .../full_binary_tree/binary_tree_path.cc | 4 +- .../graph/series_parallel/series_reduction.cc | 4 +- lib/utils/src/utils/indent.cc | 17 + .../src/utils/stack_vector/stack_vector.cc | 4 +- .../include/test/utils/doctest/check_kv.h | 12 + lib/utils/test/common/src/main.cc | 17 +- .../common/src/test/utils/doctest/check_kv.cc | 17 + .../utils/containers/{subvec.cc => slice.cc} | 24 +- lib/utils/test/src/utils/indent.cc | 66 ++ .../src/utils/stack_vector/stack_vector.cc | 85 +++ 275 files changed, 6048 insertions(+), 2570 deletions(-) create mode 100644 .flake/pkgs/fccf/default.nix create mode 100644 .flake/pkgs/fccf/fix-argparse-include.patch create mode 100644 .flake/pkgs/fccf/json-package-name.patch create mode 100644 lib/kernels/include/kernels/array_coord.struct.toml create mode 100644 lib/kernels/include/kernels/cast_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/copy_tensor_accessor.h create mode 100644 lib/kernels/include/kernels/format_accessor_contents.h create mode 100644 lib/kernels/include/kernels/legion_ordered/legion_ordered.h create mode 100644 lib/kernels/include/kernels/legion_ordered/slice.h create mode 100644 lib/kernels/include/kernels/legion_ordered/transform.h rename lib/{local-execution/include/local-execution => kernels/include/kernels}/local_cpu_allocator.h (74%) rename lib/{local-execution/include/local-execution => kernels/include/kernels}/per_device_op_state.variant.toml (100%) create mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reverse_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reverse_kernels_params.h create mode 100644 lib/kernels/include/kernels/reverse_kernels_params.struct.toml delete mode 100644 lib/kernels/src/accessor.cc delete mode 100644 lib/kernels/src/allocation.cc create mode 100644 lib/kernels/src/cpu/ops/cast_kernels.cc create mode 100644 lib/kernels/src/cpu/ops/combine_kernels.cc rename lib/kernels/src/cpu/{ => ops}/initializer_kernels.cc (100%) create mode 100644 lib/kernels/src/cpu/ops/replicate_kernels.cc create mode 100644 lib/kernels/src/cpu/ops/reverse_kernels.cc delete mode 100644 lib/kernels/src/cuda/optimizer_kernel.cu create mode 100644 lib/kernels/src/cuda/optimizer_kernels.cu rename lib/kernels/src/{ => internal}/device.cc (97%) rename lib/kernels/src/{ => internal}/device.h (98%) create mode 100644 lib/kernels/src/kernels/accessor.cc create mode 100644 lib/kernels/src/kernels/allocation.cc rename lib/kernels/src/{ => kernels}/array_shape.cc (51%) create mode 100644 lib/kernels/src/kernels/copy_tensor_accessor.cc create mode 100644 lib/kernels/src/kernels/format_accessor_contents.cc rename lib/kernels/src/{ => kernels}/legion_dim.cc (78%) create mode 100644 lib/kernels/src/kernels/legion_ordered/legion_ordered.cc create mode 100644 lib/kernels/src/kernels/legion_ordered/slice.cc create mode 100644 lib/kernels/src/kernels/legion_ordered/transform.cc rename lib/{local-execution/src => kernels/src/kernels}/local_cpu_allocator.cc (52%) rename lib/kernels/src/{ => kernels}/local_cuda_allocator.cc (59%) create mode 100644 lib/kernels/src/kernels/reverse_kernels_params.cc create mode 100644 lib/kernels/test/src/cpu/ops/replicate_kernels.cc create mode 100644 lib/kernels/test/src/cpu/ops/reverse_kernels.cc create mode 100644 lib/kernels/test/src/internal/test_utils.cc create mode 100644 lib/kernels/test/src/internal/test_utils.h create mode 100644 lib/kernels/test/src/kernels/accessor.cc create mode 100644 lib/kernels/test/src/kernels/array_shape.cc create mode 100644 lib/kernels/test/src/kernels/format_accessor_contents.cc create mode 100644 lib/kernels/test/src/kernels/legion_dim.cc create mode 100644 lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc create mode 100644 lib/kernels/test/src/kernels/legion_ordered/slice.cc create mode 100644 lib/kernels/test/src/kernels/legion_ordered/transform.cc create mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc create mode 100644 lib/kernels/test/src/test_managed_per_device_ff_handle.cc delete mode 100644 lib/kernels/test/src/test_utils.cc delete mode 100644 lib/kernels/test/src/test_utils.h rename lib/local-execution/src/{per_device_state.cc => per_device_op_state.cc} (100%) create mode 100644 lib/op-attrs/include/op-attrs/datatype_value.h rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/concat.h (95%) rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/enumerate.h (95%) create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/ff_ordered_from_map.h (88%) rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/ff_ordered_of.h (88%) rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/get_idxs.h (91%) create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/slice.h create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/transform.h create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/zip.h create mode 100644 lib/op-attrs/src/op-attrs/datatype_value.cc delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/concat.cc delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc create mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/transform.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/slice.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/transform.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/zip.cc create mode 100644 lib/op-attrs/test/src/op-attrs/datatype_value.cc rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/concat.cc (97%) rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/enumerate.cc (92%) create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/ff_ordered_from_map.cc (96%) rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/slice.cc (79%) create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc create mode 100644 lib/pcg/include/pcg/metric.enum.toml create mode 100644 lib/pcg/include/pcg/metric_attrs.h create mode 100644 lib/pcg/src/pcg/metric_attrs.cc rename lib/utils/include/utils/containers/{subvec.h => slice.h} (69%) create mode 100644 lib/utils/include/utils/indent.h create mode 100644 lib/utils/src/utils/containers/slice.cc delete mode 100644 lib/utils/src/utils/containers/subvec.cc create mode 100644 lib/utils/src/utils/indent.cc create mode 100644 lib/utils/test/common/include/test/utils/doctest/check_kv.h create mode 100644 lib/utils/test/common/src/test/utils/doctest/check_kv.cc rename lib/utils/test/src/utils/containers/{subvec.cc => slice.cc} (69%) create mode 100644 lib/utils/test/src/utils/indent.cc diff --git a/.flake/pkgs/fccf/default.nix b/.flake/pkgs/fccf/default.nix new file mode 100644 index 0000000000..f792b8606c --- /dev/null +++ b/.flake/pkgs/fccf/default.nix @@ -0,0 +1,54 @@ +{ fetchFromGitHub +, stdenv +, cmake +, pkg-config +, libclang +, libllvm +, lib +, zlib +, argparse +, nlohmann_json +, fmt +}: + +stdenv.mkDerivation rec { + pname = "fccf"; + version = "03d373fc65e2d7ceeac441ba4bbddfdc25618dff"; + + src = fetchFromGitHub { + owner = "p-ranav"; + repo = "fccf"; + rev = version; + sha256 = "sha256-3NdPon5ZfjoGFFgBlb0rzRnfWgSopvAc5Gls2NWHaOE="; + }; + + nativeBuildInputs = [ + cmake + pkg-config + ]; + + buildInputs = [ + libclang + libllvm + zlib + argparse + nlohmann_json + fmt + ]; + + patches = [ + ./json-package-name.patch + ./fix-argparse-include.patch + ]; + + cmakeFlags = [ + "-DCMAKE_BUILD_TYPE=Release" + "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS" + ]; + + meta = with lib; { + description = "A command-line tool that quickly searches through C/C++ source code in a directory based on a search string and prints relevant code snippets that match the query"; + homepage = "https://github.com/p-ranav/fccf"; + license = licenses.mit; + }; +} diff --git a/.flake/pkgs/fccf/fix-argparse-include.patch b/.flake/pkgs/fccf/fix-argparse-include.patch new file mode 100644 index 0000000000..2cb648c1bf --- /dev/null +++ b/.flake/pkgs/fccf/fix-argparse-include.patch @@ -0,0 +1,13 @@ +diff --git a/source/main.cpp b/source/main.cpp +index 7e131d3..6c05d89 100644 +--- a/source/main.cpp ++++ b/source/main.cpp +@@ -6,7 +6,7 @@ + #include + #include + +-#include ++#include + #include + #include "searcher.hpp" + #include diff --git a/.flake/pkgs/fccf/json-package-name.patch b/.flake/pkgs/fccf/json-package-name.patch new file mode 100644 index 0000000000..51f6a012cf --- /dev/null +++ b/.flake/pkgs/fccf/json-package-name.patch @@ -0,0 +1,12 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 20bcbbf..923075f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -48,6 +48,7 @@ FetchContent_MakeAvailable(fmt) + + FetchContent_Declare(json + URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz ++ FIND_PACKAGE_ARGS NAMES nlohmann_json + ) + FetchContent_MakeAvailable(json) + diff --git a/.github/runs-on.yml b/.github/runs-on.yml index a4fff33536..5033e69d65 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,23 +1,4 @@ images: - runs-on-gpu-pinned: - platform: "linux" - arch: "x64" - owner: "135269210855" # runs-on - # to find, go to - # https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Images:visibility=public-images;search=:runs-on;v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false - name: "runs-on-v2.2-ubuntu22-gpu-x64-20250220122045" - - runs-on-cpu-pinned: - platform: "linux" - arch: "x64" - owner: "135269210855" # runs-on - name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045" - - official-ubuntu-ami: - platform: "linux" - arch: "x64" - ami: "ami-0a60b027285c0d4c5" - flexflow-gpu-ci: platform: "linux" arch: "x64" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9d98fb07dd..799e3069a9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,9 +57,9 @@ jobs: name: GPU unit tests needs: cpu-ci runs-on: - - runs-on + - runs-on=${{ github.run_id }} - family=g4dn.xlarge - - image=runs-on-gpu-pinned + - image=flexflow-gpu-ci strategy: max-parallel: 1 diff --git a/.proj.toml b/.proj.toml index a06fb53c3a..8eed6166cd 100644 --- a/.proj.toml +++ b/.proj.toml @@ -2,57 +2,81 @@ project_name = "flexflow" testsuite_macro = "FF_TEST_SUITE" namespace_name = "FlexFlow" header_extension = ".h" +cuda_launch_cmd = [ + "nixGL", + "--", +] [targets.utils] type = "lib" -tests = true -benchmarks = true +has-cpu-only-tests = true +has-cpu-only-benchmarks = true +has-cuda-tests = false +has-cuda-benchmarks = false [targets.op-attrs] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.kernels] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false [targets.pcg] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.substitutions] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.compiler] type = "lib" -tests = true -benchmarks = true +has-cpu-only-tests = true +has-cpu-only-benchmarks = true +has-cuda-tests = false +has-cuda-benchmarks = false [targets.substitution-generator] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.local-execution] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.models] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.export-model-arch] type = "bin" +cuda = false [targets.substitution-to-dot] type = "bin" +cuda = false # default_build_targets = [ # "utils", diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake index 478ebda318..ef5d6d9d11 100644 --- a/cmake/flexflow-utils.cmake +++ b/cmake/flexflow-utils.cmake @@ -126,11 +126,16 @@ function(ff_add_test_executable) ${FF_TEST_EXEC_NAME} ${SRC}) + target_include_directories( + ${FF_TEST_EXEC_NAME} + PRIVATE + ${FF_TEST_EXEC_PRIVATE_INCLUDE}) + target_link_libraries( ${FF_TEST_EXEC_NAME} ${FF_TEST_EXEC_DEPS}) - target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}") + target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="cpu-${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}") define_ff_vars(${FF_TEST_EXEC_NAME}) ff_set_cxx_properties(${FF_TEST_EXEC_NAME}) diff --git a/flake.lock b/flake.lock index c991232013..ff6e797d51 100644 --- a/flake.lock +++ b/flake.lock @@ -66,11 +66,11 @@ ] }, "locked": { - "lastModified": 1741679698, - "narHash": "sha256-poSOQS/2qImAo/PgRu37pHdOrwAsZEyC8PMM3evFLX4=", + "lastModified": 1746157536, + "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=", "owner": "lockshaw", "repo": "proj", - "rev": "0de983ff66abea4703f73988d29fc807e2b0a9bd", + "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 77a6c61b7d..5fa48fa3fd 100644 --- a/flake.nix +++ b/flake.nix @@ -59,6 +59,7 @@ bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { }; ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; }; hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { }; + fccf = pkgs.callPackage ./.flake/pkgs/fccf { }; rapidcheckFull = pkgs.symlinkJoin { name = "rapidcheckFull"; paths = (with pkgs; [ rapidcheck.out rapidcheck.dev ]); @@ -162,6 +163,7 @@ ruff jq gh + expect ]) (with pkgs.python3Packages; [ gitpython @@ -179,6 +181,7 @@ (with self.packages.${system}; [ ffdb hpp2plantuml + fccf ]) ]; }; diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( @@ -30,6 +29,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..f9bef91b25 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -1,25 +1,88 @@ #ifndef _FLEXFLOW_KERNELS_ACCESSOR_H #define _FLEXFLOW_KERNELS_ACCESSOR_H -#include "array_shape.h" -#include "device.h" +#include "kernels/array_shape.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" -#include "utils/exception.h" +#include "pcg/device_type.dtg.h" +#include "utils/containers/transform.h" #include "utils/required.h" +#include namespace FlexFlow { +nonnegative_int + calculate_accessor_offset(LegionOrdered const &, + ArrayShape const &); + +class GenericTensorAccessorR { +public: + template + typename data_type_enum_to_class
::type const *get() const { + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + return static_cast const *>(this->ptr); + } + + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; + + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(FFOrdered const &indices) const { + return this->at
(legion_ordered_from_ff_ordered(indices)); + } + + template + real_type_t
const & + at(LegionOrdered const &indices) const { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorR::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset.unwrap_nonnegative()]; + } + +public: + DataType data_type; + ArrayShape shape; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; +}; + +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); + class GenericTensorAccessorW { public: template typename data_type_enum_to_class
::type *get() const { - if (this->data_type == DT) { - return static_cast *>(this->ptr); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + return static_cast *>(this->ptr); } int32_t *get_int32_ptr() const; @@ -28,76 +91,76 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; -public: - DataType data_type; - ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); + GenericTensorAccessorW() = delete; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; -class GenericTensorAccessorR { -public: template - typename data_type_enum_to_class
::type const *get() const { - if (this->data_type == DT) { - return static_cast const *>(this->ptr); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } + real_type_t
&at(FFOrdered const &indices) { + return this->at
(legion_ordered_from_ff_ordered(indices)); } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + template + real_type_t
&at(LegionOrdered const &indices) { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorW::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset.unwrap_nonnegative()]; + } + + template + real_type_t
const &at(FFOrdered const &indices) const { + return this->at
(legion_ordered_from_ff_ordered(indices)); + } + + template + real_type_t
&at(LegionOrdered const &indices) const { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorW::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; + void *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); - -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); template typename data_type_enum_to_class
::type * get(GenericTensorAccessorW const &a) { - if (a.data_type == DT) { - return static_cast *>(a.ptr); - } else { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", a.data_type, DT)); - } + ASSERT(a.data_type == DT, "Invalid datatype requested"); + return static_cast *>(a.ptr); } template @@ -113,12 +176,8 @@ std::vector *> template typename data_type_enum_to_class
::type const * get(GenericTensorAccessorR const &a) { - if (a.data_type == DT) { - return static_cast const *>(a.ptr); - } else { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", a.data_type, DT)); - } + ASSERT(a.data_type == DT, "Invalid datatype requested"); + return static_cast const *>(a.ptr); } int32_t const *get_int32_ptr(GenericTensorAccessorR const &); @@ -137,6 +196,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -150,12 +224,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -163,8 +233,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); + +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..39bad6599c 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include @@ -11,6 +11,8 @@ struct IAllocator { virtual void *allocate(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -18,9 +20,14 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + void deallocate_tensor(GenericTensorAccessorW const &); + void deallocate_tensor(GenericTensorAccessorR const &); + void *allocate(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/kernels/include/kernels/array_coord.struct.toml new file mode 100644 index 0000000000..8ce121f2bf --- /dev/null +++ b/lib/kernels/include/kernels/array_coord.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "ArrayCoord" +features = [ + "eq", + "ord", + "hash", + "fmt", + "rapidcheck", + "json", +] + +includes = [ + "op-attrs/ff_ordered/ff_ordered.h", + "utils/nonnegative_int/nonnegative_int.h" +] + +[[fields]] +name = "ff_ordered" +type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 57498ee466..25ef8116f2 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H +#include "kernels/array_coord.dtg.h" #include "kernels/legion_dim.h" #include "op-attrs/tensor_shape.dtg.h" #include "utils/nonnegative_int/nonnegative_int.h" @@ -15,9 +16,7 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); - ArrayShape(TensorShape const &shape); - ArrayShape(std::vector const &); + explicit ArrayShape(LegionOrdered const &dims); /** * @brief Alias of ArrayShape::num_elements for compatibility with @@ -46,24 +45,40 @@ struct ArrayShape { std::optional at_maybe(legion_dim_t) const; std::optional at_maybe(ff_dim_t) const; - ArrayShape - sub_shape(std::optional> start, - std::optional> end) const; + ArrayShape sub_shape(ff_dim_t const &start, + std::optional const &end) const; + + ArrayShape sub_shape(legion_dim_t const &start, + std::optional const &end) const; public: LegionOrdered dims; private: std::tuple tie() const; + + friend ::std::hash; }; +std::string format_as(ArrayShape const &); +std::ostream &operator<<(std::ostream &, ArrayShape const &); + nonnegative_int get_volume(ArrayShape const &); +ArrayShape array_shape_from_tensor_shape(TensorShape const &); TensorShape get_tensor_shape(ArrayShape const &, DataType); -std::string format_as(ArrayShape const &); -std::ostream &operator<<(std::ostream &, ArrayShape const &); +std::unordered_set get_array_coord_set(ArrayShape const &); } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::ArrayShape> { + size_t operator()(::FlexFlow::ArrayShape const &) const; +}; + +} // namespace std + #endif diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..b3c77d3430 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -1,7 +1,6 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/device.h" #include "kernels/ff_handle.h" @@ -64,8 +63,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +103,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..8b67f564d2 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -1,13 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index f2ca17f429..9bb2753a12 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -1,15 +1,13 @@ #ifndef _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H #define _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/batch_norm_per_device_state.dtg.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include -namespace FlexFlow { -namespace Kernels { -namespace BatchNorm { +namespace FlexFlow::Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &per_device_state, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, @@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels -} // namespace FlexFlow - +} // namespace FlexFlow::Kernels::BatchNorm #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..5ec4cb3975 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -1,29 +1,19 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H -#include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..343ba253d9 --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Cast { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Cast + +#endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..c87465a01f 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..75fdd56498 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Combine { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Combine + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..1e3c55bf59 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..3b7c0672df 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" #include "utils/visitable.h" @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..81fd59dafb --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorR + copy_tensor_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &, + Allocator &cpu_allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &, + Allocator &cpu_allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e83fc3325d..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { @@ -33,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -41,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -54,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -68,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..2cc6dd60a3 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/array_shape.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..fd596f2ccf 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H -#include "device.h" #include "ff_handle.h" #include "kernels/array_shape.h" +#include "kernels/device.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.h" @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..0257b3b4a6 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/ops/element_unary.h" #include @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..f51a730314 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -1,13 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 179ce41cbf..31b3296a98 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -5,7 +5,7 @@ #include #endif -#include "device.h" +#include "kernels/device.h" #include "utils/visitable.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..b2b1164f92 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -1,23 +1,20 @@ #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); + void backward_kernel(ffStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/format_accessor_contents.h b/lib/kernels/include/kernels/format_accessor_contents.h new file mode 100644 index 0000000000..b50cffbbef --- /dev/null +++ b/lib/kernels/include/kernels/format_accessor_contents.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H + +#include "kernels/accessor.h" + +namespace FlexFlow { + +std::string format_accessor_r_contents(GenericTensorAccessorR const &); +std::string format_accessor_w_contents(GenericTensorAccessorW const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..8cbc7e457e 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,23 +15,21 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, + GatherPerDeviceState const &per_device_state, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, + GatherPerDeviceState const &per_device_state, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..10cf2fb14b 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" namespace FlexFlow { @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 7b9b9c455c..947bbd00bb 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -2,7 +2,13 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H #include "kernels/legion_dim_t.dtg.h" -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "kernels/legion_ordered/legion_ordered.h" +#include "op-attrs/ff_dim_t.dtg.h" +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/set_of.h" +#include "utils/containers/transform.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -11,7 +17,10 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions); template -using LegionOrdered = DimOrdered; +std::set key_range(LegionOrdered const &d) { + return transform(set_of(nonnegative_range(num_elements(d))), + [](nonnegative_int i) { return legion_dim_t{i}; }); +} template FFOrdered @@ -25,17 +34,6 @@ LegionOrdered return LegionOrdered(ff_ordered.rbegin(), ff_ordered.rend()); } -template -std::string format_as(LegionOrdered const &v) { - std::vector as_vec(v.cbegin(), v.cend()); - return fmt::format("", as_vec); -} - -template -std::ostream &operator<<(std::ostream &s, LegionOrdered const &v) { - return (s << fmt::to_string(v)); -} - } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h new file mode 100644 index 0000000000..ad8b3bad6d --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h @@ -0,0 +1,197 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H + +#include "kernels/legion_dim_t.dtg.h" +#include "utils/fmt/vector.h" +#include "utils/stack_vector/stack_vector.h" + +namespace FlexFlow { + +template +struct LegionOrdered { + LegionOrdered() {} + + LegionOrdered(std::initializer_list const &l) + : contents(l.begin(), l.end()) {} + + LegionOrdered(std::vector const &contents) + : contents(contents.begin(), contents.end()) {} + + template + LegionOrdered(It begin, It end) : contents(begin, end) {} + + template + LegionOrdered(stack_vector const &contents) + : contents(contents.begin(), contents.end()) {} + + T const &at(legion_dim_t idx) const { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T &at(legion_dim_t idx) { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T const &operator[](legion_dim_t idx) const { + return this->at(idx); + } + + T &operator[](legion_dim_t idx) { + return this->at(idx); + } + + bool idx_is_valid(legion_dim_t const &idx) const { + int raw = idx.value.unwrap_nonnegative(); + return raw < this->contents.size(); + } + + bool operator==(LegionOrdered const &other) const { + return this->contents == other.contents; + } + + bool operator!=(LegionOrdered const &other) const { + return this->contents != other.contents; + } + + using iterator = typename stack_vector::iterator; + using const_iterator = + typename stack_vector::const_iterator; + using reverse_iterator = + typename stack_vector::reverse_iterator; + using const_reverse_iterator = + typename stack_vector::const_reverse_iterator; + using value_type = T; + using pointer = value_type *; + using const_pointer = value_type const *; + using reference = value_type &; + using const_reference = value_type const &; + + iterator begin() { + return this->contents.begin(); + } + + const_iterator begin() const { + return this->cbegin(); + } + + const_iterator cbegin() const { + return this->contents.cbegin(); + } + + iterator end() { + return this->contents.end(); + } + + const_iterator end() const { + return this->cend(); + } + + const_iterator cend() const { + return this->contents.cend(); + } + + reverse_iterator rbegin() { + return this->contents.rbegin(); + } + + const_reverse_iterator rbegin() const { + return this->crbegin(); + } + + const_reverse_iterator crbegin() const { + return this->contents.crbegin(); + } + + reverse_iterator rend() { + return this->contents.rend(); + } + + const_reverse_iterator rend() const { + return this->crend(); + } + + const_reverse_iterator crend() const { + return this->contents.crend(); + } + + size_t size() const { + return this->contents.size(); + } + + size_t empty() const { + return this->contents.empty(); + } + + size_t num_dims() const { + return this->size(); + } + + friend struct ::std::hash; + +private: + stack_vector contents; +}; + +template +auto operator<(LegionOrdered const &lhs, LegionOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} + +template +std::string format_as(LegionOrdered const &v) { + std::vector as_vec(v.cbegin(), v.cend()); + return fmt::format("", as_vec); +} + +template +std::ostream &operator<<(std::ostream &s, LegionOrdered const &v) { + return (s << fmt::to_string(v)); +} + +} // namespace FlexFlow + +namespace nlohmann { +template +struct adl_serializer<::FlexFlow::LegionOrdered> { + static ::FlexFlow::LegionOrdered from_json(nlohmann::json const &j) { + return {j.template get>()}; + } + + static void to_json(nlohmann::json &j, + ::FlexFlow::LegionOrdered const &x) { + j = std::vector{x.cbegin(), x.cend()}; + } +}; +} // namespace nlohmann + +namespace std { + +template +struct hash<::FlexFlow::LegionOrdered> { + size_t operator()(::FlexFlow::LegionOrdered const &t) const { + static_assert(::FlexFlow::is_hashable::value, + "Elements must be hashable"); + + return get_std_hash(t.contents); + } +}; + +} // namespace std + +namespace rc { + +template +struct Arbitrary<::FlexFlow::LegionOrdered> { + static Gen<::FlexFlow::LegionOrdered> arbitrary() { + return gen::construct<::FlexFlow::LegionOrdered>( + gen::arbitrary<::FlexFlow::stack_vector>()); + } +}; + +} // namespace rc + +#endif diff --git a/lib/kernels/include/kernels/legion_ordered/slice.h b/lib/kernels/include/kernels/legion_ordered/slice.h new file mode 100644 index 0000000000..6980c0d9ec --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/slice.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H + +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/containers/slice.h" +#include "utils/containers/transform.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template +LegionOrdered slice(LegionOrdered const &d, + legion_dim_t const &start, + std::optional const &end) { + int raw_start = start.value.unwrap_nonnegative(); + std::optional raw_end = transform( + end, [](legion_dim_t const &i) { return i.value.unwrap_nonnegative(); }); + + return LegionOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/legion_ordered/transform.h b/lib/kernels/include/kernels/legion_ordered/transform.h new file mode 100644 index 0000000000..55cc1ff1ea --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/transform.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H + +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/vector_transform.h" + +namespace FlexFlow { + +template > +LegionOrdered transform(LegionOrdered const &d, F &&f) { + return LegionOrdered{vector_transform(vector_of(d), f)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 3128e39fd0..21d84c2567 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H -#include "device.h" #include "ff_handle.h" +#include "kernels/device.h" #include "op-attrs/datatype.h" #include "op-attrs/ops/linear_attrs.dtg.h" @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -51,29 +50,28 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h similarity index 74% rename from lib/local-execution/include/local-execution/local_cpu_allocator.h rename to lib/kernels/include/kernels/local_cpu_allocator.h index d1e81facf2..9653dcf00e 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -1,3 +1,6 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H + #include "kernels/allocation.h" #include @@ -12,6 +15,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; @@ -20,3 +25,5 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); Allocator create_local_cpu_memory_allocator(); } // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..576edb0ffa 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H #define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { @@ -19,6 +19,9 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; +private: + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..9bd9370685 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; +private: + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..430608db55 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..d552831c78 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" namespace FlexFlow { @@ -16,15 +17,18 @@ void sgd_ps_update_task_gpu(ffStream_t, float *weight_ptr, float *sgd_v_ptr); +#ifdef FF_USE_NCCL void sgd_nccl_update_task_gpu(ffStream_t, float lr, float momentum, bool nesterov, - float weight_decay PerDeviceFFHandle const &, + float weight_decay, + PerDeviceFFHandle const &, float const *weight_grad_ptr, size_t size, float *weight_ptr, float *sgd_v_ptr); +#endif void adam_ps_update_task_gpu(ffStream_t, float alpha_t, @@ -33,9 +37,11 @@ void adam_ps_update_task_gpu(ffStream_t, float weight_decay, float epsilon, float const *weight_grad_ptr, - float *adam_m_ptr, + size_t size, + int num_replicas, + float *weight_ptr, float *adam_v_ptr, - float *weight_ptr); + float *adam_m_ptr); void adam_nccl_update_task_gpu(ffStream_t, float alpha_t, @@ -45,9 +51,10 @@ void adam_nccl_update_task_gpu(ffStream_t, float epsilon, PerDeviceFFHandle const &, float const *weight_grad_ptr, - float *adam_m_ptr, + size_t size, + float *weight_ptr, float *adam_v_ptr, - float *weight_ptr); + float *adam_m_ptr); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..aa3a7a1ef7 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" namespace FlexFlow { @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..76aa07d0a4 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H -#include "device.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" #include "op-attrs/ops/pool_2d.h" @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 655d540685..7c4145c426 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_PROFILING_H #define _FLEXFLOW_KERNELS_PROFILING_H -#include "device.h" +#include "kernels/device.h" #include "kernels/profiling_settings.dtg.h" #include "utils/visitable.h" diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..10e8e4393b 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H -#include "array_shape.h" -#include "device.h" -#include "ff_handle.h" +#include "kernels/array_shape.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" #include "op-attrs/operator_type.dtg.h" namespace FlexFlow { @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..08f73cd9ab 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..0b113868ee 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -1,24 +1,20 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..2a2eaa5eb6 --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Replicate { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas); + +} // namespace FlexFlow::Kernels::Replicate + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..88c11d2fb0 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "utils/required_core.h" namespace FlexFlow { @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..768707175c 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -1,30 +1,21 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/reverse_kernels_cpu.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, - float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); void backward_kernel(ffStream_t stream, - float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); + GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..ec82000f8f --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow::Kernels::Reverse { + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); + +} // namespace FlexFlow::Kernels::Reverse + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h new file mode 100644 index 0000000000..766d70b915 --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_params.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H + +#include "kernels/array_shape.h" +#include "kernels/reverse_kernels_params.dtg.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow { + +ReverseKernelsParams + compute_reverse_kernels_params(ArrayShape const &output_shape, + ReverseAttrs const &attrs); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml new file mode 100644 index 0000000000..a5dbd750bc --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml @@ -0,0 +1,28 @@ +namespace = "FlexFlow" +name = "ReverseKernelsParams" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + +[[fields]] +name = "num_out_blks" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "reverse_dim_size" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "in_blk_size" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "out_size" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..60101578e3 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H -#include "device.h" #include "ff_handle.h" +#include "kernels/device.h" namespace FlexFlow { @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..3b580f94be 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -1,12 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H -#include "device.h" +#include "kernels/device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..085594d57f 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" namespace FlexFlow { @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 0f1cc2ae61..776370dcbd 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -1,15 +1,14 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "op-attrs/ops/transpose_attrs.dtg.h" #include namespace FlexFlow { -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { void forward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, @@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc deleted file mode 100644 index 27b7eb390d..0000000000 --- a/lib/kernels/src/accessor.cc +++ /dev/null @@ -1,192 +0,0 @@ -#include "kernels/accessor.h" - -namespace FlexFlow { - -int32_t *GenericTensorAccessorW::get_int32_ptr() const { - return this->get(); -} - -int64_t *GenericTensorAccessorW::get_int64_ptr() const { - return this->get(); -} - -float *GenericTensorAccessorW::get_float_ptr() const { - return this->get(); -} - -double *GenericTensorAccessorW::get_double_ptr() const { - return this->get(); -} - -half *GenericTensorAccessorW::get_half_ptr() const { - return this->get(); -} - -std::string format_as(GenericTensorAccessorW const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { - return (s << fmt::to_string(a)); -} - -int32_t const *GenericTensorAccessorR::get_int32_ptr() const { - return this->get(); -} - -int64_t const *GenericTensorAccessorR::get_int64_ptr() const { - return this->get(); -} - -float const *GenericTensorAccessorR::get_float_ptr() const { - return this->get(); -} - -double const *GenericTensorAccessorR::get_double_ptr() const { - return this->get(); -} - -half const *GenericTensorAccessorR::get_half_ptr() const { - return get(); -} - -std::string format_as(GenericTensorAccessorR const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { - return (s << fmt::to_string(a)); -} - -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -float const *get_float_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -double const *get_double_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -half const *get_half_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -GenericTensorAccessorR read_only_accessor_from_write_accessor( - GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; -} - -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { - return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; -} - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorR const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc deleted file mode 100644 index d666592e77..0000000000 --- a/lib/kernels/src/allocation.cc +++ /dev/null @@ -1,21 +0,0 @@ -#include "kernels/allocation.h" -#include "op-attrs/tensor_shape.h" - -namespace FlexFlow { - -void *Allocator::allocate(size_t mem_size) { - return this->i_allocator->allocate(mem_size); -} - -void Allocator::deallocate(void *ptr) { - this->i_allocator->deallocate(ptr); -} - -GenericTensorAccessorW - Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = - this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); - return {tensor_shape.data_type, tensor_shape, ptr}; -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc new file mode 100644 index 0000000000..cdd57b8947 --- /dev/null +++ b/lib/kernels/src/cpu/ops/cast_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Cast { + +template +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume().unwrap_nonnegative(); + cpu_cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); + cpu_cast_backward( + output.get(), input.get(), volume, cast_to(1.0f)); + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch2{}( + output.data_type, input.data_type, output, input); +} + +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc new file mode 100644 index 0000000000..577984f21a --- /dev/null +++ b/lib/kernels/src/cpu/ops/combine_kernels.cc @@ -0,0 +1,39 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Combine { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc similarity index 100% rename from lib/kernels/src/cpu/initializer_kernels.cc rename to lib/kernels/src/cpu/ops/initializer_kernels.cc diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc new file mode 100644 index 0000000000..798a4ea8c7 --- /dev/null +++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow::Kernels::Replicate { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + nonnegative_int num_elements, + nonnegative_int num_replicas) { + using T = real_type_t
; + + for (nonnegative_int i : nonnegative_range(num_elements)) { + T cur_sum = 0; + for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) { + cur_sum += output.at
(LegionOrdered{replica_idx, i}); + } + input.at
(LegionOrdered{i}) = cur_sum; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas) { + nonnegative_int num_elements = input.shape.num_elements(); + DataTypeDispatch1{}(input.data_type, + output, + input, + num_elements, + nonnegative_int{num_replicas}); +} + +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc new file mode 100644 index 0000000000..4d9eb8cc09 --- /dev/null +++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc @@ -0,0 +1,46 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +namespace FlexFlow::Kernels::Reverse { + +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + ReverseAttrs const &attrs) { + nonnegative_int reverse_axis_size = input.shape.at(attrs.axis); + + for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) { + nonnegative_int input_reverse_axis_coord = + input_coord.ff_ordered.at(attrs.axis); + + ArrayCoord output_coord = input_coord; + output_coord.ff_ordered.at(attrs.axis) = + nonnegative_int{reverse_axis_size.unwrap_nonnegative() - + input_reverse_axis_coord.unwrap_nonnegative() - 1}; + + output.at
(output_coord.ff_ordered) = + input.at
(input_coord.ff_ordered); + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &attrs) { + + DataTypeDispatch1{}( + input_accessor.data_type, input_accessor, output_accessor, attrs); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor, + GenericTensorAccessorW &input_grad_accessor, + ReverseAttrs const &attrs) { + DataTypeDispatch1{}(output_grad_accessor.data_type, + output_grad_accessor, + input_grad_accessor, + attrs); +} + +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 66388c0ec8..86b2d8a437 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "utils/containers/reversed.h" @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..cb84f0e777 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -13,16 +13,15 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,229 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -388,39 +557,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu index 6c22efda21..2fccf4b48f 100644 --- a/lib/kernels/src/cuda/loss_function_kernels.cu +++ b/lib/kernels/src/cuda/loss_function_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/loss_function_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..54ecd076f6 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "internal/device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu index 38c32ad9e4..e5bdb6f21d 100644 --- a/lib/kernels/src/cuda/ops/attention_kernels.cu +++ b/lib/kernels/src/cuda/ops/attention_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/attention_kernels.h" #include "kernels/device.h" diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu index eb23514c5f..348eed9f0c 100644 --- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/batch_matmul_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 4e153a028e..ceb3a1b3d9 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/batch_norm_kernels.h" #include "kernels/ff_handle.h" @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index fe7aec68b9..f3ea6db660 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/cast_kernels.h" #include "kernels/datatype_dispatch.h" @@ -50,30 +50,26 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume().unwrap_nonnegative(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu index 7cc67ceed8..08cc343fd2 100644 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ b/lib/kernels/src/cuda/ops/combine_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/combine_kernels.h" #include "kernels/datatype_dispatch.h" diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 2715ff16e9..37dbbe12f8 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -13,50 +13,58 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0_n}, axis) + legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims()); + assert(legion_axis.value < shape.num_dims()); + if (legion_axis.value == 0_n) { + legion_axis.value = 1_n; + } + blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) .num_elements() .unwrap_nonnegative(); - num_blocks = - shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); + num_blocks = shape.sub_shape(legion_axis, std::nullopt) + .num_elements() + .unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = inputs.size(); - assert(num_inputs <= MAX_NUM_INPUTS); + assert(inputs.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output.shape, axis); - for (int i = 0; i < num_inputs; i++) { - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - copy_with_stride<<>>(output.get_float_ptr() + offset, - inputs[i].get_float_ptr(), - num_blocks, + input.get_float_ptr(), + blocks_to_copy, output_blk_size, - input_blk_sizes[i]); - offset += input_blk_sizes[i]; + input_blk_size); + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } @@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = input_grads.size(); - assert(num_inputs <= MAX_NUM_INPUTS); - + assert(input_grads.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); - for (int i = 0; i < num_inputs; i++) { - ArrayShape shape = input_grads[i].shape; - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - add_with_stride<<>>(input_grads[i].get_float_ptr(), + stream>>>(input_grad.get_float_ptr(), output_grad.get_float_ptr() + offset, - num_blocks, - input_blk_sizes[i], + blocks_to_add, + input_blk_size, output_blk_size); - offset += input_blk_sizes[i]; + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index dac55539d2..16db62a57f 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" #include "kernels/conv_2d_kernels.h" namespace FlexFlow { @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu index adf0cd8e89..c5fa56bc78 100644 --- a/lib/kernels/src/cuda/ops/dropout_kernels.cu +++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/dropout_kernels.h" #include "kernels/ff_handle.h" diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu index 44273a323f..3a4a77b3dd 100644 --- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/element_binary_kernels.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 056c80ecf6..218e74b939 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/get_op_type.h" @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 973d05f596..594a183ff0 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/flat_kernels.h" @@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 31c1bac217..19e495a540 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -13,14 +13,12 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/gather_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Gather { +namespace FlexFlow::Kernels::Gather { template __global__ void gather_forward(float const *input, @@ -125,11 +123,15 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = - output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) + output.shape + .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); @@ -157,9 +159,13 @@ void backward_kernel(ffStream_t stream, coord_t stride = output_grad.shape - .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .get_volume() + .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) + .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = @@ -180,6 +186,4 @@ void backward_kernel(ffStream_t stream, output_dim_size); } -} // namespace Gather -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..02bda55828 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/linear_kernels.h" #include "utils/integer_conversions.h" @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + static_cast(weight_ptr), weight_type, in_dim, - input_ptr, + static_cast(input_ptr), input_type, in_dim, &beta, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + static_cast(bias_ptr), weight_type, 1, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + static_cast(output_ptr), &beta, m.outputTensor, - output_ptr)); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + static_cast(input_ptr), input_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - kernel_grad_ptr, + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - bias_grad_ptr, + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + static_cast(kernel_ptr), weight_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - input_grad_ptr, + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 2831562f58..b8dfac5204 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/partition_kernels.h" @@ -40,8 +40,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel> <<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..e8ea3f64c2 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/pool_2d_kernels.h" namespace FlexFlow { @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu index 02a89da807..563bbae21d 100644 --- a/lib/kernels/src/cuda/ops/reduce_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/reduce_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 5d95a3766a..d9c09b082d 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/reduction_kernels.h" @@ -55,8 +55,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 4706f38fd4..4685fd7a2d 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels.h" @@ -22,8 +22,8 @@ namespace Kernels { namespace Replicate { template -__global__ void replicate_backward_kernel(T *input_ptr, - T const *output_ptr, +__global__ void replicate_backward_kernel(T const *output_ptr, + T *input_ptr, size_t num_elements, size_t num_replicas) { CUDA_KERNEL_LOOP(i, num_elements) { @@ -38,7 +38,6 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -51,15 +50,15 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements().unwrap_nonnegative() * num_replicas; replicate_backward_kernel> <<>>( - input.get(), output.get(), + input.get(), input.shape.num_elements().unwrap_nonnegative(), num_replicas); } @@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index c5a289ce6b..a6a390b38e 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/reshape_kernels.h" @@ -43,8 +43,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..582aa02386 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -13,13 +13,11 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_params.h" -namespace FlexFlow { - -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, @@ -27,23 +25,24 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t reverse_dim_size, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { + coord_t out_idx = i; coord_t blk_idx = i / (reverse_dim_size * in_blk_size); i = i - blk_idx * (reverse_dim_size * in_blk_size); coord_t reverse_dim_idx = i / in_blk_size; i = i - reverse_dim_idx * in_blk_size; coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + out_ptr[out_idx] = in_ptr[in_idx]; } } -void forward_kernel(cudaStream_t stream, - float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { +static void forward_kernel_internal(cudaStream_t stream, + float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { reverse_forward_kernel<< 0.0f) { - V[i] = V[i] * momentum + gt; - if (nesterov) { - gt = gt + momentum * V[i]; - } else { - gt = V[i]; - } - } - W[i] -= lr * gt; - } -} - -__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // Step 1: Gather gradients in the first replica - for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; - apply_add_with_scale - <<>>( - (float *)w_grad_ptr, src, size, 1.0f); - } - // checkCUDA(cudaDeviceSynchronize()); - // Step 2: SGD update - sgd_update<<>>( - size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} - -#ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr) { - // Use NCCL to sync gradients - // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); - // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); - // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); - - // Step 2: SGD update - sgd_update<<>>( - size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} -#endif - -// ================================================================== -// Adam Optimizer -// ================================================================== -__global__ void - add_kernel(int count, float scale, float const *src, float *dst) { - CUDA_KERNEL_LOOP(i, count) { - dst[i] += src[i] * scale; - } -} - -__global__ void scale_kernel(int count, float a, float b, float *ptr) { - CUDA_KERNEL_LOOP(i, count) { - ptr[i] = (b - a) * ptr[i] + a; - } -} - -__global__ void adam_update(int count, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *WGrad, - float *M, - float *V, - float *W) { - // Reference for weight decay - // https://www.fast.ai/2018/07/02/adam-weight-decay/ - CUDA_KERNEL_LOOP(i, count) { - // W[i] -= weight_decay * alpha_t * W[i]; - // float gt = WGrad[i]; - float gt = WGrad[i] + weight_decay * W[i]; - float mt = beta1 * M[i] + (1 - beta1) * gt; - float vt = beta2 * V[i] + (1 - beta2) * gt * gt; - M[i] = mt; - V[i] = vt; - W[i] -= alpha_t * mt / (sqrt(vt) + epsilon); - } -} - -__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // Step 1: Gather gradients in the first replica - for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; - add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); - } - // checkCUDA(cudaDeviceSynchronize()); - // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", - // op->alpha, op->alpha_t, op->weight_decay); - // Step 2: Adam update - adam_update<<>>( - size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} - -#ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - // Use NCCL to sync gradients - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); - // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", - // op->alpha, op->alpha_t, op->weight_decay); - // Step 2: Adam update - adam_update<<>>( - size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} -#endif - -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu new file mode 100644 index 0000000000..fe817876ce --- /dev/null +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -0,0 +1,205 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/device.h" +#include "kernels/nccl.h" +#include "kernels/optimizer_kernels.h" +#include "utils/exception.h" + +namespace FlexFlow { + +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + float gt = WGrad[i] + weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * momentum + gt; + if (nesterov) { + gt = gt + momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= lr * gt; + } +} + +__host__ void sgd_ps_update_task_gpu(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *sgd_v_ptr) { + // Step 1: Gather gradients in the first replica + for (int i = 1; i < num_replicas; i++) { + float const *src = weight_grad_ptr + i * size; + apply_add_with_scale + <<>>( + (float *)weight_grad_ptr, src, size, 1.0f); + } + + // Step 2: SGD update + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); +} + +#ifdef FF_USE_NCCL +__host__ void sgd_nccl_update_task_gpu(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr) { + // Step 1: Use NCCL to sync gradients + ncclComm_t comm = handle.ncclComm; + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); + + // Step 2: SGD update + sgd_update<<>>( + size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr); +} +#endif + +// ================================================================== +// Adam Optimizer +// ================================================================== +__global__ void + add_kernel(int count, float scale, float const *src, float *dst) { + CUDA_KERNEL_LOOP(i, count) { + dst[i] += src[i] * scale; + } +} + +__global__ void scale_kernel(int count, float a, float b, float *ptr) { + CUDA_KERNEL_LOOP(i, count) { + ptr[i] = (b - a) * ptr[i] + a; + } +} + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W) { + // Reference for weight decay + // https://www.fast.ai/2018/07/02/adam-weight-decay/ + CUDA_KERNEL_LOOP(i, count) { + // W[i] -= weight_decay * alpha_t * W[i]; + // float gt = WGrad[i]; + float gt = WGrad[i] + weight_decay * W[i]; + float mt = beta1 * M[i] + (1 - beta1) * gt; + float vt = beta2 * V[i] + (1 - beta2) * gt * gt; + M[i] = mt; + V[i] = vt; + W[i] -= alpha_t * mt / (sqrt(vt) + epsilon); + } +} + +__host__ void adam_ps_update_task_gpu(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr) { + // Step 1: Gather gradients in the first replica + for (int i = 1; i < num_replicas; i++) { + float const *src = w_grad_ptr + i * size; + add_kernel<<>>( + (float *)w_grad_ptr, src, size); + } + + // Step 2: Adam update + adam_update<<>>(size, + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + w_grad_ptr, + m_ptr, + v_ptr, + w_ptr); +} + +#ifdef FF_USE_NCCL +__host__ void nccl_update_task_gpu(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr) { + // Step 1: Use NCCL to sync gradients + checkNCCL(ncclAllReduce(w_grad_ptr, + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + handle.ncclComm, + stream)); + + // Step 2: Adam update + adam_update<<>>(size, + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + w_grad_ptr, + m_ptr, + v_ptr, + w_ptr); +} +#endif + +} // namespace FlexFlow diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 7ca3149f2f..aefe53cc46 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/embedding_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include @@ -364,8 +364,8 @@ struct ForwardKernel { weight.data_type == DataType::FLOAT || weight.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -374,10 +374,11 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -386,9 +387,7 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } @@ -408,8 +407,9 @@ struct BackwardKernel { assert(output.data_type == DataType::HALF || output.data_type == DataType::FLOAT || output.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), + + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -418,9 +418,11 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -429,9 +431,7 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } diff --git a/lib/kernels/src/hip/loss_function_kernels.cpp b/lib/kernels/src/hip/loss_function_kernels.cpp index e82b5c96d5..05068f1bd0 100644 --- a/lib/kernels/src/hip/loss_function_kernels.cpp +++ b/lib/kernels/src/hip/loss_function_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/loss_function_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/attention_kernels.cpp b/lib/kernels/src/hip/ops/attention_kernels.cpp index 005cef30d1..b374ead305 100644 --- a/lib/kernels/src/hip/ops/attention_kernels.cpp +++ b/lib/kernels/src/hip/ops/attention_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/attention_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp index c4b3be823f..6d9ae8a268 100644 --- a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp +++ b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/batch_matmul_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp index 8e94b462cd..764a3e0b58 100644 --- a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp +++ b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/batch_norm_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/ff_handle.h" #include diff --git a/lib/kernels/src/hip/ops/cast_kernels.cpp b/lib/kernels/src/hip/ops/cast_kernels.cpp index fa0c37ffa1..1035657c04 100644 --- a/lib/kernels/src/hip/ops/cast_kernels.cpp +++ b/lib/kernels/src/hip/ops/cast_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/cast_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/combine_kernels.cpp b/lib/kernels/src/hip/ops/combine_kernels.cpp index aa01f02276..f1e0422747 100644 --- a/lib/kernels/src/hip/ops/combine_kernels.cpp +++ b/lib/kernels/src/hip/ops/combine_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/combine_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp index aa38be739b..a215d67942 100644 --- a/lib/kernels/src/hip/ops/concat_kernels.cpp +++ b/lib/kernels/src/hip/ops/concat_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/concat_kernels.h" -#include "device.h" +#include "internal/device.h" #include #include diff --git a/lib/kernels/src/hip/ops/conv_2d_kernels.h b/lib/kernels/src/hip/ops/conv_2d_kernels.h index bcf015d561..76a73ab08c 100644 --- a/lib/kernels/src/hip/ops/conv_2d_kernels.h +++ b/lib/kernels/src/hip/ops/conv_2d_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H #define _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/hip/ops/dropout_kernels.cpp b/lib/kernels/src/hip/ops/dropout_kernels.cpp index baaf8e6902..d85c0ae054 100644 --- a/lib/kernels/src/hip/ops/dropout_kernels.cpp +++ b/lib/kernels/src/hip/ops/dropout_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/dropout_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/ff_handle.h" #include diff --git a/lib/kernels/src/hip/ops/element_binary_kernels.cpp b/lib/kernels/src/hip/ops/element_binary_kernels.cpp index bc66bbff2f..9e0452b09b 100644 --- a/lib/kernels/src/hip/ops/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/ops/element_binary_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/element_binary_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.dtg.h" diff --git a/lib/kernels/src/hip/ops/element_unary_kernels.cpp b/lib/kernels/src/hip/ops/element_unary_kernels.cpp index f4b0ccb82d..163f13a6da 100644 --- a/lib/kernels/src/hip/ops/element_unary_kernels.cpp +++ b/lib/kernels/src/hip/ops/element_unary_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/element_unary_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "op-attrs/get_op_type.h" #include diff --git a/lib/kernels/src/hip/ops/flat_kernels.cpp b/lib/kernels/src/hip/ops/flat_kernels.cpp index 763fb9e322..dedfb4b9a9 100644 --- a/lib/kernels/src/hip/ops/flat_kernels.cpp +++ b/lib/kernels/src/hip/ops/flat_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/flat_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include diff --git a/lib/kernels/src/hip/ops/gather_kernels.cpp b/lib/kernels/src/hip/ops/gather_kernels.cpp index 17c0014e98..6e9e4c6a2c 100644 --- a/lib/kernels/src/hip/ops/gather_kernels.cpp +++ b/lib/kernels/src/hip/ops/gather_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/gather_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/partition_kernels.cpp b/lib/kernels/src/hip/ops/partition_kernels.cpp index 4591247faa..26748a7e45 100644 --- a/lib/kernels/src/hip/ops/partition_kernels.cpp +++ b/lib/kernels/src/hip/ops/partition_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/partition_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp index ed942c105c..7e5ae2ab80 100644 --- a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp +++ b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/pool_2d_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/reduce_kernels.cpp b/lib/kernels/src/hip/ops/reduce_kernels.cpp index 468543dd5b..c0bcc84d48 100644 --- a/lib/kernels/src/hip/ops/reduce_kernels.cpp +++ b/lib/kernels/src/hip/ops/reduce_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reduce_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/replicate_kernels.cpp b/lib/kernels/src/hip/ops/replicate_kernels.cpp index 8d27bb1908..ee7bf701c0 100644 --- a/lib/kernels/src/hip/ops/replicate_kernels.cpp +++ b/lib/kernels/src/hip/ops/replicate_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/replicate_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/reshape_kernels.cpp b/lib/kernels/src/hip/ops/reshape_kernels.cpp index 47978a5f4a..810b929e24 100644 --- a/lib/kernels/src/hip/ops/reshape_kernels.cpp +++ b/lib/kernels/src/hip/ops/reshape_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reshape_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/reverse_kernels.cpp b/lib/kernels/src/hip/ops/reverse_kernels.cpp index 03e97245bf..a56ff3540a 100644 --- a/lib/kernels/src/hip/ops/reverse_kernels.cpp +++ b/lib/kernels/src/hip/ops/reverse_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reverse_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/softmax_kernels.cpp b/lib/kernels/src/hip/ops/softmax_kernels.cpp index 3a8f2813b7..610675850b 100644 --- a/lib/kernels/src/hip/ops/softmax_kernels.cpp +++ b/lib/kernels/src/hip/ops/softmax_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/softmax_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/split_kernels.cpp b/lib/kernels/src/hip/ops/split_kernels.cpp index 5599ae6d6f..3034b633a6 100644 --- a/lib/kernels/src/hip/ops/split_kernels.cpp +++ b/lib/kernels/src/hip/ops/split_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/split_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/topk_kernels.cpp b/lib/kernels/src/hip/ops/topk_kernels.cpp index f085c5831f..777d9edffa 100644 --- a/lib/kernels/src/hip/ops/topk_kernels.cpp +++ b/lib/kernels/src/hip/ops/topk_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/topk_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/transpose_kernels.cpp b/lib/kernels/src/hip/ops/transpose_kernels.cpp index ef9dd58c63..c5122f34bf 100644 --- a/lib/kernels/src/hip/ops/transpose_kernels.cpp +++ b/lib/kernels/src/hip/ops/transpose_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/transpose_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "utils/exception.h" #include diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/internal/device.cc similarity index 97% rename from lib/kernels/src/device.cc rename to lib/kernels/src/internal/device.cc index f46099c79a..eb3d229c2a 100644 --- a/lib/kernels/src/device.cc +++ b/lib/kernels/src/internal/device.cc @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" namespace FlexFlow { diff --git a/lib/kernels/src/device.h b/lib/kernels/src/internal/device.h similarity index 98% rename from lib/kernels/src/device.h rename to lib/kernels/src/internal/device.h index ceff2f92ff..226c7ad174 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/internal/device.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_KERNELS_SRC_DEVICE_H -#define _FLEXFLOW_KERNELS_SRC_DEVICE_H +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H #include "kernels/array_shape.h" #include "kernels/device.h" diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc new file mode 100644 index 0000000000..b5042f77a0 --- /dev/null +++ b/lib/kernels/src/kernels/accessor.cc @@ -0,0 +1,249 @@ +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/datatype_dispatch.h" +#include "utils/containers/reversed.h" +#include "utils/containers/vector_of.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include + +namespace FlexFlow { + +nonnegative_int + calculate_accessor_offset(LegionOrdered const &indices, + ArrayShape const &shape) { + ASSERT(indices.size() == shape.num_dims(), + "Number of indices does not match the number of dimensions"); + + nonnegative_int offset = 0_n; + nonnegative_int multiplier = 1_n; + + for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) { + ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}), + "Out of bounds access", + dim); + + offset += indices.at(dim) * multiplier; + multiplier *= shape.at(legion_dim_t{dim}); + } + + return offset; +} + +void copy_accessor_data_to_l_from_r( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = + dst_accessor.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + assert(src_device_type == DeviceType::GPU); + assert(dst_device_type == DeviceType::GPU); + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); +} + +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return this->tie() != other.tie(); +} + +int32_t *GenericTensorAccessorW::get_int32_ptr() const { + return this->get(); +} + +int64_t *GenericTensorAccessorW::get_int64_ptr() const { + return this->get(); +} + +float *GenericTensorAccessorW::get_float_ptr() const { + return this->get(); +} + +double *GenericTensorAccessorW::get_double_ptr() const { + return this->get(); +} + +half *GenericTensorAccessorW::get_half_ptr() const { + return this->get(); +} + +std::string format_as(GenericTensorAccessorW const &a) { + return fmt::format("", + a.data_type, + a.shape, + a.ptr); +} + +std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { + return (s << fmt::to_string(a)); +} + +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return this->tie() != other.tie(); +} + +int32_t const *GenericTensorAccessorR::get_int32_ptr() const { + return this->get(); +} + +int64_t const *GenericTensorAccessorR::get_int64_ptr() const { + return this->get(); +} + +float const *GenericTensorAccessorR::get_float_ptr() const { + return this->get(); +} + +double const *GenericTensorAccessorR::get_double_ptr() const { + return this->get(); +} + +half const *GenericTensorAccessorR::get_half_ptr() const { + return get(); +} + +std::string format_as(GenericTensorAccessorR const &a) { + return fmt::format("", + a.data_type, + a.shape, + a.ptr); +} + +std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { + return (s << fmt::to_string(a)); +} + +int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +float const *get_float_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +double const *get_double_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +half const *get_half_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +std::vector + get_int32_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_int64_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_float_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_double_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_half_ptrs(std::vector const &a) { + return get(a); +} + +GenericTensorAccessorR read_only_accessor_from_write_accessor( + GenericTensorAccessorW const &writable) { + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.device_type}; +} + +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { + return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; +} + +bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, + ArrayShape const &expected_shape, + DataType const &expected_dtype) { + return accessor.shape == expected_shape && + accessor.data_type == expected_dtype; +} + +std::pair + get_shape_and_datatype(GenericTensorAccessorR const &accessor) { + return std::make_pair(accessor.shape, accessor.data_type); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc new file mode 100644 index 0000000000..b9f253bcff --- /dev/null +++ b/lib/kernels/src/kernels/allocation.cc @@ -0,0 +1,38 @@ +#include "kernels/allocation.h" +#include "op-attrs/tensor_shape.h" + +namespace FlexFlow { + +void *Allocator::allocate(size_t mem_size) { + return this->i_allocator->allocate(mem_size); +} + +void Allocator::deallocate(void *ptr) { + this->i_allocator->deallocate(ptr); +} + +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); +} + +GenericTensorAccessorW + Allocator::allocate_tensor(TensorShape const &tensor_shape) { + void *ptr = + this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); + return GenericTensorAccessorW{ + tensor_shape.data_type, + array_shape_from_tensor_shape(tensor_shape), + ptr, + this->get_allocation_device_type(), + }; +} + +void Allocator::deallocate_tensor(GenericTensorAccessorW const &t) { + this->deallocate(t.ptr); +} + +void Allocator::deallocate_tensor(GenericTensorAccessorR const &t) { + this->deallocate(const_cast(t.ptr)); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc similarity index 51% rename from lib/kernels/src/array_shape.cc rename to lib/kernels/src/kernels/array_shape.cc index 243185ada4..34a53c1bb3 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/kernels/array_shape.cc @@ -1,23 +1,20 @@ #include "kernels/array_shape.h" +#include "kernels/legion_ordered/slice.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/slice.h" +#include "utils/containers/cartesian_product.h" #include "utils/containers/product.h" #include "utils/containers/reversed.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" #include "utils/containers/vector_of.h" +#include "utils/hash/tuple.h" +#include "utils/hash/vector.h" #include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { -static LegionOrdered - legion_dims_from_ff_dims(FFOrdered const &ff_ordered) { - return LegionOrdered{reversed(vector_of(ff_ordered))}; -} - -ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims) - : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {} - -ArrayShape::ArrayShape(TensorShape const &shape) - : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {} - -ArrayShape::ArrayShape(std::vector const &input_dims) +ArrayShape::ArrayShape(LegionOrdered const &input_dims) : dims(input_dims) {} nonnegative_int ArrayShape::get_volume() const { @@ -59,10 +56,19 @@ bool ArrayShape::operator!=(ArrayShape const &other) const { return this->tie() != other.tie(); } -ArrayShape ArrayShape::sub_shape( - std::optional> start, - std::optional> end) const { - NOT_IMPLEMENTED(); +ArrayShape + ArrayShape::sub_shape(ff_dim_t const &start, + std::optional const &maybe_end) const { + FFOrdered ff_ordered_dims = + ff_ordered_from_legion_ordered(this->dims); + FFOrdered sliced = slice(ff_ordered_dims, start, maybe_end); + return ArrayShape{legion_ordered_from_ff_ordered(sliced)}; +} + +ArrayShape + ArrayShape::sub_shape(legion_dim_t const &start, + std::optional const &maybe_end) const { + return ArrayShape{slice(this->dims, start, maybe_end)}; } std::optional ArrayShape::at_maybe(legion_dim_t index) const { @@ -81,15 +87,6 @@ std::tuple const &> ArrayShape::tie() const { return std::tie(this->dims); } -nonnegative_int get_volume(ArrayShape const &shape) { - return shape.get_volume(); -} - -TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { - return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)}, - dtype}; -} - std::string format_as(ArrayShape const &x) { std::ostringstream oss; oss << " get_array_coord_set(ArrayShape const &shape) { + std::vector> per_dim_ranges = + transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)), + [](nonnegative_int dim_size) -> std::vector { + return nonnegative_range(dim_size); + }); + + std::unordered_set> raw_points = + unordered_set_of(cartesian_product(per_dim_ranges)); + + return transform(raw_points, + [](std::vector const &raw_point) { + return ArrayCoord{ff_ordered_of(raw_point)}; + }); +} + } // namespace FlexFlow + +namespace std { + +using namespace FlexFlow; + +size_t hash::operator()(ArrayShape const &s) const { + return get_std_hash(s.tie()); +} + +} // namespace std diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc new file mode 100644 index 0000000000..d8619d8ce6 --- /dev/null +++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc @@ -0,0 +1,66 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary( + GenericTensorAccessorR const &accessor, Allocator &cpu_allocator) { + if (accessor.device_type == DeviceType::GPU) { + return copy_tensor_accessor_r(accessor, cpu_allocator); + } else { + return accessor; + } +} + +GenericTensorAccessorW copy_tensor_accessor_w_to_cpu_if_necessary( + GenericTensorAccessorW const &accessor, Allocator &cpu_allocator) { + if (accessor.device_type == DeviceType::GPU) { + return copy_tensor_accessor_w(accessor, cpu_allocator); + } else { + return accessor; + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc new file mode 100644 index 0000000000..1b8ab35d89 --- /dev/null +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -0,0 +1,184 @@ +#include "kernels/format_accessor_contents.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/local_cpu_allocator.h" +#include "utils/indent.h" +#include + +namespace FlexFlow { + +template +struct Print1DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 1_n); + + nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n}); + + stream << "[" + << join_strings(nonnegative_range(ncols), + " ", + [&](nonnegative_int col_idx) -> std::string { + return fmt::to_string( + accessor.at
(FFOrdered{col_idx})); + }) + << "]"; + } +}; + +static std::string + format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 1_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print2DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 2_n); + nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + + auto render_1d = [&](nonnegative_int dim0_idx) -> std::string { + return "[" + + join_strings(nonnegative_range(dim1_size), + " ", + [&](nonnegative_int dim1_idx) -> std::string { + return fmt::to_string( + accessor.at
(FFOrdered{dim0_idx, dim1_idx})); + }) + + "]"; + }; + + stream << "[\n" + << indent( + join_strings(nonnegative_range(dim0_size), "\n", render_1d)) + << "\n]"; + } +}; + +static std::string + format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 2_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print3DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 3_n); + + nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n}); + + auto render_1d = [&](nonnegative_int dim0_idx, + nonnegative_int dim1_idx) -> std::string { + return "[" + + join_strings(nonnegative_range(dim2_size), + " ", + [&](nonnegative_int dim2_idx) -> std::string { + return fmt::to_string(accessor.at
( + FFOrdered{dim0_idx, dim1_idx, dim2_idx})); + }) + + "]"; + }; + + auto render_2d = [&](nonnegative_int dim0_idx) -> std::string { + return "[\n" + + indent(join_strings(nonnegative_range(dim1_size), + "\n", + [&](nonnegative_int dim1_idx) -> std::string { + return render_1d(dim0_idx, dim1_idx); + })) + + "\n]"; + }; + + stream << "[\n" + << indent( + join_strings(nonnegative_range(dim0_size), "\n", render_2d)) + << "\n]"; + } +}; + +static std::string + format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 3_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +static std::string + format_1d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_1d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +static std::string + format_2d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_2d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +static std::string + format_3d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_3d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + + int num_dims = accessor.shape.num_dims().unwrap_nonnegative(); + switch (num_dims) { + case 1: + return format_1d_accessor_r_contents(accessor); + case 2: + return format_2d_accessor_r_contents(accessor); + case 3: + return format_3d_accessor_r_contents(accessor); + default: + PANIC("Unhandled accessor dimensionality", num_dims); + } +} + +std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = + copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator); + + int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative(); + switch (num_dims) { + case 1: + return format_1d_accessor_w_contents(cpu_accessor); + case 2: + return format_2d_accessor_w_contents(cpu_accessor); + case 3: + return format_3d_accessor_w_contents(cpu_accessor); + default: + PANIC("Unhandled accessor dimensionality", num_dims); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc similarity index 78% rename from lib/kernels/src/legion_dim.cc rename to lib/kernels/src/kernels/legion_dim.cc index bbb15c5636..f3482b1d9b 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/kernels/legion_dim.cc @@ -1,7 +1,11 @@ #include "kernels/legion_dim.h" +#include "utils/archetypes/value_type.h" namespace FlexFlow { +using T = value_type<0>; +template std::set key_range(LegionOrdered const &); + legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { return legion_dim_t{ nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}}; @@ -11,6 +15,7 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, nonnegative_int num_dimensions) { return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - ff_dim.value.unwrap_nonnegative() - 1}}; + ; } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc new file mode 100644 index 0000000000..8af44173b0 --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc @@ -0,0 +1,10 @@ +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template struct LegionOrdered; + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/slice.cc b/lib/kernels/src/kernels/legion_ordered/slice.cc new file mode 100644 index 0000000000..69fcf570aa --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/slice.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/slice.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template LegionOrdered slice(LegionOrdered const &, + legion_dim_t const &, + std::optional const &); + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/transform.cc b/lib/kernels/src/kernels/legion_ordered/transform.cc new file mode 100644 index 0000000000..d9fb38198e --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/transform.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/transform.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; +using Out = value_type<1>; +using F = std::function; + +template LegionOrdered transform(LegionOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/kernels/src/kernels/local_cpu_allocator.cc similarity index 52% rename from lib/local-execution/src/local_cpu_allocator.cc rename to lib/kernels/src/kernels/local_cpu_allocator.cc index 4ca5f987a8..738d1abf27 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/kernels/src/kernels/local_cpu_allocator.cc @@ -1,20 +1,27 @@ -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" #include "utils/containers/contains_key.h" +#include +#include namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { void *ptr = malloc(requested_memory_size); + ASSERT(ptr != nullptr); this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); return ptr; } void LocalCPUAllocator::deallocate(void *ptr) { - if (contains_key(this->ptrs, ptr)) { - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } + ASSERT(contains_key(this->ptrs, ptr), + "Deallocating a pointer that was not allocated by this Allocator"); + + free(ptr); + this->ptrs.erase(ptr); +} + +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; } Allocator create_local_cpu_memory_allocator() { diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/kernels/local_cuda_allocator.cc similarity index 59% rename from lib/kernels/src/local_cuda_allocator.cc rename to lib/kernels/src/kernels/local_cuda_allocator.cc index cdcfb017a0..1b081517bf 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/kernels/local_cuda_allocator.cc @@ -1,6 +1,7 @@ #include "kernels/local_cuda_allocator.h" #include "kernels/device.h" #include "utils/containers/contains.h" +#include namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { @@ -11,13 +12,15 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) { } void LocalCudaAllocator::deallocate(void *ptr) { - if (contains(this->ptrs, ptr)) { - checkCUDA(cudaFree(ptr)); - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } + ASSERT(contains(this->ptrs, ptr), + "Deallocating a pointer that was not allocated by this Allocator"); + + checkCUDA(cudaFree(ptr)); + this->ptrs.erase(ptr); +} + +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; } LocalCudaAllocator::~LocalCudaAllocator() { @@ -27,7 +30,8 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc new file mode 100644 index 0000000000..c647181872 --- /dev/null +++ b/lib/kernels/src/kernels/reverse_kernels_params.cc @@ -0,0 +1,30 @@ +#include "kernels/reverse_kernels_params.h" + +namespace FlexFlow { + +ReverseKernelsParams + compute_reverse_kernels_params(ArrayShape const &output_shape, + ReverseAttrs const &attrs) { + auto axis = attrs.axis; + nonnegative_int in_blk_size = 1_n; + nonnegative_int reverse_dim_size = 1_n; + nonnegative_int num_out_blks = 1_n; + for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) { + if (i < axis.value) { + in_blk_size *= output_shape.at(ff_dim_t{i}); + } else if (i == axis.value) { + reverse_dim_size = output_shape.at(ff_dim_t{i}); + } else { + num_out_blks *= output_shape.at(ff_dim_t{i}); + } + } + + return ReverseKernelsParams{ + num_out_blks, + reverse_dim_size, + in_blk_size, + output_shape.get_volume(), + }; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,36 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + this->cleanup(); + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + this->cleanup(); +} + +void ManagedFFStream::cleanup() { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..ea26d2350c 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -1,16 +1,17 @@ #include "kernels/managed_per_device_ff_handle.h" -#include "device.h" +#include "internal/device.h" namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; - - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { + this->handle = new PerDeviceFFHandle{}; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; + + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + this->cleanup(); + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; } } diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 00da2d0d70..066cb96753 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,6 +14,7 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) set(FF_TEST_EXEC_NAME "kernels-tests") diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc new file mode 100644 index 0000000000..8630dcd8cd --- /dev/null +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -0,0 +1,57 @@ +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/replicate_kernels_cpu.h" +#include "test/utils/doctest/check_kv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Replicate::cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = + create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); + + TensorShape result_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + GenericTensorAccessorW result = + create_zero_filled_accessor_w(result_shape, cpu_allocator); + + GenericTensorAccessorR correct = input; + + Kernels::Replicate::cpu_forward_kernel(input, result); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + TEST_CASE("Replicate::cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR output = create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + cpu_allocator); + + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + + TensorShape result_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + GenericTensorAccessorW result = + create_zero_filled_accessor_w(result_shape, cpu_allocator); + Kernels::Replicate::cpu_backward_kernel(output, result, 3); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } +} diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc new file mode 100644 index 0000000000..db0016cb0b --- /dev/null +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -0,0 +1,206 @@ +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Reverse::cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + DataType::FLOAT, + }, + cpu_allocator); + + SUBCASE("axis = ff_dim_t{0}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{1}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{1_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{2}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{2_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + } + + TEST_CASE("Reverse::cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + DataType::FLOAT, + }, + cpu_allocator); + + SUBCASE("axis = ff_dim_t{0}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{1}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{1_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{2}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{2_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + } +} diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc new file mode 100644 index 0000000000..0f34a6aa06 --- /dev/null +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -0,0 +1,392 @@ +#include "internal/test_utils.h" +#include "op-attrs/tensor_shape.h" +#include "utils/containers/require_all_same1.h" +#include "utils/join_strings.h" +#include + +namespace FlexFlow { + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; +} + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_zero_filled_accessor_w(shape, allocator); + return read_only_accessor_from_write_accessor(accessor); +} + +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator) { + nonnegative_int ncols = num_elements(contents); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{ncols}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at(FFOrdered{col_idx}) = + contents.at(col_idx.unwrap_nonnegative()); + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator) { + nonnegative_int nrows = num_elements(contents); + ASSERT(nrows > 0); + + nonnegative_int ncols = throw_if_unexpected( + require_all_same1(transform(contents, [](std::vector const &row) { + return num_elements(row); + }))); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{nrows, ncols}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int row_idx : nonnegative_range(nrows)) { + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at(FFOrdered{row_idx, col_idx}) = + contents.at(row_idx.unwrap_nonnegative()) + .at(col_idx.unwrap_nonnegative()); + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return num_elements(m); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return throw_if_unexpected( + require_all_same1(transform(m, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + cpu_accessor.at( + FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()); + } + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return num_elements(t); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &m) { + return throw_if_unexpected(require_all_same1( + transform(m, [](std::vector> const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return throw_if_unexpected(require_all_same1( + transform(t, [](std::vector> const &mat) { + return throw_if_unexpected(require_all_same1( + transform(mat, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + }))); + ASSERT(dim3_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}}, + DataType::FLOAT, + }; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) { + accessor.at( + FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()) + .at(dim3_idx.unwrap_nonnegative()); + } + } + } + } + + return accessor; +} + +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_1d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_2d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_3d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_4d_accessor_w_with_contents(contents, allocator)); +} + +template +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape).unwrap_nonnegative(); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 99); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } + + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, allocator); +} + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} + +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T)); + } else { + checkCUDA(cudaMemset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * + sizeof(T))); + } + } +}; + +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + int volume = accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; + } +}; + +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); +} + +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + int volume = accessor_a.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + ASSERT(accessor_a.shape == accessor_b.shape, + "accessors_are_equal expects accessors to have the same shape"); + + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + + int volume = dst_accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); +} + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); +} +} // namespace FlexFlow diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h new file mode 100644 index 0000000000..a4fc9b88c8 --- /dev/null +++ b/lib/kernels/test/src/internal/test_utils.h @@ -0,0 +1,78 @@ +#ifndef _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H +#define _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H + +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" +#include +#include +#include +#include + +namespace FlexFlow { + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator); +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator); + +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator); +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator); + +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator); +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator); + +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator); +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator); + +bool contains_non_zero(GenericTensorAccessorR const &accessor); + +void fill_with_zeros(GenericTensorAccessorW const &accessor); + +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream); + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc new file mode 100644 index 0000000000..98f8471212 --- /dev/null +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -0,0 +1,73 @@ +#include "kernels/accessor.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("calculate_accessor_offset") { + SUBCASE("one dimension") { + std::vector indices = {4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 13_n, + }, + }; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 4_n; + + CHECK(result == correct); + } + + SUBCASE("multiple dimensions") { + std::vector indices = {2_n, 4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 2_n * 5_n + 4_n; + + CHECK(result == correct); + } + + SUBCASE("zero dimensions") { + std::vector indices = {}; + ArrayShape shape = ArrayShape{std::vector{}}; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 0_n; + + CHECK(result == correct); + } + + SUBCASE("index and shape dimensions do not match") { + std::vector indices = {1_n, 2_n, 4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + CHECK_THROWS(calculate_accessor_offset(indices, shape)); + } + + SUBCASE("out of bounds index") { + std::vector indices = {2_n, 5_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + CHECK_THROWS(calculate_accessor_offset(indices, shape)); + } + } +} diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc new file mode 100644 index 0000000000..1fb4c0b541 --- /dev/null +++ b/lib/kernels/test/src/kernels/array_shape.cc @@ -0,0 +1,49 @@ +#include "kernels/array_shape.h" +#include "test/utils/doctest/fmt/unordered_set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_array_coord_set") { + SUBCASE("ArrayShape is not empty") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_n, 1_n, 3_n}, + }; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = { + ArrayCoord{FFOrdered{0_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{0_n, 0_n, 1_n}}, + ArrayCoord{FFOrdered{1_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{1_n, 0_n, 1_n}}, + ArrayCoord{FFOrdered{2_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{2_n, 0_n, 1_n}}, + }; + + CHECK(result == correct); + } + + SUBCASE("ArrayShape has a dimension of size zero") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_n, 0_n, 3_n}, + }; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = {}; + + CHECK(result == correct); + } + + SUBCASE("ArrayShape is zero-dimensional") { + ArrayShape input = ArrayShape{LegionOrdered{}}; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = { + ArrayCoord{FFOrdered{}}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc new file mode 100644 index 0000000000..915a84c335 --- /dev/null +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -0,0 +1,94 @@ +#include "kernels/format_accessor_contents.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("format_accessor_r_contents(GenericTensorAccessorR)") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("accessor is 1d") { + GenericTensorAccessorR accessor = + create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator); + + std::string correct = "[1 2 3 2]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is 2d") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {1, 2, 3, 5}, + {4, 3, 3, 2}, + {1, 1, 5, 8}, + }, + cpu_allocator); + + std::string correct = "[\n" + " [1 2 3 5]\n" + " [4 3 3 2]\n" + " [1 1 5 8]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is 3d") { + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {1, 2, 3, 6}, + {4, 3, 3, 9}, + {1, 1, 5, 1}, + }, + { + {4, 1, 8, 7}, + {9, 4, 2, 4}, + {1, 0, 0, 6}, + }, + { + {2, 1, 1, 9}, + {1, 3, 6, 2}, + {1, 9, 8, 9}, + }, + }, + cpu_allocator); + + std::string correct = "[\n" + " [\n" + " [1 2 3 6]\n" + " [4 3 3 9]\n" + " [1 1 5 1]\n" + " ]\n" + " [\n" + " [4 1 8 7]\n" + " [9 4 2 4]\n" + " [1 0 0 6]\n" + " ]\n" + " [\n" + " [2 1 1 9]\n" + " [1 3 6 2]\n" + " [1 9 8 9]\n" + " ]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is some other dimension") { + GenericTensorAccessorR accessor = + create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator); + + CHECK_THROWS(format_accessor_r_contents(accessor)); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc new file mode 100644 index 0000000000..34822ed1c3 --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_dim.cc @@ -0,0 +1,32 @@ +#include "kernels/legion_dim.h" +#include "test/utils/doctest/fmt/set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("key_range(LegionOrdered)") { + SUBCASE("input is non-empty") { + LegionOrdered input = {5, 3, 2, 3}; + + std::set result = key_range(input); + std::set correct = { + legion_dim_t{0_n}, + legion_dim_t{1_n}, + legion_dim_t{2_n}, + legion_dim_t{3_n}, + }; + + CHECK(result == correct); + } + + SUBCASE("input is empty") { + LegionOrdered input = {}; + + std::set result = key_range(input); + std::set correct = {}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc new file mode 100644 index 0000000000..4b50cad735 --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/legion_ordered.h" +#include "test/utils/rapidcheck.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE_TEMPLATE( + "Arbitrary> with T=", T, int, double, char) { + RC_SUBCASE([](LegionOrdered) {}); + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/slice.cc b/lib/kernels/test/src/kernels/legion_ordered/slice.cc new file mode 100644 index 0000000000..d0211d270e --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/slice.cc @@ -0,0 +1,30 @@ +#include "kernels/legion_ordered/slice.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("slice(LegionOrdered, ..., ...") { + LegionOrdered d = LegionOrdered{ + 1, + 2, + 3, + 4, + }; + SUBCASE("legion_dim_t, legion_dim_t") { + LegionOrdered result = slice(d, + legion_dim_t{nonnegative_int{1}}, + legion_dim_t{nonnegative_int{3}}); + LegionOrdered correct = LegionOrdered{2, 3}; + + CHECK(result == correct); + } + SUBCASE("legion_dim_t, std::nullopt_t") { + LegionOrdered result = + slice(d, legion_dim_t{nonnegative_int{1}}, std::nullopt); + LegionOrdered correct = LegionOrdered{2, 3, 4}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc new file mode 100644 index 0000000000..759507264f --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc @@ -0,0 +1,36 @@ +#include "kernels/legion_ordered/transform.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("transform(LegionOrdered, F)") { + SUBCASE("input is empty") { + LegionOrdered input = {}; + + LegionOrdered result = + transform(input, [](std::string const &) -> int { + CHECK(false); + return 0; + }); + LegionOrdered correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input is not empty") { + LegionOrdered input = {2, 1, 2, 5}; + + LegionOrdered result = + transform(input, [](int x) { return fmt::to_string(x); }); + LegionOrdered correct = LegionOrdered{ + "2", + "1", + "2", + "5", + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 64264f6c39..9064ae4824 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/attention_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test multi-head attention kernel") { nonnegative_int num_samples = 10_n; nonnegative_int num_heads = 4_n; @@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -39,16 +41,26 @@ TEST_SUITE(FF_TEST_SUITE) { /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), /*add_bias_kv=*/false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); - TensorShape weight_shape = make_float_tensor_shape_from_legion_dims( - {nonnegative_int{state.weightSize}}); + TensorShape query_shape = TensorShape{ + TensorDims{FFOrdered{qoSeqLength, num_samples, qSize}}, + DataType::FLOAT, + }; + TensorShape key_shape = TensorShape{ + TensorDims{FFOrdered{kvSeqLength, num_samples, kSize}}, + DataType::FLOAT, + }; + TensorShape value_shape = TensorShape{ + TensorDims{FFOrdered{kvSeqLength, num_samples, vSize}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{qoSeqLength, num_samples, oProjSize}}, + DataType::FLOAT, + }; + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{nonnegative_int{state.weightSize}}}, + DataType::FLOAT, + }; GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); @@ -72,9 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index cacd5b60fb..5f63b48198 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/batch_matmul_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchMatmul Kernel") { nonnegative_int m = 10_n; nonnegative_int n = 10_n; @@ -15,16 +15,24 @@ TEST_SUITE(FF_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); - TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + TensorShape input_shape_a = TensorShape{ + TensorDims{FFOrdered{batch, k, m}}, + DataType::FLOAT, + }; + TensorShape input_shape_b = TensorShape{ + TensorDims{FFOrdered{batch, n, k}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{batch, n, m}}, + DataType::FLOAT, + }; GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index b4c43cf1d8..903ad8cc43 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,10 +1,11 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/batch_norm_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchNorm Kernel") { nonnegative_int output_n = 1_n; nonnegative_int output_c = 10_n; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -26,25 +29,33 @@ TEST_SUITE(FF_TEST_SUITE) { /*output_w=*/output_w.unwrap_nonnegative(), /*relu=*/true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape scale_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape bias_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel( /*stream=*/managed_stream.raw_stream(), @@ -54,10 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*scale_ptr=*/scale_accessor.get_float_ptr(), /*bias_ptr=*/bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { @@ -73,9 +81,9 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel( /*stream=*/managed_stream.raw_stream(), /*per_device_state=*/state, - /*input_ptr=*/input_accessor.get_float_ptr(), - /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), /*output_ptr=*/output_accessor.get_float_ptr(), + /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), + /*input_ptr=*/input_accessor.get_float_ptr(), /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(), /*scale_ptr=*/scale_accessor.get_float_ptr(), /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), @@ -83,19 +91,9 @@ TEST_SUITE(FF_TEST_SUITE) { /*numElements=*/ input_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - std::vector host_scale_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(scale_grad_accessor)); - std::vector host_bias_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(bias_grad_accessor)); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 0e0769014d..0c41fe12ac 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,56 +1,86 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/cast_kernels.h" -#include "test_utils.h" -#include +#include "kernels/cast_kernels_cpu.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Cast Forward and Backward Kernels") { ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); - TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100_n, 100_n}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::DOUBLE, + }; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - std::vector host_double_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); - - std::vector host_grad_float_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(grad_input_accessor)); - CHECK(contains_non_zero(host_grad_float_data)); + create_zero_filled_accessor_w(input_shape, allocator); + + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); + + CHECK(contains_non_zero(grad_input_accessor)); + } + } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 2_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 2_n}}, + DataType::DOUBLE, + }; + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + // Run GPU Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2b6b9bf589..2040dcbd5d 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,39 +1,39 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/combine_kernels.h" -#include "test_utils.h" +#include "kernels/combine_kernels_cpu.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Call Combine Forward and Backward Kernels") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +41,66 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{5_n, 5_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 215e599716..c2df907917 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,56 +1,113 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/concat_kernels.h" -#include "test_utils.h" #include "utils/containers/repeat.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - nonnegative_int num_inputs = 3_n; - nonnegative_int size_per_input = 100_n; - ff_dim_t concat_axis = ff_dim_t{0_n}; - - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); - Allocator allocator = create_local_cuda_memory_allocator(); + const nonnegative_int num_inputs = 4_n; + SUBCASE("forward_kernel") { - std::vector input_accessors = - repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - }); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Concat::forward_kernel(managed_stream.raw_stream(), - output_accessor, - input_accessors, - concat_axis); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_data)); + auto run_forward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, input_cols}}, + DataType::FLOAT, + }; + + std::vector input_accessors = + repeat(num_inputs, [&]() { + return create_random_filled_accessor_r(input_shape, allocator); + }); + + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Concat::forward_kernel(managed_stream.raw_stream(), + output_accessor, + input_accessors, + concat_axis); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test forward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, + DataType::FLOAT, + }; + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test forward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, + DataType::FLOAT, + }; + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - std::vector input_grad_accessors = repeat( - num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); - Kernels::Concat::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessors, - concat_axis); + auto run_backward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, input_cols}}, + DataType::FLOAT, + }; + + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + + std::vector input_grad_accessors = + repeat(num_inputs, [&]() { + return create_zero_filled_accessor_w(input_shape, allocator); + }); + + Kernels::Concat::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessors, + concat_axis); + + for (auto &accessor : input_grad_accessors) { + CHECK(contains_non_zero(accessor)); + } + }; + + SUBCASE("test backward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, + DataType::FLOAT, + }; + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test backward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, + DataType::FLOAT, + }; + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } } } diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc index ed5852bc31..de3215cf2d 100644 --- a/lib/kernels/test/src/test_cuda.cc +++ b/lib/kernels/test/src/test_cuda.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" -#include "test_utils.h" +#include "internal/test_utils.h" +#include #include namespace FlexFlow { -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test CUDA") { int deviceCount = 0; diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 86f8f2102b..409b06d9a9 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,38 +1,37 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/dropout_kernels.h" -#include "test_utils.h" #include "utils/containers/count.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Dropout Kernels") { unsigned long long seed = 12345; float dropout_rate = 0.1; ArrayShape shape = ArrayShape{ - std::vector{10_n, 10_n}, + std::vector{10_n, 10_n}, }; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); DropoutPerDeviceState state = Kernels::Dropout::init_kernel( managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); - auto get_zero_count = [](std::vector const &data) { - return count(data, [](float x) { return x == 0.0f; }); - }; - SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -41,11 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 83f7f0445e..f8a3abdb98 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,21 +1,27 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/flat_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -25,33 +31,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); - - std::vector backward_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1a8cf5f82a..f0be809475 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -1,61 +1,107 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/gather_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{2_n}}; + legion_dim_t{0_n}}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Gather::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - index_accessor, - output_accessor); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Gather::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - index_accessor, - input_grad_accessor); - - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad_data)); + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::FLOAT, + }; + run_backward_test(input_shape, index_shape, output_shape); + } } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5386c1d943..02a95ba58a 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,23 +1,30 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/layer_norm_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test LayerNorm Forward and Backward Kernel") { nonnegative_int batch_size = 10_n; nonnegative_int feature_size = 10_n; float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, feature_size}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; - TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + TensorShape feature_shape = TensorShape{ + TensorDims{FFOrdered{feature_size}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -31,16 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,8 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..fb5920adcc --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,107 @@ +#include "internal/test_utils.h" +#include "kernels/gather_kernels.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Test ManagedFFStream") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + Allocator allocator = create_local_cuda_memory_allocator(); + + GatherPerDeviceState state = {managed_handle.raw_handle(), + legion_dim_t{0_n}}; + + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape const &input_shape, + TensorShape const &index_shape, + TensorShape const &output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*input=*/input_accessor, + /*index=*/index_accessor, + /*output=*/output_accessor); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + } + + SUBCASE("backward_kernel") { + auto run_backward_test = [&](TensorShape const &input_shape, + TensorShape const &index_shape, + TensorShape const &output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*output_grad=*/output_grad_accessor, + /*index=*/index_accessor, + /*input_grad=*/input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::FLOAT, + }; + run_backward_test(input_shape, index_shape, output_shape); + } + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..fc67764cdb --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,37 @@ +#include "kernels/managed_per_device_ff_handle.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Test ManagedPerDeviceFFHandle") { + ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + SUBCASE("constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("move constructor") { + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + new_handle = std::move(base_handle); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } + } + } +} diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4fd1b53210..5452266dad 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/partition_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -14,48 +17,36 @@ TEST_SUITE(FF_TEST_SUITE) { RepartitionPerDeviceState state = Kernels::Repartition::init_kernel( managed_handle.raw_handle(), DataType::FLOAT); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 62b61707c6..f2ada8387e 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,9 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/pool_2d_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Pool2D Forward and Backward Kernel") { nonnegative_int input_w = 10_n; nonnegative_int input_h = 10_n; @@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -46,10 +49,14 @@ TEST_SUITE(FF_TEST_SUITE) { /*stride_w=*/stride_w.unwrap_nonnegative(), /*pool_type=*/pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_n, input_c, input_h, input_w}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{output_n, input_c, output_h, output_w}}, + DataType::FLOAT, + }; GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -62,28 +69,23 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 04a3817b84..e13b149769 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,27 +1,33 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reduction_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {10_n, 10_n, 10_n, 10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({10_n}); + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -30,30 +36,22 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_grad_data == expected_grad_input_data); + output_grad_accessor, + input_grad_accessor); + + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fa726898f2..83a9a992f7 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,55 +1,150 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels.h" -#include "test_utils.h" +#include "kernels/replicate_kernels_cpu.h" +#include "test/utils/doctest/check_kv.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Call Replicate Forward and Backward Kernels") { nonnegative_int num_replicas = 10_n; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = input_shape; + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); + GenericTensorAccessorR input = + create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); + + GenericTensorAccessorW output = + gpu_allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input_accessor, output_accessor); + managed_stream.raw_stream(), input, output); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + GenericTensorAccessorR correct = input; - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK_MESSAGE(accessors_are_equal(output, correct), + check_kv("output", format_accessor_w_contents(output))); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + gpu_allocator); + + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + + GenericTensorAccessorW input_grad = + gpu_allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor, + output_grad, + input_grad, num_replicas.unwrap_nonnegative()); - std::vector check_aggregated_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(check_aggregated_data)); + CHECK_MESSAGE( + accessors_are_equal(input_grad, correct), + check_kv("input_grad", format_accessor_w_contents(input_grad))); + } + } + + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { + nonnegative_int num_replicas = 2_n; + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{5_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{5_n, num_replicas}}, + DataType::FLOAT, + }; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK_MESSAGE( + accessors_are_equal(output_accessor_gpu, output_accessor_cpu), + check_kv("input", format_accessor_r_contents(input_accessor_cpu)), + check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)), + check_kv("cpu", format_accessor_w_contents(output_accessor_cpu))); + } + + SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + num_replicas.unwrap_nonnegative()); + + // Run CPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Replicate::cpu_backward_kernel( + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_replicas.unwrap_nonnegative()); + + CHECK_MESSAGE( + accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu), + check_kv("output_grad", + format_accessor_r_contents(output_grad_accessor_cpu)), + check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)), + check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu))); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index d329a347b3..66c6bf849b 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -1,16 +1,21 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reshape_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -18,42 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 9c8475f6d6..6e12c48ac3 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,63 +1,124 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reverse_kernels.h" -#include "test_utils.h" +#include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { - nonnegative_int reverse_dim_size = 10_n; - nonnegative_int in_blk_size = 10_n; - nonnegative_int num_out_blks = 1_n; - - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{1_n, 10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reverse::forward_kernel( - managed_stream.raw_stream(), - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_accessor.shape.num_elements().unwrap_nonnegative()); - - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(check_output_data)); + managed_stream.raw_stream(), input_accessor, output_accessor, attrs); + + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Reverse::backward_kernel( - managed_stream.raw_stream(), - output_grad_accessor.get_float_ptr(), - input_grad_accessor.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad_accessor.shape.num_elements().unwrap_nonnegative()); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + allocator.allocate_tensor(input_shape); + + Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessor, + attrs); + + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{1_n, 4_n, 3_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + SUBCASE("forward_kernel") { + // Run GPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu, + output_accessor_gpu, + attrs); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel( + input_accessor_cpu, output_accessor_cpu, attrs); + + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + attrs); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Reverse::cpu_backward_kernel( + output_grad_accessor_cpu, input_grad_accessor_cpu, attrs); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c9eaa76b86..904cca2d3e 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/softmax_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { nonnegative_int input_n = 1_n; nonnegative_int input_c = 1_n; @@ -12,12 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = @@ -40,30 +45,22 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector expected_input_grad_data = std::vector( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index ea0d280f68..44e8f42f76 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,24 +1,33 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/split_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" #include "utils/containers/repeat.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Split Forward and Backward Kernel") { nonnegative_int num_outputs = 2_n; coord_t out_blk_sizes[] = {50, 50}; coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{50_n}}, + DataType::FLOAT, + }; SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = @@ -47,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 02d99c86a1..3c15661396 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -1,58 +1,54 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/transpose_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { TransposeAttrs attrs = TransposeAttrs{ - FFOrdered{ - ff_dim_t{0_n}, + FFOrdered{ ff_dim_t{1_n}, + ff_dim_t{0_n}, }, }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), attrs, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), attrs, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc deleted file mode 100644 index 903b666fa9..0000000000 --- a/lib/kernels/test/src/test_utils.cc +++ /dev/null @@ -1,106 +0,0 @@ -#include "test_utils.h" - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } -} - -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; -} - -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; -} diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h deleted file mode 100644 index 08f0f382fb..0000000000 --- a/lib/kernels/test/src/test_utils.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef _FLEXFLOW_KERNELS_TEST_UTILS -#define _FLEXFLOW_KERNELS_TEST_UTILS - -#include "kernels/device.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include -#include -#include -#include -#include - -using namespace FlexFlow; - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill = false); - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); - -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims); - -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims); - -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { - int volume = accessor.shape.get_volume(); - - std::vector local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(T), - cudaMemcpyDeviceToHost)); - return local_data; -} - -template -bool contains_non_zero(std::vector &data) { - return !all_of( - data.begin(), data.end(), [](T const &val) { return val == 0; }); -} - -// Specialize doctest's StringMaker for std::vector -template <> -struct doctest::StringMaker> { - static doctest::String convert(std::vector const &vec) { - std::ostringstream oss; - for (size_t i = 0; i < vec.size(); ++i) { - oss << vec[i]; - if (i != vec.size() - 1) { - oss << ", "; - } - } - return doctest::String(("[" + oss.str() + "]").c_str()); - } -}; - -#endif diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 54eca7e514..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 1df6da8d8e..5cf8742918 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -134,9 +134,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index bb1504a3f5..55ff354483 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -107,8 +107,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -120,10 +120,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index c5ff9199f3..311b8e7924 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -58,8 +58,10 @@ static DeviceSpecificDeviceStates ParallelTensorShape output_shape = throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = init_kernel( - get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); + ElementUnaryPerDeviceState per_device_state = + init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)), + array_shape_from_tensor_shape(get_piece_shape(output_shape)), + attrs); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -88,10 +90,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -106,10 +108,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 0f872b5d50..af6fc16272 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -40,15 +40,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 6f0901e66a..9641cdbd4a 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -26,9 +26,9 @@ OpTaskInvocation init(LinearAttrs const &attrs) { binding.bind_arg(HANDLE, ff_handle()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); + binding.bind(WEIGHT, weight_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); return {task_id_t::LINEAR_INIT_TASK_ID, binding}; } @@ -36,11 +36,11 @@ OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskInvocation forward(LinearAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); + binding.bind(WEIGHT, weight_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); if (attrs.use_bias) { - binding.bind(BIAS, weight_tensor(1)); // bias + binding.bind(BIAS, weight_tensor(1)); } binding.bind_arg(PROFILING, profiling_settings()); @@ -124,20 +124,21 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto output = acc.get_tensor(OUTPUT); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_grad_ptr = NULL; if (attrs.use_bias) { - bias_ptr = bias.get_float_ptr(); + auto bias_grad = acc.get_tensor_grad(BIAS); + bias_grad_ptr = bias_grad.get_float_ptr(); } nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); @@ -148,13 +149,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + bias_grad_ptr, in_dim.unwrap_nonnegative(), out_dim.unwrap_nonnegative(), batch_size.unwrap_nonnegative()); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index fb0635efba..f85874dc0a 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -115,19 +115,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index ee1a7c6c4e..b07d9fe965 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -63,13 +63,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 6c0c813c8d..7b6e9fe2f6 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -85,8 +85,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index d3ada35d93..99aeb913ba 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -66,8 +66,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree.unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index fc3a75607d..e382b2668e 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -86,8 +86,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index ddd47d355d..00f56c6892 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -48,30 +48,12 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - nonnegative_int output_size = output.shape.get_volume(); - auto axis = attrs.axis; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) { - if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t{i}); - } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= output.shape.at(ff_dim_t{i}); - } - } - return profile(forward_kernel, profiling, "[reverse] forward_time = {:.2lf}ms\n", - input.get_float_ptr(), - output.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - output_size.unwrap_nonnegative()); + input, + output, + attrs); } static std::optional @@ -81,30 +63,12 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int axis = input_grad.shape.num_dims().unwrap_nonnegative() - - attrs.axis.value.unwrap_nonnegative() - 1; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) { - if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t{i}); - } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= input_grad.shape.at(ff_dim_t{i}); - } - } - return profile(backward_kernel, profiling, "[reverse] backward_time = {:.2lf}ms\n", - output_grad.get_float_ptr(), - input_grad.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad.shape.get_volume().unwrap_nonnegative()); + output_grad, + input_grad, + attrs); } TaskImplFunction get_reverse_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 0e94422c5f..e008098e05 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -106,8 +106,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume().unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 4146836b9a..1859bb0ccc 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -67,8 +67,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", attrs, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + return allocator; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..9f8b4092c1 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,68 +12,71 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle{ +// /*workSpaceSize=*/1024 * 1024, +// /*allowTensorOpMathConversion=*/true}; -// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ -// DeviceSpecific::create(managed_handle.raw_handle()), -// EnableProfiling::YES, -// ProfilingSettings{/*warmup_iters=*/0, -// /*measure_iters=*/1}}; +// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ +// DeviceSpecific::create(managed_handle.raw_handle()), +// EnableProfiling::YES, +// ProfilingSettings{/*warmup_iters=*/0, +// /*measure_iters=*/1}}; -// LocalCostEstimator cost_estimator = -// LocalCostEstimator{runtime_arg_config}; +// LocalCostEstimator cost_estimator = +// LocalCostEstimator{runtime_arg_config}; -// SUBCASE("Estimate cost -- Attention Op") { -// int embed_dim = 32; -// int num_heads = 10; -// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ -// /*embed_dim=*/embed_dim, -// /*num_heads=*/num_heads, -// /*kdim=*/embed_dim, -// /*vdim=*/embed_dim, -// /*dropout=*/0.0, -// /*bias=*/true, -// /*add_bias_kv=*/false, -// /*add_zero_attn=*/false, -// }; +// SUBCASE("Estimate cost -- Attention Op") { +// int embed_dim = 32; +// int num_heads = 10; +// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ +// /*embed_dim=*/embed_dim, +// /*num_heads=*/num_heads, +// /*kdim=*/embed_dim, +// /*vdim=*/embed_dim, +// /*dropout=*/0.0, +// /*bias=*/true, +// /*add_bias_kv=*/false, +// /*add_zero_attn=*/false, +// }; -// size_t batch_size = 40; -// size_t seq_len = 48; -// size_t feature_size = 36; +// size_t batch_size = 40; +// size_t seq_len = 48; +// size_t feature_size = 36; -// DataType dtype = DataType::FLOAT; -// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ -// TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, -// DataType::FLOAT, -// }); +// DataType dtype = DataType::FLOAT; +// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ +// TensorDims{FFOrdered{batch_size, seq_len, +// feature_size}}, DataType::FLOAT, +// }); -// ParallelTensorShape weights_shape = throw_if_unexpected( -// get_weights_shape(attrs, inputs_shape, inputs_shape, -// inputs_shape)); -// ParallelTensorAttrs weight_attrs = -// ParallelTensorAttrs{weights_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape weights_shape = throw_if_unexpected( +// get_weights_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs weight_attrs = +// ParallelTensorAttrs{weights_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// ParallelTensorShape output_shape = throw_if_unexpected( -// get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); -// ParallelTensorAttrs output_attrs = -// ParallelTensorAttrs{output_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape output_shape = throw_if_unexpected( +// get_output_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs output_attrs = +// ParallelTensorAttrs{output_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// CostDetails result = cost_estimator.estimate_cost( -// PCGOperatorAttrs{attrs}, -// std::vector{ -// inputs_shape, inputs_shape, inputs_shape}, -// std::vector{weight_attrs}, -// std::vector{output_attrs}, -// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); +// CostDetails result = cost_estimator.estimate_cost( +// PCGOperatorAttrs{attrs}, +// std::vector{ +// inputs_shape, inputs_shape, inputs_shape}, +// std::vector{weight_attrs}, +// std::vector{output_attrs}, +// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); -// CHECK(result.total_elapsed_time > 0); -// CHECK(result.total_mem_usage > 0); +// CHECK(result.total_elapsed_time > 0); +// CHECK(result.total_mem_usage > 0); +// } +// } // } -// } -// } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index dffb19398c..e55d1eddf5 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -1,6 +1,6 @@ #include "kernels/attention_kernels.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/local_slots_backing.h" #include "op-attrs/ops/attention.h" #include "op-attrs/parallel_tensor_shape.h" @@ -106,24 +106,24 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( query_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{query_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(query_shape), dtype}; CHECK(result == correct); } SUBCASE("Key grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( key_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{key_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(key_shape), dtype}; CHECK(result == correct); } SUBCASE("Value grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( value_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{value_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(value_shape), dtype}; CHECK(result == correct); } } @@ -135,9 +135,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_result_shape_and_dtype_for_tensor_guid_and_map( output_guid, local_slots_backing.tensor_mapping); std::pair correct = { - ArrayShape{ + array_shape_from_tensor_shape( get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, + .shape), dtype}; CHECK(result == correct); } @@ -146,9 +146,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_result_shape_and_dtype_for_tensor_guid_and_map( output_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = { - ArrayShape{ + array_shape_from_tensor_shape( get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, + .shape), dtype}; CHECK(result == correct); } diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index 0fab0f6a60..a39bb229e2 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -1,5 +1,5 @@ #include "doctest/doctest.h" -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/task_signature_impl.h" #include "utils/fmt/variant.h" diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..09ee99915d 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,6 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h index f2355289dc..5c47745209 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h @@ -17,13 +17,9 @@ struct DimOrdered { DimOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} - /* template ::value>::type> */ DimOrdered(std::vector const &contents) : contents(contents.begin(), contents.end()) {} - /* template ::value>::type> */ template DimOrdered(It begin, It end) : contents(begin, end) {} @@ -62,10 +58,6 @@ struct DimOrdered { return this->contents != other.contents; } - bool operator<(DimOrdered const &other) const { - return this->contents < other.contents; - } - using iterator = typename stack_vector::iterator; using const_iterator = typename stack_vector::const_iterator; @@ -116,7 +108,7 @@ struct DimOrdered { } reverse_iterator rend() { - return this->contents.crend(); + return this->contents.rend(); } const_reverse_iterator rend() const { @@ -145,195 +137,26 @@ struct DimOrdered { stack_vector contents; }; -template -struct DimOrdered { - DimOrdered() {} - - DimOrdered(std::initializer_list const &l) - : contents(l.begin(), l.end()) {} - - DimOrdered(std::vector const &contents) - : contents(contents.begin(), contents.end()) {} - - template - DimOrdered(It begin, It end) : contents(begin, end) {} - - template - DimOrdered(stack_vector const &contents) - : contents(contents.begin(), contents.end()) {} - - T const &at(ff_dim_t idx) const { - int raw = idx.value.unwrap_nonnegative(); - return this->contents.at(raw); - } - - T const &at(relative_ff_dim_t idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); - } - - T &at(ff_dim_t idx) { - int raw = idx.value.unwrap_nonnegative(); - return this->contents.at(raw); - } - - T &at(relative_ff_dim_t idx) { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); - } - - T const &operator[](ff_dim_t idx) const { - return this->at(idx); - } - - T const &operator[](relative_ff_dim_t idx) const { - return this->at(idx); - } - - T &operator[](ff_dim_t idx) { - return this->at(idx); - } - - T &operator[](relative_ff_dim_t idx) { - return this->at(idx); - } - - bool idx_is_valid(ff_dim_t const &idx) const { - int raw = idx.value.unwrap_nonnegative(); - return raw < this->contents.size(); - } - - bool idx_is_valid(relative_ff_dim_t const &idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return (raw >= 0 && raw < this->contents.size()); - } - - bool operator==(DimOrdered const &other) const { - return this->contents == other.contents; - } - - bool operator!=(DimOrdered const &other) const { - return this->contents != other.contents; - } - - bool operator<(DimOrdered const &other) const { - return this->contents < other.contents; - } - - using iterator = typename stack_vector::iterator; - using const_iterator = - typename stack_vector::const_iterator; - using reverse_iterator = - typename stack_vector::reverse_iterator; - using const_reverse_iterator = - typename stack_vector::const_reverse_iterator; - using value_type = T; - using pointer = value_type *; - using const_pointer = value_type const *; - using reference = value_type &; - using const_reference = value_type const &; - - iterator begin() { - return this->contents.begin(); - } - - const_iterator begin() const { - return this->cbegin(); - } - - const_iterator cbegin() const { - return this->contents.cbegin(); - } - - iterator end() { - return this->contents.end(); - } - - const_iterator end() const { - return this->cend(); - } - - const_iterator cend() const { - return this->contents.cend(); - } - - reverse_iterator rbegin() { - return this->contents.rbegin(); - } - - const_reverse_iterator rbegin() const { - return this->crbegin(); - } - - const_reverse_iterator crbegin() const { - return this->contents.crbegin(); - } - - reverse_iterator rend() { - return this->contents.crend(); - } - - const_reverse_iterator rend() const { - return this->crend(); - } - - const_reverse_iterator crend() const { - return this->contents.crend(); - } - - size_t size() const { - return this->contents.size(); - } - - size_t empty() const { - return this->contents.empty(); - } - - size_t num_dims() const { - return this->size(); - } - - friend struct ::std::hash; - -private: - stack_vector contents; -}; - -template -using FFOrdered = DimOrdered; +template +auto operator<(DimOrdered const &lhs, DimOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} -template -std::string format_as(FFOrdered const &v) { +template +std::string format_as(DimOrdered const &v) { std::vector as_vec(v.cbegin(), v.cend()); return fmt::format("", as_vec); } -template -std::ostream &operator<<(std::ostream &s, FFOrdered const &v) { +template +std::ostream &operator<<(std::ostream &s, DimOrdered const &v) { return (s << fmt::to_string(v)); } } // namespace FlexFlow -/* template */ -/* void to_json(json &j, DimOrdered const &x) { */ -/* /1* j = std::vector{x.cbegin(), x.cend()}; *1/ */ -/* } */ - -/* template */ -/* void from_json(json const &j, DimOrdered &x) { */ -/* /1* x = DimOrdered{j.template get>()}; *1/ */ -/* } */ - namespace nlohmann { template struct adl_serializer<::FlexFlow::DimOrdered> { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h index 166916dd44..76526447be 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H #include "op-attrs/dim_ordered/dim_ordered.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "utils/containers/transform.h" #include "utils/containers/vector_of.h" #include "utils/optional.h" @@ -18,35 +18,8 @@ DimOrdered nonoverloaded_slice(DimOrdered const &d, }; return DimOrdered{ - subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; + slice(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; } - -template -FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end) { - auto to_raw_idx = - [](std::optional const &idx) -> std::optional { - return transform( - idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); - }; - - return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; -} - -template -FFOrdered relative_ff_dim_t_nonoverloaded_slice( - FFOrdered const &d, - std::optional const &start, - std::optional const &end) { - auto to_raw_idx = - [](std::optional const &idx) -> std::optional { - return transform(idx, [](relative_ff_dim_t const &i) { return i.value; }); - }; - - return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; -} - template DimOrdered slice(DimOrdered const &d, std::optional const &start = std::nullopt, @@ -54,20 +27,6 @@ DimOrdered slice(DimOrdered const &d, return ff_dim_t_nonoverloaded_slice(d, start, end); } -template -FFOrdered slice(FFOrdered const &d, - std::optional const &start = std::nullopt, - std::optional const &end = std::nullopt) { - return ff_dim_t_nonoverloaded_slice(d, start, end); -} - -template -FFOrdered slice(FFOrdered const &d, - std::optional const &start = std::nullopt, - std::optional const &end = std::nullopt) { - return relative_ff_dim_t_nonoverloaded_slice(d, start, end); -} - } // namespace FlexFlow #endif diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h similarity index 95% rename from lib/op-attrs/include/op-attrs/dim_ordered/concat.h rename to lib/op-attrs/include/op-attrs/ff_ordered/concat.h index 9b9eaf9b93..a5faed2b36 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/concat_vectors.h" #include "utils/containers/transform.h" diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h similarity index 95% rename from lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h rename to lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h index 9e4271a1ff..bc8636615c 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/bidict/bidict.h" #include "utils/containers/count.h" diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h new file mode 100644 index 0000000000..92ed211c31 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h @@ -0,0 +1,228 @@ +#ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H +#define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H + +#include "op-attrs/ff_dim_t.dtg.h" +#include "op-attrs/relative_ff_dim_t.dtg.h" +#include "utils/fmt/vector.h" +#include "utils/stack_vector/stack_vector.h" + +namespace FlexFlow { + +template +struct FFOrdered { + FFOrdered() {} + + FFOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} + + FFOrdered(std::vector const &contents) + : contents(contents.begin(), contents.end()) {} + + template + FFOrdered(It begin, It end) : contents(begin, end) {} + + template + FFOrdered(stack_vector const &contents) + : contents(contents.begin(), contents.end()) {} + + T const &at(ff_dim_t idx) const { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T const &at(relative_ff_dim_t idx) const { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return this->contents.at(raw); + } + + T &at(ff_dim_t idx) { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T &at(relative_ff_dim_t idx) { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return this->contents.at(raw); + } + + T const &operator[](ff_dim_t idx) const { + return this->at(idx); + } + + T const &operator[](relative_ff_dim_t idx) const { + return this->at(idx); + } + + T &operator[](ff_dim_t idx) { + return this->at(idx); + } + + T &operator[](relative_ff_dim_t idx) { + return this->at(idx); + } + + bool idx_is_valid(ff_dim_t const &idx) const { + int raw = idx.value.unwrap_nonnegative(); + return raw < this->contents.size(); + } + + bool idx_is_valid(relative_ff_dim_t const &idx) const { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return (raw >= 0 && raw < this->contents.size()); + } + + bool operator==(FFOrdered const &other) const { + return this->contents == other.contents; + } + + bool operator!=(FFOrdered const &other) const { + return this->contents != other.contents; + } + + using iterator = typename stack_vector::iterator; + using const_iterator = + typename stack_vector::const_iterator; + using reverse_iterator = + typename stack_vector::reverse_iterator; + using const_reverse_iterator = + typename stack_vector::const_reverse_iterator; + using value_type = T; + using pointer = value_type *; + using const_pointer = value_type const *; + using reference = value_type &; + using const_reference = value_type const &; + + iterator begin() { + return this->contents.begin(); + } + + const_iterator begin() const { + return this->cbegin(); + } + + const_iterator cbegin() const { + return this->contents.cbegin(); + } + + iterator end() { + return this->contents.end(); + } + + const_iterator end() const { + return this->cend(); + } + + const_iterator cend() const { + return this->contents.cend(); + } + + reverse_iterator rbegin() { + return this->contents.rbegin(); + } + + const_reverse_iterator rbegin() const { + return this->crbegin(); + } + + const_reverse_iterator crbegin() const { + return this->contents.crbegin(); + } + + reverse_iterator rend() { + return this->contents.rend(); + } + + const_reverse_iterator rend() const { + return this->crend(); + } + + const_reverse_iterator crend() const { + return this->contents.crend(); + } + + size_t size() const { + return this->contents.size(); + } + + size_t empty() const { + return this->contents.empty(); + } + + size_t num_dims() const { + return this->size(); + } + + friend struct ::std::hash; + +private: + stack_vector contents; +}; + +template +auto operator<(FFOrdered const &lhs, FFOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} + +template +std::string format_as(FFOrdered const &v) { + std::vector as_vec(v.cbegin(), v.cend()); + return fmt::format("", as_vec); +} + +template +std::ostream &operator<<(std::ostream &s, FFOrdered const &v) { + return (s << fmt::to_string(v)); +} + +} // namespace FlexFlow + +namespace nlohmann { +template +struct adl_serializer<::FlexFlow::FFOrdered> { + static ::FlexFlow::FFOrdered from_json(nlohmann::json const &j) { + return {j.template get>()}; + } + + static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered const &x) { + j = std::vector{x.cbegin(), x.cend()}; + } +}; +} // namespace nlohmann + +namespace std { + +template +struct hash<::FlexFlow::FFOrdered> { + size_t operator()(::FlexFlow::FFOrdered const &t) const { + static_assert(::FlexFlow::is_hashable::value, + "Elements must be hashable"); + + return get_std_hash(t.contents); + } +}; + +} // namespace std + +namespace rc { + +template +struct Arbitrary<::FlexFlow::FFOrdered> { + static Gen<::FlexFlow::FFOrdered> arbitrary() { + return gen::construct<::FlexFlow::FFOrdered>( + gen::arbitrary<::FlexFlow::stack_vector>()); + } +}; + +} // namespace rc + +#endif diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h similarity index 88% rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h index f8f49233ec..9232afddfb 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H -#include "op-attrs/dim_ordered/dim_ordered.h" -#include "op-attrs/dim_ordered/ff_ordered_of.h" #include "op-attrs/ff_dim_t.h" +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h similarity index 88% rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h index 8cc1bf3a51..ace60b7e3d 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h similarity index 91% rename from lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h rename to lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h index 4e7f8530a4..5ff390d3fe 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H -#include "op-attrs/dim_ordered/dim_ordered.h" #include "op-attrs/ff_dim_t.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/count.h" #include "utils/containers/transform.h" diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h new file mode 100644 index 0000000000..79217c4cc3 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h @@ -0,0 +1,49 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/slice.h" +#include "utils/containers/transform.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template +FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, + ff_dim_t const &start, + std::optional const &end) { + int raw_start = start.value.unwrap_nonnegative(); + std::optional raw_end = transform( + end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); + return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +template +FFOrdered relative_ff_dim_t_nonoverloaded_slice( + FFOrdered const &d, + relative_ff_dim_t const &start, + std::optional const &end) { + int raw_start = start.value; + std::optional raw_end = + transform(end, [](relative_ff_dim_t const &i) { return i.value; }); + + return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +template +FFOrdered slice(FFOrdered const &d, + ff_dim_t const &start = ff_dim_t{0_n}, + std::optional const &end = std::nullopt) { + return ff_dim_t_nonoverloaded_slice(d, start, end); +} + +template +FFOrdered slice(FFOrdered const &d, + relative_ff_dim_t const &start = relative_ff_dim_t{0}, + std::optional const &end = std::nullopt) { + return relative_ff_dim_t_nonoverloaded_slice(d, start, end); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h new file mode 100644 index 0000000000..3a8eeb9ecf --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/vector_transform.h" + +namespace FlexFlow { + +template > +FFOrdered transform(FFOrdered const &d, F &&f) { + return FFOrdered{vector_transform(vector_of(d), f)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h new file mode 100644 index 0000000000..fe207740f7 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/zip.h" + +namespace FlexFlow { + +template +FFOrdered> zip(FFOrdered const &lhs, + FFOrdered const &rhs) { + return FFOrdered>{zip(vector_of(lhs), vector_of(rhs))}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml index b1c5f60382..50756f095b 100644 --- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml index be3a95eec8..d68ef02ec1 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/parallel_tensor_shape/sum_degree.dtg.h", "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h", - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "utils/nonnegative_int/nonnegative_int.h", ] diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml index f24fa12309..d2f8758377 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "op-attrs/shard_parallel_dim.dtg.h", "op-attrs/replica_parallel_dim_set.dtg.h", "", diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h index 97f3432c2f..ba35295e09 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/tensor_dims.h @@ -19,7 +19,7 @@ std::optional get_broadcast_target_dims(std::unordered_set const &); TensorDims slice_tensor_dims(TensorDims const &, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml index e86b866fd6..8c6d1098cc 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "utils/nonnegative_int/nonnegative_int.h", ] diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index a3cd8bfd9a..298ea04638 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -12,7 +12,7 @@ nonnegative_int get_num_elements(TensorShape const &); nonnegative_int get_size_in_bytes(TensorShape const &); TensorShape slice_tensor_shape(TensorShape const &, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop); } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..4604ef0b4e --- /dev/null +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc deleted file mode 100644 index cb29f708a3..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/concat.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc deleted file mode 100644 index 6edd5485af..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/enumerate.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc deleted file mode 100644 index 2de88f38c8..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc deleted file mode 100644 index 8e5c2fd38a..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/ff_ordered_of.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc deleted file mode 100644 index 175ae8d4bd..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/get_idxs.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc index 75ab1a32aa..8c3dbd7bbc 100644 --- a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc +++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc @@ -1,26 +1 @@ #include "op-attrs/dim_ordered/slice.h" -#include "utils/archetypes/value_type.h" - -namespace FlexFlow { - -using T = value_type<0>; - -template FFOrdered - ff_dim_t_nonoverloaded_slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered relative_ff_dim_t_nonoverloaded_slice( - FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc new file mode 100644 index 0000000000..73683eba94 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc @@ -0,0 +1 @@ +#include "op-attrs/dim_ordered/transform.h" diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc new file mode 100644 index 0000000000..e06c144149 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/enumerate.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::map enumerate(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc new file mode 100644 index 0000000000..1420586809 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc @@ -0,0 +1,14 @@ +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template struct FFOrdered; + +template std::string format_as(FFOrdered const &); + +template std::ostream &operator<<(std::ostream &, FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc new file mode 100644 index 0000000000..e39fedb858 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -0,0 +1,13 @@ +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered ff_ordered_from_map(std::map const &); + +template FFOrdered + ff_ordered_from_map(std::unordered_map const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc new file mode 100644 index 0000000000..3da15bebba --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/get_idxs.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::vector get_idxs(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc new file mode 100644 index 0000000000..059fd811cd --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc @@ -0,0 +1,24 @@ +#include "op-attrs/ff_ordered/slice.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered ff_dim_t_nonoverloaded_slice( + FFOrdered const &, ff_dim_t const &, std::optional const &); + +template FFOrdered relative_ff_dim_t_nonoverloaded_slice( + FFOrdered const &, + relative_ff_dim_t const &, + std::optional const &); + +template FFOrdered slice(FFOrdered const &, + ff_dim_t const &, + std::optional const &); + +template FFOrdered slice(FFOrdered const &, + relative_ff_dim_t const &, + std::optional const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc new file mode 100644 index 0000000000..74bf4895a3 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/transform.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; +using Out = value_type<1>; +using F = std::function; + +template FFOrdered transform(FFOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc new file mode 100644 index 0000000000..dc715ea97c --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/zip.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T1 = value_type<0>; +using T2 = value_type<1>; + +template FFOrdered> zip(FFOrdered const &, + FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc index d4763ef004..ddd92bd417 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/batch_norm.h" -#include "op-attrs/dim_ordered/concat.h" -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/any_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index fc42241ef2..bf0ba553e4 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/concat.h" -#include "op-attrs/dim_ordered/enumerate.h" -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" +#include "op-attrs/ff_ordered/enumerate.h" +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc index 4dc602646b..5b5b91a8e7 100644 --- a/lib/op-attrs/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc @@ -1,8 +1,10 @@ #include "op-attrs/ops/embedding.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/transform.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/transform.h" +#include "op-attrs/ops/embedding_attrs.dtg.h" #include "op-attrs/parallel_tensor_dims.h" #include "utils/containers/product.h" +#include "utils/fmt/optional.h" #include "utils/integer_conversions.h" namespace FlexFlow { diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc index 8ed12167b3..b4eeda76ab 100644 --- a/lib/op-attrs/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/src/op-attrs/ops/flat.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/flat.h" -#include "op-attrs/dim_ordered/concat.h" -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_dims.h" #include "utils/containers/any_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc index 00c6bb5e9b..c9798368e2 100644 --- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/layer_norm.h" -#include "op-attrs/dim_ordered/ff_ordered_of.h" -#include "op-attrs/dim_ordered/get_idxs.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/get_idxs.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/all_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index fb26113613..bee9d0cf4f 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -1,11 +1,12 @@ #include "op-attrs/ops/linear.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/transform.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/transform.h" #include "op-attrs/initializers/kaiming_initializer_mode.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/product.h" #include "utils/expected.h" +#include "utils/fmt/optional.h" #include "utils/integer_conversions.h" namespace FlexFlow { @@ -101,7 +102,7 @@ tl::expected SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input), - std::nullopt, + relative_ff_dim_t{0}, relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{ shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, @@ -126,8 +127,10 @@ tl::expected SumDegree sum_degree = SumDegree{get_sum_degree(input) * shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice( - ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))}; + DiscardCopyDegree discard_copy_degree = + DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input), + relative_ff_dim_t{0}, + relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{get_discard_copy_degree(input)}; diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index 7a8f91e498..3f2245b2dc 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -1,6 +1,6 @@ #include "op-attrs/parallel_tensor_dims.h" -#include "op-attrs/dim_ordered/transform.h" -#include "op-attrs/dim_ordered/zip.h" +#include "op-attrs/ff_ordered/transform.h" +#include "op-attrs/ff_ordered/zip.h" #include "op-attrs/replica_parallel_dim.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.h" diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc index 8d0592eab7..760278297c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc @@ -1,6 +1,6 @@ #include "op-attrs/tensor_dims.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/zip.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/zip.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.dtg.h" #include "utils/containers/all_of.h" @@ -67,7 +67,7 @@ std::optional } TensorDims slice_tensor_dims(TensorDims const &dims, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop) { return TensorDims{ slice(dims.ff_ordered, start, stop), diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc index 04b18794f1..afc14af54c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc @@ -29,7 +29,7 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) { } TensorShape slice_tensor_shape(TensorShape const &shape, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop) { return TensorShape{ slice_tensor_dims(shape.dims, start, stop), diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..9b0e90b601 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc @@ -0,0 +1,68 @@ +#include "op-attrs/datatype_value.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("test make_data_type_value") { + SUBCASE("make_float_data_type_value") { + float value = 1.0f; + DataTypeValue data_type_value = make_float_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_double_data_type_value") { + double value = 2.71828; + DataTypeValue data_type_value = make_double_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int32_data_type_value") { + int32_t value = -42; + DataTypeValue data_type_value = make_int32_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int64_data_type_value") { + int64_t value = 1LL << 40; + DataTypeValue data_type_value = make_int64_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_bool_data_type_value") { + bool value = true; + DataTypeValue data_type_value = make_bool_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc index d7901a0c53..a5a261da25 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc +++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc @@ -10,8 +10,4 @@ TEST_SUITE(FF_TEST_SUITE) { "Arbitrary> with T=", T, int, double, char) { RC_SUBCASE([](DimOrdered) {}); } - - TEST_CASE_TEMPLATE("Arbitrary> with T=", T, int, double, char) { - RC_SUBCASE([](FFOrdered) {}); - } } diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc similarity index 97% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc index 2ac641cfc2..d8e04124bc 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/concat.h" +#include "op-attrs/ff_ordered/concat.h" #include using namespace ::FlexFlow; diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc similarity index 92% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc index bf4c33d65a..e1a94e72c3 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/enumerate.h" +#include "op-attrs/ff_ordered/enumerate.h" #include "test/utils/doctest/fmt/map.h" #include diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc new file mode 100644 index 0000000000..b0812ba9d6 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc @@ -0,0 +1,11 @@ +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "test/utils/rapidcheck.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE_TEMPLATE("Arbitrary> with T=", T, int, double, char) { + RC_SUBCASE([](FFOrdered) {}); + } +} diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc similarity index 96% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc index bba989920e..73036d5662 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" #include using namespace ::FlexFlow; diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc similarity index 79% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc index b2fddd058e..2f1dfecd65 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/slice.h" #include using namespace ::FlexFlow; @@ -25,13 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("std::nullopt_t, ff_dim_t") { - FFOrdered result = - slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}}); - FFOrdered correct = FFOrdered{1, 2, 3}; - - CHECK(result == correct); - } SUBCASE("relative_ff_dim_t, relative_ff_dim_t") { FFOrdered result = slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1}); @@ -45,12 +38,6 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("std::nullopt_t, relative_ff_dim_t") { - FFOrdered result = slice(d, std::nullopt, relative_ff_dim_t{-1}); - FFOrdered correct = FFOrdered{1, 2, 3}; - - CHECK(result == correct); - } SUBCASE("start index = stop index") { FFOrdered result = slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1}); @@ -86,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt)); } SUBCASE("stop index out of bounds (too low)") { - CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10})); + CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{-10})); } SUBCASE("stop index out of bounds (too high)") { - CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10})); + CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{10})); } } } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc new file mode 100644 index 0000000000..4bf189ec77 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc @@ -0,0 +1,35 @@ +#include "op-attrs/ff_ordered/transform.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("transform(FFOrdered, F)") { + SUBCASE("input is empty") { + FFOrdered input = {}; + + FFOrdered result = transform(input, [](std::string const &) -> int { + CHECK(false); + return 0; + }); + FFOrdered correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input is not empty") { + FFOrdered input = {2, 1, 2, 5}; + + FFOrdered result = + transform(input, [](int x) { return fmt::to_string(x); }); + FFOrdered correct = FFOrdered{ + "2", + "1", + "2", + "5", + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc new file mode 100644 index 0000000000..19167cd0ff --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc @@ -0,0 +1,38 @@ +#include "op-attrs/ff_ordered/zip.h" +#include "test/utils/doctest/fmt/pair.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("zip(FFOrdered, FFOrdered)") { + FFOrdered lhs_input = {9, 9, 8, 9}; + FFOrdered rhs_input = {"m", "m", "k", "l", "m"}; + + SUBCASE("lhs is longer") { + FFOrdered> result = zip(lhs_input, rhs_input); + + FFOrdered> correct = { + {9, "m"}, + {9, "m"}, + {8, "k"}, + {9, "l"}, + }; + + CHECK(result == correct); + } + + SUBCASE("rhs is longer") { + FFOrdered> result = zip(rhs_input, lhs_input); + + FFOrdered> correct = { + {"m", 9}, + {"m", 9}, + {"k", 8}, + {"l", 9}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml new file mode 100644 index 0000000000..ebb2323203 --- /dev/null +++ b/lib/pcg/include/pcg/metric.enum.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "Metric" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "ACCURACY" + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR" + +[[values]] +name = "ROOT_MEAN_SQUARED_ERROR" + +[[values]] +name = "MEAN_ABSOLUTE_ERROR" diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h new file mode 100644 index 0000000000..343c2154dd --- /dev/null +++ b/lib/pcg/include/pcg/metric_attrs.h @@ -0,0 +1,28 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "pcg/metric.dtg.h" +#include "utils/fmt.h" +#include + +namespace FlexFlow { + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::unordered_set const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc new file mode 100644 index 0000000000..9a93e75350 --- /dev/null +++ b/lib/pcg/src/pcg/metric_attrs.cc @@ -0,0 +1,38 @@ +#include "pcg/metric_attrs.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::unordered_set const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error(fmt::format( + "Initializing MetricsAttrs with unrecogonized metrics type {}", m)); + } + } +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc index 2cf149f78a..940024c9b6 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc @@ -1,5 +1,5 @@ #include "pcg/parallel_computation_graph/generate_weight_transform.h" -#include "op-attrs/dim_ordered/enumerate.h" +#include "op-attrs/ff_ordered/enumerate.h" #include "op-attrs/parallel_tensor_shape.h" namespace FlexFlow { diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 253fd3cb4f..83e7c15460 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,11 +77,11 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), diff --git a/lib/utils/include/utils/containers/subvec.h b/lib/utils/include/utils/containers/slice.h similarity index 69% rename from lib/utils/include/utils/containers/subvec.h rename to lib/utils/include/utils/containers/slice.h index c89e9227de..a82fb383b5 100644 --- a/lib/utils/include/utils/containers/subvec.h +++ b/lib/utils/include/utils/containers/slice.h @@ -9,9 +9,9 @@ namespace FlexFlow { template -std::vector subvec(std::vector const &v, - std::optional const &maybe_start, - std::optional const &maybe_end) { +std::vector slice(std::vector const &v, + int const &maybe_start, + std::optional const &maybe_end) { auto begin_iter = v.cbegin(); auto end_iter = v.cend(); @@ -22,15 +22,13 @@ std::vector subvec(std::vector const &v, if (idx < 0) { new_idx = size + idx; } - if (new_idx < 0 || new_idx > size) { - throw mk_runtime_error("Index {} is out of bounds for array {}"); - } + + ASSERT(new_idx >= 0, "Index out of bounds"); + ASSERT(new_idx <= size, "Index out of bounds"); return new_idx; }; - if (maybe_start.has_value()) { - begin_iter += resolve_loc(maybe_start.value()); - } + begin_iter += resolve_loc(maybe_start); if (maybe_end.has_value()) { end_iter = v.cbegin() + resolve_loc(maybe_end.value()); diff --git a/lib/utils/include/utils/containers/zip_strict.h b/lib/utils/include/utils/containers/zip_strict.h index 64049042d4..5606fccff1 100644 --- a/lib/utils/include/utils/containers/zip_strict.h +++ b/lib/utils/include/utils/containers/zip_strict.h @@ -4,21 +4,17 @@ #include "utils/containers/zip.h" #include "utils/exception.h" #include "utils/fmt/vector.h" +#include namespace FlexFlow { template std::vector> zip_strict(std::vector const &lhs, std::vector const &rhs) { - if (lhs.size() != rhs.size()) { - throw mk_runtime_error( - fmt::format("zip_strict requires lhs and rhs to have the same length, " - "but received lhs={} (length {}), rhs={} (length {})", - lhs, - lhs.size(), - rhs, - rhs.size())); - } + ASSERT(lhs.size() == rhs.size(), + "zip_strict requires lhs and rhs to have the same length", + lhs, + rhs); return zip(lhs, rhs); } diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h index 080cbb3611..f95eb8a38d 100644 --- a/lib/utils/include/utils/exception.h +++ b/lib/utils/include/utils/exception.h @@ -3,6 +3,7 @@ #include "utils/fmt.h" #include +#include #include #include diff --git a/lib/utils/include/utils/indent.h b/lib/utils/include/utils/indent.h new file mode 100644 index 0000000000..eccbd34cfc --- /dev/null +++ b/lib/utils/include/utils/indent.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H + +#include + +namespace FlexFlow { + +std::string indent(std::string const &, int indent_size = 2); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/stack_vector/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h index 5d4d6eaad3..64d005a10e 100644 --- a/lib/utils/include/utils/stack_vector/stack_vector.h +++ b/lib/utils/include/utils/stack_vector/stack_vector.h @@ -272,18 +272,6 @@ struct stack_vector { return !(*this == other); } - bool operator<(stack_vector const &other) const { - for (std::size_t i = 0; i < std::min(this->m_size, other.m_size); i++) { - if (this->at(i) < other.at(i)) { - return true; - } else if (this->at(i) > other.at(i)) { - return false; - } - } - - return (this->m_size < other.m_size); - } - std::size_t size() const { return this->m_size; } @@ -305,17 +293,16 @@ struct stack_vector { private: std::size_t m_size = 0; std::array contents; - - static_assert( - implies, is_equal_comparable>::value, - ""); - static_assert( - implies, is_neq_comparable>::value, - ""); - static_assert( - implies, is_lt_comparable>::value, ""); }; +template +auto operator<(stack_vector const &lhs, + stack_vector const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} + template std::ostream &operator<<(std::ostream &s, stack_vector const &v) { return s << fmt::to_string(v); diff --git a/lib/utils/src/utils/containers/slice.cc b/lib/utils/src/utils/containers/slice.cc new file mode 100644 index 0000000000..f960c21881 --- /dev/null +++ b/lib/utils/src/utils/containers/slice.cc @@ -0,0 +1,3 @@ +#include "utils/containers/slice.h" + +namespace FlexFlow {} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/subvec.cc b/lib/utils/src/utils/containers/subvec.cc deleted file mode 100644 index 93c7de31c5..0000000000 --- a/lib/utils/src/utils/containers/subvec.cc +++ /dev/null @@ -1 +0,0 @@ -#include "utils/containers/subvec.h" diff --git a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc index 8445a2721a..8aed06ae01 100644 --- a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc +++ b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc @@ -1,5 +1,5 @@ #include "utils/full_binary_tree/binary_tree_path.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" namespace FlexFlow { @@ -27,7 +27,7 @@ BinaryTreePathEntry binary_tree_path_get_top_level(BinaryTreePath const &p) { BinaryTreePath binary_tree_path_get_non_top_level(BinaryTreePath const &p) { return BinaryTreePath{ - subvec(p.entries, 1, std::nullopt), + slice(p.entries, 1, std::nullopt), }; } diff --git a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc index 5b9b592444..459e61be71 100644 --- a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc +++ b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc @@ -3,7 +3,7 @@ #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" #include "utils/containers/require_same.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/values.h" #include "utils/graph/digraph/algorithms/get_predecessors.h" @@ -103,7 +103,7 @@ MultiDiEdge Node last = g.get_multidiedge_dst(reduction.edges.back()); std::vector internal_nodes; - for (MultiDiEdge const &e : subvec(reduction.edges, std::nullopt, -1)) { + for (MultiDiEdge const &e : slice(reduction.edges, 0, -1)) { internal_nodes.push_back(g.get_multidiedge_dst(e)); } diff --git a/lib/utils/src/utils/indent.cc b/lib/utils/src/utils/indent.cc new file mode 100644 index 0000000000..2761ad1878 --- /dev/null +++ b/lib/utils/src/utils/indent.cc @@ -0,0 +1,17 @@ +#include "utils/indent.h" +#include "utils/containers/flatmap.h" + +namespace FlexFlow { + +std::string indent(std::string const &s, int indent_size) { + std::string indent_str(indent_size, ' '); + return indent_str + flatmap(s, [&](char c) -> std::string { + if (c == '\n') { + return "\n" + indent_str; + } else { + return std::string{c}; + }; + }); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc index d4fb849412..e2009d74d3 100644 --- a/lib/utils/src/utils/stack_vector/stack_vector.cc +++ b/lib/utils/src/utils/stack_vector/stack_vector.cc @@ -1,9 +1,9 @@ #include "utils/stack_vector/stack_vector.h" -#include "utils/archetypes/ordered_value_type.h" +#include "utils/archetypes/value_type.h" namespace FlexFlow { -using T = ordered_value_type<0>; +using T = value_type<0>; template struct stack_vector; template struct stack_vector; diff --git a/lib/utils/test/common/include/test/utils/doctest/check_kv.h b/lib/utils/test/common/include/test/utils/doctest/check_kv.h new file mode 100644 index 0000000000..6449b8ac87 --- /dev/null +++ b/lib/utils/test/common/include/test/utils/doctest/check_kv.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H +#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H + +#include + +namespace FlexFlow { + +std::string check_kv(std::string const &k, std::string const &v); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc index 9522fa7fdb..6df2d925b7 100644 --- a/lib/utils/test/common/src/main.cc +++ b/lib/utils/test/common/src/main.cc @@ -1,2 +1,15 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "doctest/doctest.h" +#define DOCTEST_CONFIG_IMPLEMENT +#include + +#include +#include + +void libassert_throw_exception_handler(libassert::assertion_info const &info) { + throw std::runtime_error("Assertion failed:\n" + info.to_string()); +} + +int main(int argc, char **argv) { + libassert::set_failure_handler(libassert_throw_exception_handler); + + return doctest::Context(argc, argv).run(); +} diff --git a/lib/utils/test/common/src/test/utils/doctest/check_kv.cc b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc new file mode 100644 index 0000000000..d3c1ee335e --- /dev/null +++ b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc @@ -0,0 +1,17 @@ +#include "test/utils/doctest/check_kv.h" +#include "utils/indent.h" +#include + +namespace FlexFlow { + +std::string check_kv(std::string const &k, std::string const &v) { + std::ostringstream oss; + + oss << std::endl + << indent(k + "=", /*indent_size=*/4) << std::endl + << indent(v, /*indent_size=*/6); + + return oss.str(); +} + +} // namespace FlexFlow diff --git a/lib/utils/test/src/utils/containers/subvec.cc b/lib/utils/test/src/utils/containers/slice.cc similarity index 69% rename from lib/utils/test/src/utils/containers/subvec.cc rename to lib/utils/test/src/utils/containers/slice.cc index 610fc55b5a..4e4d840bfe 100644 --- a/lib/utils/test/src/utils/containers/subvec.cc +++ b/lib/utils/test/src/utils/containers/slice.cc @@ -1,4 +1,4 @@ -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "test/utils/doctest/fmt/vector.h" #include #include @@ -6,57 +6,57 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("subvec") { + TEST_CASE("slice") { std::vector v = {1, 2, 3, 4, 5}; - SUBCASE("Basic subvector") { - auto result = subvec(v, 1, 4); + SUBCASE("Basic slice") { + auto result = slice(v, 1, 4); std::vector correct = {2, 3, 4}; CHECK(result == correct); } SUBCASE("From beginning to index") { - auto result = subvec(v, std::nullopt, 3); + auto result = slice(v, 0, 3); std::vector correct = {1, 2, 3}; CHECK(result == correct); } SUBCASE("From index to end") { - auto result = subvec(v, 2, std::nullopt); + auto result = slice(v, 2, std::nullopt); std::vector correct = {3, 4, 5}; CHECK(result == correct); } SUBCASE("All of the vector") { - auto result = subvec(v, std::nullopt, std::nullopt); + auto result = slice(v, 0, std::nullopt); std::vector correct = {1, 2, 3, 4, 5}; CHECK(result == correct); } SUBCASE("Start greater than end") { - auto result = subvec(v, 3, 1); + auto result = slice(v, 3, 1); std::vector correct = {}; CHECK(result == correct); } SUBCASE("Start equal to end") { - auto result = subvec(v, 3, 3); + auto result = slice(v, 3, 3); std::vector correct = {}; CHECK(result == correct); } SUBCASE("Negative indices") { - auto result = subvec(v, -3, -1); + auto result = slice(v, -3, -1); std::vector correct = {3, 4}; CHECK(result == correct); } SUBCASE("Upper index is out of bounds by 1") { - CHECK_THROWS(subvec(v, 2, 6)); + CHECK_THROWS(slice(v, 2, 6)); } SUBCASE("Lower index is out of bounds by 1") { - CHECK_THROWS(subvec(v, -6, 2)); + CHECK_THROWS(slice(v, -6, 2)); } } } diff --git a/lib/utils/test/src/utils/indent.cc b/lib/utils/test/src/utils/indent.cc new file mode 100644 index 0000000000..b137253fae --- /dev/null +++ b/lib/utils/test/src/utils/indent.cc @@ -0,0 +1,66 @@ +#include "utils/indent.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("indent") { + SUBCASE("string is empty") { + std::string input = ""; + + std::string result = indent(input); + std::string correct = " "; + + CHECK(result == correct); + } + + SUBCASE("string is one line") { + std::string input = "hello world"; + std::string result = indent(input); + std::string correct = " hello world"; + + CHECK(result == correct); + } + + SUBCASE("string has multiple lines") { + std::string input = "\n" + "a b\n" + "c d\n" + "e f\n" + "g\n"; + + std::string result = indent(input); + std::string correct = " \n" + " a b\n" + " c d\n" + " e f\n" + " g\n" + " "; + + CHECK(result == correct); + } + + SUBCASE("leading and trailing whitespace is preserved") { + std::string input = " a b \n" + "c d e\n" + " "; + + std::string result = indent(input); + std::string correct = " a b \n" + " c d e\n" + " "; + + CHECK(result == correct); + } + + SUBCASE("allows custom indent size") { + std::string input = "hello\nworld"; + + std::string result = indent(input, /*indent_size=*/4); + std::string correct = " hello\n" + " world"; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc index c36de733b6..6eb2cc0d88 100644 --- a/lib/utils/test/src/utils/stack_vector/stack_vector.cc +++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc @@ -1,12 +1,97 @@ #include "utils/stack_vector/stack_vector.h" #include "test/utils/doctest/fmt/vector.h" #include "test/utils/rapidcheck.h" +#include "utils/archetypes/value_type.h" #include #include using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("operator<(stack_vector, stack_vector)") { + constexpr std::size_t MAXSIZE = 5; + + SUBCASE("T is ordered") { + SUBCASE("inputs are the same") { + std::vector input = {2, 1, 2, 3}; + + bool result = (input < input); + bool correct = false; + + CHECK(result == correct); + } + + SUBCASE("lhs is strict prefix of rhs") { + std::vector lhs = {2, 1, 2}; + std::vector rhs = {2, 1, 2, 3}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("lhs is empty") { + std::vector lhs = {}; + std::vector rhs = {2, 1, 2, 3}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("lhs has a smaller element first") { + std::vector lhs = {2, 1, 0, 3}; + std::vector rhs = {2, 1, 2}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + // from the definition of a strict total order, i.e., + // https://en.wikipedia.org/w/index.php?title=Total_order&oldid=1278541072#Strict_and_non-strict_total_orders + RC_SUBCASE("operator< is irreflexive", + [](stack_vector const &input) { + RC_ASSERT(!(input < input)); + }); + + RC_SUBCASE("operator< is asymmetric", + [](stack_vector const &lhs, + stack_vector const &rhs) { + RC_PRE(lhs != rhs); + + RC_ASSERT((lhs < rhs) == !(rhs < lhs)); + }); + + RC_SUBCASE("operator< is transitive", + [](stack_vector const &a, + stack_vector const &b, + stack_vector const &c) { + RC_PRE(a < b); + RC_PRE(b < c); + + RC_ASSERT(a < c); + }); + + RC_SUBCASE("operator< is connected", + [](stack_vector const &lhs, + stack_vector const &rhs) { + RC_PRE(lhs != rhs); + + RC_ASSERT((lhs < rhs) || (rhs < lhs)); + }); + } + + SUBCASE("T is not ordered") { + bool result = is_lt_comparable_v, MAXSIZE>>; + + CHECK_FALSE(result); + } + } + TEST_CASE_TEMPLATE( "stack_vector::push_back", T, int, double, char) { constexpr std::size_t MAXSIZE = 5; From fd3d7f1df7c78989a49fc00a74ce9367f716aaf6 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 May 2025 16:11:20 -0700 Subject: [PATCH 08/11] Add section on EditorConfig to setup guide in CONTRIBUTING.md (#1612) * Add section on EditorConfig to setup guide in CONTRIBUTING.md * Update workflow badges in README --- CONTRIBUTING.md | 14 +++++++++++--- README.md | 3 +-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1a1b3c9bee..f52ec68c0c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,6 +83,15 @@ Total Test time (real) = 8.64 sec If you don't, or if you see any tests failing, please double check that you have followed the instructions above. If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run. +### EditorConfig + +FlexFlow Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors. +The EditorConfig file for FlexFlow Train can be found in [`.editorconfig`](./.editorconfig). +If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed)) +the configuration will be detected and applied without you needing to do anything. +If you are using an editor not on this list, you will need to install a corresponding [EditorConfig plugin](https://editorconfig.org/#editor-plugins). +**If you are using vscode, you should install [this plugin](https://marketplace.visualstudio.com/items?itemName=EditorConfig.EditorConfig).** + ### GPU setup If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell: @@ -227,9 +236,8 @@ The bulk of the FlexFlow source code is stored in the following folders: We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows: -1. [`tests`](./.github/workflows/per-lib-check.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Also uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). -2. [`clang-format-check.yml`](./.github/workflows/clang-format-check.yml): ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`). -4. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo. +1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`). +2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo. GPU machines for CI are managed using [runs-on](https://runs-on.com/). diff --git a/README.md b/README.md index 0d56bc46e0..f181c4ad96 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # FlexFlow Train -[![clang-format Check](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml/badge.svg?branch=master)](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml) -[![per-lib-checks](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml) +[![tests](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml) [![shell-check](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) From 651ba943fd623285a0df0c0081c9dd26f20041ab Mon Sep 17 00:00:00 2001 From: Victor Li Date: Sat, 17 May 2025 21:43:42 -0700 Subject: [PATCH 09/11] Slight refactoring --- lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc | 1 - .../compiler/mcmc/{mcmc_algorithm.cc => mcmc_over_mapped_pcg.cc} | 0 2 files changed, 1 deletion(-) rename lib/compiler/test/src/compiler/mcmc/{mcmc_algorithm.cc => mcmc_over_mapped_pcg.cc} (100%) diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc index 47ecc2479f..75ef4d08a6 100644 --- a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc +++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -38,7 +38,6 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, get_random_pattern_match(random_substitution.value().pcg_pattern, sub_pcg_from_full_pcg(mapped_pcg.pcg)); if (pattern_match != std::nullopt) { - std::cout << "HELLO" << std::endl; return apply_substitution_and_update_machine_mapping( mapped_pcg, random_substitution.value(), pattern_match.value()); } diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc similarity index 100% rename from lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc rename to lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc From e3cf79aa41262d4a1a5a916078a3b7eee76d0bf2 Mon Sep 17 00:00:00 2001 From: Victor Li Date: Sun, 18 May 2025 01:20:05 -0700 Subject: [PATCH 10/11] Updating MCMC to work with substitutions --- .../machine_mapping_mutation_set.h | 13 +- .../src/compiler/graph_optimize_result.cc | 15 -- ...substitution_and_update_machine_mapping.cc | 19 ++- .../machine_mapping_mutation_set.cc | 139 ------------------ .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc | 1 + lib/compiler/src/compiler/search_result.cc | 2 +- .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc | 5 +- .../apply_substitution/apply_substitution.cc | 2 - 8 files changed, 21 insertions(+), 175 deletions(-) delete mode 100644 lib/compiler/src/compiler/graph_optimize_result.cc diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h index 6dfefec7d1..796c94b371 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h @@ -9,22 +9,11 @@ std::optional get_naive_mapping(ParallelComputationGraph &pcg, MachineSpecification const &resources, DeviceType const &device_type); -std::vector - get_possible_mutations(SearchResult mapped_pcg, - MachineSpecification const &resource); + std::optional get_random_mutation(SearchResult mapped_pcg, MachineSpecification const &resource, DeviceType const &device_type); -MachineView increment_stride(MachineView machine_view, nonnegative_int dim); -MachineView decrement_all_strides(MachineView machine_view); -MachineView change_stride(nonnegative_int stride, - MachineView machine_view, - nonnegative_int dim); -MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view); -MachineView change_device_idx(nonnegative_int device_idx, - MachineView machine_view); -MachineView switch_projection(MachineView machine_view, nonnegative_int dim); } // namespace FlexFlow #endif diff --git a/lib/compiler/src/compiler/graph_optimize_result.cc b/lib/compiler/src/compiler/graph_optimize_result.cc deleted file mode 100644 index 33243a226d..0000000000 --- a/lib/compiler/src/compiler/graph_optimize_result.cc +++ /dev/null @@ -1,15 +0,0 @@ -#include "compiler/search_result.h" - -namespace FlexFlow { - -std::string format_as(SearchResult const &r) { - return fmt::format("", - as_dot(r.pcg), - r.machine_mapping); -} - -std::ostream &operator<<(std::ostream &s, SearchResult const &r) { - return (s << fmt::to_string(r)); -} - -} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc index 411ee67145..1276a63893 100644 --- a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc @@ -2,6 +2,7 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" #include "substitutions/apply_substitution/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/apply_substitution.h" #include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" #include "substitutions/open_parallel_tensor_guid_t.h" #include "substitutions/pcg_pattern_match.h" @@ -13,6 +14,7 @@ #include "utils/containers/restrict_keys.h" #include "utils/containers/set_minus.h" #include "utils/containers/values.h" +#include "utils/containers/is_subseteq_of.h" namespace FlexFlow { @@ -47,6 +49,7 @@ SearchResult apply_substitution_and_update_machine_mapping( transform(matched_nodes, [&](parallel_layer_guid_t const &node) { return machine_views.at(node); }); + MachineView first_substituted_machine_view = *substituted_machine_views.begin(); std::unordered_map post_node_data = [&] { @@ -56,10 +59,8 @@ SearchResult apply_substitution_and_update_machine_mapping( std::unordered_map post_node_data_from_sub = output_graph_data.node_data; - // just taking the first substituted machine view, not sure if this - // is fine for (auto [layer, attrs] : post_node_data_from_sub) { - machine_views.try_emplace(layer, *substituted_machine_views.begin()); + machine_views.insert_or_assign(layer, first_substituted_machine_view); } return merge_disjoint_maps(post_node_data_from_orig, @@ -175,6 +176,18 @@ SearchResult apply_substitution_and_update_machine_mapping( post_value_data, }; + assert(is_subseteq_of(keys(post_node_data), keys(machine_views))); + + for (auto it = machine_views.begin(); it != machine_views.end(); ) { + if (post_node_data.find(it->first) == post_node_data.end()) { + it = machine_views.erase(it); + } else { + ++it; + } + } + + assert(keys(post_node_data) == keys(machine_views)); + return SearchResult{ pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)), MachineMapping{machine_views}}; diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc index 7f7a54d07a..3688385e2f 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc @@ -29,71 +29,6 @@ std::optional std::optional get_random_mutation(SearchResult mapped_pcg, - MachineSpecification const &resources, - DeviceType const &device_type) { - ParallelComputationGraph pcg = mapped_pcg.pcg; - std::vector layers = topological_ordering(pcg); - if (layers.size() == 0) { - return std::nullopt; - } - parallel_layer_guid_t random_layer = select_random(layers); - MachineMapping machine_mapping = mapped_pcg.machine_mapping; - MachineView machine_view = machine_mapping.machine_views.at(random_layer); - - int mutation_op = select_random(range(6)); - switch (mutation_op) { - case 0: { - machine_view = decrement_all_strides(machine_view); - break; - } - case 1: { - nonnegative_int rand_dim = select_random( - nonnegative_range(nonnegative_int{num_dims(machine_view)})); - machine_view = increment_stride(machine_view, rand_dim); - break; - } - case 2: { - nonnegative_int rand_node_idx = - select_random(nonnegative_range(resources.num_nodes)); - machine_view = change_node_idx(rand_node_idx, machine_view); - break; - } - case 3: { - if (device_type == DeviceType::GPU) { - nonnegative_int rand_device_idx = - select_random(nonnegative_range(resources.num_gpus_per_node)); - machine_view = change_device_idx(rand_device_idx, machine_view); - } else { - nonnegative_int rand_device_idx = - select_random(nonnegative_range(resources.num_cpus_per_node)); - machine_view = change_device_idx(rand_device_idx, machine_view); - } - break; - } - case 4: { - nonnegative_int rand_dim = select_random( - nonnegative_range(nonnegative_int{num_dims(machine_view)})); - machine_view = switch_projection(machine_view, rand_dim); - break; - } - case 5: { - // copy layer - parallel_layer_guid_t layer_to_copy = select_random(layers); - machine_view = machine_mapping.machine_views.at(layer_to_copy); - break; - } - } - OperatorTaskSpace task = get_operator_task_space(pcg, random_layer); - if (is_valid_machine_view(machine_view, task, resources)) { - // only apply it if valid - machine_mapping.machine_views.at(random_layer) = machine_view; - } - return machine_mapping; -} - -// "lazy" version just picks a random available machine view for a random layer -std::optional - get_random_mutation_lazy(SearchResult mapped_pcg, MachineSpecification const &resources, DeviceType const &device_type) { ParallelComputationGraph pcg = mapped_pcg.pcg; @@ -102,7 +37,6 @@ std::optional return std::nullopt; } parallel_layer_guid_t random_layer = select_random(layers); - ; MachineMapping machine_mapping = mapped_pcg.machine_mapping; MachineView machine_view = machine_mapping.machine_views.at(random_layer); @@ -115,77 +49,4 @@ std::optional machine_mapping.machine_views.at(random_layer) = random_new_machine_view; return machine_mapping; } - -MachineView increment_stride(MachineView machine_view, nonnegative_int dim) { - std::vector strides = get_strides(machine_view); - nonnegative_int new_stride = - strides.at(dim.unwrap_nonnegative()).unwrapped + 1_n; - return change_stride(new_stride, machine_view, dim); -} - -MachineView decrement_all_strides(MachineView machine_view) { - std::vector strides = get_strides(machine_view); - for (nonnegative_int dim : - nonnegative_range(nonnegative_int{num_dims(machine_view)})) { - nonnegative_int old_stride = strides.at(dim.unwrap_nonnegative()).unwrapped; - if (old_stride >= 1_n) { - machine_view = - change_stride(nonnegative_int{old_stride.unwrap_nonnegative() - 1}, - machine_view, - dim); - } - } - return machine_view; -} - -MachineView change_stride(nonnegative_int stride, - MachineView machine_view, - nonnegative_int dim) { - std::vector strides = get_strides(machine_view); - strides.at(dim.unwrap_nonnegative()) = stride_t{stride}; - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - machine_view.start, strides, get_dimensions(machine_view)); - return new_machine_view; -} - -MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view) { - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - MachineSpaceCoordinate{node_ix, - machine_view.start.device_idx, - machine_view.start.device_type}, - get_strides(machine_view), - get_dimensions(machine_view)); - return new_machine_view; -} - -MachineView change_device_idx(nonnegative_int device_idx, - MachineView machine_view) { - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - MachineSpaceCoordinate{machine_view.start.node_idx, - device_idx, - machine_view.start.device_type}, - get_strides(machine_view), - get_dimensions(machine_view)); - return new_machine_view; -} - -MachineView switch_projection(MachineView machine_view, nonnegative_int dim) { - std::vector dims = - get_dimensions(machine_view); - MachineSpecificationDimension projection = dims.at(dim.unwrap_nonnegative()); - if (projection == MachineSpecificationDimension::INTER_NODE) { - dims.at(dim.unwrap_nonnegative()) = - MachineSpecificationDimension::INTRA_NODE; - } else { - dims.at(dim.unwrap_nonnegative()) = - MachineSpecificationDimension::INTER_NODE; - } - MachineView new_machine_view = - machine_view_from_strides_and_machine_spec_dimensions( - machine_view.start, get_strides(machine_view), dims); - return new_machine_view; -} } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc index 75ef4d08a6..452cf3baa3 100644 --- a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc +++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -6,6 +6,7 @@ #include "substitutions/pcg_pattern.h" #include "substitutions/pcg_pattern_match.h" #include "substitutions/unity_substitution_set.h" +#include "compiler/search_result.h" #include "utils/optional.h" namespace FlexFlow { diff --git a/lib/compiler/src/compiler/search_result.cc b/lib/compiler/src/compiler/search_result.cc index 33243a226d..0afc10723a 100644 --- a/lib/compiler/src/compiler/search_result.cc +++ b/lib/compiler/src/compiler/search_result.cc @@ -3,7 +3,7 @@ namespace FlexFlow { std::string format_as(SearchResult const &r) { - return fmt::format("", + return fmt::format("", as_dot(r.pcg), r.machine_mapping); } diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc index 5c469c4301..07feef073d 100644 --- a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -65,7 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { MCMCOverMappedPCGConfig search_config = MCMCOverMappedPCGConfig{/*temperature=*/1.0, /*num_iterations=*/100_n, - /*substitution_interval=*/100_n, + /*substitution_interval=*/5_n, /*device_type=*/DeviceType::GPU}; SearchResult result = mcmc_graph_optimize( @@ -74,7 +74,6 @@ TEST_SUITE(FF_TEST_SUITE) { result.pcg, cost_estimator, result.machine_mapping, full_machine_spec); std::cout << runtime << std::endl; - CHECK(runtime < 16); - CHECK(false); + CHECK(runtime < 12); } } diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc index f1354264f8..61bfe15d7b 100644 --- a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc +++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc @@ -159,8 +159,6 @@ SubParallelComputationGraph post_value_data, }; - std::cout << as_dot(sub_pcg_from_graph_data(post_data)) << std::endl; - return sub_pcg_from_graph_data(post_data); } From 1af4a6b45c11e8ee836d7c3c0eb626d4f42147ff Mon Sep 17 00:00:00 2001 From: Victor Li Date: Sun, 18 May 2025 01:21:51 -0700 Subject: [PATCH 11/11] Fixing formatting --- .../machine_mapping/machine_mapping_mutation_set.h | 2 +- ...apply_substitution_and_update_machine_mapping.cc | 13 +++++++------ .../machine_mapping/machine_mapping_mutation_set.cc | 4 ++-- .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc | 2 +- .../test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h index 796c94b371..43af640e02 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h @@ -9,7 +9,7 @@ std::optional get_naive_mapping(ParallelComputationGraph &pcg, MachineSpecification const &resources, DeviceType const &device_type); - + std::optional get_random_mutation(SearchResult mapped_pcg, MachineSpecification const &resource, diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc index 1276a63893..252384985b 100644 --- a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc @@ -1,20 +1,20 @@ #include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" -#include "substitutions/apply_substitution/evaluate_substitution_output.h" #include "substitutions/apply_substitution/apply_substitution.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" #include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" #include "substitutions/open_parallel_tensor_guid_t.h" #include "substitutions/pcg_pattern_match.h" #include "substitutions/sub_parallel_computation_graph.h" #include "substitutions/sub_parallel_computation_graph_data.dtg.h" #include "substitutions/sub_parallel_computation_graph_edge.h" +#include "utils/containers/is_subseteq_of.h" #include "utils/containers/keys.h" #include "utils/containers/merge_maps.h" #include "utils/containers/restrict_keys.h" #include "utils/containers/set_minus.h" #include "utils/containers/values.h" -#include "utils/containers/is_subseteq_of.h" namespace FlexFlow { @@ -49,7 +49,8 @@ SearchResult apply_substitution_and_update_machine_mapping( transform(matched_nodes, [&](parallel_layer_guid_t const &node) { return machine_views.at(node); }); - MachineView first_substituted_machine_view = *substituted_machine_views.begin(); + MachineView first_substituted_machine_view = + *substituted_machine_views.begin(); std::unordered_map post_node_data = [&] { @@ -178,11 +179,11 @@ SearchResult apply_substitution_and_update_machine_mapping( assert(is_subseteq_of(keys(post_node_data), keys(machine_views))); - for (auto it = machine_views.begin(); it != machine_views.end(); ) { + for (auto it = machine_views.begin(); it != machine_views.end();) { if (post_node_data.find(it->first) == post_node_data.end()) { - it = machine_views.erase(it); + it = machine_views.erase(it); } else { - ++it; + ++it; } } diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc index 3688385e2f..15648eab74 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc @@ -29,8 +29,8 @@ std::optional std::optional get_random_mutation(SearchResult mapped_pcg, - MachineSpecification const &resources, - DeviceType const &device_type) { + MachineSpecification const &resources, + DeviceType const &device_type) { ParallelComputationGraph pcg = mapped_pcg.pcg; std::vector layers = topological_ordering(pcg); if (layers.size() == 0) { diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc index 452cf3baa3..ab7769679e 100644 --- a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc +++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -2,11 +2,11 @@ #include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" #include "compiler/machine_mapping/machine_mapping_mutation_set.h" #include "compiler/mcmc/generic_mcmc_algorithm.h" +#include "compiler/search_result.h" #include "compiler/task_graph_simulator/task_simulator.h" #include "substitutions/pcg_pattern.h" #include "substitutions/pcg_pattern_match.h" #include "substitutions/unity_substitution_set.h" -#include "compiler/search_result.h" #include "utils/optional.h" namespace FlexFlow { diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc index 07feef073d..7d74d897e4 100644 --- a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -1,5 +1,5 @@ -#include "../cost_estimator_for_test.h" #include "compiler/mcmc/mcmc_over_mapped_pcg.h" +#include "../cost_estimator_for_test.h" #include "compiler/task_graph_simulator/task_simulator.h" #include "doctest/doctest.h" #include "op-attrs/parallel_tensor_dims.h"