diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..2797f0f929 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +source_up_if_exists + +use flake diff --git a/.flake/pkgs/fccf/default.nix b/.flake/pkgs/fccf/default.nix new file mode 100644 index 0000000000..f792b8606c --- /dev/null +++ b/.flake/pkgs/fccf/default.nix @@ -0,0 +1,54 @@ +{ fetchFromGitHub +, stdenv +, cmake +, pkg-config +, libclang +, libllvm +, lib +, zlib +, argparse +, nlohmann_json +, fmt +}: + +stdenv.mkDerivation rec { + pname = "fccf"; + version = "03d373fc65e2d7ceeac441ba4bbddfdc25618dff"; + + src = fetchFromGitHub { + owner = "p-ranav"; + repo = "fccf"; + rev = version; + sha256 = "sha256-3NdPon5ZfjoGFFgBlb0rzRnfWgSopvAc5Gls2NWHaOE="; + }; + + nativeBuildInputs = [ + cmake + pkg-config + ]; + + buildInputs = [ + libclang + libllvm + zlib + argparse + nlohmann_json + fmt + ]; + + patches = [ + ./json-package-name.patch + ./fix-argparse-include.patch + ]; + + cmakeFlags = [ + "-DCMAKE_BUILD_TYPE=Release" + "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS" + ]; + + meta = with lib; { + description = "A command-line tool that quickly searches through C/C++ source code in a directory based on a search string and prints relevant code snippets that match the query"; + homepage = "https://github.com/p-ranav/fccf"; + license = licenses.mit; + }; +} diff --git a/.flake/pkgs/fccf/fix-argparse-include.patch b/.flake/pkgs/fccf/fix-argparse-include.patch new file mode 100644 index 0000000000..2cb648c1bf --- /dev/null +++ b/.flake/pkgs/fccf/fix-argparse-include.patch @@ -0,0 +1,13 @@ +diff --git a/source/main.cpp b/source/main.cpp +index 7e131d3..6c05d89 100644 +--- a/source/main.cpp ++++ b/source/main.cpp +@@ -6,7 +6,7 @@ + #include + #include + +-#include ++#include + #include + #include "searcher.hpp" + #include diff --git a/.flake/pkgs/fccf/json-package-name.patch b/.flake/pkgs/fccf/json-package-name.patch new file mode 100644 index 0000000000..51f6a012cf --- /dev/null +++ b/.flake/pkgs/fccf/json-package-name.patch @@ -0,0 +1,12 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 20bcbbf..923075f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -48,6 +48,7 @@ FetchContent_MakeAvailable(fmt) + + FetchContent_Declare(json + URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz ++ FIND_PACKAGE_ARGS NAMES nlohmann_json + ) + FetchContent_MakeAvailable(json) + diff --git a/.github/runs-on.yml b/.github/runs-on.yml index a4fff33536..5033e69d65 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,23 +1,4 @@ images: - runs-on-gpu-pinned: - platform: "linux" - arch: "x64" - owner: "135269210855" # runs-on - # to find, go to - # https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Images:visibility=public-images;search=:runs-on;v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false - name: "runs-on-v2.2-ubuntu22-gpu-x64-20250220122045" - - runs-on-cpu-pinned: - platform: "linux" - arch: "x64" - owner: "135269210855" # runs-on - name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045" - - official-ubuntu-ami: - platform: "linux" - arch: "x64" - ami: "ami-0a60b027285c0d4c5" - flexflow-gpu-ci: platform: "linux" arch: "x64" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9d98fb07dd..799e3069a9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,9 +57,9 @@ jobs: name: GPU unit tests needs: cpu-ci runs-on: - - runs-on + - runs-on=${{ github.run_id }} - family=g4dn.xlarge - - image=runs-on-gpu-pinned + - image=flexflow-gpu-ci strategy: max-parallel: 1 diff --git a/.proj.toml b/.proj.toml index a06fb53c3a..8eed6166cd 100644 --- a/.proj.toml +++ b/.proj.toml @@ -2,57 +2,81 @@ project_name = "flexflow" testsuite_macro = "FF_TEST_SUITE" namespace_name = "FlexFlow" header_extension = ".h" +cuda_launch_cmd = [ + "nixGL", + "--", +] [targets.utils] type = "lib" -tests = true -benchmarks = true +has-cpu-only-tests = true +has-cpu-only-benchmarks = true +has-cuda-tests = false +has-cuda-benchmarks = false [targets.op-attrs] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.kernels] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false [targets.pcg] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.substitutions] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.compiler] type = "lib" -tests = true -benchmarks = true +has-cpu-only-tests = true +has-cpu-only-benchmarks = true +has-cuda-tests = false +has-cuda-benchmarks = false [targets.substitution-generator] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.local-execution] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.models] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.export-model-arch] type = "bin" +cuda = false [targets.substitution-to-dot] type = "bin" +cuda = false # default_build_targets = [ # "utils", diff --git a/.vimrc b/.vimrc new file mode 100644 index 0000000000..4c8a8a8279 --- /dev/null +++ b/.vimrc @@ -0,0 +1,8 @@ +" example search path configuration +set path=lib/runtime/**,lib/** + +" set build target +" let g:target = "pcg" + +" set test target +" let g:test_target = "utils-test" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1a1b3c9bee..f52ec68c0c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,6 +83,15 @@ Total Test time (real) = 8.64 sec If you don't, or if you see any tests failing, please double check that you have followed the instructions above. If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run. +### EditorConfig + +FlexFlow Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors. +The EditorConfig file for FlexFlow Train can be found in [`.editorconfig`](./.editorconfig). +If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed)) +the configuration will be detected and applied without you needing to do anything. +If you are using an editor not on this list, you will need to install a corresponding [EditorConfig plugin](https://editorconfig.org/#editor-plugins). +**If you are using vscode, you should install [this plugin](https://marketplace.visualstudio.com/items?itemName=EditorConfig.EditorConfig).** + ### GPU setup If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell: @@ -227,9 +236,8 @@ The bulk of the FlexFlow source code is stored in the following folders: We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows: -1. [`tests`](./.github/workflows/per-lib-check.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Also uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). -2. [`clang-format-check.yml`](./.github/workflows/clang-format-check.yml): ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`). -4. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo. +1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`). +2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo. GPU machines for CI are managed using [runs-on](https://runs-on.com/). diff --git a/README.md b/README.md index 0d56bc46e0..f181c4ad96 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # FlexFlow Train -[![clang-format Check](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml/badge.svg?branch=master)](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml) -[![per-lib-checks](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml) +[![tests](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml) [![shell-check](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake index 478ebda318..ef5d6d9d11 100644 --- a/cmake/flexflow-utils.cmake +++ b/cmake/flexflow-utils.cmake @@ -126,11 +126,16 @@ function(ff_add_test_executable) ${FF_TEST_EXEC_NAME} ${SRC}) + target_include_directories( + ${FF_TEST_EXEC_NAME} + PRIVATE + ${FF_TEST_EXEC_PRIVATE_INCLUDE}) + target_link_libraries( ${FF_TEST_EXEC_NAME} ${FF_TEST_EXEC_DEPS}) - target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}") + target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="cpu-${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}") define_ff_vars(${FF_TEST_EXEC_NAME}) ff_set_cxx_properties(${FF_TEST_EXEC_NAME}) diff --git a/flake.lock b/flake.lock index c991232013..ff6e797d51 100644 --- a/flake.lock +++ b/flake.lock @@ -66,11 +66,11 @@ ] }, "locked": { - "lastModified": 1741679698, - "narHash": "sha256-poSOQS/2qImAo/PgRu37pHdOrwAsZEyC8PMM3evFLX4=", + "lastModified": 1746157536, + "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=", "owner": "lockshaw", "repo": "proj", - "rev": "0de983ff66abea4703f73988d29fc807e2b0a9bd", + "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 77a6c61b7d..5fa48fa3fd 100644 --- a/flake.nix +++ b/flake.nix @@ -59,6 +59,7 @@ bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { }; ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; }; hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { }; + fccf = pkgs.callPackage ./.flake/pkgs/fccf { }; rapidcheckFull = pkgs.symlinkJoin { name = "rapidcheckFull"; paths = (with pkgs; [ rapidcheck.out rapidcheck.dev ]); @@ -162,6 +163,7 @@ ruff jq gh + expect ]) (with pkgs.python3Packages; [ gitpython @@ -179,6 +181,7 @@ (with self.packages.${system}; [ ffdb hpp2plantuml + fccf ]) ]; }; diff --git a/lib/compiler/include/compiler/algorithm_config.variant.toml b/lib/compiler/include/compiler/algorithm_config.variant.toml new file mode 100644 index 0000000000..4e58104875 --- /dev/null +++ b/lib/compiler/include/compiler/algorithm_config.variant.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "AlgorithmConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "compiler/data_parallelism/data_parallelism_config.dtg.h", + "compiler/unity_algorithm/unity_search_config.dtg.h", +] + +[[values]] +type = "::FlexFlow::DataParallelismConfig" + +[[values]] +type = "::FlexFlow::UnitySearchConfig" diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h index 178ab19a53..8697c06beb 100644 --- a/lib/compiler/include/compiler/compiler.h +++ b/lib/compiler/include/compiler/compiler.h @@ -1,42 +1,22 @@ #ifndef _FLEXFLOW_COMPILER_COMPILER_H #define _FLEXFLOW_COMPILER_COMPILER_H -#include "pcg/cost_values.h" -#include "pcg/machine_view.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph.h" -#include "pcg/tensor_mapping.h" +#include "compiler/algorithm_config.dtg.h" +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/search_result.dtg.h" +#include "pcg/machine_specification.dtg.h" namespace FlexFlow { enum class SearchAlgorithm { DATA_PARALLEL, -}; - -using SearchAlgorithmConfig = std::variant<>; -using SearchSolution = std::variant<>; - -struct SearchResult { - ParallelComputationGraph pcg; - TensorMapping tensor_mapping; - SearchSolution solution; - CostValues cost_values; + UNITY, }; SearchResult optimize(ComputationGraph const &, MachineSpecification const &, CostEstimator const &, - SearchAlgorithm, - optional const &); - -// struct SearchSolution { -// LabelledMultiDiGraph optimized_pcg; -// std::unordered_map device_assignments; -// /* std::unordered_map> tensor_mappings; */ -// }; -// -// SearchSolution run_data_parallelize(ComputationGraph const &, -// MachineSpecification const &); + AlgorithmConfig const &); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml new file mode 100644 index 0000000000..68512fa473 --- /dev/null +++ b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml @@ -0,0 +1,14 @@ +namespace = "FlexFlow" +name = "DataParallelismConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ +] + +[[fields]] +name = "degree" +type = "int" diff --git a/lib/compiler/include/compiler/graph_optimize_result.struct.toml b/lib/compiler/include/compiler/graph_optimize_result.struct.toml deleted file mode 100644 index 22f29cbd59..0000000000 --- a/lib/compiler/include/compiler/graph_optimize_result.struct.toml +++ /dev/null @@ -1,16 +0,0 @@ -namespace = "FlexFlow" -name = "GraphOptimizeResult" -features = [ ] - -includes = [ - "compiler/machine_mapping/machine_mapping.dtg.h", - "pcg/parallel_computation_graph/parallel_computation_graph.h" -] - -[[fields]] -name = "pcg" -type = "::FlexFlow::ParallelComputationGraph" - -[[fields]] -name = "machine_mapping" -type = "::FlexFlow::MachineMapping" diff --git a/lib/compiler/include/compiler/allowed_machine_views.h b/lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h similarity index 100% rename from lib/compiler/include/compiler/allowed_machine_views.h rename to lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h diff --git a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h new file mode 100644 index 0000000000..b08ca57851 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H +#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H + +#include "compiler/search_result.dtg.h" +#include "substitutions/pcg_pattern_match.dtg.h" +#include "substitutions/sub_parallel_computation_graph.dtg.h" +#include "substitutions/substitution.dtg.h" + +namespace FlexFlow { +/** + * @brief Applies \p substitution to \p mapped_pcg at the location specified by + * \p match, returning the resulting SearchResult (mapped pcg) + * + * @param mapped_pcg + * @param substitution + * @param match The location at which to apply substitution. This location in + * sub_pcg should match substitution's PCGPattern. Likely created by running + * FlexFlow::find_pattern_matches(PCGPattern const &, + * SubParallelComputationGraph const &). + * @return SearchResult A mapped pcg similar to mapped_pcg, but with + * the subgraph of the pcg specified by match replaced with the result of the + * output expression of substitution and the machine mapping updated to account + * for the new output + */ +SearchResult apply_substitution_and_update_machine_mapping( + SearchResult const &mapped_pcg, + Substitution const &sub, + PCGPatternMatch const &match); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h index 7375cde985..796225637e 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h @@ -2,6 +2,8 @@ #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H #include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "compiler/machine_mapping/machine_mapping_result.h" +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" #include "pcg/device_id_t.dtg.h" #include "pcg/machine_specification.dtg.h" #include "pcg/operator_task_space.dtg.h" @@ -14,6 +16,13 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &, bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2); +parallel_layer_guid_t + get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition, + BinaryTreePath const &path); + +std::optional get_machine_mapping_from_machine_mapping_result( + PCGBinarySPDecomposition const &, MachineMappingResult const &); + } // namespace FlexFlow #endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h new file mode 100644 index 0000000000..43af640e02 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H + +#include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/search_result.dtg.h" + +namespace FlexFlow { +std::optional + get_naive_mapping(ParallelComputationGraph &pcg, + MachineSpecification const &resources, + DeviceType const &device_type); + +std::optional + get_random_mutation(SearchResult mapped_pcg, + MachineSpecification const &resource, + DeviceType const &device_type); +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h index 68d02aaa54..168ba6c3d5 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h @@ -9,6 +9,9 @@ namespace FlexFlow { +bool is_valid_machine_mapping_problem_tree( + MachineMappingProblemTree const &problem_tree); + MachineMappingProblemTree get_machine_mapping_problem_tree(ParallelComputationGraph const &pcg, PCGBinarySPDecomposition const &sp); diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h index 29e9e7c90b..3d1dc91d24 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h @@ -4,6 +4,7 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h" #include "utils/full_binary_tree/binary_tree_path.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h" #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h" @@ -27,6 +28,9 @@ std::optional mm_problem_tree_get_subtree_at_path(MachineMappingProblemTree const &, BinaryTreePath const &); +std::string as_dot(MachineMappingProblemTree const &); +void debug_print_dot(MachineMappingProblemTree const &); + } // namespace FlexFlow #endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml index fe76683eb7..7493c68387 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml @@ -11,6 +11,7 @@ includes = [ "op-attrs/parallel_tensor_shape.dtg.h", "", "pcg/machine_view.dtg.h", + "pcg/operator_task_space.dtg.h", ] src_includes = [ @@ -34,3 +35,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>" name = "output_shapes" type = "std::vector<::FlexFlow::ParallelTensorShape>" +[[fields]] +name = "op_task_space" +type = "::FlexFlow::OperatorTaskSpace" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h index b21fea5f24..db2f4e6f0d 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h @@ -31,6 +31,8 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); make_singleton_machine_mapping_result(float runtime, MachineView const &machine_view); +[[nodiscard]] float get_runtime_cost(MachineMappingResult const &mm_result); + } // namespace FlexFlow #endif diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h new file mode 100644 index 0000000000..a27ecbc8f4 --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h @@ -0,0 +1,57 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H +#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H + +#include "compiler/mcmc/generic_mcmc_config.dtg.h" +#include "compiler/mcmc/generic_mcmc_state.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/random_utils.h" +#include + +namespace FlexFlow { + +template +void modify_state_for_minimization( + Generic_MCMC_state &best_state, + Generic_MCMC_state ¤t_state, + State candidate, + ScoringFunc scorer, + float temperature) { + float best_estimate = best_state.get_score(); + float new_estimate = scorer(candidate); + float delta = new_estimate - best_estimate; + if (delta < 0 || (randf() < exp(-delta / temperature))) { + current_state = Generic_MCMC_state(candidate, new_estimate); + if (delta < 0) { + best_state = current_state; + } + } +} + +// GeneratingFunc : State -> nn_int -> std::optional +// ScoringFunc : State -> float + +template +Generic_MCMC_state + minimize_score(State const &starting_state, + GeneratingFunc const &generator, + ScoringFunc const &scorer, + GenericMCMCConfig const &search_config) { + using MCMCState = Generic_MCMC_state; + MCMCState best_state = MCMCState(starting_state, scorer(starting_state)); + MCMCState current_state = best_state; + for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) { + std::optional candidate = generator(current_state.get_state(), i); + if (candidate != std::nullopt) { + modify_state_for_minimization(best_state, + current_state, + candidate.value(), + scorer, + search_config.temperature); + } + } + return best_state; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml new file mode 100644 index 0000000000..e11c84f0bd --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "GenericMCMCConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "utils/nonnegative_int/nonnegative_int.h" +] + +[[fields]] +name = "temperature" +type = "float" + +[[fields]] +name = "num_iterations" +type = "::FlexFlow::nonnegative_int" \ No newline at end of file diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h new file mode 100644 index 0000000000..6a6aada32b --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H +#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H +#include "utils/nonnegative_int/nonnegative_int.h" + +namespace FlexFlow { + +template +struct Generic_MCMC_state { +public: + Generic_MCMC_state(State const &state, Score const &score) + : state(state), score(score) {} + + State const &get_state() const { + return state; + } + Score const &get_score() const { + return score; + } + +private: + State state; + Score score; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h new file mode 100644 index 0000000000..c2d8737184 --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H +#define _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H + +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/mcmc/mcmc_over_mapped_pcg_config.dtg.h" +#include "compiler/search_result.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/substitution.h" + +namespace FlexFlow { + +SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + MCMCOverMappedPCGConfig const &search_config); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml new file mode 100644 index 0000000000..e1548a581e --- /dev/null +++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml @@ -0,0 +1,28 @@ +namespace = "FlexFlow" +name = "MCMCOverMappedPCGConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "pcg/device_type.dtg.h", + "utils/nonnegative_int/nonnegative_int.h" +] + +[[fields]] +name = "temperature" +type = "float" + +[[fields]] +name = "num_iterations" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "substitution_interval" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "device_type" +type = "::FlexFlow::DeviceType" \ No newline at end of file diff --git a/lib/compiler/include/compiler/graph_optimize_result.h b/lib/compiler/include/compiler/search_result.h similarity index 54% rename from lib/compiler/include/compiler/graph_optimize_result.h rename to lib/compiler/include/compiler/search_result.h index f3843e2a93..197b36e9ea 100644 --- a/lib/compiler/include/compiler/graph_optimize_result.h +++ b/lib/compiler/include/compiler/search_result.h @@ -1,12 +1,12 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_GRAPH_OPTIMIZE_RESULT_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_GRAPH_OPTIMIZE_RESULT_H -#include "compiler/graph_optimize_result.dtg.h" +#include "compiler/search_result.dtg.h" namespace FlexFlow { -std::string format_as(GraphOptimizeResult const &); -std::ostream &operator<<(std::ostream &, GraphOptimizeResult const &); +std::string format_as(SearchResult const &); +std::ostream &operator<<(std::ostream &, SearchResult const &); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml new file mode 100644 index 0000000000..120d182c75 --- /dev/null +++ b/lib/compiler/include/compiler/search_result.struct.toml @@ -0,0 +1,17 @@ +namespace = "FlexFlow" +name = "SearchResult" +features = [ +] + +includes = [ + "pcg/parallel_computation_graph/parallel_computation_graph.h", + "compiler/machine_mapping/machine_mapping.h", +] + +[[fields]] +name = "pcg" +type = "::FlexFlow::ParallelComputationGraph" + +[[fields]] +name = "machine_mapping" +type = "::FlexFlow::MachineMapping" diff --git a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h index d43edaa79d..bb7459c767 100644 --- a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h +++ b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h @@ -1,6 +1,8 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" + namespace FlexFlow { std::optional diff --git a/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h index 86fa1a59aa..e4fd841787 100644 --- a/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h +++ b/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h @@ -27,6 +27,10 @@ std::optional std::unordered_multiset get_parallel_layers(PCGBinarySPDecomposition const &); +PCGBinarySPDecomposition + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + BinarySPDecompositionTree const &); + SPDecompositionTreeNodeType get_node_type(PCGBinarySPDecomposition const &); std::unordered_set diff --git a/lib/compiler/include/compiler/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm.h deleted file mode 100644 index 232f2b9563..0000000000 --- a/lib/compiler/include/compiler/unity_algorithm.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H -#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H - -#include "compiler/cost_estimator/cost_estimator.h" -#include "compiler/graph_optimize_result.dtg.h" -#include "optimizer_config.dtg.h" -#include "pcg/computation_graph.h" -#include "pcg/machine_specification.dtg.h" -#include "substitutions/sub_parallel_computation_graph.h" - -namespace FlexFlow { - -GraphOptimizeResult graph_optimize( - ParallelComputationGraph &pcg, - CostEstimator const &cost_estimator, - MachineSpecification const &resources, - std::function( - ParallelLayerAttrs const &, MachineSpecification const &)> const - &allowed_machine_views, - OptimizerConfig const &opt_config); - -} // namespace FlexFlow - -#endif diff --git a/lib/compiler/include/compiler/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h similarity index 63% rename from lib/compiler/include/compiler/graph_optimize_state.h rename to lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h index 404111ff8b..9f609f3118 100644 --- a/lib/compiler/include/compiler/graph_optimize_state.h +++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h @@ -1,16 +1,17 @@ -#ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H -#define _FLEXFLOW_COMPILER_MCMC_STATE_H +#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H +#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H -#include "compiler/graph_optimize_result.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" namespace FlexFlow { struct GraphOptimizeState { - explicit GraphOptimizeState(GraphOptimizeResult const &graph_optimize_result, + GraphOptimizeState() = delete; + explicit GraphOptimizeState(ParallelComputationGraph const &pcg, float runtime); - GraphOptimizeResult graph_optimize_result; - float runtime; + ParallelComputationGraph pcg; + float runtime_with_optimal_mm; bool operator==(GraphOptimizeState const &other) const; bool operator!=(GraphOptimizeState const &other) const; diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h new file mode 100644 index 0000000000..618e764f80 --- /dev/null +++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H +#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H + +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/search_result.dtg.h" +#include "compiler/unity_algorithm/unity_search_config.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "substitutions/substitution.h" + +namespace FlexFlow { + +SearchResult graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + UnitySearchConfig const &search_config); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/optimizer_config.struct.toml b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml similarity index 90% rename from lib/compiler/include/compiler/optimizer_config.struct.toml rename to lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml index b7f4f71e9c..9ec22cf916 100644 --- a/lib/compiler/include/compiler/optimizer_config.struct.toml +++ b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "OptimizerConfig" +name = "UnitySearchConfig" features = [ "eq", "hash", diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc new file mode 100644 index 0000000000..a58651f01a --- /dev/null +++ b/lib/compiler/src/compiler/compiler.cc @@ -0,0 +1,26 @@ +#include "compiler/compiler.h" +#include "compiler/unity_algorithm/unity_algorithm.h" +#include "pcg/pcg_from_computation_graph.h" +#include "utils/overload.h" + +namespace FlexFlow { + +SearchResult optimize(ComputationGraph const &computation_graph, + MachineSpecification const &machine_specification, + CostEstimator const &cost_estimator, + AlgorithmConfig const &search_config) { + return search_config.visit(overload{ + [&](DataParallelismConfig const &config) -> SearchResult { + throw std::runtime_error( + "Data parallel search algorithm is not implemented yet"); + }, + [&](UnitySearchConfig const &config) { + ParallelComputationGraph pcg = + pcg_from_computation_graph(computation_graph); + return graph_optimize( + pcg, cost_estimator, machine_specification, config); + }, + }); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/graph_optimize_result.cc b/lib/compiler/src/compiler/graph_optimize_result.cc deleted file mode 100644 index f48c119603..0000000000 --- a/lib/compiler/src/compiler/graph_optimize_result.cc +++ /dev/null @@ -1,15 +0,0 @@ -#include "compiler/graph_optimize_result.h" - -namespace FlexFlow { - -std::string format_as(GraphOptimizeResult const &r) { - return fmt::format("", - as_dot(r.pcg), - r.machine_mapping); -} - -std::ostream &operator<<(std::ostream &s, GraphOptimizeResult const &r) { - return (s << fmt::to_string(r)); -} - -} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/graph_optimize_state.cc b/lib/compiler/src/compiler/graph_optimize_state.cc deleted file mode 100644 index 1091b92866..0000000000 --- a/lib/compiler/src/compiler/graph_optimize_state.cc +++ /dev/null @@ -1,96 +0,0 @@ -#include "compiler/graph_optimize_state.h" -#include "compiler/graph_optimize_result.h" -#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" - -namespace FlexFlow { - -GraphOptimizeState::GraphOptimizeState( - GraphOptimizeResult const &graph_optimize_result, float runtime) - : graph_optimize_result(graph_optimize_result), runtime(runtime) {} - -bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const { - // Note(@wmdi): This is a hack to implement a partially correct homomorphism - // check. Switch to the homomorphism check used in substitutions right after - // https://github.com/flexflow/FlexFlow/pull/1471 is merged. - auto layers1 = topological_ordering(graph_optimize_result.pcg); - auto layers2 = topological_ordering(other.graph_optimize_result.pcg); - if (layers1.size() != layers2.size()) { - return false; - } - std::unordered_map mapping; - for (size_t i = 0; i < layers1.size(); ++i) { - if (get_parallel_layer_attrs(graph_optimize_result.pcg, layers1[i]) != - get_parallel_layer_attrs(other.graph_optimize_result.pcg, layers2[i])) { - return false; - } - auto inputs1 = get_incoming_tensors(graph_optimize_result.pcg, layers1[i]); - auto inputs2 = - get_incoming_tensors(other.graph_optimize_result.pcg, layers2[i]); - if (inputs1.size() != inputs2.size()) { - return false; - } - for (size_t j = 0; j < inputs1.size(); ++j) { - if (inputs1[j] != mapping.at(inputs2[j])) { - return false; - } - } - auto outputs1 = get_layer_outputs(graph_optimize_result.pcg, layers1[i]); - auto outputs2 = - get_layer_outputs(other.graph_optimize_result.pcg, layers2[i]); - if (outputs1.size() != outputs2.size()) { - return false; - } - for (size_t j = 0; j < outputs1.size(); ++j) { - mapping.emplace(outputs2[j], outputs1[j]); - } - } - return true; -} - -bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const { - return !(*this == other); -} - -bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const { - return runtime < other.runtime; -} - -std::string format_as(GraphOptimizeState const &st) { - return fmt::format("", - st.graph_optimize_result, - st.runtime); -} - -std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) { - return (s << fmt::to_string(st)); -} - -} // namespace FlexFlow - -namespace std { - -size_t hash<::FlexFlow::GraphOptimizeState>::operator()( - ::FlexFlow::GraphOptimizeState const &state) const { - // TODO(@wmdi): Eventually it might be good to use a proper graph hash like - // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash - size_t seed = 0; - auto layers = topological_ordering(state.graph_optimize_result.pcg); - ::FlexFlow::hash_combine(seed, layers.size()); - for (auto layer : layers) { - ::FlexFlow::hash_combine( - seed, get_parallel_layer_attrs(state.graph_optimize_result.pcg, layer)); - auto inputs = get_incoming_tensors(state.graph_optimize_result.pcg, layer); - ::FlexFlow::hash_combine(seed, inputs.size()); - for (auto input : inputs) { - for (size_t i = 0; i < layers.size(); ++i) { - if (get_source_layer(input) == layers[i]) { - ::FlexFlow::hash_combine(seed, i); - break; - } - } - } - } - return seed; -} - -} // namespace std diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc similarity index 79% rename from lib/compiler/src/compiler/allowed_machine_views.cc rename to lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc index 6f86d1d82a..b4df1451ca 100644 --- a/lib/compiler/src/compiler/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc @@ -1,4 +1,4 @@ -#include "compiler/allowed_machine_views.h" +#include "compiler/machine_mapping/allowed_machine_views.h" #include "pcg/machine_specification.h" #include "pcg/machine_view.h" #include "pcg/multi_dimensional_stride.dtg.h" @@ -57,6 +57,8 @@ static std::unordered_set product(transform(tensor_dims, [](nonnegative_int num_devices) { return nonnegative_int{num_devices.unwrap_nonnegative() - 1}; })); + min_num_devices_with_full_stride_volume = + std::max(min_num_devices_with_full_stride_volume, 1_n); return ceildiv(total_devices, min_num_devices_with_full_stride_volume); }; @@ -66,13 +68,19 @@ static std::unordered_set nonnegative_int max_stride_upper_bound = get_max_stride_upper_bound(tensor_dims, total_devices); - std::vector single_stride_range = - transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n), - [](nonnegative_int stride) { return stride_t{stride}; }); + std::vector> stride_options = + transform(tensor_dims, [&](nonnegative_int dim_size) { + if (dim_size != 1_n) { + return transform( + nonnegative_range(1_n, max_stride_upper_bound + 1_n), + [](nonnegative_int stride) { return stride_t{stride}; }); + } else { + return std::vector{stride_t{1_n}}; + } + }); + std::unordered_multiset> raw_stride_vectors = - cartesian_product( - repeat_element(/*num_times=*/num_elements(tensor_dims), - /*element=*/single_stride_range)); + cartesian_product(stride_options); std::unordered_multiset strides = transform(raw_stride_vectors, [](auto const &stride_vec) { return MultiDimensionalStride{stride_vec}; @@ -94,10 +102,18 @@ static std::unordered_set }; auto candidate_dimensions = [](OperatorTaskSpace const &task) { - std::unordered_set options = { - MachineSpecificationDimension::INTER_NODE, - MachineSpecificationDimension::INTRA_NODE}; - return get_all_permutations_with_repetition(options, num_dims(task)); + std::vector> dimension_options = + transform(task.degrees, [](nonnegative_int dim_size) { + if (dim_size == 1_n) { + return std::vector{ + MachineSpecificationDimension::INTRA_NODE}; + } else { + return std::vector{ + MachineSpecificationDimension::INTER_NODE, + MachineSpecificationDimension::INTRA_NODE}; + } + }); + return cartesian_product(dimension_options); }; std::vector tensor_dims = task.degrees; diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc new file mode 100644 index 0000000000..252384985b --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc @@ -0,0 +1,197 @@ +#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" +#include "substitutions/apply_substitution/apply_substitution.h" +#include "substitutions/apply_substitution/evaluate_substitution_output.h" +#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h" +#include "substitutions/open_parallel_tensor_guid_t.h" +#include "substitutions/pcg_pattern_match.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/sub_parallel_computation_graph_data.dtg.h" +#include "substitutions/sub_parallel_computation_graph_edge.h" +#include "utils/containers/is_subseteq_of.h" +#include "utils/containers/keys.h" +#include "utils/containers/merge_maps.h" +#include "utils/containers/restrict_keys.h" +#include "utils/containers/set_minus.h" +#include "utils/containers/values.h" + +namespace FlexFlow { + +SearchResult apply_substitution_and_update_machine_mapping( + SearchResult const &mapped_pcg, + Substitution const &sub, + PCGPatternMatch const &match) { + SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg); + + auto substitution_output_result = + evaluate_substitution_output(spcg, sub, match); + SubParallelComputationGraph substitution_output_graph = + substitution_output_result.first; + OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping = + substitution_output_result.second; + + SubParallelComputationGraphData output_graph_data = + get_sub_pcg_data(substitution_output_graph); + SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg); + + std::unordered_set pre_nodes = + keys(pre_data.node_data); + std::unordered_set matched_nodes = + unordered_set_of(values(match.node_assignment)); + std::unordered_set post_nodes_from_original_graph = + set_minus(pre_nodes, matched_nodes); + + std::unordered_map machine_views = + mapped_pcg.machine_mapping.machine_views; + + std::unordered_set substituted_machine_views = + transform(matched_nodes, [&](parallel_layer_guid_t const &node) { + return machine_views.at(node); + }); + MachineView first_substituted_machine_view = + *substituted_machine_views.begin(); + + std::unordered_map post_node_data = + [&] { + std::unordered_map + post_node_data_from_orig = restrict_keys( + pre_data.node_data, post_nodes_from_original_graph); + std::unordered_map + post_node_data_from_sub = output_graph_data.node_data; + + for (auto [layer, attrs] : post_node_data_from_sub) { + machine_views.insert_or_assign(layer, first_substituted_machine_view); + } + + return merge_disjoint_maps(post_node_data_from_orig, + post_node_data_from_sub); + }(); + + std::unordered_set post_edges = [&] { + std::unordered_set post_edges_from_orig = + filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) { + if (e.raw_edge.has()) { + return true; + } else { + DataflowEdge dfe = e.raw_edge.get(); + parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node}; + parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node}; + return !(contains(matched_nodes, src) || + contains(matched_nodes, dst)); + } + }); + + std::unordered_set post_edges_from_sub = + filter(output_graph_data.edges, + [&](SubParallelComputationGraphEdge const &e) { + return !e.raw_edge.has(); + }); + + bidict + output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match( + match, sub.pcg_pattern, spcg); + bidict + output_post_outexpr_mapping = get_output_graph_expr_output_mapping( + output_expr_to_result_sub_pcg_mapping, + sub.output_graph_expr, + substitution_output_graph); + + std::unordered_set incoming_to_sub_edges; + for (auto const &[pattern_input, base_graph_tensor] : + match.input_assignment) { + OutputGraphExprInput output_expr_input = + sub.inputs_mapping.at_l(pattern_input); + input_parallel_tensor_guid_t output_graph_input = + output_expr_to_result_sub_pcg_mapping.input_mapping.at_r( + output_expr_input); + std::unordered_set uses = get_parallel_tensor_uses( + substitution_output_graph, + open_parallel_tensor_guid_from_input(output_graph_input)); + for (parallel_tensor_use_t const &use : uses) { + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_use(base_graph_tensor, use); + incoming_to_sub_edges.insert(new_edge); + } + } + + std::unordered_set outgoing_from_sub_edges; + for (ParallelComputationGraphEdge const &outgoing_edge : + get_subgraph_outgoing_edges(spcg, matched_nodes)) { + parallel_tensor_guid_t original_tensor = + get_parallel_tensor(outgoing_edge); + PatternNodeOutput pattern_tensor = + output_orig_pattern_mapping.at_r(original_tensor); + OutputGraphExprNodeOutput output_graph_tensor = + sub.outputs_mapping.at_l(pattern_tensor); + parallel_tensor_guid_t new_tensor = + output_post_outexpr_mapping.at_r(output_graph_tensor); + + SubParallelComputationGraphEdge new_edge = + subpcg_edge_from_tensor_and_dst( + new_tensor, + get_dst_layer(outgoing_edge), + get_dst_layer_input_idx(outgoing_edge)); + outgoing_from_sub_edges.insert(new_edge); + } + + return set_union(std::vector{ + post_edges_from_orig, + post_edges_from_sub, + incoming_to_sub_edges, + outgoing_from_sub_edges, + }); + }(); + + std::unordered_set post_inputs = + pre_data.inputs; + + std::unordered_map + post_value_data = [&] { + std::unordered_map + post_value_data_from_orig = filter_keys( + pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) { + return visit_open_parallel_tensor_guid( + t, + overload{ + [&](parallel_tensor_guid_t const &t) { + return contains(post_nodes_from_original_graph, + get_source_layer(t)); + }, + [](input_parallel_tensor_guid_t const &) { + return true; + }, + }); + }); + + std::unordered_map + post_value_data_from_sub = output_graph_data.value_data; + return merge_disjoint_maps(post_value_data_from_orig, + post_value_data_from_sub); + }(); + + SubParallelComputationGraphData post_data = SubParallelComputationGraphData{ + post_node_data, + post_edges, + post_inputs, + post_value_data, + }; + + assert(is_subseteq_of(keys(post_node_data), keys(machine_views))); + + for (auto it = machine_views.begin(); it != machine_views.end();) { + if (post_node_data.find(it->first) == post_node_data.end()) { + it = machine_views.erase(it); + } else { + ++it; + } + } + + assert(keys(post_node_data) == keys(machine_views)); + + return SearchResult{ + pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)), + MachineMapping{machine_views}}; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 49d528e4ab..0743301e8f 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -16,9 +16,13 @@ #include "pcg/machine_view.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "utils/containers/contains.h" +#include "utils/containers/contains_key.h" #include "utils/containers/flatmap.h" #include "utils/containers/generate_map.h" #include "utils/containers/get_all_assignments.h" +#include "utils/containers/keys.h" +#include "utils/containers/merge_maps.h" +#include "utils/containers/set_minus.h" #include "utils/containers/unordered_set_of.h" #include "utils/exception.h" #include "utils/overload.h" @@ -80,17 +84,23 @@ MachineMappingResult ¶llel_split_transformation) { auto get_boundary_machine_view_assignments = - [&](std::unordered_set const &boundary_layers) + [&](std::unordered_set const &boundary_layers, + MachineMappingProblemTree const &t, + BinaryTreePathEntry const &prefix) -> std::unordered_set { + std::unordered_set unconstrained_boundary_layers = + set_minus(boundary_layers, + keys(restrict_to_child(constraints, prefix).machine_views)); + std::unordered_map> allowed = generate_map( - boundary_layers, + unconstrained_boundary_layers, [&](BinaryTreePath const &l) -> std::unordered_set { + MachineMappingProblemTree subtree_at_path = + expect(mm_problem_tree_get_subtree_at_path(t, l), + "Failed to get subtree at path"); UnmappedOpCostEstimateKey leaf = - mm_problem_tree_get_subtree_at_path( - MachineMappingProblemTree{series_split}, l) - .value() - .get(); + subtree_at_path.get(); return context.allowed_machine_views(leaf, resources); }); return transform( @@ -138,24 +148,37 @@ MachineMappingResult for (ParallelLayerGuidObliviousMachineMapping const &assigned_pre_machine_views : - get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) { + get_boundary_machine_view_assignments(get_src_layers(tensor_movement), + series_split.get_left_child(), + BinaryTreePathEntry::LEFT_CHILD)) { MachineMappingResult pre_result = eval_pre_boundary_mapping(assigned_pre_machine_views); + if (is_infeasible(pre_result)) { + continue; + } + for (ParallelLayerGuidObliviousMachineMapping const &assigned_post_machine_views : get_boundary_machine_view_assignments( - get_dst_layers(tensor_movement))) { + get_dst_layers(tensor_movement), + series_split.get_right_child(), + BinaryTreePathEntry::RIGHT_CHILD)) { MachineMappingResult post_result = eval_post_boundary_mapping(assigned_post_machine_views); + if (is_infeasible(post_result)) { + continue; + } + TensorSetMovement comm_across_split = concretize_abstracted_tensor_set_movement( tensor_movement, - /*pre_mapping=*/assigned_pre_machine_views, - /*post_mapping=*/assigned_post_machine_views); + /*pre_mapping=*/pre_result.raw_result.value().machine_mapping, + /*post_mapping=*/post_result.raw_result.value().machine_mapping); + float cost_across_split = context.cost_estimator.estimate_cost(comm_across_split); diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index 82c8274808..07bde820e9 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -1,7 +1,16 @@ #include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" +#include "pcg/machine_specification.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.dtg.h" +#include "pcg/operator_task_space.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "utils/containers/are_disjoint.h" #include "utils/containers/keys.h" +#include "utils/containers/map_keys.h" #include "utils/containers/merge_maps.h" +#include "utils/containers/transform.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h" namespace FlexFlow { @@ -15,4 +24,39 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) { return are_disjoint(keys(m1.machine_views), keys(m2.machine_views)); } +parallel_layer_guid_t + get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition, + BinaryTreePath const &path) { + std::optional subtree_optional = + get_subtree_at_path( + sp_decomposition, generic_impl_for_pcg_sp_tree(), path); + + if (!subtree_optional.has_value()) { + throw std::runtime_error(fmt::format("Invalid tree path {}", path)); + } + + PCGBinarySPDecomposition subtree = subtree_optional.value(); + if (!subtree.is_leaf()) { + throw std::runtime_error( + fmt::format("Invalid tree path to a leaf: found {} instead", subtree)); + } + return subtree.require_leaf(); +} + +std::optional get_machine_mapping_from_machine_mapping_result( + PCGBinarySPDecomposition const &sp_decomposition, + MachineMappingResult const &mm_result) { + + return transform( + mm_result.raw_result, + [&](FeasibleMachineMappingResult const &feasible_mm_result) { + return MachineMapping{ + map_keys(feasible_mm_result.machine_mapping.raw_mapping, + [&](BinaryTreePath const &path) { + return get_layer_from_path(sp_decomposition, path); + }), + }; + }); +} + } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc new file mode 100644 index 0000000000..15648eab74 --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc @@ -0,0 +1,52 @@ +#include "compiler/machine_mapping/machine_mapping_mutation_set.h" +#include "compiler/machine_mapping/allowed_machine_views.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.h" +#include "utils/containers/vector_of.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/random_utils.h" +#include "utils/vector.h" + +namespace FlexFlow { + +std::optional + get_naive_mapping(ParallelComputationGraph &pcg, + MachineSpecification const &resources, + DeviceType const &device_type) { + std::vector layers = topological_ordering(pcg); + std::unordered_map machine_views; + for (parallel_layer_guid_t layer : layers) { + OperatorTaskSpace task = get_operator_task_space(pcg, layer); + std::unordered_set allowed_machine_views = + get_allowed_machine_views(resources, task, DeviceType::GPU); + if (allowed_machine_views.empty()) { + return std::nullopt; + } + machine_views.insert({layer, *(allowed_machine_views.begin())}); + } + return MachineMapping{machine_views}; +} + +std::optional + get_random_mutation(SearchResult mapped_pcg, + MachineSpecification const &resources, + DeviceType const &device_type) { + ParallelComputationGraph pcg = mapped_pcg.pcg; + std::vector layers = topological_ordering(pcg); + if (layers.size() == 0) { + return std::nullopt; + } + parallel_layer_guid_t random_layer = select_random(layers); + + MachineMapping machine_mapping = mapped_pcg.machine_mapping; + MachineView machine_view = machine_mapping.machine_views.at(random_layer); + OperatorTaskSpace task = get_operator_task_space(pcg, random_layer); + + std::vector allowed_machine_views = + vector_of(get_allowed_machine_views(resources, task, device_type)); + MachineView random_new_machine_view = select_random(allowed_machine_views); + + machine_mapping.machine_views.at(random_layer) = random_new_machine_view; + return machine_mapping; +} +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index 367af3701e..1d000ff041 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -1,14 +1,50 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "utils/containers/all_of.h" #include "utils/overload.h" namespace FlexFlow { +bool is_valid_machine_mapping_problem_tree( + MachineMappingProblemTree const &problem_tree) { + return problem_tree.visit(overload{ + [&](MMProblemTreeSeriesSplit const &series_split) { + AbstractedTensorSetMovement tensor_movement = + series_split.tensor_set_movement; + + auto contains_paths = + [](MachineMappingProblemTree const &t, + std::unordered_set const &paths) { + return all_of(paths, [&](BinaryTreePath const &p) { + return mm_problem_tree_get_subtree_at_path(t, p).has_value(); + }); + }; + + return contains_paths(series_split.get_left_child(), + get_src_layers(tensor_movement)) && + contains_paths(series_split.get_right_child(), + get_dst_layers(tensor_movement)) && + is_valid_machine_mapping_problem_tree( + series_split.get_left_child()) && + is_valid_machine_mapping_problem_tree( + series_split.get_right_child()); + }, + [&](MMProblemTreeParallelSplit const ¶llel_split) { + return is_valid_machine_mapping_problem_tree( + parallel_split.get_left_child()) && + is_valid_machine_mapping_problem_tree( + parallel_split.get_right_child()); + }, + [&](UnmappedOpCostEstimateKey const &leaf) { return true; }, + }); +} + MachineMappingProblemTree get_machine_mapping_problem_tree( ParallelComputationGraph const &pcg, PCGBinarySPDecomposition const &sp_decomposition_tree) { @@ -23,31 +59,43 @@ MachineMappingProblemTree get_machine_mapping_problem_tree( [&](PCGBinarySeriesSplit const &series) { AbstractedTensorSetMovement tensor_movement = get_abstracted_tensor_set_movement_across_split(tr_pcg, series); - return MachineMappingProblemTree{ + MachineMappingProblemTree result = MachineMappingProblemTree{ MMProblemTreeSeriesSplit{ /*tensor_set_movement=*/tensor_movement, /*lhs=*/to_problem_tree(series.get_left_child()), /*rhs=*/to_problem_tree(series.get_right_child()), }, }; + assert(is_valid_machine_mapping_problem_tree(result)); + return result; }, [&](PCGBinaryParallelSplit const ¶llel) { - return MachineMappingProblemTree{ + MachineMappingProblemTree result = MachineMappingProblemTree{ MMProblemTreeParallelSplit{ to_problem_tree(parallel.get_left_child()), to_problem_tree(parallel.get_right_child()), }, }; + assert(is_valid_machine_mapping_problem_tree(result)); + return result; }, [&](parallel_layer_guid_t const &leaf) { - return MachineMappingProblemTree{ + MachineMappingProblemTree result = MachineMappingProblemTree{ get_unmapped_op_cost_estimate_key_for_layer(pcg, leaf), }; + assert(is_valid_machine_mapping_problem_tree(result)); + return result; }, }); }; - return to_problem_tree(sp_decomposition_tree); + MachineMappingProblemTree mm_tree = to_problem_tree(sp_decomposition_tree); + + if (!is_valid_machine_mapping_problem_tree(mm_tree)) { + throw std::runtime_error("Invalid machine mapping problem tree generated"); + } + + return mm_tree; } } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc index 1e39a7be19..7834938e41 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc @@ -1,4 +1,6 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_all_leaf_paths.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h" @@ -88,4 +90,54 @@ std::optional tree, generic_binary_sp_impl_for_mm_problem_tree(), path); } +std::string as_dot(MachineMappingProblemTree const &tree) { + std::function + get_series_label = + [](MMProblemTreeSeriesSplit const &series) -> std::string { + auto path_as_dot = [](BinaryTreePath const &path) -> std::string { + return "(" + + join_strings(path.entries, + ", ", + [](BinaryTreePathEntry const &entry) -> std::string { + if (entry == BinaryTreePathEntry::LEFT_CHILD) { + return "l"; + } else { + assert(entry == BinaryTreePathEntry::RIGHT_CHILD); + return "r"; + } + }) + + ")"; + }; + + auto path_set_as_dot = + [&](std::unordered_set const &path_set) -> std::string { + return "(" + join_strings(path_set, ", ", path_as_dot) + ")"; + }; + + return fmt::format( + "srcs={} dsts={}", + path_set_as_dot(get_src_layers(series.tensor_set_movement)), + path_set_as_dot(get_dst_layers(series.tensor_set_movement))); + }; + + std::function + get_parallel_label = + [](MMProblemTreeParallelSplit const ¶llel) -> std::string { + return "P"; + }; + + std::function get_leaf_label = + [](UnmappedOpCostEstimateKey const &leaf) -> std::string { return ""; }; + + return as_dot(tree, + generic_binary_sp_impl_for_mm_problem_tree(), + get_series_label, + get_parallel_label, + get_leaf_label); +} + +void debug_print_dot(MachineMappingProblemTree const &tree) { + std::cout << as_dot(tree) << std::endl; +} + } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc index 990b287f8b..b6d701cb98 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc @@ -1,4 +1,5 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "pcg/operator_task_space.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" @@ -18,6 +19,8 @@ UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer( transform(get_incoming_weights(pcg, layer), get_tensor_shape), /*output_shapes=*/ transform(get_layer_outputs(pcg, layer), get_tensor_shape), + /*op_task_space=*/ + get_operator_task_space(pcg, layer), }; } diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc index 3409f7f871..031b7f7fc5 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc @@ -135,4 +135,12 @@ MachineMappingResult }; } +float get_runtime_cost(MachineMappingResult const &mm_result) { + if (mm_result.raw_result == std::nullopt) { + return std::numeric_limits::infinity(); + } else { + return mm_result.raw_result.value().runtime; + } +} + } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc new file mode 100644 index 0000000000..1bf4f5c2b7 --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc @@ -0,0 +1 @@ +#include "compiler/mcmc/generic_mcmc_algorithm.h" diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc new file mode 100644 index 0000000000..6aa4dd5eff --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc @@ -0,0 +1,12 @@ +#include "compiler/mcmc/generic_mcmc_state.h" +#include "utils/archetypes/ordered_value_type.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { +using State = value_type<0>; +using Score = ordered_value_type<1>; + +template struct Generic_MCMC_state; +template struct Generic_MCMC_state; + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc new file mode 100644 index 0000000000..ab7769679e --- /dev/null +++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -0,0 +1,73 @@ +#include "compiler/mcmc/mcmc_over_mapped_pcg.h" +#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping_mutation_set.h" +#include "compiler/mcmc/generic_mcmc_algorithm.h" +#include "compiler/search_result.h" +#include "compiler/task_graph_simulator/task_simulator.h" +#include "substitutions/pcg_pattern.h" +#include "substitutions/pcg_pattern_match.h" +#include "substitutions/unity_substitution_set.h" +#include "utils/optional.h" + +namespace FlexFlow { + +SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + MCMCOverMappedPCGConfig const &search_config) { + + std::vector substitutions = get_substitution_set(resources); + + std::optional naive_mapping = + get_naive_mapping(pcg, resources, search_config.device_type); + if (naive_mapping == std::nullopt) { + throw std::runtime_error("Failed to find any solutions"); + } + + SearchResult starting_state = SearchResult{pcg, naive_mapping.value()}; + + auto generating_func = [&](SearchResult mapped_pcg, + nonnegative_int i) -> std::optional { + if (i.unwrap_nonnegative() % + search_config.substitution_interval.unwrap_nonnegative() == + 0) { + // substitutions every (substitution_interval) iterations + std::optional random_substitution = + get_random_substitution(resources); + if (random_substitution != std::nullopt) { + std::optional pattern_match = + get_random_pattern_match(random_substitution.value().pcg_pattern, + sub_pcg_from_full_pcg(mapped_pcg.pcg)); + if (pattern_match != std::nullopt) { + return apply_substitution_and_update_machine_mapping( + mapped_pcg, random_substitution.value(), pattern_match.value()); + } + } + return std::nullopt; + } else { + // machine mapping mutations otherwise + std::optional new_machine_mapping = + get_random_mutation(mapped_pcg, resources, search_config.device_type); + if (new_machine_mapping == std::nullopt) { + return std::nullopt; + } + return SearchResult{mapped_pcg.pcg, new_machine_mapping.value()}; + } + }; + + auto scoring_func = [&](SearchResult mapped_pcg) -> float { + return task_simulator_estimate_forward_pass_time( + mapped_pcg.pcg, cost_estimator, mapped_pcg.machine_mapping, resources); + }; + + GenericMCMCConfig config = + GenericMCMCConfig{/*temperature*/ search_config.temperature, + /*num_iterations*/ search_config.num_iterations}; + + Generic_MCMC_state result = + minimize_score(starting_state, generating_func, scoring_func, config); + + return result.get_state(); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/search_result.cc b/lib/compiler/src/compiler/search_result.cc new file mode 100644 index 0000000000..0afc10723a --- /dev/null +++ b/lib/compiler/src/compiler/search_result.cc @@ -0,0 +1,15 @@ +#include "compiler/search_result.h" + +namespace FlexFlow { + +std::string format_as(SearchResult const &r) { + return fmt::format("", + as_dot(r.pcg), + r.machine_mapping); +} + +std::ostream &operator<<(std::ostream &s, SearchResult const &r) { + return (s << fmt::to_string(r)); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc b/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc index 5eb993c6ef..7b4670c608 100644 --- a/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc +++ b/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc @@ -1,7 +1,10 @@ #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" +#include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h" +#include "compiler/series_parallel/pcg/pcg_binary_parallel_split.h" #include "compiler/series_parallel/pcg/pcg_binary_series_split.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/find_paths_to_leaf.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/left_associative_binary_sp_tree_from_nary.h" #include "utils/overload.h" namespace FlexFlow { @@ -82,8 +85,63 @@ BinarySPDecompositionTree } std::optional - get_pcg_balanced_binary_sp_decomposition(ParallelComputationGraph const &) { - NOT_IMPLEMENTED(); + get_pcg_balanced_binary_sp_decomposition( + ParallelComputationGraph const &pcg) { + SeriesParallelDecomposition sp_decomp = + expect(get_pcg_series_parallel_decomposition(pcg), + "Failed to get SP decomposition of PCG"); + BinarySPDecompositionTree binary_sp_tree = + left_associative_binary_sp_tree_from_nary(sp_decomp); + return pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + binary_sp_tree); +} + +PCGBinarySeriesSplit pcg_binary_series_split_from_binary_series_split( + BinarySeriesSplit const &split) { + return PCGBinarySeriesSplit{ + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + split.get_left_child()), + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + split.get_right_child()), + }; +} + +PCGBinaryParallelSplit pcg_binary_parallel_split_from_binary_parallel_split( + BinaryParallelSplit const &split) { + return PCGBinaryParallelSplit{ + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + split.get_left_child()), + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + split.get_right_child()), + }; +} + +PCGBinarySPDecomposition + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + BinarySPDecompositionTree const &sp_tree) { + + return sp_tree.visit(overload{ + [](BinarySeriesSplit const &series) -> PCGBinarySPDecomposition { + return PCGBinarySPDecomposition{ + pcg_binary_series_split_from_binary_series_split(series), + }; + }, + [](BinaryParallelSplit const ¶llel) -> PCGBinarySPDecomposition { + return PCGBinarySPDecomposition{ + PCGBinaryParallelSplit{ + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + parallel.get_left_child()), + pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree( + parallel.get_right_child()), + }, + }; + }, + [](Node const &node) -> PCGBinarySPDecomposition { + return PCGBinarySPDecomposition{ + parallel_layer_guid_t{node}, + }; + }, + }); } std::unordered_multiset diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc new file mode 100644 index 0000000000..22e319321b --- /dev/null +++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc @@ -0,0 +1,61 @@ +#include "compiler/unity_algorithm/graph_optimize_state.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" + +namespace FlexFlow { + +GraphOptimizeState::GraphOptimizeState(ParallelComputationGraph const &pcg, + float runtime_with_optimal_mm) + : pcg(pcg), runtime_with_optimal_mm(runtime_with_optimal_mm) {} + +bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const { + return pcgs_are_isomorphic(pcg, other.pcg); +} + +bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const { + return !(*this == other); +} + +bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const { + return runtime_with_optimal_mm < other.runtime_with_optimal_mm; +} + +std::string format_as(GraphOptimizeState const &st) { + return fmt::format("", + as_dot(st.pcg), + st.runtime_with_optimal_mm); +} + +std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) { + return (s << fmt::to_string(st)); +} + +} // namespace FlexFlow + +namespace std { + +size_t hash<::FlexFlow::GraphOptimizeState>::operator()( + ::FlexFlow::GraphOptimizeState const &state) const { + // TODO(@wmdi): Eventually it might be good to use a proper graph hash like + // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash + size_t seed = 0; + std::vector<::FlexFlow::parallel_layer_guid_t> layers = + topological_ordering(state.pcg); + ::FlexFlow::hash_combine(seed, layers.size()); + for (::FlexFlow::parallel_layer_guid_t const &layer : layers) { + ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer)); + std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = + get_incoming_tensors(state.pcg, layer); + ::FlexFlow::hash_combine(seed, inputs.size()); + for (::FlexFlow::parallel_tensor_guid_t input : inputs) { + for (size_t i = 0; i < layers.size(); ++i) { + if (get_source_layer(input) == layers.at(i)) { + ::FlexFlow::hash_combine(seed, i); + break; + } + } + } + } + return seed; +} + +} // namespace std diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc new file mode 100644 index 0000000000..caaefbfdbf --- /dev/null +++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc @@ -0,0 +1,138 @@ +#include "compiler/unity_algorithm/unity_algorithm.h" +#include "compiler/machine_mapping/allowed_machine_views.h" +#include "compiler/machine_mapping/get_optimal_machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping_cache.h" +#include "compiler/machine_mapping/machine_mapping_constraints.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_result.h" +#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h" +#include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h" +#include "compiler/unity_algorithm/graph_optimize_state.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/operator_task_space.h" +#include "substitutions/apply_substitution/apply_substitution.h" +#include "substitutions/pcg_pattern.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/substitution.h" +#include "substitutions/unity_substitution_set.h" +#include "utils/containers/generate_map.h" +#include "utils/deduplicated_priority_queue.h" +#include "utils/graph/node/algorithms.h" +#include "utils/optional.h" + +namespace FlexFlow { + +/* + * Applies a substitution to all possible positions in PCG + */ +std::vector + all_pcgs_obtained_by_applying_a_substitution( + ParallelComputationGraph const &pcg, + std::vector const &substitutions) { + std::vector results; + SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(pcg); + for (Substitution const &substitution : substitutions) { + for (PCGPatternMatch const &pattern_match : + find_pattern_matches(substitution.pcg_pattern, subpcg)) { + SubParallelComputationGraph subpcg_from_substitution = + apply_substitution(subpcg, substitution, pattern_match); + results.push_back( + pcg_from_sub_pcg_by_dropping_inputs(subpcg_from_substitution)); + } + } + return results; +} + +SearchResult graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + UnitySearchConfig const &search_config) { + + std::vector substitutions = get_substitution_set(resources); + + MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache(); + DeduplicatedPriorityQueue candidates; + + MachineMappingContext context = MachineMappingContext{ + /*cost_estimator=*/cost_estimator, + /*allowed_machine_views=*/ + [&](UnmappedOpCostEstimateKey const &key, + MachineSpecification const &resources) + -> std::unordered_set { + return get_allowed_machine_views( + resources, key.op_task_space, DeviceType::GPU); + }, + }; + + auto optimize_pcg = [&](ParallelComputationGraph const &pcg) + -> std::pair> { + PCGBinarySPDecomposition sp_decomp = + expect(get_pcg_balanced_binary_sp_decomposition(pcg), + "Failed to get SP decomposition of PCG"); + + MachineMappingProblemTree problem_tree = + get_machine_mapping_problem_tree(pcg, sp_decomp); + MachineMappingConstraints constraints = + get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree)); + + MachineMappingResult mm_result = get_optimal_machine_mapping( + cached_subgraph_costs, context, problem_tree, resources, constraints); + + return { + GraphOptimizeState{ + /*pcg=*/pcg, + /*runtime_with_optimal_mm=*/get_runtime_cost(mm_result), + }, + get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result), + }; + }; + + GraphOptimizeState best_state = optimize_pcg(pcg).first; + candidates.push(best_state); + + for (int iteration = 0; + !candidates.empty() && iteration < search_config.budget; + ++iteration) { + GraphOptimizeState current_state = candidates.top(); + candidates.pop(); + + if (current_state < best_state) { + best_state = current_state; + } else if (current_state.runtime_with_optimal_mm > + best_state.runtime_with_optimal_mm * search_config.alpha) { + continue; + } + + for (ParallelComputationGraph const &new_pcg : + all_pcgs_obtained_by_applying_a_substitution(current_state.pcg, + substitutions)) { + std::optional new_pcg_optimize_result = + optimize_pcg(new_pcg).first; + if (new_pcg_optimize_result == std::nullopt) { + continue; + } + GraphOptimizeState new_state = new_pcg_optimize_result.value(); + if (new_state.runtime_with_optimal_mm <= search_config.threshold && + get_nodes(new_pcg.raw_graph).size() <= search_config.max_num_ops) { + candidates.push(new_state); + } + } + } + + std::optional best_mapping = + optimize_pcg(best_state.pcg).second; + + if (best_mapping == std::nullopt) { + throw std::runtime_error("Failed to find any solutions"); + } + + return SearchResult{ + /*pcg=*/best_state.pcg, + /*machine_mapping=*/best_mapping.value(), + }; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/unity_algorithm.cc b/lib/compiler/src/unity_algorithm.cc deleted file mode 100644 index 86a211c535..0000000000 --- a/lib/compiler/src/unity_algorithm.cc +++ /dev/null @@ -1,93 +0,0 @@ -#include "compiler/unity_algorithm.h" -#include "compiler/graph_optimize_state.h" -#include "compiler/machine_mapping/get_optimal_machine_mapping.h" -#include "pcg/machine_specification.dtg.h" -#include "substitutions/substitution.h" -#include "utils/deduplicated_priority_queue.h" -#include "utils/graph/node/algorithms.h" -namespace FlexFlow { - -/* - * Gets all substitutions applicable to a PCG - */ -std::vector - get_all_applicable_substitutions(ParallelComputationGraph const &pcg) { - NOT_IMPLEMENTED(); -} - -/* - * Applies a substitution to all possible positions in PCG - */ -std::vector - apply_substitution(ParallelComputationGraph const &pcg, - Substitution const &) { - NOT_IMPLEMENTED(); -} - -GraphOptimizeResult graph_optimize( - ParallelComputationGraph &pcg, - CostEstimator const &cost_estimator, - MachineSpecification const &resources, - std::function( - ParallelLayerAttrs const &, MachineSpecification const &)> const - &allowed_machine_views, - OptimizerConfig const &opt_config) { - NOT_IMPLEMENTED(); - - // std::vector substitutions = - // get_all_applicable_substitutions(pcg); - // - // MachineMappingCache cached_subgraph_costs; - // DeduplicatedPriorityQueue candidates; - // - // MachineMappingResult original_pcg_cost = - // get_optimal_machine_mapping(pcg, - // allowed_machine_views, - // cost_estimator, - // resources, - // cached_subgraph_costs); - // - // GraphOptimizeState initial_state = { - // GraphOptimizeResult(pcg, original_pcg_cost.machine_mapping), - // original_pcg_cost.runtime}; - // - // GraphOptimizeState best_state = initial_state; - // candidates.push(initial_state); - // - // for (int iteration = 0; !candidates.empty() && iteration < - // opt_config.budget; - // ++iteration) { - // GraphOptimizeState current_state = candidates.top(); - // candidates.pop(); - // - // if (current_state.runtime < best_state.runtime) { - // best_state = current_state; - // } else if (current_state.runtime > best_state.runtime * opt_config.alpha) - // { - // continue; - // } - // - // for (Substitution const &substitution : substitutions) { - // for (ParallelComputationGraph const &new_pcg : apply_substitution( - // current_state.graph_optimize_result.pcg, substitution)) { - // MachineMappingResult new_pcg_cost = - // get_optimal_machine_mapping(new_pcg, - // allowed_machine_views, - // cost_estimator, - // resources, - // cached_subgraph_costs); - // GraphOptimizeState new_state{ - // GraphOptimizeResult(new_pcg, new_pcg_cost.machine_mapping), - // new_pcg_cost.runtime}; - // if (new_pcg_cost.runtime <= opt_config.threshold && - // get_nodes(new_pcg.raw_graph).size() <= opt_config.max_num_ops) { - // candidates.push(new_state); - // } - // } - // } - // } - - // return best_state.graph_optimize_result; -} - -} // namespace FlexFlow diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc deleted file mode 100644 index 817cc80700..0000000000 --- a/lib/compiler/test/src/allowed_machine_views.cc +++ /dev/null @@ -1,110 +0,0 @@ -#include "compiler/allowed_machine_views.h" -#include "doctest/doctest.h" -#include "utils/containers/extend.h" -#include "utils/containers/range.h" -#include "utils/containers/transform.h" -#include "utils/containers/unordered_set_of.h" -#include "utils/containers/zip.h" -#include "utils/fmt/unordered_set.h" - -using namespace FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - - TEST_CASE("get_allowed_machine_views") { - - SUBCASE("1 degree of parallelism") { - MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/1_n, - /*num_cpus_per_node=*/5_n, - /*num_gpus_per_node=*/5_n, - /*inter_node_bandwidth=*/0, - /*intra_node_bandwidth=*/0, - }; - - OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; - - std::unordered_set correct = { - MachineView{ - MachineSpaceCoordinate{ - /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, - MachineSpecificationDimension::INTRA_NODE}}, - }, - - MachineView{ - MachineSpaceCoordinate{ - /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, - MachineSpecificationDimension::INTRA_NODE}}, - }, - MachineView{ - MachineSpaceCoordinate{ - /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, - MachineSpecificationDimension::INTRA_NODE}}, - }, - MachineView{ - MachineSpaceCoordinate{ - /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{2_n}, - MachineSpecificationDimension::INTRA_NODE}}, - }, - }; - - std::unordered_set result = - get_allowed_machine_views(ms, task, DeviceType::GPU); - - CHECK(correct == result); - } - - SUBCASE("2 degrees of parallelism") { - - MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/3_n, - /*num_cpus_per_node=*/3_n, - /*num_gpus_per_node=*/3_n, - /*inter_node_bandwidth=*/0, - /*intra_node_bandwidth=*/0, - }; - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}}; - - auto make_2d_view = [&](nonnegative_int start_node_idx, - nonnegative_int start_device_idx, - nonnegative_int stride1, - nonnegative_int stride2, - MachineSpecificationDimension m1, - MachineSpecificationDimension m2) { - return MachineView{ - MachineSpaceCoordinate{ - start_node_idx, start_device_idx, DeviceType::GPU}, - {MachineViewDimension{stride_t{stride1}, m1}, - MachineViewDimension{stride_t{stride2}, m2}}, - }; - }; - - auto intra = MachineSpecificationDimension::INTRA_NODE; - auto inter = MachineSpecificationDimension::INTER_NODE; - std::unordered_set correct = { - make_2d_view( - 0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra), - make_2d_view( - 1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra), - make_2d_view( - 0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra), - - make_2d_view( - 0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter), - make_2d_view( - 0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter), - make_2d_view( - 0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter), - }; - - std::unordered_set result = - get_allowed_machine_views(ms, task, DeviceType::GPU); - - CHECK(correct == result); - } - } -} diff --git a/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc new file mode 100644 index 0000000000..f176621a18 --- /dev/null +++ b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc @@ -0,0 +1,156 @@ +#include "compiler/machine_mapping/allowed_machine_views.h" +#include "doctest/doctest.h" +#include "utils/containers/extend.h" +#include "utils/containers/range.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" +#include "utils/containers/zip.h" +#include "utils/fmt/unordered_set.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + + TEST_CASE("get_allowed_machine_views") { + + auto make_2d_view = [&](nonnegative_int start_node_idx, + nonnegative_int start_device_idx, + nonnegative_int stride_1, + nonnegative_int stride_2, + MachineSpecificationDimension m1, + MachineSpecificationDimension m2) { + return MachineView{ + MachineSpaceCoordinate{ + start_node_idx, start_device_idx, DeviceType::GPU}, + {MachineViewDimension{stride_t{stride_1}, m1}, + MachineViewDimension{stride_t{stride_2}, m2}}, + }; + }; + auto intra = MachineSpecificationDimension::INTRA_NODE; + auto inter = MachineSpecificationDimension::INTER_NODE; + + SUBCASE("1 degree of parallelism") { + MachineSpecification ms = MachineSpecification{ + /*num_nodes=*/1_n, + /*num_cpus_per_node=*/5_n, + /*num_gpus_per_node=*/5_n, + /*inter_node_bandwidth=*/0, + /*intra_node_bandwidth=*/0, + }; + + OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; + + std::unordered_set correct = { + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{1_n}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, + {MachineViewDimension{stride_t{2_n}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + }; + + std::unordered_set result = + get_allowed_machine_views(ms, task, DeviceType::GPU); + + CHECK(correct == result); + } + + SUBCASE("2 degrees of parallelism") { + + MachineSpecification ms = MachineSpecification{ + /*num_nodes=*/3_n, + /*num_cpus_per_node=*/3_n, + /*num_gpus_per_node=*/3_n, + /*inter_node_bandwidth=*/0, + /*intra_node_bandwidth=*/0, + }; + OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}}; + + std::unordered_set correct = { + make_2d_view( + 0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, inter, intra), + make_2d_view( + 1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, inter, intra), + make_2d_view( + 0_n, 0_n, /*stride_1=*/2_n, /*stride_2=*/1_n, inter, intra), + + make_2d_view( + 0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter), + make_2d_view( + 0_n, 1_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter), + make_2d_view( + 0_n, 0_n, /*stride_1=*/2_n, /*stride_2=*/1_n, intra, inter), + }; + + std::unordered_set result = + get_allowed_machine_views(ms, task, DeviceType::GPU); + + CHECK(correct == result); + } + + SUBCASE("2D operator task space, dimensions (1,1)") { + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/nonnegative_int{2}, + /*num_cpus_per_node=*/nonnegative_int{1}, + /*num_gpus_per_node=*/nonnegative_int{1}, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + OperatorTaskSpace task = OperatorTaskSpace{{1_n, 1_n}}; + + std::unordered_set result = + get_allowed_machine_views(full_machine_spec, task, DeviceType::GPU); + + std::unordered_set correct = { + make_2d_view( + 0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra), + make_2d_view( + 1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra)}; + CHECK(correct == result); + } + + SUBCASE("2D operator task space, dimensions (2,1)") { + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/nonnegative_int{2}, + /*num_cpus_per_node=*/nonnegative_int{2}, + /*num_gpus_per_node=*/nonnegative_int{2}, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n}}; + + std::unordered_set result = + get_allowed_machine_views(full_machine_spec, task, DeviceType::GPU); + + std::unordered_set correct = { + make_2d_view( + 0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra), + make_2d_view( + 0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter), + make_2d_view( + 1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra), + make_2d_view( + 0_n, 1_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter)}; + CHECK(correct == result); + } + } +} diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index e506dea1d7..a45227011c 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -109,11 +109,14 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::FLOAT, }; + OperatorTaskSpace fake_op_task_space = OperatorTaskSpace{{}}; + UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{ /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}}, /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ @@ -126,6 +129,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape); diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index 048f1ddcac..9059950742 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -1,8 +1,15 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h" #include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/operator_task_space.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "pcg/pcg_from_computation_graph.h" +#include "utils/containers/extend.h" #include "utils/containers/get_only.h" +#include "utils/containers/vector_of.h" #include using namespace ::FlexFlow; @@ -90,6 +97,14 @@ TEST_SUITE(FF_TEST_SUITE) { PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{input_shape}}; + auto make_operator_task_space = [&](ParallelTensorShape const &shape) { + std::vector degrees; + extend(degrees, vector_of(ff_ordered_shard_degrees(shape))); + degrees.push_back(get_sum_degree(shape)); + degrees.push_back(get_discard_copy_degree(shape)); + return OperatorTaskSpace{degrees}; + }; + auto make_input_key = [&](ParallelTensorShape const ¶llel_tensor_shape) { return UnmappedOpCostEstimateKey{ @@ -97,6 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{parallel_tensor_shape}, + /*op_task_space=*/make_operator_task_space(parallel_tensor_shape), }; }; @@ -143,11 +159,15 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_layer_guid_t relu_layer = relu_added.parallel_layer; parallel_tensor_guid_t relu_output = get_only(relu_added.outputs); + OperatorTaskSpace relu_task_space = + get_operator_task_space(pcg, relu_layer); + UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{ /*op_attrs=*/relu_attrs, /*input_shapes=*/{par_input_shape}, /*weight_shapes=*/{}, /*output_shapes=*/{relu_output_shape}, + /*op_task_space=*/relu_task_space, }; PCGBinarySPDecomposition sp_decomposition = pcg_make_series( @@ -228,11 +248,14 @@ TEST_SUITE(FF_TEST_SUITE) { {input1_tensor, input2_tensor}, {}); parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer; + OperatorTaskSpace ew_op_task_space = + get_operator_task_space(pcg, ew_op_layer); UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{ /*op_attrs=*/ew_op_attrs, /*input_shapes=*/{par_input_shape, par_input_shape}, /*weight_shapes=*/{}, /*output_shapes=*/{ew_op_output_shape}, + /*op_task_space=*/ew_op_task_space, }; PCGBinarySPDecomposition sp_decomposition = @@ -280,4 +303,43 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } } + + TEST_CASE("from pcg") { + ComputationGraph cg = [&] { + ComputationGraphBuilder b; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{ + FFOrdered{nonnegative_int{32}, + nonnegative_int{64}}, + }, + DataType::FLOAT, + }; + tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES); + t = b.dense(t, + /*outDim=*/nonnegative_int{16}, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/nonnegative_int{12}, + /*activation=*/std::nullopt, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + t = b.relu(t); + t = b.dense(t, + /*outDim=*/nonnegative_int{8}, + /*activation=*/Activation::RELU); + return b.computation_graph; + }(); + + ParallelComputationGraph pcg = pcg_from_computation_graph(cg); + + PCGBinarySPDecomposition sp_decomp = + expect(get_pcg_balanced_binary_sp_decomposition(pcg), + "Failed to get SP decomposition of PCG"); + + MachineMappingProblemTree problem_tree = + get_machine_mapping_problem_tree(pcg, sp_decomp); + } } diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 8ae1ebe753..f049f4b288 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -99,6 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) { } }; + OperatorTaskSpace fake_op_task_space = OperatorTaskSpace{{}}; TensorShape tensor_shape = TensorShape{ TensorDims{ FFOrdered{ @@ -116,6 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ @@ -128,6 +130,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{ diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc new file mode 100644 index 0000000000..ba6faa93c4 --- /dev/null +++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc @@ -0,0 +1,32 @@ +#include "compiler/mcmc/generic_mcmc_algorithm.h" +#include "doctest/doctest.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("generic_mcmc_algorithm") { + float starting_state = 0.1; + auto generating_func = [](float x, + nonnegative_int i) -> std::optional { + float new_x = x + (randf() - 0.5) / (i.unwrap_nonnegative() + 1); + if (new_x < 0) { + return std::nullopt; + } + if (new_x > 1) { + return std::nullopt; + } + return new_x; + }; + auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); }; + GenericMCMCConfig config = GenericMCMCConfig{/*temperature=*/1.0, + /*num_iterations=*/10_n}; + Generic_MCMC_state result = + minimize_score(starting_state, generating_func, scoring_func, config); + float answer = result.get_state(); + float error = result.get_score(); + CHECK(answer > 0.49); + CHECK(answer < 0.51); + CHECK(error >= 0); + CHECK(error < 0.01); + } +} diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc new file mode 100644 index 0000000000..7d74d897e4 --- /dev/null +++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc @@ -0,0 +1,79 @@ +#include "compiler/mcmc/mcmc_over_mapped_pcg.h" +#include "../cost_estimator_for_test.h" +#include "compiler/task_graph_simulator/task_simulator.h" +#include "doctest/doctest.h" +#include "op-attrs/parallel_tensor_dims.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/replica_type.dtg.h" +#include "op-attrs/shard_parallel_dim.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "pcg/pcg_from_computation_graph.h" +#include "utils/integer_conversions.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("mcmc_graph_optimize") { + ComputationGraph cg = [&] { + ComputationGraphBuilder b; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{ + FFOrdered{32_n, 64_n}, + }, + DataType::FLOAT, + }; + tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES); + t = b.dense(t, + /*outDim=*/16_n, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/12_n, + /*activation=*/std::nullopt, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + t = b.relu(t); + t = b.dense(t, + /*outDim=*/8_n, + /*activation=*/Activation::RELU); + return b.computation_graph; + }(); + + ParallelComputationGraph pcg = pcg_from_computation_graph(cg); + + CostEstimator cost_estimator = make_fake_cost_estimator( + [](OpCostEstimateKey const &k) { + return OpCostMetrics{ + /*forward_runtime=*/1.0, + /*backward_runtime=*/2.0, + /*memory=*/1_n, + }; + }, + [](TensorSetMovement const &) { return 1.0; }); + + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/2_n, + /*num_cpus_per_node=*/1_n, + /*num_gpus_per_node=*/1_n, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + + MCMCOverMappedPCGConfig search_config = + MCMCOverMappedPCGConfig{/*temperature=*/1.0, + /*num_iterations=*/100_n, + /*substitution_interval=*/5_n, + /*device_type=*/DeviceType::GPU}; + + SearchResult result = mcmc_graph_optimize( + pcg, cost_estimator, full_machine_spec, search_config); + float runtime = task_simulator_estimate_forward_pass_time( + result.pcg, cost_estimator, result.machine_mapping, full_machine_spec); + std::cout << runtime << std::endl; + + CHECK(runtime < 12); + } +} diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc similarity index 68% rename from lib/compiler/test/src/graph_optimize_state.cc rename to lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc index 5c00ce1558..3b146be93f 100644 --- a/lib/compiler/test/src/graph_optimize_state.cc +++ b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc @@ -1,4 +1,4 @@ -#include "compiler/graph_optimize_state.h" +#include "compiler/unity_algorithm/graph_optimize_state.h" #include "doctest/doctest.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" @@ -15,24 +15,6 @@ TEST_SUITE(FF_TEST_SUITE) { }, DataType::FLOAT, }; - // ParallelTensorShape input_shape = - // ParallelTensorShape{ParallelTensorDims{ - // FFOrdered{ - // ShardParallelDim{32_n, 2_n}, - // ShardParallelDim{16_n, 1_n}, - // }, - // ReplicaParallelDimSet{ - // SumDegree{1_n}, - // DiscardCopyDegree{1_n}, - // }, - // }, - // DataType::FLOAT}; - - // `machine_mapping` is determined by the PCG and the device mapping - // algorithm, and `runtime` is determined by the PCG and the device mapping, - // so their values here do not matter. - std::unordered_map empty_machine_views; - MachineMapping empty_machine_mapping(empty_machine_views); InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}}; @@ -70,13 +52,12 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg2 = create_pcg(); GraphOptimizeState state1 = GraphOptimizeState{ - GraphOptimizeResult{pcg1, empty_machine_mapping}, - 0, + pcg1, + .0, }; - GraphOptimizeState state2 = GraphOptimizeState{ - GraphOptimizeResult{pcg2, empty_machine_mapping}, - 0, + pcg2, + .0, }; CHECK(state1 == state2); @@ -100,16 +81,30 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg_ = builder_.pcg; GraphOptimizeState state1 = GraphOptimizeState{ - GraphOptimizeResult{pcg1, empty_machine_mapping}, - 0, + pcg1, + .0, }; GraphOptimizeState state_ = GraphOptimizeState{ - GraphOptimizeResult{pcg_, empty_machine_mapping}, - 0, + pcg_, + .0, }; CHECK_FALSE(state1 == state_); } } + + TEST_CASE("GraphOptimizeState::operator<") { + ParallelComputationGraph pcg1 = empty_parallel_computation_graph(); + ParallelComputationGraph pcg2 = empty_parallel_computation_graph(); + GraphOptimizeState state1 = GraphOptimizeState{ + pcg1, + 1.0, + }; + GraphOptimizeState state2 = GraphOptimizeState{ + pcg2, + 2.0, + }; + CHECK(state1 < state2); + } } diff --git a/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc new file mode 100644 index 0000000000..4ca23710e2 --- /dev/null +++ b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc @@ -0,0 +1,77 @@ +#include "compiler/unity_algorithm/unity_algorithm.h" +#include "../cost_estimator_for_test.h" +#include "doctest/doctest.h" +#include "op-attrs/parallel_tensor_dims.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/replica_type.dtg.h" +#include "op-attrs/shard_parallel_dim.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "pcg/pcg_from_computation_graph.h" +#include "utils/integer_conversions.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("graph_optimize") { + ComputationGraph cg = [&] { + ComputationGraphBuilder b; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{ + FFOrdered{nonnegative_int{32}, + nonnegative_int{64}}, + }, + DataType::FLOAT, + }; + tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES); + t = b.dense(t, + /*outDim=*/nonnegative_int{16}, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/nonnegative_int{12}, + /*activation=*/std::nullopt, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + t = b.relu(t); + t = b.dense(t, + /*outDim=*/nonnegative_int{8}, + /*activation=*/Activation::RELU); + return b.computation_graph; + }(); + + ParallelComputationGraph pcg = pcg_from_computation_graph(cg); + + CostEstimator cost_estimator = make_fake_cost_estimator( + [](OpCostEstimateKey const &k) { + return OpCostMetrics{ + /*forward_runtime=*/1.0, + /*backward_runtime=*/2.0, + /*memory=*/nonnegative_int{1}, + }; + }, + [](TensorSetMovement const &) { return 1.0; }); + + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/nonnegative_int{2}, + /*num_cpus_per_node=*/nonnegative_int{1}, + /*num_gpus_per_node=*/nonnegative_int{1}, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + + UnitySearchConfig search_config = UnitySearchConfig{ + /*alpha=*/1.0, + /*budget=*/0, + /*threshold=*/1000.0, + /*max_num_ops=*/100, + }; + + SearchResult result = + graph_optimize(pcg, cost_estimator, full_machine_spec, search_config); + + // TODO: check the result + } +} diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc deleted file mode 100644 index 8ff0978ea5..0000000000 --- a/lib/compiler/test/src/unity_algorithm.cc +++ /dev/null @@ -1,26 +0,0 @@ -#include "compiler/unity_algorithm.h" -#include "doctest/doctest.h" - -TEST_SUITE(FF_TEST_SUITE) { - // Rapidcheck does not work for now - // TEST_CASE("graph_optimize") { - // RC_SUBCASE([](ComputationGraph const &g, - // float alpha, - // int budget, - // float threshold, - // int max_num_ops) { - // Strategy s = graph_optimize( - // g, - // TestCostEstimator{}, - // MachineSpecification{1, 1, 4, 0.1, 0.2}, - // [](Operator const &, MachineSpecification const &) { - // return std::unordered_set{make_1d_machine_view(0, 1, - // 1)}; - // }, - // OptimizerConfig{alpha, budget, threshold, max_num_ops}); - // RC_ASSERT(get_nodes(s.pcg).size() > 0); - // RC_ASSERT(s.machine_mapping.runtime > 0); - // RC_ASSERT(keys(s.machine_mapping.machine_views) == get_nodes(s.pcg)); - // }); - // } -} diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( @@ -30,6 +29,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..f9bef91b25 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -1,25 +1,88 @@ #ifndef _FLEXFLOW_KERNELS_ACCESSOR_H #define _FLEXFLOW_KERNELS_ACCESSOR_H -#include "array_shape.h" -#include "device.h" +#include "kernels/array_shape.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" -#include "utils/exception.h" +#include "pcg/device_type.dtg.h" +#include "utils/containers/transform.h" #include "utils/required.h" +#include namespace FlexFlow { +nonnegative_int + calculate_accessor_offset(LegionOrdered const &, + ArrayShape const &); + +class GenericTensorAccessorR { +public: + template + typename data_type_enum_to_class
::type const *get() const { + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + return static_cast const *>(this->ptr); + } + + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; + + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(FFOrdered const &indices) const { + return this->at
(legion_ordered_from_ff_ordered(indices)); + } + + template + real_type_t
const & + at(LegionOrdered const &indices) const { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorR::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset.unwrap_nonnegative()]; + } + +public: + DataType data_type; + ArrayShape shape; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; +}; + +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); + class GenericTensorAccessorW { public: template typename data_type_enum_to_class
::type *get() const { - if (this->data_type == DT) { - return static_cast *>(this->ptr); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + return static_cast *>(this->ptr); } int32_t *get_int32_ptr() const; @@ -28,76 +91,76 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; -public: - DataType data_type; - ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); + GenericTensorAccessorW() = delete; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; -class GenericTensorAccessorR { -public: template - typename data_type_enum_to_class
::type const *get() const { - if (this->data_type == DT) { - return static_cast const *>(this->ptr); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } + real_type_t
&at(FFOrdered const &indices) { + return this->at
(legion_ordered_from_ff_ordered(indices)); } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + template + real_type_t
&at(LegionOrdered const &indices) { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorW::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset.unwrap_nonnegative()]; + } + + template + real_type_t
const &at(FFOrdered const &indices) const { + return this->at
(legion_ordered_from_ff_ordered(indices)); + } + + template + real_type_t
&at(LegionOrdered const &indices) const { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorW::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; + void *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); - -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); template typename data_type_enum_to_class
::type * get(GenericTensorAccessorW const &a) { - if (a.data_type == DT) { - return static_cast *>(a.ptr); - } else { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", a.data_type, DT)); - } + ASSERT(a.data_type == DT, "Invalid datatype requested"); + return static_cast *>(a.ptr); } template @@ -113,12 +176,8 @@ std::vector *> template typename data_type_enum_to_class
::type const * get(GenericTensorAccessorR const &a) { - if (a.data_type == DT) { - return static_cast const *>(a.ptr); - } else { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", a.data_type, DT)); - } + ASSERT(a.data_type == DT, "Invalid datatype requested"); + return static_cast const *>(a.ptr); } int32_t const *get_int32_ptr(GenericTensorAccessorR const &); @@ -137,6 +196,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -150,12 +224,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -163,8 +233,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); + +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..39bad6599c 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include @@ -11,6 +11,8 @@ struct IAllocator { virtual void *allocate(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -18,9 +20,14 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + void deallocate_tensor(GenericTensorAccessorW const &); + void deallocate_tensor(GenericTensorAccessorR const &); + void *allocate(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/kernels/include/kernels/array_coord.struct.toml new file mode 100644 index 0000000000..8ce121f2bf --- /dev/null +++ b/lib/kernels/include/kernels/array_coord.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "ArrayCoord" +features = [ + "eq", + "ord", + "hash", + "fmt", + "rapidcheck", + "json", +] + +includes = [ + "op-attrs/ff_ordered/ff_ordered.h", + "utils/nonnegative_int/nonnegative_int.h" +] + +[[fields]] +name = "ff_ordered" +type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 57498ee466..25ef8116f2 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H +#include "kernels/array_coord.dtg.h" #include "kernels/legion_dim.h" #include "op-attrs/tensor_shape.dtg.h" #include "utils/nonnegative_int/nonnegative_int.h" @@ -15,9 +16,7 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); - ArrayShape(TensorShape const &shape); - ArrayShape(std::vector const &); + explicit ArrayShape(LegionOrdered const &dims); /** * @brief Alias of ArrayShape::num_elements for compatibility with @@ -46,24 +45,40 @@ struct ArrayShape { std::optional at_maybe(legion_dim_t) const; std::optional at_maybe(ff_dim_t) const; - ArrayShape - sub_shape(std::optional> start, - std::optional> end) const; + ArrayShape sub_shape(ff_dim_t const &start, + std::optional const &end) const; + + ArrayShape sub_shape(legion_dim_t const &start, + std::optional const &end) const; public: LegionOrdered dims; private: std::tuple tie() const; + + friend ::std::hash; }; +std::string format_as(ArrayShape const &); +std::ostream &operator<<(std::ostream &, ArrayShape const &); + nonnegative_int get_volume(ArrayShape const &); +ArrayShape array_shape_from_tensor_shape(TensorShape const &); TensorShape get_tensor_shape(ArrayShape const &, DataType); -std::string format_as(ArrayShape const &); -std::ostream &operator<<(std::ostream &, ArrayShape const &); +std::unordered_set get_array_coord_set(ArrayShape const &); } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::ArrayShape> { + size_t operator()(::FlexFlow::ArrayShape const &) const; +}; + +} // namespace std + #endif diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..b3c77d3430 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -1,7 +1,6 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/device.h" #include "kernels/ff_handle.h" @@ -64,8 +63,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +103,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..8b67f564d2 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -1,13 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index f2ca17f429..9bb2753a12 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -1,15 +1,13 @@ #ifndef _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H #define _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/batch_norm_per_device_state.dtg.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include -namespace FlexFlow { -namespace Kernels { -namespace BatchNorm { +namespace FlexFlow::Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &per_device_state, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, @@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels -} // namespace FlexFlow - +} // namespace FlexFlow::Kernels::BatchNorm #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..5ec4cb3975 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -1,29 +1,19 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H -#include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..343ba253d9 --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Cast { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Cast + +#endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..c87465a01f 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..75fdd56498 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Combine { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Combine + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..1e3c55bf59 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..3b7c0672df 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" #include "utils/visitable.h" @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..81fd59dafb --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorR + copy_tensor_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &, + Allocator &cpu_allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &, + Allocator &cpu_allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e83fc3325d..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { @@ -33,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -41,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -54,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -68,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..2cc6dd60a3 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/array_shape.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..fd596f2ccf 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H -#include "device.h" #include "ff_handle.h" #include "kernels/array_shape.h" +#include "kernels/device.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.h" @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..0257b3b4a6 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/ops/element_unary.h" #include @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..f51a730314 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -1,13 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 179ce41cbf..31b3296a98 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -5,7 +5,7 @@ #include #endif -#include "device.h" +#include "kernels/device.h" #include "utils/visitable.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..b2b1164f92 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -1,23 +1,20 @@ #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); + void backward_kernel(ffStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/format_accessor_contents.h b/lib/kernels/include/kernels/format_accessor_contents.h new file mode 100644 index 0000000000..b50cffbbef --- /dev/null +++ b/lib/kernels/include/kernels/format_accessor_contents.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H + +#include "kernels/accessor.h" + +namespace FlexFlow { + +std::string format_accessor_r_contents(GenericTensorAccessorR const &); +std::string format_accessor_w_contents(GenericTensorAccessorW const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..8cbc7e457e 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,23 +15,21 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, + GatherPerDeviceState const &per_device_state, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, + GatherPerDeviceState const &per_device_state, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..10cf2fb14b 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" namespace FlexFlow { @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 7b9b9c455c..947bbd00bb 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -2,7 +2,13 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H #include "kernels/legion_dim_t.dtg.h" -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "kernels/legion_ordered/legion_ordered.h" +#include "op-attrs/ff_dim_t.dtg.h" +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/set_of.h" +#include "utils/containers/transform.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -11,7 +17,10 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions); template -using LegionOrdered = DimOrdered; +std::set key_range(LegionOrdered const &d) { + return transform(set_of(nonnegative_range(num_elements(d))), + [](nonnegative_int i) { return legion_dim_t{i}; }); +} template FFOrdered @@ -25,17 +34,6 @@ LegionOrdered return LegionOrdered(ff_ordered.rbegin(), ff_ordered.rend()); } -template -std::string format_as(LegionOrdered const &v) { - std::vector as_vec(v.cbegin(), v.cend()); - return fmt::format("", as_vec); -} - -template -std::ostream &operator<<(std::ostream &s, LegionOrdered const &v) { - return (s << fmt::to_string(v)); -} - } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h new file mode 100644 index 0000000000..ad8b3bad6d --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h @@ -0,0 +1,197 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H + +#include "kernels/legion_dim_t.dtg.h" +#include "utils/fmt/vector.h" +#include "utils/stack_vector/stack_vector.h" + +namespace FlexFlow { + +template +struct LegionOrdered { + LegionOrdered() {} + + LegionOrdered(std::initializer_list const &l) + : contents(l.begin(), l.end()) {} + + LegionOrdered(std::vector const &contents) + : contents(contents.begin(), contents.end()) {} + + template + LegionOrdered(It begin, It end) : contents(begin, end) {} + + template + LegionOrdered(stack_vector const &contents) + : contents(contents.begin(), contents.end()) {} + + T const &at(legion_dim_t idx) const { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T &at(legion_dim_t idx) { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T const &operator[](legion_dim_t idx) const { + return this->at(idx); + } + + T &operator[](legion_dim_t idx) { + return this->at(idx); + } + + bool idx_is_valid(legion_dim_t const &idx) const { + int raw = idx.value.unwrap_nonnegative(); + return raw < this->contents.size(); + } + + bool operator==(LegionOrdered const &other) const { + return this->contents == other.contents; + } + + bool operator!=(LegionOrdered const &other) const { + return this->contents != other.contents; + } + + using iterator = typename stack_vector::iterator; + using const_iterator = + typename stack_vector::const_iterator; + using reverse_iterator = + typename stack_vector::reverse_iterator; + using const_reverse_iterator = + typename stack_vector::const_reverse_iterator; + using value_type = T; + using pointer = value_type *; + using const_pointer = value_type const *; + using reference = value_type &; + using const_reference = value_type const &; + + iterator begin() { + return this->contents.begin(); + } + + const_iterator begin() const { + return this->cbegin(); + } + + const_iterator cbegin() const { + return this->contents.cbegin(); + } + + iterator end() { + return this->contents.end(); + } + + const_iterator end() const { + return this->cend(); + } + + const_iterator cend() const { + return this->contents.cend(); + } + + reverse_iterator rbegin() { + return this->contents.rbegin(); + } + + const_reverse_iterator rbegin() const { + return this->crbegin(); + } + + const_reverse_iterator crbegin() const { + return this->contents.crbegin(); + } + + reverse_iterator rend() { + return this->contents.rend(); + } + + const_reverse_iterator rend() const { + return this->crend(); + } + + const_reverse_iterator crend() const { + return this->contents.crend(); + } + + size_t size() const { + return this->contents.size(); + } + + size_t empty() const { + return this->contents.empty(); + } + + size_t num_dims() const { + return this->size(); + } + + friend struct ::std::hash; + +private: + stack_vector contents; +}; + +template +auto operator<(LegionOrdered const &lhs, LegionOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} + +template +std::string format_as(LegionOrdered const &v) { + std::vector as_vec(v.cbegin(), v.cend()); + return fmt::format("", as_vec); +} + +template +std::ostream &operator<<(std::ostream &s, LegionOrdered const &v) { + return (s << fmt::to_string(v)); +} + +} // namespace FlexFlow + +namespace nlohmann { +template +struct adl_serializer<::FlexFlow::LegionOrdered> { + static ::FlexFlow::LegionOrdered from_json(nlohmann::json const &j) { + return {j.template get>()}; + } + + static void to_json(nlohmann::json &j, + ::FlexFlow::LegionOrdered const &x) { + j = std::vector{x.cbegin(), x.cend()}; + } +}; +} // namespace nlohmann + +namespace std { + +template +struct hash<::FlexFlow::LegionOrdered> { + size_t operator()(::FlexFlow::LegionOrdered const &t) const { + static_assert(::FlexFlow::is_hashable::value, + "Elements must be hashable"); + + return get_std_hash(t.contents); + } +}; + +} // namespace std + +namespace rc { + +template +struct Arbitrary<::FlexFlow::LegionOrdered> { + static Gen<::FlexFlow::LegionOrdered> arbitrary() { + return gen::construct<::FlexFlow::LegionOrdered>( + gen::arbitrary<::FlexFlow::stack_vector>()); + } +}; + +} // namespace rc + +#endif diff --git a/lib/kernels/include/kernels/legion_ordered/slice.h b/lib/kernels/include/kernels/legion_ordered/slice.h new file mode 100644 index 0000000000..6980c0d9ec --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/slice.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H + +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/containers/slice.h" +#include "utils/containers/transform.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template +LegionOrdered slice(LegionOrdered const &d, + legion_dim_t const &start, + std::optional const &end) { + int raw_start = start.value.unwrap_nonnegative(); + std::optional raw_end = transform( + end, [](legion_dim_t const &i) { return i.value.unwrap_nonnegative(); }); + + return LegionOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/legion_ordered/transform.h b/lib/kernels/include/kernels/legion_ordered/transform.h new file mode 100644 index 0000000000..55cc1ff1ea --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/transform.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H + +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/vector_transform.h" + +namespace FlexFlow { + +template > +LegionOrdered transform(LegionOrdered const &d, F &&f) { + return LegionOrdered{vector_transform(vector_of(d), f)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 3128e39fd0..21d84c2567 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H -#include "device.h" #include "ff_handle.h" +#include "kernels/device.h" #include "op-attrs/datatype.h" #include "op-attrs/ops/linear_attrs.dtg.h" @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -51,29 +50,28 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h similarity index 74% rename from lib/local-execution/include/local-execution/local_cpu_allocator.h rename to lib/kernels/include/kernels/local_cpu_allocator.h index d1e81facf2..9653dcf00e 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -1,3 +1,6 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H + #include "kernels/allocation.h" #include @@ -12,6 +15,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; @@ -20,3 +25,5 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); Allocator create_local_cpu_memory_allocator(); } // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..576edb0ffa 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H #define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { @@ -19,6 +19,9 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; +private: + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..9bd9370685 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; +private: + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..430608db55 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..d552831c78 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" namespace FlexFlow { @@ -16,15 +17,18 @@ void sgd_ps_update_task_gpu(ffStream_t, float *weight_ptr, float *sgd_v_ptr); +#ifdef FF_USE_NCCL void sgd_nccl_update_task_gpu(ffStream_t, float lr, float momentum, bool nesterov, - float weight_decay PerDeviceFFHandle const &, + float weight_decay, + PerDeviceFFHandle const &, float const *weight_grad_ptr, size_t size, float *weight_ptr, float *sgd_v_ptr); +#endif void adam_ps_update_task_gpu(ffStream_t, float alpha_t, @@ -33,9 +37,11 @@ void adam_ps_update_task_gpu(ffStream_t, float weight_decay, float epsilon, float const *weight_grad_ptr, - float *adam_m_ptr, + size_t size, + int num_replicas, + float *weight_ptr, float *adam_v_ptr, - float *weight_ptr); + float *adam_m_ptr); void adam_nccl_update_task_gpu(ffStream_t, float alpha_t, @@ -45,9 +51,10 @@ void adam_nccl_update_task_gpu(ffStream_t, float epsilon, PerDeviceFFHandle const &, float const *weight_grad_ptr, - float *adam_m_ptr, + size_t size, + float *weight_ptr, float *adam_v_ptr, - float *weight_ptr); + float *adam_m_ptr); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..aa3a7a1ef7 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" namespace FlexFlow { @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..76aa07d0a4 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H -#include "device.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" #include "op-attrs/ops/pool_2d.h" @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 655d540685..7c4145c426 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_PROFILING_H #define _FLEXFLOW_KERNELS_PROFILING_H -#include "device.h" +#include "kernels/device.h" #include "kernels/profiling_settings.dtg.h" #include "utils/visitable.h" diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..10e8e4393b 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H -#include "array_shape.h" -#include "device.h" -#include "ff_handle.h" +#include "kernels/array_shape.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" #include "op-attrs/operator_type.dtg.h" namespace FlexFlow { @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..08f73cd9ab 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..0b113868ee 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -1,24 +1,20 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..2a2eaa5eb6 --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Replicate { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas); + +} // namespace FlexFlow::Kernels::Replicate + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..88c11d2fb0 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "utils/required_core.h" namespace FlexFlow { @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..768707175c 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -1,30 +1,21 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/reverse_kernels_cpu.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, - float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); void backward_kernel(ffStream_t stream, - float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); + GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..ec82000f8f --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow::Kernels::Reverse { + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); + +} // namespace FlexFlow::Kernels::Reverse + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h new file mode 100644 index 0000000000..766d70b915 --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_params.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H + +#include "kernels/array_shape.h" +#include "kernels/reverse_kernels_params.dtg.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow { + +ReverseKernelsParams + compute_reverse_kernels_params(ArrayShape const &output_shape, + ReverseAttrs const &attrs); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml new file mode 100644 index 0000000000..a5dbd750bc --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml @@ -0,0 +1,28 @@ +namespace = "FlexFlow" +name = "ReverseKernelsParams" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + +[[fields]] +name = "num_out_blks" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "reverse_dim_size" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "in_blk_size" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "out_size" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..60101578e3 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H -#include "device.h" #include "ff_handle.h" +#include "kernels/device.h" namespace FlexFlow { @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..3b580f94be 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -1,12 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H -#include "device.h" +#include "kernels/device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..085594d57f 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" namespace FlexFlow { @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 0f1cc2ae61..776370dcbd 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -1,15 +1,14 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "op-attrs/ops/transpose_attrs.dtg.h" #include namespace FlexFlow { -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { void forward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, @@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc deleted file mode 100644 index 27b7eb390d..0000000000 --- a/lib/kernels/src/accessor.cc +++ /dev/null @@ -1,192 +0,0 @@ -#include "kernels/accessor.h" - -namespace FlexFlow { - -int32_t *GenericTensorAccessorW::get_int32_ptr() const { - return this->get(); -} - -int64_t *GenericTensorAccessorW::get_int64_ptr() const { - return this->get(); -} - -float *GenericTensorAccessorW::get_float_ptr() const { - return this->get(); -} - -double *GenericTensorAccessorW::get_double_ptr() const { - return this->get(); -} - -half *GenericTensorAccessorW::get_half_ptr() const { - return this->get(); -} - -std::string format_as(GenericTensorAccessorW const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { - return (s << fmt::to_string(a)); -} - -int32_t const *GenericTensorAccessorR::get_int32_ptr() const { - return this->get(); -} - -int64_t const *GenericTensorAccessorR::get_int64_ptr() const { - return this->get(); -} - -float const *GenericTensorAccessorR::get_float_ptr() const { - return this->get(); -} - -double const *GenericTensorAccessorR::get_double_ptr() const { - return this->get(); -} - -half const *GenericTensorAccessorR::get_half_ptr() const { - return get(); -} - -std::string format_as(GenericTensorAccessorR const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { - return (s << fmt::to_string(a)); -} - -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -float const *get_float_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -double const *get_double_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -half const *get_half_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -GenericTensorAccessorR read_only_accessor_from_write_accessor( - GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; -} - -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { - return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; -} - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorR const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc deleted file mode 100644 index d666592e77..0000000000 --- a/lib/kernels/src/allocation.cc +++ /dev/null @@ -1,21 +0,0 @@ -#include "kernels/allocation.h" -#include "op-attrs/tensor_shape.h" - -namespace FlexFlow { - -void *Allocator::allocate(size_t mem_size) { - return this->i_allocator->allocate(mem_size); -} - -void Allocator::deallocate(void *ptr) { - this->i_allocator->deallocate(ptr); -} - -GenericTensorAccessorW - Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = - this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); - return {tensor_shape.data_type, tensor_shape, ptr}; -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc new file mode 100644 index 0000000000..cdd57b8947 --- /dev/null +++ b/lib/kernels/src/cpu/ops/cast_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Cast { + +template +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume().unwrap_nonnegative(); + cpu_cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); + cpu_cast_backward( + output.get(), input.get(), volume, cast_to(1.0f)); + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch2{}( + output.data_type, input.data_type, output, input); +} + +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc new file mode 100644 index 0000000000..577984f21a --- /dev/null +++ b/lib/kernels/src/cpu/ops/combine_kernels.cc @@ -0,0 +1,39 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Combine { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc similarity index 100% rename from lib/kernels/src/cpu/initializer_kernels.cc rename to lib/kernels/src/cpu/ops/initializer_kernels.cc diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc new file mode 100644 index 0000000000..798a4ea8c7 --- /dev/null +++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow::Kernels::Replicate { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + nonnegative_int num_elements, + nonnegative_int num_replicas) { + using T = real_type_t
; + + for (nonnegative_int i : nonnegative_range(num_elements)) { + T cur_sum = 0; + for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) { + cur_sum += output.at
(LegionOrdered{replica_idx, i}); + } + input.at
(LegionOrdered{i}) = cur_sum; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas) { + nonnegative_int num_elements = input.shape.num_elements(); + DataTypeDispatch1{}(input.data_type, + output, + input, + num_elements, + nonnegative_int{num_replicas}); +} + +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc new file mode 100644 index 0000000000..4d9eb8cc09 --- /dev/null +++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc @@ -0,0 +1,46 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +namespace FlexFlow::Kernels::Reverse { + +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + ReverseAttrs const &attrs) { + nonnegative_int reverse_axis_size = input.shape.at(attrs.axis); + + for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) { + nonnegative_int input_reverse_axis_coord = + input_coord.ff_ordered.at(attrs.axis); + + ArrayCoord output_coord = input_coord; + output_coord.ff_ordered.at(attrs.axis) = + nonnegative_int{reverse_axis_size.unwrap_nonnegative() - + input_reverse_axis_coord.unwrap_nonnegative() - 1}; + + output.at
(output_coord.ff_ordered) = + input.at
(input_coord.ff_ordered); + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &attrs) { + + DataTypeDispatch1{}( + input_accessor.data_type, input_accessor, output_accessor, attrs); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor, + GenericTensorAccessorW &input_grad_accessor, + ReverseAttrs const &attrs) { + DataTypeDispatch1{}(output_grad_accessor.data_type, + output_grad_accessor, + input_grad_accessor, + attrs); +} + +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 66388c0ec8..86b2d8a437 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "utils/containers/reversed.h" @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..cb84f0e777 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -13,16 +13,15 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,229 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -388,39 +557,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu index 6c22efda21..2fccf4b48f 100644 --- a/lib/kernels/src/cuda/loss_function_kernels.cu +++ b/lib/kernels/src/cuda/loss_function_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/loss_function_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..54ecd076f6 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "internal/device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu index 38c32ad9e4..e5bdb6f21d 100644 --- a/lib/kernels/src/cuda/ops/attention_kernels.cu +++ b/lib/kernels/src/cuda/ops/attention_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/attention_kernels.h" #include "kernels/device.h" diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu index eb23514c5f..348eed9f0c 100644 --- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/batch_matmul_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 4e153a028e..ceb3a1b3d9 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/batch_norm_kernels.h" #include "kernels/ff_handle.h" @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index fe7aec68b9..f3ea6db660 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/cast_kernels.h" #include "kernels/datatype_dispatch.h" @@ -50,30 +50,26 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume().unwrap_nonnegative(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu index 7cc67ceed8..08cc343fd2 100644 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ b/lib/kernels/src/cuda/ops/combine_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/combine_kernels.h" #include "kernels/datatype_dispatch.h" diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 2715ff16e9..37dbbe12f8 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -13,50 +13,58 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0_n}, axis) + legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims()); + assert(legion_axis.value < shape.num_dims()); + if (legion_axis.value == 0_n) { + legion_axis.value = 1_n; + } + blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) .num_elements() .unwrap_nonnegative(); - num_blocks = - shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); + num_blocks = shape.sub_shape(legion_axis, std::nullopt) + .num_elements() + .unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = inputs.size(); - assert(num_inputs <= MAX_NUM_INPUTS); + assert(inputs.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output.shape, axis); - for (int i = 0; i < num_inputs; i++) { - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - copy_with_stride<<>>(output.get_float_ptr() + offset, - inputs[i].get_float_ptr(), - num_blocks, + input.get_float_ptr(), + blocks_to_copy, output_blk_size, - input_blk_sizes[i]); - offset += input_blk_sizes[i]; + input_blk_size); + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } @@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = input_grads.size(); - assert(num_inputs <= MAX_NUM_INPUTS); - + assert(input_grads.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); - for (int i = 0; i < num_inputs; i++) { - ArrayShape shape = input_grads[i].shape; - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - add_with_stride<<>>(input_grads[i].get_float_ptr(), + stream>>>(input_grad.get_float_ptr(), output_grad.get_float_ptr() + offset, - num_blocks, - input_blk_sizes[i], + blocks_to_add, + input_blk_size, output_blk_size); - offset += input_blk_sizes[i]; + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index dac55539d2..16db62a57f 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" #include "kernels/conv_2d_kernels.h" namespace FlexFlow { @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu index adf0cd8e89..c5fa56bc78 100644 --- a/lib/kernels/src/cuda/ops/dropout_kernels.cu +++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/dropout_kernels.h" #include "kernels/ff_handle.h" diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu index 44273a323f..3a4a77b3dd 100644 --- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/element_binary_kernels.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 056c80ecf6..218e74b939 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/get_op_type.h" @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 973d05f596..594a183ff0 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/flat_kernels.h" @@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 31c1bac217..19e495a540 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -13,14 +13,12 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/gather_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Gather { +namespace FlexFlow::Kernels::Gather { template __global__ void gather_forward(float const *input, @@ -125,11 +123,15 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = - output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) + output.shape + .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); @@ -157,9 +159,13 @@ void backward_kernel(ffStream_t stream, coord_t stride = output_grad.shape - .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .get_volume() + .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) + .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = @@ -180,6 +186,4 @@ void backward_kernel(ffStream_t stream, output_dim_size); } -} // namespace Gather -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..02bda55828 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/linear_kernels.h" #include "utils/integer_conversions.h" @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + static_cast(weight_ptr), weight_type, in_dim, - input_ptr, + static_cast(input_ptr), input_type, in_dim, &beta, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + static_cast(bias_ptr), weight_type, 1, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + static_cast(output_ptr), &beta, m.outputTensor, - output_ptr)); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + static_cast(input_ptr), input_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - kernel_grad_ptr, + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - bias_grad_ptr, + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + static_cast(kernel_ptr), weight_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - input_grad_ptr, + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 2831562f58..b8dfac5204 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/partition_kernels.h" @@ -40,8 +40,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel> <<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..e8ea3f64c2 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/pool_2d_kernels.h" namespace FlexFlow { @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu index 02a89da807..563bbae21d 100644 --- a/lib/kernels/src/cuda/ops/reduce_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/reduce_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 5d95a3766a..d9c09b082d 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/reduction_kernels.h" @@ -55,8 +55,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 4706f38fd4..4685fd7a2d 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels.h" @@ -22,8 +22,8 @@ namespace Kernels { namespace Replicate { template -__global__ void replicate_backward_kernel(T *input_ptr, - T const *output_ptr, +__global__ void replicate_backward_kernel(T const *output_ptr, + T *input_ptr, size_t num_elements, size_t num_replicas) { CUDA_KERNEL_LOOP(i, num_elements) { @@ -38,7 +38,6 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -51,15 +50,15 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements().unwrap_nonnegative() * num_replicas; replicate_backward_kernel> <<>>( - input.get(), output.get(), + input.get(), input.shape.num_elements().unwrap_nonnegative(), num_replicas); } @@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index c5a289ce6b..a6a390b38e 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/reshape_kernels.h" @@ -43,8 +43,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..582aa02386 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -13,13 +13,11 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_params.h" -namespace FlexFlow { - -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, @@ -27,23 +25,24 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t reverse_dim_size, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { + coord_t out_idx = i; coord_t blk_idx = i / (reverse_dim_size * in_blk_size); i = i - blk_idx * (reverse_dim_size * in_blk_size); coord_t reverse_dim_idx = i / in_blk_size; i = i - reverse_dim_idx * in_blk_size; coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + out_ptr[out_idx] = in_ptr[in_idx]; } } -void forward_kernel(cudaStream_t stream, - float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { +static void forward_kernel_internal(cudaStream_t stream, + float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { reverse_forward_kernel<< 0.0f) { - V[i] = V[i] * momentum + gt; - if (nesterov) { - gt = gt + momentum * V[i]; - } else { - gt = V[i]; - } - } - W[i] -= lr * gt; - } -} - -__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // Step 1: Gather gradients in the first replica - for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; - apply_add_with_scale - <<>>( - (float *)w_grad_ptr, src, size, 1.0f); - } - // checkCUDA(cudaDeviceSynchronize()); - // Step 2: SGD update - sgd_update<<>>( - size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} - -#ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr) { - // Use NCCL to sync gradients - // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); - // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); - // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); - - // Step 2: SGD update - sgd_update<<>>( - size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} -#endif - -// ================================================================== -// Adam Optimizer -// ================================================================== -__global__ void - add_kernel(int count, float scale, float const *src, float *dst) { - CUDA_KERNEL_LOOP(i, count) { - dst[i] += src[i] * scale; - } -} - -__global__ void scale_kernel(int count, float a, float b, float *ptr) { - CUDA_KERNEL_LOOP(i, count) { - ptr[i] = (b - a) * ptr[i] + a; - } -} - -__global__ void adam_update(int count, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *WGrad, - float *M, - float *V, - float *W) { - // Reference for weight decay - // https://www.fast.ai/2018/07/02/adam-weight-decay/ - CUDA_KERNEL_LOOP(i, count) { - // W[i] -= weight_decay * alpha_t * W[i]; - // float gt = WGrad[i]; - float gt = WGrad[i] + weight_decay * W[i]; - float mt = beta1 * M[i] + (1 - beta1) * gt; - float vt = beta2 * V[i] + (1 - beta2) * gt * gt; - M[i] = mt; - V[i] = vt; - W[i] -= alpha_t * mt / (sqrt(vt) + epsilon); - } -} - -__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // Step 1: Gather gradients in the first replica - for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; - add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); - } - // checkCUDA(cudaDeviceSynchronize()); - // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", - // op->alpha, op->alpha_t, op->weight_decay); - // Step 2: Adam update - adam_update<<>>( - size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} - -#ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - // Use NCCL to sync gradients - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); - // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", - // op->alpha, op->alpha_t, op->weight_decay); - // Step 2: Adam update - adam_update<<>>( - size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} -#endif - -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu new file mode 100644 index 0000000000..fe817876ce --- /dev/null +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -0,0 +1,205 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/device.h" +#include "kernels/nccl.h" +#include "kernels/optimizer_kernels.h" +#include "utils/exception.h" + +namespace FlexFlow { + +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + float gt = WGrad[i] + weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * momentum + gt; + if (nesterov) { + gt = gt + momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= lr * gt; + } +} + +__host__ void sgd_ps_update_task_gpu(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *sgd_v_ptr) { + // Step 1: Gather gradients in the first replica + for (int i = 1; i < num_replicas; i++) { + float const *src = weight_grad_ptr + i * size; + apply_add_with_scale + <<>>( + (float *)weight_grad_ptr, src, size, 1.0f); + } + + // Step 2: SGD update + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); +} + +#ifdef FF_USE_NCCL +__host__ void sgd_nccl_update_task_gpu(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr) { + // Step 1: Use NCCL to sync gradients + ncclComm_t comm = handle.ncclComm; + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); + + // Step 2: SGD update + sgd_update<<>>( + size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr); +} +#endif + +// ================================================================== +// Adam Optimizer +// ================================================================== +__global__ void + add_kernel(int count, float scale, float const *src, float *dst) { + CUDA_KERNEL_LOOP(i, count) { + dst[i] += src[i] * scale; + } +} + +__global__ void scale_kernel(int count, float a, float b, float *ptr) { + CUDA_KERNEL_LOOP(i, count) { + ptr[i] = (b - a) * ptr[i] + a; + } +} + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W) { + // Reference for weight decay + // https://www.fast.ai/2018/07/02/adam-weight-decay/ + CUDA_KERNEL_LOOP(i, count) { + // W[i] -= weight_decay * alpha_t * W[i]; + // float gt = WGrad[i]; + float gt = WGrad[i] + weight_decay * W[i]; + float mt = beta1 * M[i] + (1 - beta1) * gt; + float vt = beta2 * V[i] + (1 - beta2) * gt * gt; + M[i] = mt; + V[i] = vt; + W[i] -= alpha_t * mt / (sqrt(vt) + epsilon); + } +} + +__host__ void adam_ps_update_task_gpu(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr) { + // Step 1: Gather gradients in the first replica + for (int i = 1; i < num_replicas; i++) { + float const *src = w_grad_ptr + i * size; + add_kernel<<>>( + (float *)w_grad_ptr, src, size); + } + + // Step 2: Adam update + adam_update<<>>(size, + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + w_grad_ptr, + m_ptr, + v_ptr, + w_ptr); +} + +#ifdef FF_USE_NCCL +__host__ void nccl_update_task_gpu(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr) { + // Step 1: Use NCCL to sync gradients + checkNCCL(ncclAllReduce(w_grad_ptr, + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + handle.ncclComm, + stream)); + + // Step 2: Adam update + adam_update<<>>(size, + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + w_grad_ptr, + m_ptr, + v_ptr, + w_ptr); +} +#endif + +} // namespace FlexFlow diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 7ca3149f2f..aefe53cc46 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/embedding_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include @@ -364,8 +364,8 @@ struct ForwardKernel { weight.data_type == DataType::FLOAT || weight.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -374,10 +374,11 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -386,9 +387,7 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } @@ -408,8 +407,9 @@ struct BackwardKernel { assert(output.data_type == DataType::HALF || output.data_type == DataType::FLOAT || output.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), + + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -418,9 +418,11 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -429,9 +431,7 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } diff --git a/lib/kernels/src/hip/loss_function_kernels.cpp b/lib/kernels/src/hip/loss_function_kernels.cpp index e82b5c96d5..05068f1bd0 100644 --- a/lib/kernels/src/hip/loss_function_kernels.cpp +++ b/lib/kernels/src/hip/loss_function_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/loss_function_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/attention_kernels.cpp b/lib/kernels/src/hip/ops/attention_kernels.cpp index 005cef30d1..b374ead305 100644 --- a/lib/kernels/src/hip/ops/attention_kernels.cpp +++ b/lib/kernels/src/hip/ops/attention_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/attention_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp index c4b3be823f..6d9ae8a268 100644 --- a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp +++ b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/batch_matmul_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp index 8e94b462cd..764a3e0b58 100644 --- a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp +++ b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/batch_norm_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/ff_handle.h" #include diff --git a/lib/kernels/src/hip/ops/cast_kernels.cpp b/lib/kernels/src/hip/ops/cast_kernels.cpp index fa0c37ffa1..1035657c04 100644 --- a/lib/kernels/src/hip/ops/cast_kernels.cpp +++ b/lib/kernels/src/hip/ops/cast_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/cast_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/combine_kernels.cpp b/lib/kernels/src/hip/ops/combine_kernels.cpp index aa01f02276..f1e0422747 100644 --- a/lib/kernels/src/hip/ops/combine_kernels.cpp +++ b/lib/kernels/src/hip/ops/combine_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/combine_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp index aa38be739b..a215d67942 100644 --- a/lib/kernels/src/hip/ops/concat_kernels.cpp +++ b/lib/kernels/src/hip/ops/concat_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/concat_kernels.h" -#include "device.h" +#include "internal/device.h" #include #include diff --git a/lib/kernels/src/hip/ops/conv_2d_kernels.h b/lib/kernels/src/hip/ops/conv_2d_kernels.h index bcf015d561..76a73ab08c 100644 --- a/lib/kernels/src/hip/ops/conv_2d_kernels.h +++ b/lib/kernels/src/hip/ops/conv_2d_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H #define _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/hip/ops/dropout_kernels.cpp b/lib/kernels/src/hip/ops/dropout_kernels.cpp index baaf8e6902..d85c0ae054 100644 --- a/lib/kernels/src/hip/ops/dropout_kernels.cpp +++ b/lib/kernels/src/hip/ops/dropout_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/dropout_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/ff_handle.h" #include diff --git a/lib/kernels/src/hip/ops/element_binary_kernels.cpp b/lib/kernels/src/hip/ops/element_binary_kernels.cpp index bc66bbff2f..9e0452b09b 100644 --- a/lib/kernels/src/hip/ops/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/ops/element_binary_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/element_binary_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.dtg.h" diff --git a/lib/kernels/src/hip/ops/element_unary_kernels.cpp b/lib/kernels/src/hip/ops/element_unary_kernels.cpp index f4b0ccb82d..163f13a6da 100644 --- a/lib/kernels/src/hip/ops/element_unary_kernels.cpp +++ b/lib/kernels/src/hip/ops/element_unary_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/element_unary_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "op-attrs/get_op_type.h" #include diff --git a/lib/kernels/src/hip/ops/flat_kernels.cpp b/lib/kernels/src/hip/ops/flat_kernels.cpp index 763fb9e322..dedfb4b9a9 100644 --- a/lib/kernels/src/hip/ops/flat_kernels.cpp +++ b/lib/kernels/src/hip/ops/flat_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/flat_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include diff --git a/lib/kernels/src/hip/ops/gather_kernels.cpp b/lib/kernels/src/hip/ops/gather_kernels.cpp index 17c0014e98..6e9e4c6a2c 100644 --- a/lib/kernels/src/hip/ops/gather_kernels.cpp +++ b/lib/kernels/src/hip/ops/gather_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/gather_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/partition_kernels.cpp b/lib/kernels/src/hip/ops/partition_kernels.cpp index 4591247faa..26748a7e45 100644 --- a/lib/kernels/src/hip/ops/partition_kernels.cpp +++ b/lib/kernels/src/hip/ops/partition_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/partition_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp index ed942c105c..7e5ae2ab80 100644 --- a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp +++ b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/pool_2d_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/reduce_kernels.cpp b/lib/kernels/src/hip/ops/reduce_kernels.cpp index 468543dd5b..c0bcc84d48 100644 --- a/lib/kernels/src/hip/ops/reduce_kernels.cpp +++ b/lib/kernels/src/hip/ops/reduce_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reduce_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/replicate_kernels.cpp b/lib/kernels/src/hip/ops/replicate_kernels.cpp index 8d27bb1908..ee7bf701c0 100644 --- a/lib/kernels/src/hip/ops/replicate_kernels.cpp +++ b/lib/kernels/src/hip/ops/replicate_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/replicate_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/reshape_kernels.cpp b/lib/kernels/src/hip/ops/reshape_kernels.cpp index 47978a5f4a..810b929e24 100644 --- a/lib/kernels/src/hip/ops/reshape_kernels.cpp +++ b/lib/kernels/src/hip/ops/reshape_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reshape_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/reverse_kernels.cpp b/lib/kernels/src/hip/ops/reverse_kernels.cpp index 03e97245bf..a56ff3540a 100644 --- a/lib/kernels/src/hip/ops/reverse_kernels.cpp +++ b/lib/kernels/src/hip/ops/reverse_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reverse_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/softmax_kernels.cpp b/lib/kernels/src/hip/ops/softmax_kernels.cpp index 3a8f2813b7..610675850b 100644 --- a/lib/kernels/src/hip/ops/softmax_kernels.cpp +++ b/lib/kernels/src/hip/ops/softmax_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/softmax_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/split_kernels.cpp b/lib/kernels/src/hip/ops/split_kernels.cpp index 5599ae6d6f..3034b633a6 100644 --- a/lib/kernels/src/hip/ops/split_kernels.cpp +++ b/lib/kernels/src/hip/ops/split_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/split_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/topk_kernels.cpp b/lib/kernels/src/hip/ops/topk_kernels.cpp index f085c5831f..777d9edffa 100644 --- a/lib/kernels/src/hip/ops/topk_kernels.cpp +++ b/lib/kernels/src/hip/ops/topk_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/topk_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/transpose_kernels.cpp b/lib/kernels/src/hip/ops/transpose_kernels.cpp index ef9dd58c63..c5122f34bf 100644 --- a/lib/kernels/src/hip/ops/transpose_kernels.cpp +++ b/lib/kernels/src/hip/ops/transpose_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/transpose_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "utils/exception.h" #include diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/internal/device.cc similarity index 97% rename from lib/kernels/src/device.cc rename to lib/kernels/src/internal/device.cc index f46099c79a..eb3d229c2a 100644 --- a/lib/kernels/src/device.cc +++ b/lib/kernels/src/internal/device.cc @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" namespace FlexFlow { diff --git a/lib/kernels/src/device.h b/lib/kernels/src/internal/device.h similarity index 98% rename from lib/kernels/src/device.h rename to lib/kernels/src/internal/device.h index ceff2f92ff..226c7ad174 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/internal/device.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_KERNELS_SRC_DEVICE_H -#define _FLEXFLOW_KERNELS_SRC_DEVICE_H +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H #include "kernels/array_shape.h" #include "kernels/device.h" diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc new file mode 100644 index 0000000000..b5042f77a0 --- /dev/null +++ b/lib/kernels/src/kernels/accessor.cc @@ -0,0 +1,249 @@ +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/datatype_dispatch.h" +#include "utils/containers/reversed.h" +#include "utils/containers/vector_of.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include + +namespace FlexFlow { + +nonnegative_int + calculate_accessor_offset(LegionOrdered const &indices, + ArrayShape const &shape) { + ASSERT(indices.size() == shape.num_dims(), + "Number of indices does not match the number of dimensions"); + + nonnegative_int offset = 0_n; + nonnegative_int multiplier = 1_n; + + for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) { + ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}), + "Out of bounds access", + dim); + + offset += indices.at(dim) * multiplier; + multiplier *= shape.at(legion_dim_t{dim}); + } + + return offset; +} + +void copy_accessor_data_to_l_from_r( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = + dst_accessor.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + assert(src_device_type == DeviceType::GPU); + assert(dst_device_type == DeviceType::GPU); + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); +} + +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return this->tie() != other.tie(); +} + +int32_t *GenericTensorAccessorW::get_int32_ptr() const { + return this->get(); +} + +int64_t *GenericTensorAccessorW::get_int64_ptr() const { + return this->get(); +} + +float *GenericTensorAccessorW::get_float_ptr() const { + return this->get(); +} + +double *GenericTensorAccessorW::get_double_ptr() const { + return this->get(); +} + +half *GenericTensorAccessorW::get_half_ptr() const { + return this->get(); +} + +std::string format_as(GenericTensorAccessorW const &a) { + return fmt::format("", + a.data_type, + a.shape, + a.ptr); +} + +std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { + return (s << fmt::to_string(a)); +} + +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return this->tie() != other.tie(); +} + +int32_t const *GenericTensorAccessorR::get_int32_ptr() const { + return this->get(); +} + +int64_t const *GenericTensorAccessorR::get_int64_ptr() const { + return this->get(); +} + +float const *GenericTensorAccessorR::get_float_ptr() const { + return this->get(); +} + +double const *GenericTensorAccessorR::get_double_ptr() const { + return this->get(); +} + +half const *GenericTensorAccessorR::get_half_ptr() const { + return get(); +} + +std::string format_as(GenericTensorAccessorR const &a) { + return fmt::format("", + a.data_type, + a.shape, + a.ptr); +} + +std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { + return (s << fmt::to_string(a)); +} + +int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +float const *get_float_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +double const *get_double_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +half const *get_half_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +std::vector + get_int32_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_int64_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_float_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_double_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_half_ptrs(std::vector const &a) { + return get(a); +} + +GenericTensorAccessorR read_only_accessor_from_write_accessor( + GenericTensorAccessorW const &writable) { + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.device_type}; +} + +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { + return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; +} + +bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, + ArrayShape const &expected_shape, + DataType const &expected_dtype) { + return accessor.shape == expected_shape && + accessor.data_type == expected_dtype; +} + +std::pair + get_shape_and_datatype(GenericTensorAccessorR const &accessor) { + return std::make_pair(accessor.shape, accessor.data_type); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc new file mode 100644 index 0000000000..b9f253bcff --- /dev/null +++ b/lib/kernels/src/kernels/allocation.cc @@ -0,0 +1,38 @@ +#include "kernels/allocation.h" +#include "op-attrs/tensor_shape.h" + +namespace FlexFlow { + +void *Allocator::allocate(size_t mem_size) { + return this->i_allocator->allocate(mem_size); +} + +void Allocator::deallocate(void *ptr) { + this->i_allocator->deallocate(ptr); +} + +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); +} + +GenericTensorAccessorW + Allocator::allocate_tensor(TensorShape const &tensor_shape) { + void *ptr = + this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); + return GenericTensorAccessorW{ + tensor_shape.data_type, + array_shape_from_tensor_shape(tensor_shape), + ptr, + this->get_allocation_device_type(), + }; +} + +void Allocator::deallocate_tensor(GenericTensorAccessorW const &t) { + this->deallocate(t.ptr); +} + +void Allocator::deallocate_tensor(GenericTensorAccessorR const &t) { + this->deallocate(const_cast(t.ptr)); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc similarity index 51% rename from lib/kernels/src/array_shape.cc rename to lib/kernels/src/kernels/array_shape.cc index 243185ada4..34a53c1bb3 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/kernels/array_shape.cc @@ -1,23 +1,20 @@ #include "kernels/array_shape.h" +#include "kernels/legion_ordered/slice.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/slice.h" +#include "utils/containers/cartesian_product.h" #include "utils/containers/product.h" #include "utils/containers/reversed.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" #include "utils/containers/vector_of.h" +#include "utils/hash/tuple.h" +#include "utils/hash/vector.h" #include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { -static LegionOrdered - legion_dims_from_ff_dims(FFOrdered const &ff_ordered) { - return LegionOrdered{reversed(vector_of(ff_ordered))}; -} - -ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims) - : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {} - -ArrayShape::ArrayShape(TensorShape const &shape) - : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {} - -ArrayShape::ArrayShape(std::vector const &input_dims) +ArrayShape::ArrayShape(LegionOrdered const &input_dims) : dims(input_dims) {} nonnegative_int ArrayShape::get_volume() const { @@ -59,10 +56,19 @@ bool ArrayShape::operator!=(ArrayShape const &other) const { return this->tie() != other.tie(); } -ArrayShape ArrayShape::sub_shape( - std::optional> start, - std::optional> end) const { - NOT_IMPLEMENTED(); +ArrayShape + ArrayShape::sub_shape(ff_dim_t const &start, + std::optional const &maybe_end) const { + FFOrdered ff_ordered_dims = + ff_ordered_from_legion_ordered(this->dims); + FFOrdered sliced = slice(ff_ordered_dims, start, maybe_end); + return ArrayShape{legion_ordered_from_ff_ordered(sliced)}; +} + +ArrayShape + ArrayShape::sub_shape(legion_dim_t const &start, + std::optional const &maybe_end) const { + return ArrayShape{slice(this->dims, start, maybe_end)}; } std::optional ArrayShape::at_maybe(legion_dim_t index) const { @@ -81,15 +87,6 @@ std::tuple const &> ArrayShape::tie() const { return std::tie(this->dims); } -nonnegative_int get_volume(ArrayShape const &shape) { - return shape.get_volume(); -} - -TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { - return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)}, - dtype}; -} - std::string format_as(ArrayShape const &x) { std::ostringstream oss; oss << " get_array_coord_set(ArrayShape const &shape) { + std::vector> per_dim_ranges = + transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)), + [](nonnegative_int dim_size) -> std::vector { + return nonnegative_range(dim_size); + }); + + std::unordered_set> raw_points = + unordered_set_of(cartesian_product(per_dim_ranges)); + + return transform(raw_points, + [](std::vector const &raw_point) { + return ArrayCoord{ff_ordered_of(raw_point)}; + }); +} + } // namespace FlexFlow + +namespace std { + +using namespace FlexFlow; + +size_t hash::operator()(ArrayShape const &s) const { + return get_std_hash(s.tie()); +} + +} // namespace std diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc new file mode 100644 index 0000000000..d8619d8ce6 --- /dev/null +++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc @@ -0,0 +1,66 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary( + GenericTensorAccessorR const &accessor, Allocator &cpu_allocator) { + if (accessor.device_type == DeviceType::GPU) { + return copy_tensor_accessor_r(accessor, cpu_allocator); + } else { + return accessor; + } +} + +GenericTensorAccessorW copy_tensor_accessor_w_to_cpu_if_necessary( + GenericTensorAccessorW const &accessor, Allocator &cpu_allocator) { + if (accessor.device_type == DeviceType::GPU) { + return copy_tensor_accessor_w(accessor, cpu_allocator); + } else { + return accessor; + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc new file mode 100644 index 0000000000..1b8ab35d89 --- /dev/null +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -0,0 +1,184 @@ +#include "kernels/format_accessor_contents.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/local_cpu_allocator.h" +#include "utils/indent.h" +#include + +namespace FlexFlow { + +template +struct Print1DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 1_n); + + nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n}); + + stream << "[" + << join_strings(nonnegative_range(ncols), + " ", + [&](nonnegative_int col_idx) -> std::string { + return fmt::to_string( + accessor.at
(FFOrdered{col_idx})); + }) + << "]"; + } +}; + +static std::string + format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 1_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print2DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 2_n); + nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + + auto render_1d = [&](nonnegative_int dim0_idx) -> std::string { + return "[" + + join_strings(nonnegative_range(dim1_size), + " ", + [&](nonnegative_int dim1_idx) -> std::string { + return fmt::to_string( + accessor.at
(FFOrdered{dim0_idx, dim1_idx})); + }) + + "]"; + }; + + stream << "[\n" + << indent( + join_strings(nonnegative_range(dim0_size), "\n", render_1d)) + << "\n]"; + } +}; + +static std::string + format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 2_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print3DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 3_n); + + nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n}); + + auto render_1d = [&](nonnegative_int dim0_idx, + nonnegative_int dim1_idx) -> std::string { + return "[" + + join_strings(nonnegative_range(dim2_size), + " ", + [&](nonnegative_int dim2_idx) -> std::string { + return fmt::to_string(accessor.at
( + FFOrdered{dim0_idx, dim1_idx, dim2_idx})); + }) + + "]"; + }; + + auto render_2d = [&](nonnegative_int dim0_idx) -> std::string { + return "[\n" + + indent(join_strings(nonnegative_range(dim1_size), + "\n", + [&](nonnegative_int dim1_idx) -> std::string { + return render_1d(dim0_idx, dim1_idx); + })) + + "\n]"; + }; + + stream << "[\n" + << indent( + join_strings(nonnegative_range(dim0_size), "\n", render_2d)) + << "\n]"; + } +}; + +static std::string + format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 3_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +static std::string + format_1d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_1d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +static std::string + format_2d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_2d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +static std::string + format_3d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_3d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + + int num_dims = accessor.shape.num_dims().unwrap_nonnegative(); + switch (num_dims) { + case 1: + return format_1d_accessor_r_contents(accessor); + case 2: + return format_2d_accessor_r_contents(accessor); + case 3: + return format_3d_accessor_r_contents(accessor); + default: + PANIC("Unhandled accessor dimensionality", num_dims); + } +} + +std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = + copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator); + + int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative(); + switch (num_dims) { + case 1: + return format_1d_accessor_w_contents(cpu_accessor); + case 2: + return format_2d_accessor_w_contents(cpu_accessor); + case 3: + return format_3d_accessor_w_contents(cpu_accessor); + default: + PANIC("Unhandled accessor dimensionality", num_dims); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc similarity index 78% rename from lib/kernels/src/legion_dim.cc rename to lib/kernels/src/kernels/legion_dim.cc index bbb15c5636..f3482b1d9b 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/kernels/legion_dim.cc @@ -1,7 +1,11 @@ #include "kernels/legion_dim.h" +#include "utils/archetypes/value_type.h" namespace FlexFlow { +using T = value_type<0>; +template std::set key_range(LegionOrdered const &); + legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { return legion_dim_t{ nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}}; @@ -11,6 +15,7 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, nonnegative_int num_dimensions) { return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - ff_dim.value.unwrap_nonnegative() - 1}}; + ; } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc new file mode 100644 index 0000000000..8af44173b0 --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc @@ -0,0 +1,10 @@ +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template struct LegionOrdered; + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/slice.cc b/lib/kernels/src/kernels/legion_ordered/slice.cc new file mode 100644 index 0000000000..69fcf570aa --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/slice.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/slice.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template LegionOrdered slice(LegionOrdered const &, + legion_dim_t const &, + std::optional const &); + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/transform.cc b/lib/kernels/src/kernels/legion_ordered/transform.cc new file mode 100644 index 0000000000..d9fb38198e --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/transform.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/transform.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; +using Out = value_type<1>; +using F = std::function; + +template LegionOrdered transform(LegionOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/kernels/src/kernels/local_cpu_allocator.cc similarity index 52% rename from lib/local-execution/src/local_cpu_allocator.cc rename to lib/kernels/src/kernels/local_cpu_allocator.cc index 4ca5f987a8..738d1abf27 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/kernels/src/kernels/local_cpu_allocator.cc @@ -1,20 +1,27 @@ -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" #include "utils/containers/contains_key.h" +#include +#include namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { void *ptr = malloc(requested_memory_size); + ASSERT(ptr != nullptr); this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); return ptr; } void LocalCPUAllocator::deallocate(void *ptr) { - if (contains_key(this->ptrs, ptr)) { - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } + ASSERT(contains_key(this->ptrs, ptr), + "Deallocating a pointer that was not allocated by this Allocator"); + + free(ptr); + this->ptrs.erase(ptr); +} + +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; } Allocator create_local_cpu_memory_allocator() { diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/kernels/local_cuda_allocator.cc similarity index 59% rename from lib/kernels/src/local_cuda_allocator.cc rename to lib/kernels/src/kernels/local_cuda_allocator.cc index cdcfb017a0..1b081517bf 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/kernels/local_cuda_allocator.cc @@ -1,6 +1,7 @@ #include "kernels/local_cuda_allocator.h" #include "kernels/device.h" #include "utils/containers/contains.h" +#include namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { @@ -11,13 +12,15 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) { } void LocalCudaAllocator::deallocate(void *ptr) { - if (contains(this->ptrs, ptr)) { - checkCUDA(cudaFree(ptr)); - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } + ASSERT(contains(this->ptrs, ptr), + "Deallocating a pointer that was not allocated by this Allocator"); + + checkCUDA(cudaFree(ptr)); + this->ptrs.erase(ptr); +} + +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; } LocalCudaAllocator::~LocalCudaAllocator() { @@ -27,7 +30,8 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc new file mode 100644 index 0000000000..c647181872 --- /dev/null +++ b/lib/kernels/src/kernels/reverse_kernels_params.cc @@ -0,0 +1,30 @@ +#include "kernels/reverse_kernels_params.h" + +namespace FlexFlow { + +ReverseKernelsParams + compute_reverse_kernels_params(ArrayShape const &output_shape, + ReverseAttrs const &attrs) { + auto axis = attrs.axis; + nonnegative_int in_blk_size = 1_n; + nonnegative_int reverse_dim_size = 1_n; + nonnegative_int num_out_blks = 1_n; + for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) { + if (i < axis.value) { + in_blk_size *= output_shape.at(ff_dim_t{i}); + } else if (i == axis.value) { + reverse_dim_size = output_shape.at(ff_dim_t{i}); + } else { + num_out_blks *= output_shape.at(ff_dim_t{i}); + } + } + + return ReverseKernelsParams{ + num_out_blks, + reverse_dim_size, + in_blk_size, + output_shape.get_volume(), + }; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,36 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + this->cleanup(); + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + this->cleanup(); +} + +void ManagedFFStream::cleanup() { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..ea26d2350c 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -1,16 +1,17 @@ #include "kernels/managed_per_device_ff_handle.h" -#include "device.h" +#include "internal/device.h" namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; - - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { + this->handle = new PerDeviceFFHandle{}; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; + + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + this->cleanup(); + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; } } diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 00da2d0d70..066cb96753 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,6 +14,7 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) set(FF_TEST_EXEC_NAME "kernels-tests") diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc new file mode 100644 index 0000000000..8630dcd8cd --- /dev/null +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -0,0 +1,57 @@ +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/replicate_kernels_cpu.h" +#include "test/utils/doctest/check_kv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Replicate::cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = + create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); + + TensorShape result_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + GenericTensorAccessorW result = + create_zero_filled_accessor_w(result_shape, cpu_allocator); + + GenericTensorAccessorR correct = input; + + Kernels::Replicate::cpu_forward_kernel(input, result); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + TEST_CASE("Replicate::cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR output = create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + cpu_allocator); + + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + + TensorShape result_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + GenericTensorAccessorW result = + create_zero_filled_accessor_w(result_shape, cpu_allocator); + Kernels::Replicate::cpu_backward_kernel(output, result, 3); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } +} diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc new file mode 100644 index 0000000000..db0016cb0b --- /dev/null +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -0,0 +1,206 @@ +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Reverse::cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + DataType::FLOAT, + }, + cpu_allocator); + + SUBCASE("axis = ff_dim_t{0}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{1}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{1_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{2}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{2_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + } + + TEST_CASE("Reverse::cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + DataType::FLOAT, + }, + cpu_allocator); + + SUBCASE("axis = ff_dim_t{0}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{1}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{1_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{2}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{2_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + } +} diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc new file mode 100644 index 0000000000..0f34a6aa06 --- /dev/null +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -0,0 +1,392 @@ +#include "internal/test_utils.h" +#include "op-attrs/tensor_shape.h" +#include "utils/containers/require_all_same1.h" +#include "utils/join_strings.h" +#include + +namespace FlexFlow { + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; +} + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_zero_filled_accessor_w(shape, allocator); + return read_only_accessor_from_write_accessor(accessor); +} + +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator) { + nonnegative_int ncols = num_elements(contents); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{ncols}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at(FFOrdered{col_idx}) = + contents.at(col_idx.unwrap_nonnegative()); + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator) { + nonnegative_int nrows = num_elements(contents); + ASSERT(nrows > 0); + + nonnegative_int ncols = throw_if_unexpected( + require_all_same1(transform(contents, [](std::vector const &row) { + return num_elements(row); + }))); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{nrows, ncols}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int row_idx : nonnegative_range(nrows)) { + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at(FFOrdered{row_idx, col_idx}) = + contents.at(row_idx.unwrap_nonnegative()) + .at(col_idx.unwrap_nonnegative()); + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return num_elements(m); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return throw_if_unexpected( + require_all_same1(transform(m, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + cpu_accessor.at( + FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()); + } + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return num_elements(t); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &m) { + return throw_if_unexpected(require_all_same1( + transform(m, [](std::vector> const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return throw_if_unexpected(require_all_same1( + transform(t, [](std::vector> const &mat) { + return throw_if_unexpected(require_all_same1( + transform(mat, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + }))); + ASSERT(dim3_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}}, + DataType::FLOAT, + }; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) { + accessor.at( + FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()) + .at(dim3_idx.unwrap_nonnegative()); + } + } + } + } + + return accessor; +} + +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_1d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_2d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_3d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_4d_accessor_w_with_contents(contents, allocator)); +} + +template +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape).unwrap_nonnegative(); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 99); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } + + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, allocator); +} + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} + +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T)); + } else { + checkCUDA(cudaMemset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * + sizeof(T))); + } + } +}; + +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + int volume = accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; + } +}; + +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); +} + +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + int volume = accessor_a.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + ASSERT(accessor_a.shape == accessor_b.shape, + "accessors_are_equal expects accessors to have the same shape"); + + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + + int volume = dst_accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); +} + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); +} +} // namespace FlexFlow diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h new file mode 100644 index 0000000000..a4fc9b88c8 --- /dev/null +++ b/lib/kernels/test/src/internal/test_utils.h @@ -0,0 +1,78 @@ +#ifndef _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H +#define _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H + +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" +#include +#include +#include +#include + +namespace FlexFlow { + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator); +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator); + +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator); +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator); + +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator); +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator); + +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator); +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator); + +bool contains_non_zero(GenericTensorAccessorR const &accessor); + +void fill_with_zeros(GenericTensorAccessorW const &accessor); + +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream); + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc new file mode 100644 index 0000000000..98f8471212 --- /dev/null +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -0,0 +1,73 @@ +#include "kernels/accessor.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("calculate_accessor_offset") { + SUBCASE("one dimension") { + std::vector indices = {4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 13_n, + }, + }; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 4_n; + + CHECK(result == correct); + } + + SUBCASE("multiple dimensions") { + std::vector indices = {2_n, 4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 2_n * 5_n + 4_n; + + CHECK(result == correct); + } + + SUBCASE("zero dimensions") { + std::vector indices = {}; + ArrayShape shape = ArrayShape{std::vector{}}; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 0_n; + + CHECK(result == correct); + } + + SUBCASE("index and shape dimensions do not match") { + std::vector indices = {1_n, 2_n, 4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + CHECK_THROWS(calculate_accessor_offset(indices, shape)); + } + + SUBCASE("out of bounds index") { + std::vector indices = {2_n, 5_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + CHECK_THROWS(calculate_accessor_offset(indices, shape)); + } + } +} diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc new file mode 100644 index 0000000000..1fb4c0b541 --- /dev/null +++ b/lib/kernels/test/src/kernels/array_shape.cc @@ -0,0 +1,49 @@ +#include "kernels/array_shape.h" +#include "test/utils/doctest/fmt/unordered_set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_array_coord_set") { + SUBCASE("ArrayShape is not empty") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_n, 1_n, 3_n}, + }; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = { + ArrayCoord{FFOrdered{0_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{0_n, 0_n, 1_n}}, + ArrayCoord{FFOrdered{1_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{1_n, 0_n, 1_n}}, + ArrayCoord{FFOrdered{2_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{2_n, 0_n, 1_n}}, + }; + + CHECK(result == correct); + } + + SUBCASE("ArrayShape has a dimension of size zero") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_n, 0_n, 3_n}, + }; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = {}; + + CHECK(result == correct); + } + + SUBCASE("ArrayShape is zero-dimensional") { + ArrayShape input = ArrayShape{LegionOrdered{}}; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = { + ArrayCoord{FFOrdered{}}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc new file mode 100644 index 0000000000..915a84c335 --- /dev/null +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -0,0 +1,94 @@ +#include "kernels/format_accessor_contents.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("format_accessor_r_contents(GenericTensorAccessorR)") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("accessor is 1d") { + GenericTensorAccessorR accessor = + create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator); + + std::string correct = "[1 2 3 2]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is 2d") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {1, 2, 3, 5}, + {4, 3, 3, 2}, + {1, 1, 5, 8}, + }, + cpu_allocator); + + std::string correct = "[\n" + " [1 2 3 5]\n" + " [4 3 3 2]\n" + " [1 1 5 8]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is 3d") { + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {1, 2, 3, 6}, + {4, 3, 3, 9}, + {1, 1, 5, 1}, + }, + { + {4, 1, 8, 7}, + {9, 4, 2, 4}, + {1, 0, 0, 6}, + }, + { + {2, 1, 1, 9}, + {1, 3, 6, 2}, + {1, 9, 8, 9}, + }, + }, + cpu_allocator); + + std::string correct = "[\n" + " [\n" + " [1 2 3 6]\n" + " [4 3 3 9]\n" + " [1 1 5 1]\n" + " ]\n" + " [\n" + " [4 1 8 7]\n" + " [9 4 2 4]\n" + " [1 0 0 6]\n" + " ]\n" + " [\n" + " [2 1 1 9]\n" + " [1 3 6 2]\n" + " [1 9 8 9]\n" + " ]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is some other dimension") { + GenericTensorAccessorR accessor = + create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator); + + CHECK_THROWS(format_accessor_r_contents(accessor)); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc new file mode 100644 index 0000000000..34822ed1c3 --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_dim.cc @@ -0,0 +1,32 @@ +#include "kernels/legion_dim.h" +#include "test/utils/doctest/fmt/set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("key_range(LegionOrdered)") { + SUBCASE("input is non-empty") { + LegionOrdered input = {5, 3, 2, 3}; + + std::set result = key_range(input); + std::set correct = { + legion_dim_t{0_n}, + legion_dim_t{1_n}, + legion_dim_t{2_n}, + legion_dim_t{3_n}, + }; + + CHECK(result == correct); + } + + SUBCASE("input is empty") { + LegionOrdered input = {}; + + std::set result = key_range(input); + std::set correct = {}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc new file mode 100644 index 0000000000..4b50cad735 --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/legion_ordered.h" +#include "test/utils/rapidcheck.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE_TEMPLATE( + "Arbitrary> with T=", T, int, double, char) { + RC_SUBCASE([](LegionOrdered) {}); + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/slice.cc b/lib/kernels/test/src/kernels/legion_ordered/slice.cc new file mode 100644 index 0000000000..d0211d270e --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/slice.cc @@ -0,0 +1,30 @@ +#include "kernels/legion_ordered/slice.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("slice(LegionOrdered, ..., ...") { + LegionOrdered d = LegionOrdered{ + 1, + 2, + 3, + 4, + }; + SUBCASE("legion_dim_t, legion_dim_t") { + LegionOrdered result = slice(d, + legion_dim_t{nonnegative_int{1}}, + legion_dim_t{nonnegative_int{3}}); + LegionOrdered correct = LegionOrdered{2, 3}; + + CHECK(result == correct); + } + SUBCASE("legion_dim_t, std::nullopt_t") { + LegionOrdered result = + slice(d, legion_dim_t{nonnegative_int{1}}, std::nullopt); + LegionOrdered correct = LegionOrdered{2, 3, 4}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc new file mode 100644 index 0000000000..759507264f --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc @@ -0,0 +1,36 @@ +#include "kernels/legion_ordered/transform.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("transform(LegionOrdered, F)") { + SUBCASE("input is empty") { + LegionOrdered input = {}; + + LegionOrdered result = + transform(input, [](std::string const &) -> int { + CHECK(false); + return 0; + }); + LegionOrdered correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input is not empty") { + LegionOrdered input = {2, 1, 2, 5}; + + LegionOrdered result = + transform(input, [](int x) { return fmt::to_string(x); }); + LegionOrdered correct = LegionOrdered{ + "2", + "1", + "2", + "5", + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 64264f6c39..9064ae4824 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/attention_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test multi-head attention kernel") { nonnegative_int num_samples = 10_n; nonnegative_int num_heads = 4_n; @@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -39,16 +41,26 @@ TEST_SUITE(FF_TEST_SUITE) { /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), /*add_bias_kv=*/false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); - TensorShape weight_shape = make_float_tensor_shape_from_legion_dims( - {nonnegative_int{state.weightSize}}); + TensorShape query_shape = TensorShape{ + TensorDims{FFOrdered{qoSeqLength, num_samples, qSize}}, + DataType::FLOAT, + }; + TensorShape key_shape = TensorShape{ + TensorDims{FFOrdered{kvSeqLength, num_samples, kSize}}, + DataType::FLOAT, + }; + TensorShape value_shape = TensorShape{ + TensorDims{FFOrdered{kvSeqLength, num_samples, vSize}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{qoSeqLength, num_samples, oProjSize}}, + DataType::FLOAT, + }; + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{nonnegative_int{state.weightSize}}}, + DataType::FLOAT, + }; GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); @@ -72,9 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index cacd5b60fb..5f63b48198 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/batch_matmul_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchMatmul Kernel") { nonnegative_int m = 10_n; nonnegative_int n = 10_n; @@ -15,16 +15,24 @@ TEST_SUITE(FF_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); - TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + TensorShape input_shape_a = TensorShape{ + TensorDims{FFOrdered{batch, k, m}}, + DataType::FLOAT, + }; + TensorShape input_shape_b = TensorShape{ + TensorDims{FFOrdered{batch, n, k}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{batch, n, m}}, + DataType::FLOAT, + }; GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index b4c43cf1d8..903ad8cc43 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,10 +1,11 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/batch_norm_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchNorm Kernel") { nonnegative_int output_n = 1_n; nonnegative_int output_c = 10_n; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -26,25 +29,33 @@ TEST_SUITE(FF_TEST_SUITE) { /*output_w=*/output_w.unwrap_nonnegative(), /*relu=*/true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape scale_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape bias_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel( /*stream=*/managed_stream.raw_stream(), @@ -54,10 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*scale_ptr=*/scale_accessor.get_float_ptr(), /*bias_ptr=*/bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { @@ -73,9 +81,9 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel( /*stream=*/managed_stream.raw_stream(), /*per_device_state=*/state, - /*input_ptr=*/input_accessor.get_float_ptr(), - /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), /*output_ptr=*/output_accessor.get_float_ptr(), + /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), + /*input_ptr=*/input_accessor.get_float_ptr(), /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(), /*scale_ptr=*/scale_accessor.get_float_ptr(), /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), @@ -83,19 +91,9 @@ TEST_SUITE(FF_TEST_SUITE) { /*numElements=*/ input_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - std::vector host_scale_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(scale_grad_accessor)); - std::vector host_bias_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(bias_grad_accessor)); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 0e0769014d..0c41fe12ac 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,56 +1,86 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/cast_kernels.h" -#include "test_utils.h" -#include +#include "kernels/cast_kernels_cpu.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Cast Forward and Backward Kernels") { ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); - TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100_n, 100_n}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::DOUBLE, + }; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - std::vector host_double_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); - - std::vector host_grad_float_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(grad_input_accessor)); - CHECK(contains_non_zero(host_grad_float_data)); + create_zero_filled_accessor_w(input_shape, allocator); + + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); + + CHECK(contains_non_zero(grad_input_accessor)); + } + } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 2_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 2_n}}, + DataType::DOUBLE, + }; + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + // Run GPU Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2b6b9bf589..2040dcbd5d 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,39 +1,39 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/combine_kernels.h" -#include "test_utils.h" +#include "kernels/combine_kernels_cpu.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Call Combine Forward and Backward Kernels") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +41,66 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{5_n, 5_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 215e599716..c2df907917 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,56 +1,113 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/concat_kernels.h" -#include "test_utils.h" #include "utils/containers/repeat.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - nonnegative_int num_inputs = 3_n; - nonnegative_int size_per_input = 100_n; - ff_dim_t concat_axis = ff_dim_t{0_n}; - - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); - Allocator allocator = create_local_cuda_memory_allocator(); + const nonnegative_int num_inputs = 4_n; + SUBCASE("forward_kernel") { - std::vector input_accessors = - repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - }); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Concat::forward_kernel(managed_stream.raw_stream(), - output_accessor, - input_accessors, - concat_axis); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_data)); + auto run_forward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, input_cols}}, + DataType::FLOAT, + }; + + std::vector input_accessors = + repeat(num_inputs, [&]() { + return create_random_filled_accessor_r(input_shape, allocator); + }); + + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Concat::forward_kernel(managed_stream.raw_stream(), + output_accessor, + input_accessors, + concat_axis); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test forward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, + DataType::FLOAT, + }; + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test forward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, + DataType::FLOAT, + }; + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - std::vector input_grad_accessors = repeat( - num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); - Kernels::Concat::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessors, - concat_axis); + auto run_backward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, input_cols}}, + DataType::FLOAT, + }; + + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + + std::vector input_grad_accessors = + repeat(num_inputs, [&]() { + return create_zero_filled_accessor_w(input_shape, allocator); + }); + + Kernels::Concat::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessors, + concat_axis); + + for (auto &accessor : input_grad_accessors) { + CHECK(contains_non_zero(accessor)); + } + }; + + SUBCASE("test backward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, + DataType::FLOAT, + }; + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test backward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, + DataType::FLOAT, + }; + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } } } diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc index ed5852bc31..de3215cf2d 100644 --- a/lib/kernels/test/src/test_cuda.cc +++ b/lib/kernels/test/src/test_cuda.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" -#include "test_utils.h" +#include "internal/test_utils.h" +#include #include namespace FlexFlow { -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test CUDA") { int deviceCount = 0; diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 86f8f2102b..409b06d9a9 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,38 +1,37 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/dropout_kernels.h" -#include "test_utils.h" #include "utils/containers/count.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Dropout Kernels") { unsigned long long seed = 12345; float dropout_rate = 0.1; ArrayShape shape = ArrayShape{ - std::vector{10_n, 10_n}, + std::vector{10_n, 10_n}, }; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); DropoutPerDeviceState state = Kernels::Dropout::init_kernel( managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); - auto get_zero_count = [](std::vector const &data) { - return count(data, [](float x) { return x == 0.0f; }); - }; - SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -41,11 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 83f7f0445e..f8a3abdb98 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,21 +1,27 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/flat_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -25,33 +31,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); - - std::vector backward_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1a8cf5f82a..f0be809475 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -1,61 +1,107 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/gather_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{2_n}}; + legion_dim_t{0_n}}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Gather::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - index_accessor, - output_accessor); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Gather::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - index_accessor, - input_grad_accessor); - - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad_data)); + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::FLOAT, + }; + run_backward_test(input_shape, index_shape, output_shape); + } } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5386c1d943..02a95ba58a 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,23 +1,30 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/layer_norm_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test LayerNorm Forward and Backward Kernel") { nonnegative_int batch_size = 10_n; nonnegative_int feature_size = 10_n; float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, feature_size}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; - TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + TensorShape feature_shape = TensorShape{ + TensorDims{FFOrdered{feature_size}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -31,16 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,8 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..fb5920adcc --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,107 @@ +#include "internal/test_utils.h" +#include "kernels/gather_kernels.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Test ManagedFFStream") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + Allocator allocator = create_local_cuda_memory_allocator(); + + GatherPerDeviceState state = {managed_handle.raw_handle(), + legion_dim_t{0_n}}; + + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape const &input_shape, + TensorShape const &index_shape, + TensorShape const &output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*input=*/input_accessor, + /*index=*/index_accessor, + /*output=*/output_accessor); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + } + + SUBCASE("backward_kernel") { + auto run_backward_test = [&](TensorShape const &input_shape, + TensorShape const &index_shape, + TensorShape const &output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*output_grad=*/output_grad_accessor, + /*index=*/index_accessor, + /*input_grad=*/input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::FLOAT, + }; + run_backward_test(input_shape, index_shape, output_shape); + } + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..fc67764cdb --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,37 @@ +#include "kernels/managed_per_device_ff_handle.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Test ManagedPerDeviceFFHandle") { + ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + SUBCASE("constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("move constructor") { + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + new_handle = std::move(base_handle); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } + } + } +} diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4fd1b53210..5452266dad 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/partition_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -14,48 +17,36 @@ TEST_SUITE(FF_TEST_SUITE) { RepartitionPerDeviceState state = Kernels::Repartition::init_kernel( managed_handle.raw_handle(), DataType::FLOAT); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 62b61707c6..f2ada8387e 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,9 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/pool_2d_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Pool2D Forward and Backward Kernel") { nonnegative_int input_w = 10_n; nonnegative_int input_h = 10_n; @@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -46,10 +49,14 @@ TEST_SUITE(FF_TEST_SUITE) { /*stride_w=*/stride_w.unwrap_nonnegative(), /*pool_type=*/pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_n, input_c, input_h, input_w}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{output_n, input_c, output_h, output_w}}, + DataType::FLOAT, + }; GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -62,28 +69,23 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 04a3817b84..e13b149769 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,27 +1,33 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reduction_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {10_n, 10_n, 10_n, 10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({10_n}); + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -30,30 +36,22 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_grad_data == expected_grad_input_data); + output_grad_accessor, + input_grad_accessor); + + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fa726898f2..83a9a992f7 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,55 +1,150 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels.h" -#include "test_utils.h" +#include "kernels/replicate_kernels_cpu.h" +#include "test/utils/doctest/check_kv.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Call Replicate Forward and Backward Kernels") { nonnegative_int num_replicas = 10_n; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = input_shape; + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); + GenericTensorAccessorR input = + create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); + + GenericTensorAccessorW output = + gpu_allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input_accessor, output_accessor); + managed_stream.raw_stream(), input, output); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + GenericTensorAccessorR correct = input; - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK_MESSAGE(accessors_are_equal(output, correct), + check_kv("output", format_accessor_w_contents(output))); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + gpu_allocator); + + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + + GenericTensorAccessorW input_grad = + gpu_allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor, + output_grad, + input_grad, num_replicas.unwrap_nonnegative()); - std::vector check_aggregated_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(check_aggregated_data)); + CHECK_MESSAGE( + accessors_are_equal(input_grad, correct), + check_kv("input_grad", format_accessor_w_contents(input_grad))); + } + } + + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { + nonnegative_int num_replicas = 2_n; + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{5_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{5_n, num_replicas}}, + DataType::FLOAT, + }; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK_MESSAGE( + accessors_are_equal(output_accessor_gpu, output_accessor_cpu), + check_kv("input", format_accessor_r_contents(input_accessor_cpu)), + check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)), + check_kv("cpu", format_accessor_w_contents(output_accessor_cpu))); + } + + SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + num_replicas.unwrap_nonnegative()); + + // Run CPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Replicate::cpu_backward_kernel( + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_replicas.unwrap_nonnegative()); + + CHECK_MESSAGE( + accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu), + check_kv("output_grad", + format_accessor_r_contents(output_grad_accessor_cpu)), + check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)), + check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu))); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index d329a347b3..66c6bf849b 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -1,16 +1,21 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reshape_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -18,42 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 9c8475f6d6..6e12c48ac3 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,63 +1,124 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reverse_kernels.h" -#include "test_utils.h" +#include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { - nonnegative_int reverse_dim_size = 10_n; - nonnegative_int in_blk_size = 10_n; - nonnegative_int num_out_blks = 1_n; - - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{1_n, 10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reverse::forward_kernel( - managed_stream.raw_stream(), - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_accessor.shape.num_elements().unwrap_nonnegative()); - - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(check_output_data)); + managed_stream.raw_stream(), input_accessor, output_accessor, attrs); + + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Reverse::backward_kernel( - managed_stream.raw_stream(), - output_grad_accessor.get_float_ptr(), - input_grad_accessor.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad_accessor.shape.num_elements().unwrap_nonnegative()); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + allocator.allocate_tensor(input_shape); + + Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessor, + attrs); + + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{1_n, 4_n, 3_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + SUBCASE("forward_kernel") { + // Run GPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu, + output_accessor_gpu, + attrs); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel( + input_accessor_cpu, output_accessor_cpu, attrs); + + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + attrs); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Reverse::cpu_backward_kernel( + output_grad_accessor_cpu, input_grad_accessor_cpu, attrs); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c9eaa76b86..904cca2d3e 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/softmax_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { nonnegative_int input_n = 1_n; nonnegative_int input_c = 1_n; @@ -12,12 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = @@ -40,30 +45,22 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector expected_input_grad_data = std::vector( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index ea0d280f68..44e8f42f76 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,24 +1,33 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/split_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" #include "utils/containers/repeat.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Split Forward and Backward Kernel") { nonnegative_int num_outputs = 2_n; coord_t out_blk_sizes[] = {50, 50}; coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{50_n}}, + DataType::FLOAT, + }; SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = @@ -47,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 02d99c86a1..3c15661396 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -1,58 +1,54 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/transpose_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { TransposeAttrs attrs = TransposeAttrs{ - FFOrdered{ - ff_dim_t{0_n}, + FFOrdered{ ff_dim_t{1_n}, + ff_dim_t{0_n}, }, }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), attrs, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), attrs, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc deleted file mode 100644 index 903b666fa9..0000000000 --- a/lib/kernels/test/src/test_utils.cc +++ /dev/null @@ -1,106 +0,0 @@ -#include "test_utils.h" - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } -} - -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; -} - -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; -} diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h deleted file mode 100644 index 08f0f382fb..0000000000 --- a/lib/kernels/test/src/test_utils.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef _FLEXFLOW_KERNELS_TEST_UTILS -#define _FLEXFLOW_KERNELS_TEST_UTILS - -#include "kernels/device.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include -#include -#include -#include -#include - -using namespace FlexFlow; - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill = false); - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); - -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims); - -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims); - -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { - int volume = accessor.shape.get_volume(); - - std::vector local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(T), - cudaMemcpyDeviceToHost)); - return local_data; -} - -template -bool contains_non_zero(std::vector &data) { - return !all_of( - data.begin(), data.end(), [](T const &val) { return val == 0; }); -} - -// Specialize doctest's StringMaker for std::vector -template <> -struct doctest::StringMaker> { - static doctest::String convert(std::vector const &vec) { - std::ostringstream oss; - for (size_t i = 0; i < vec.size(); ++i) { - oss << vec[i]; - if (i != vec.size() - 1) { - oss << ", "; - } - } - return doctest::String(("[" + oss.str() + "]").c_str()); - } -}; - -#endif diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 54eca7e514..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 1df6da8d8e..5cf8742918 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -134,9 +134,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index bb1504a3f5..55ff354483 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -107,8 +107,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -120,10 +120,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index c5ff9199f3..311b8e7924 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -58,8 +58,10 @@ static DeviceSpecificDeviceStates ParallelTensorShape output_shape = throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = init_kernel( - get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); + ElementUnaryPerDeviceState per_device_state = + init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)), + array_shape_from_tensor_shape(get_piece_shape(output_shape)), + attrs); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -88,10 +90,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -106,10 +108,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 0f872b5d50..af6fc16272 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -40,15 +40,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 6f0901e66a..9641cdbd4a 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -26,9 +26,9 @@ OpTaskInvocation init(LinearAttrs const &attrs) { binding.bind_arg(HANDLE, ff_handle()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); + binding.bind(WEIGHT, weight_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); return {task_id_t::LINEAR_INIT_TASK_ID, binding}; } @@ -36,11 +36,11 @@ OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskInvocation forward(LinearAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); + binding.bind(WEIGHT, weight_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); if (attrs.use_bias) { - binding.bind(BIAS, weight_tensor(1)); // bias + binding.bind(BIAS, weight_tensor(1)); } binding.bind_arg(PROFILING, profiling_settings()); @@ -124,20 +124,21 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto output = acc.get_tensor(OUTPUT); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_grad_ptr = NULL; if (attrs.use_bias) { - bias_ptr = bias.get_float_ptr(); + auto bias_grad = acc.get_tensor_grad(BIAS); + bias_grad_ptr = bias_grad.get_float_ptr(); } nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); @@ -148,13 +149,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + bias_grad_ptr, in_dim.unwrap_nonnegative(), out_dim.unwrap_nonnegative(), batch_size.unwrap_nonnegative()); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index fb0635efba..f85874dc0a 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -115,19 +115,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index ee1a7c6c4e..b07d9fe965 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -63,13 +63,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 6c0c813c8d..7b6e9fe2f6 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -85,8 +85,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index d3ada35d93..99aeb913ba 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -66,8 +66,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree.unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index fc3a75607d..e382b2668e 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -86,8 +86,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index ddd47d355d..00f56c6892 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -48,30 +48,12 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - nonnegative_int output_size = output.shape.get_volume(); - auto axis = attrs.axis; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) { - if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t{i}); - } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= output.shape.at(ff_dim_t{i}); - } - } - return profile(forward_kernel, profiling, "[reverse] forward_time = {:.2lf}ms\n", - input.get_float_ptr(), - output.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - output_size.unwrap_nonnegative()); + input, + output, + attrs); } static std::optional @@ -81,30 +63,12 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int axis = input_grad.shape.num_dims().unwrap_nonnegative() - - attrs.axis.value.unwrap_nonnegative() - 1; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) { - if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t{i}); - } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= input_grad.shape.at(ff_dim_t{i}); - } - } - return profile(backward_kernel, profiling, "[reverse] backward_time = {:.2lf}ms\n", - output_grad.get_float_ptr(), - input_grad.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad.shape.get_volume().unwrap_nonnegative()); + output_grad, + input_grad, + attrs); } TaskImplFunction get_reverse_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 0e94422c5f..e008098e05 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -106,8 +106,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume().unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 4146836b9a..1859bb0ccc 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -67,8 +67,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", attrs, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + return allocator; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..9f8b4092c1 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,68 +12,71 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle{ +// /*workSpaceSize=*/1024 * 1024, +// /*allowTensorOpMathConversion=*/true}; -// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ -// DeviceSpecific::create(managed_handle.raw_handle()), -// EnableProfiling::YES, -// ProfilingSettings{/*warmup_iters=*/0, -// /*measure_iters=*/1}}; +// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ +// DeviceSpecific::create(managed_handle.raw_handle()), +// EnableProfiling::YES, +// ProfilingSettings{/*warmup_iters=*/0, +// /*measure_iters=*/1}}; -// LocalCostEstimator cost_estimator = -// LocalCostEstimator{runtime_arg_config}; +// LocalCostEstimator cost_estimator = +// LocalCostEstimator{runtime_arg_config}; -// SUBCASE("Estimate cost -- Attention Op") { -// int embed_dim = 32; -// int num_heads = 10; -// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ -// /*embed_dim=*/embed_dim, -// /*num_heads=*/num_heads, -// /*kdim=*/embed_dim, -// /*vdim=*/embed_dim, -// /*dropout=*/0.0, -// /*bias=*/true, -// /*add_bias_kv=*/false, -// /*add_zero_attn=*/false, -// }; +// SUBCASE("Estimate cost -- Attention Op") { +// int embed_dim = 32; +// int num_heads = 10; +// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ +// /*embed_dim=*/embed_dim, +// /*num_heads=*/num_heads, +// /*kdim=*/embed_dim, +// /*vdim=*/embed_dim, +// /*dropout=*/0.0, +// /*bias=*/true, +// /*add_bias_kv=*/false, +// /*add_zero_attn=*/false, +// }; -// size_t batch_size = 40; -// size_t seq_len = 48; -// size_t feature_size = 36; +// size_t batch_size = 40; +// size_t seq_len = 48; +// size_t feature_size = 36; -// DataType dtype = DataType::FLOAT; -// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ -// TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, -// DataType::FLOAT, -// }); +// DataType dtype = DataType::FLOAT; +// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ +// TensorDims{FFOrdered{batch_size, seq_len, +// feature_size}}, DataType::FLOAT, +// }); -// ParallelTensorShape weights_shape = throw_if_unexpected( -// get_weights_shape(attrs, inputs_shape, inputs_shape, -// inputs_shape)); -// ParallelTensorAttrs weight_attrs = -// ParallelTensorAttrs{weights_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape weights_shape = throw_if_unexpected( +// get_weights_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs weight_attrs = +// ParallelTensorAttrs{weights_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// ParallelTensorShape output_shape = throw_if_unexpected( -// get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); -// ParallelTensorAttrs output_attrs = -// ParallelTensorAttrs{output_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape output_shape = throw_if_unexpected( +// get_output_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs output_attrs = +// ParallelTensorAttrs{output_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// CostDetails result = cost_estimator.estimate_cost( -// PCGOperatorAttrs{attrs}, -// std::vector{ -// inputs_shape, inputs_shape, inputs_shape}, -// std::vector{weight_attrs}, -// std::vector{output_attrs}, -// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); +// CostDetails result = cost_estimator.estimate_cost( +// PCGOperatorAttrs{attrs}, +// std::vector{ +// inputs_shape, inputs_shape, inputs_shape}, +// std::vector{weight_attrs}, +// std::vector{output_attrs}, +// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); -// CHECK(result.total_elapsed_time > 0); -// CHECK(result.total_mem_usage > 0); +// CHECK(result.total_elapsed_time > 0); +// CHECK(result.total_mem_usage > 0); +// } +// } // } -// } -// } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index dffb19398c..e55d1eddf5 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -1,6 +1,6 @@ #include "kernels/attention_kernels.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/local_slots_backing.h" #include "op-attrs/ops/attention.h" #include "op-attrs/parallel_tensor_shape.h" @@ -106,24 +106,24 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( query_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{query_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(query_shape), dtype}; CHECK(result == correct); } SUBCASE("Key grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( key_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{key_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(key_shape), dtype}; CHECK(result == correct); } SUBCASE("Value grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( value_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{value_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(value_shape), dtype}; CHECK(result == correct); } } @@ -135,9 +135,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_result_shape_and_dtype_for_tensor_guid_and_map( output_guid, local_slots_backing.tensor_mapping); std::pair correct = { - ArrayShape{ + array_shape_from_tensor_shape( get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, + .shape), dtype}; CHECK(result == correct); } @@ -146,9 +146,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_result_shape_and_dtype_for_tensor_guid_and_map( output_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = { - ArrayShape{ + array_shape_from_tensor_shape( get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, + .shape), dtype}; CHECK(result == correct); } diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index 0fab0f6a60..a39bb229e2 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -1,5 +1,5 @@ #include "doctest/doctest.h" -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/task_signature_impl.h" #include "utils/fmt/variant.h" diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..09ee99915d 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,6 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h index f2355289dc..5c47745209 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h @@ -17,13 +17,9 @@ struct DimOrdered { DimOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} - /* template ::value>::type> */ DimOrdered(std::vector const &contents) : contents(contents.begin(), contents.end()) {} - /* template ::value>::type> */ template DimOrdered(It begin, It end) : contents(begin, end) {} @@ -62,10 +58,6 @@ struct DimOrdered { return this->contents != other.contents; } - bool operator<(DimOrdered const &other) const { - return this->contents < other.contents; - } - using iterator = typename stack_vector::iterator; using const_iterator = typename stack_vector::const_iterator; @@ -116,7 +108,7 @@ struct DimOrdered { } reverse_iterator rend() { - return this->contents.crend(); + return this->contents.rend(); } const_reverse_iterator rend() const { @@ -145,195 +137,26 @@ struct DimOrdered { stack_vector contents; }; -template -struct DimOrdered { - DimOrdered() {} - - DimOrdered(std::initializer_list const &l) - : contents(l.begin(), l.end()) {} - - DimOrdered(std::vector const &contents) - : contents(contents.begin(), contents.end()) {} - - template - DimOrdered(It begin, It end) : contents(begin, end) {} - - template - DimOrdered(stack_vector const &contents) - : contents(contents.begin(), contents.end()) {} - - T const &at(ff_dim_t idx) const { - int raw = idx.value.unwrap_nonnegative(); - return this->contents.at(raw); - } - - T const &at(relative_ff_dim_t idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); - } - - T &at(ff_dim_t idx) { - int raw = idx.value.unwrap_nonnegative(); - return this->contents.at(raw); - } - - T &at(relative_ff_dim_t idx) { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); - } - - T const &operator[](ff_dim_t idx) const { - return this->at(idx); - } - - T const &operator[](relative_ff_dim_t idx) const { - return this->at(idx); - } - - T &operator[](ff_dim_t idx) { - return this->at(idx); - } - - T &operator[](relative_ff_dim_t idx) { - return this->at(idx); - } - - bool idx_is_valid(ff_dim_t const &idx) const { - int raw = idx.value.unwrap_nonnegative(); - return raw < this->contents.size(); - } - - bool idx_is_valid(relative_ff_dim_t const &idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return (raw >= 0 && raw < this->contents.size()); - } - - bool operator==(DimOrdered const &other) const { - return this->contents == other.contents; - } - - bool operator!=(DimOrdered const &other) const { - return this->contents != other.contents; - } - - bool operator<(DimOrdered const &other) const { - return this->contents < other.contents; - } - - using iterator = typename stack_vector::iterator; - using const_iterator = - typename stack_vector::const_iterator; - using reverse_iterator = - typename stack_vector::reverse_iterator; - using const_reverse_iterator = - typename stack_vector::const_reverse_iterator; - using value_type = T; - using pointer = value_type *; - using const_pointer = value_type const *; - using reference = value_type &; - using const_reference = value_type const &; - - iterator begin() { - return this->contents.begin(); - } - - const_iterator begin() const { - return this->cbegin(); - } - - const_iterator cbegin() const { - return this->contents.cbegin(); - } - - iterator end() { - return this->contents.end(); - } - - const_iterator end() const { - return this->cend(); - } - - const_iterator cend() const { - return this->contents.cend(); - } - - reverse_iterator rbegin() { - return this->contents.rbegin(); - } - - const_reverse_iterator rbegin() const { - return this->crbegin(); - } - - const_reverse_iterator crbegin() const { - return this->contents.crbegin(); - } - - reverse_iterator rend() { - return this->contents.crend(); - } - - const_reverse_iterator rend() const { - return this->crend(); - } - - const_reverse_iterator crend() const { - return this->contents.crend(); - } - - size_t size() const { - return this->contents.size(); - } - - size_t empty() const { - return this->contents.empty(); - } - - size_t num_dims() const { - return this->size(); - } - - friend struct ::std::hash; - -private: - stack_vector contents; -}; - -template -using FFOrdered = DimOrdered; +template +auto operator<(DimOrdered const &lhs, DimOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} -template -std::string format_as(FFOrdered const &v) { +template +std::string format_as(DimOrdered const &v) { std::vector as_vec(v.cbegin(), v.cend()); return fmt::format("", as_vec); } -template -std::ostream &operator<<(std::ostream &s, FFOrdered const &v) { +template +std::ostream &operator<<(std::ostream &s, DimOrdered const &v) { return (s << fmt::to_string(v)); } } // namespace FlexFlow -/* template */ -/* void to_json(json &j, DimOrdered const &x) { */ -/* /1* j = std::vector{x.cbegin(), x.cend()}; *1/ */ -/* } */ - -/* template */ -/* void from_json(json const &j, DimOrdered &x) { */ -/* /1* x = DimOrdered{j.template get>()}; *1/ */ -/* } */ - namespace nlohmann { template struct adl_serializer<::FlexFlow::DimOrdered> { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h index 166916dd44..76526447be 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H #include "op-attrs/dim_ordered/dim_ordered.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "utils/containers/transform.h" #include "utils/containers/vector_of.h" #include "utils/optional.h" @@ -18,35 +18,8 @@ DimOrdered nonoverloaded_slice(DimOrdered const &d, }; return DimOrdered{ - subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; + slice(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; } - -template -FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end) { - auto to_raw_idx = - [](std::optional const &idx) -> std::optional { - return transform( - idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); - }; - - return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; -} - -template -FFOrdered relative_ff_dim_t_nonoverloaded_slice( - FFOrdered const &d, - std::optional const &start, - std::optional const &end) { - auto to_raw_idx = - [](std::optional const &idx) -> std::optional { - return transform(idx, [](relative_ff_dim_t const &i) { return i.value; }); - }; - - return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; -} - template DimOrdered slice(DimOrdered const &d, std::optional const &start = std::nullopt, @@ -54,20 +27,6 @@ DimOrdered slice(DimOrdered const &d, return ff_dim_t_nonoverloaded_slice(d, start, end); } -template -FFOrdered slice(FFOrdered const &d, - std::optional const &start = std::nullopt, - std::optional const &end = std::nullopt) { - return ff_dim_t_nonoverloaded_slice(d, start, end); -} - -template -FFOrdered slice(FFOrdered const &d, - std::optional const &start = std::nullopt, - std::optional const &end = std::nullopt) { - return relative_ff_dim_t_nonoverloaded_slice(d, start, end); -} - } // namespace FlexFlow #endif diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h similarity index 95% rename from lib/op-attrs/include/op-attrs/dim_ordered/concat.h rename to lib/op-attrs/include/op-attrs/ff_ordered/concat.h index 9b9eaf9b93..a5faed2b36 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/concat_vectors.h" #include "utils/containers/transform.h" diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h similarity index 95% rename from lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h rename to lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h index 9e4271a1ff..bc8636615c 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/bidict/bidict.h" #include "utils/containers/count.h" diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h new file mode 100644 index 0000000000..92ed211c31 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h @@ -0,0 +1,228 @@ +#ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H +#define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H + +#include "op-attrs/ff_dim_t.dtg.h" +#include "op-attrs/relative_ff_dim_t.dtg.h" +#include "utils/fmt/vector.h" +#include "utils/stack_vector/stack_vector.h" + +namespace FlexFlow { + +template +struct FFOrdered { + FFOrdered() {} + + FFOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} + + FFOrdered(std::vector const &contents) + : contents(contents.begin(), contents.end()) {} + + template + FFOrdered(It begin, It end) : contents(begin, end) {} + + template + FFOrdered(stack_vector const &contents) + : contents(contents.begin(), contents.end()) {} + + T const &at(ff_dim_t idx) const { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T const &at(relative_ff_dim_t idx) const { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return this->contents.at(raw); + } + + T &at(ff_dim_t idx) { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T &at(relative_ff_dim_t idx) { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return this->contents.at(raw); + } + + T const &operator[](ff_dim_t idx) const { + return this->at(idx); + } + + T const &operator[](relative_ff_dim_t idx) const { + return this->at(idx); + } + + T &operator[](ff_dim_t idx) { + return this->at(idx); + } + + T &operator[](relative_ff_dim_t idx) { + return this->at(idx); + } + + bool idx_is_valid(ff_dim_t const &idx) const { + int raw = idx.value.unwrap_nonnegative(); + return raw < this->contents.size(); + } + + bool idx_is_valid(relative_ff_dim_t const &idx) const { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return (raw >= 0 && raw < this->contents.size()); + } + + bool operator==(FFOrdered const &other) const { + return this->contents == other.contents; + } + + bool operator!=(FFOrdered const &other) const { + return this->contents != other.contents; + } + + using iterator = typename stack_vector::iterator; + using const_iterator = + typename stack_vector::const_iterator; + using reverse_iterator = + typename stack_vector::reverse_iterator; + using const_reverse_iterator = + typename stack_vector::const_reverse_iterator; + using value_type = T; + using pointer = value_type *; + using const_pointer = value_type const *; + using reference = value_type &; + using const_reference = value_type const &; + + iterator begin() { + return this->contents.begin(); + } + + const_iterator begin() const { + return this->cbegin(); + } + + const_iterator cbegin() const { + return this->contents.cbegin(); + } + + iterator end() { + return this->contents.end(); + } + + const_iterator end() const { + return this->cend(); + } + + const_iterator cend() const { + return this->contents.cend(); + } + + reverse_iterator rbegin() { + return this->contents.rbegin(); + } + + const_reverse_iterator rbegin() const { + return this->crbegin(); + } + + const_reverse_iterator crbegin() const { + return this->contents.crbegin(); + } + + reverse_iterator rend() { + return this->contents.rend(); + } + + const_reverse_iterator rend() const { + return this->crend(); + } + + const_reverse_iterator crend() const { + return this->contents.crend(); + } + + size_t size() const { + return this->contents.size(); + } + + size_t empty() const { + return this->contents.empty(); + } + + size_t num_dims() const { + return this->size(); + } + + friend struct ::std::hash; + +private: + stack_vector contents; +}; + +template +auto operator<(FFOrdered const &lhs, FFOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} + +template +std::string format_as(FFOrdered const &v) { + std::vector as_vec(v.cbegin(), v.cend()); + return fmt::format("", as_vec); +} + +template +std::ostream &operator<<(std::ostream &s, FFOrdered const &v) { + return (s << fmt::to_string(v)); +} + +} // namespace FlexFlow + +namespace nlohmann { +template +struct adl_serializer<::FlexFlow::FFOrdered> { + static ::FlexFlow::FFOrdered from_json(nlohmann::json const &j) { + return {j.template get>()}; + } + + static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered const &x) { + j = std::vector{x.cbegin(), x.cend()}; + } +}; +} // namespace nlohmann + +namespace std { + +template +struct hash<::FlexFlow::FFOrdered> { + size_t operator()(::FlexFlow::FFOrdered const &t) const { + static_assert(::FlexFlow::is_hashable::value, + "Elements must be hashable"); + + return get_std_hash(t.contents); + } +}; + +} // namespace std + +namespace rc { + +template +struct Arbitrary<::FlexFlow::FFOrdered> { + static Gen<::FlexFlow::FFOrdered> arbitrary() { + return gen::construct<::FlexFlow::FFOrdered>( + gen::arbitrary<::FlexFlow::stack_vector>()); + } +}; + +} // namespace rc + +#endif diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h similarity index 88% rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h index f8f49233ec..9232afddfb 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H -#include "op-attrs/dim_ordered/dim_ordered.h" -#include "op-attrs/dim_ordered/ff_ordered_of.h" #include "op-attrs/ff_dim_t.h" +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h similarity index 88% rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h index 8cc1bf3a51..ace60b7e3d 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h similarity index 91% rename from lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h rename to lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h index 4e7f8530a4..5ff390d3fe 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H -#include "op-attrs/dim_ordered/dim_ordered.h" #include "op-attrs/ff_dim_t.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/count.h" #include "utils/containers/transform.h" diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h new file mode 100644 index 0000000000..79217c4cc3 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h @@ -0,0 +1,49 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/slice.h" +#include "utils/containers/transform.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template +FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, + ff_dim_t const &start, + std::optional const &end) { + int raw_start = start.value.unwrap_nonnegative(); + std::optional raw_end = transform( + end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); + return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +template +FFOrdered relative_ff_dim_t_nonoverloaded_slice( + FFOrdered const &d, + relative_ff_dim_t const &start, + std::optional const &end) { + int raw_start = start.value; + std::optional raw_end = + transform(end, [](relative_ff_dim_t const &i) { return i.value; }); + + return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +template +FFOrdered slice(FFOrdered const &d, + ff_dim_t const &start = ff_dim_t{0_n}, + std::optional const &end = std::nullopt) { + return ff_dim_t_nonoverloaded_slice(d, start, end); +} + +template +FFOrdered slice(FFOrdered const &d, + relative_ff_dim_t const &start = relative_ff_dim_t{0}, + std::optional const &end = std::nullopt) { + return relative_ff_dim_t_nonoverloaded_slice(d, start, end); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h new file mode 100644 index 0000000000..3a8eeb9ecf --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/vector_transform.h" + +namespace FlexFlow { + +template > +FFOrdered transform(FFOrdered const &d, F &&f) { + return FFOrdered{vector_transform(vector_of(d), f)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h new file mode 100644 index 0000000000..fe207740f7 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/zip.h" + +namespace FlexFlow { + +template +FFOrdered> zip(FFOrdered const &lhs, + FFOrdered const &rhs) { + return FFOrdered>{zip(vector_of(lhs), vector_of(rhs))}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml index b1c5f60382..50756f095b 100644 --- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml index be3a95eec8..d68ef02ec1 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/parallel_tensor_shape/sum_degree.dtg.h", "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h", - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "utils/nonnegative_int/nonnegative_int.h", ] diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml index f24fa12309..d2f8758377 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "op-attrs/shard_parallel_dim.dtg.h", "op-attrs/replica_parallel_dim_set.dtg.h", "", diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h index 97f3432c2f..ba35295e09 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/tensor_dims.h @@ -19,7 +19,7 @@ std::optional get_broadcast_target_dims(std::unordered_set const &); TensorDims slice_tensor_dims(TensorDims const &, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml index e86b866fd6..8c6d1098cc 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "utils/nonnegative_int/nonnegative_int.h", ] diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index a3cd8bfd9a..298ea04638 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -12,7 +12,7 @@ nonnegative_int get_num_elements(TensorShape const &); nonnegative_int get_size_in_bytes(TensorShape const &); TensorShape slice_tensor_shape(TensorShape const &, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop); } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..4604ef0b4e --- /dev/null +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc deleted file mode 100644 index cb29f708a3..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/concat.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc deleted file mode 100644 index 6edd5485af..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/enumerate.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc deleted file mode 100644 index 2de88f38c8..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc deleted file mode 100644 index 8e5c2fd38a..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/ff_ordered_of.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc deleted file mode 100644 index 175ae8d4bd..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/get_idxs.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc index 75ab1a32aa..8c3dbd7bbc 100644 --- a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc +++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc @@ -1,26 +1 @@ #include "op-attrs/dim_ordered/slice.h" -#include "utils/archetypes/value_type.h" - -namespace FlexFlow { - -using T = value_type<0>; - -template FFOrdered - ff_dim_t_nonoverloaded_slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered relative_ff_dim_t_nonoverloaded_slice( - FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc new file mode 100644 index 0000000000..73683eba94 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc @@ -0,0 +1 @@ +#include "op-attrs/dim_ordered/transform.h" diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc new file mode 100644 index 0000000000..e06c144149 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/enumerate.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::map enumerate(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc new file mode 100644 index 0000000000..1420586809 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc @@ -0,0 +1,14 @@ +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template struct FFOrdered; + +template std::string format_as(FFOrdered const &); + +template std::ostream &operator<<(std::ostream &, FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc new file mode 100644 index 0000000000..e39fedb858 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -0,0 +1,13 @@ +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered ff_ordered_from_map(std::map const &); + +template FFOrdered + ff_ordered_from_map(std::unordered_map const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc new file mode 100644 index 0000000000..3da15bebba --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/get_idxs.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::vector get_idxs(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc new file mode 100644 index 0000000000..059fd811cd --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc @@ -0,0 +1,24 @@ +#include "op-attrs/ff_ordered/slice.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered ff_dim_t_nonoverloaded_slice( + FFOrdered const &, ff_dim_t const &, std::optional const &); + +template FFOrdered relative_ff_dim_t_nonoverloaded_slice( + FFOrdered const &, + relative_ff_dim_t const &, + std::optional const &); + +template FFOrdered slice(FFOrdered const &, + ff_dim_t const &, + std::optional const &); + +template FFOrdered slice(FFOrdered const &, + relative_ff_dim_t const &, + std::optional const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc new file mode 100644 index 0000000000..74bf4895a3 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/transform.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; +using Out = value_type<1>; +using F = std::function; + +template FFOrdered transform(FFOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc new file mode 100644 index 0000000000..dc715ea97c --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/zip.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T1 = value_type<0>; +using T2 = value_type<1>; + +template FFOrdered> zip(FFOrdered const &, + FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc index d4763ef004..ddd92bd417 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/batch_norm.h" -#include "op-attrs/dim_ordered/concat.h" -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/any_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index fc42241ef2..bf0ba553e4 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/concat.h" -#include "op-attrs/dim_ordered/enumerate.h" -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" +#include "op-attrs/ff_ordered/enumerate.h" +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc index 4dc602646b..5b5b91a8e7 100644 --- a/lib/op-attrs/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc @@ -1,8 +1,10 @@ #include "op-attrs/ops/embedding.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/transform.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/transform.h" +#include "op-attrs/ops/embedding_attrs.dtg.h" #include "op-attrs/parallel_tensor_dims.h" #include "utils/containers/product.h" +#include "utils/fmt/optional.h" #include "utils/integer_conversions.h" namespace FlexFlow { diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc index 8ed12167b3..b4eeda76ab 100644 --- a/lib/op-attrs/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/src/op-attrs/ops/flat.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/flat.h" -#include "op-attrs/dim_ordered/concat.h" -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_dims.h" #include "utils/containers/any_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc index 00c6bb5e9b..c9798368e2 100644 --- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/layer_norm.h" -#include "op-attrs/dim_ordered/ff_ordered_of.h" -#include "op-attrs/dim_ordered/get_idxs.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/get_idxs.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/all_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index fb26113613..bee9d0cf4f 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -1,11 +1,12 @@ #include "op-attrs/ops/linear.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/transform.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/transform.h" #include "op-attrs/initializers/kaiming_initializer_mode.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/product.h" #include "utils/expected.h" +#include "utils/fmt/optional.h" #include "utils/integer_conversions.h" namespace FlexFlow { @@ -101,7 +102,7 @@ tl::expected SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input), - std::nullopt, + relative_ff_dim_t{0}, relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{ shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, @@ -126,8 +127,10 @@ tl::expected SumDegree sum_degree = SumDegree{get_sum_degree(input) * shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice( - ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))}; + DiscardCopyDegree discard_copy_degree = + DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input), + relative_ff_dim_t{0}, + relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{get_discard_copy_degree(input)}; diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index 7a8f91e498..3f2245b2dc 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -1,6 +1,6 @@ #include "op-attrs/parallel_tensor_dims.h" -#include "op-attrs/dim_ordered/transform.h" -#include "op-attrs/dim_ordered/zip.h" +#include "op-attrs/ff_ordered/transform.h" +#include "op-attrs/ff_ordered/zip.h" #include "op-attrs/replica_parallel_dim.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.h" diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc index 8d0592eab7..760278297c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc @@ -1,6 +1,6 @@ #include "op-attrs/tensor_dims.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/zip.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/zip.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.dtg.h" #include "utils/containers/all_of.h" @@ -67,7 +67,7 @@ std::optional } TensorDims slice_tensor_dims(TensorDims const &dims, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop) { return TensorDims{ slice(dims.ff_ordered, start, stop), diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc index 04b18794f1..afc14af54c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc @@ -29,7 +29,7 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) { } TensorShape slice_tensor_shape(TensorShape const &shape, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop) { return TensorShape{ slice_tensor_dims(shape.dims, start, stop), diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..9b0e90b601 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc @@ -0,0 +1,68 @@ +#include "op-attrs/datatype_value.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("test make_data_type_value") { + SUBCASE("make_float_data_type_value") { + float value = 1.0f; + DataTypeValue data_type_value = make_float_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_double_data_type_value") { + double value = 2.71828; + DataTypeValue data_type_value = make_double_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int32_data_type_value") { + int32_t value = -42; + DataTypeValue data_type_value = make_int32_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int64_data_type_value") { + int64_t value = 1LL << 40; + DataTypeValue data_type_value = make_int64_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_bool_data_type_value") { + bool value = true; + DataTypeValue data_type_value = make_bool_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc index d7901a0c53..a5a261da25 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc +++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc @@ -10,8 +10,4 @@ TEST_SUITE(FF_TEST_SUITE) { "Arbitrary> with T=", T, int, double, char) { RC_SUBCASE([](DimOrdered) {}); } - - TEST_CASE_TEMPLATE("Arbitrary> with T=", T, int, double, char) { - RC_SUBCASE([](FFOrdered) {}); - } } diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc similarity index 97% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc index 2ac641cfc2..d8e04124bc 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/concat.h" +#include "op-attrs/ff_ordered/concat.h" #include using namespace ::FlexFlow; diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc similarity index 92% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc index bf4c33d65a..e1a94e72c3 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/enumerate.h" +#include "op-attrs/ff_ordered/enumerate.h" #include "test/utils/doctest/fmt/map.h" #include diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc new file mode 100644 index 0000000000..b0812ba9d6 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc @@ -0,0 +1,11 @@ +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "test/utils/rapidcheck.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE_TEMPLATE("Arbitrary> with T=", T, int, double, char) { + RC_SUBCASE([](FFOrdered) {}); + } +} diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc similarity index 96% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc index bba989920e..73036d5662 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" #include using namespace ::FlexFlow; diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc similarity index 79% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc index b2fddd058e..2f1dfecd65 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/slice.h" #include using namespace ::FlexFlow; @@ -25,13 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("std::nullopt_t, ff_dim_t") { - FFOrdered result = - slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}}); - FFOrdered correct = FFOrdered{1, 2, 3}; - - CHECK(result == correct); - } SUBCASE("relative_ff_dim_t, relative_ff_dim_t") { FFOrdered result = slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1}); @@ -45,12 +38,6 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("std::nullopt_t, relative_ff_dim_t") { - FFOrdered result = slice(d, std::nullopt, relative_ff_dim_t{-1}); - FFOrdered correct = FFOrdered{1, 2, 3}; - - CHECK(result == correct); - } SUBCASE("start index = stop index") { FFOrdered result = slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1}); @@ -86,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt)); } SUBCASE("stop index out of bounds (too low)") { - CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10})); + CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{-10})); } SUBCASE("stop index out of bounds (too high)") { - CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10})); + CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{10})); } } } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc new file mode 100644 index 0000000000..4bf189ec77 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc @@ -0,0 +1,35 @@ +#include "op-attrs/ff_ordered/transform.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("transform(FFOrdered, F)") { + SUBCASE("input is empty") { + FFOrdered input = {}; + + FFOrdered result = transform(input, [](std::string const &) -> int { + CHECK(false); + return 0; + }); + FFOrdered correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input is not empty") { + FFOrdered input = {2, 1, 2, 5}; + + FFOrdered result = + transform(input, [](int x) { return fmt::to_string(x); }); + FFOrdered correct = FFOrdered{ + "2", + "1", + "2", + "5", + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc new file mode 100644 index 0000000000..19167cd0ff --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc @@ -0,0 +1,38 @@ +#include "op-attrs/ff_ordered/zip.h" +#include "test/utils/doctest/fmt/pair.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("zip(FFOrdered, FFOrdered)") { + FFOrdered lhs_input = {9, 9, 8, 9}; + FFOrdered rhs_input = {"m", "m", "k", "l", "m"}; + + SUBCASE("lhs is longer") { + FFOrdered> result = zip(lhs_input, rhs_input); + + FFOrdered> correct = { + {9, "m"}, + {9, "m"}, + {8, "k"}, + {9, "l"}, + }; + + CHECK(result == correct); + } + + SUBCASE("rhs is longer") { + FFOrdered> result = zip(rhs_input, lhs_input); + + FFOrdered> correct = { + {"m", 9}, + {"m", 9}, + {"k", 8}, + {"l", 9}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml new file mode 100644 index 0000000000..ebb2323203 --- /dev/null +++ b/lib/pcg/include/pcg/metric.enum.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "Metric" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "ACCURACY" + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR" + +[[values]] +name = "ROOT_MEAN_SQUARED_ERROR" + +[[values]] +name = "MEAN_ABSOLUTE_ERROR" diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h new file mode 100644 index 0000000000..343c2154dd --- /dev/null +++ b/lib/pcg/include/pcg/metric_attrs.h @@ -0,0 +1,28 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "pcg/metric.dtg.h" +#include "utils/fmt.h" +#include + +namespace FlexFlow { + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::unordered_set const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h index 3542e73dea..f820c56d61 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H #define _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H +#include "pcg/computation_graph.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h" #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h" diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc new file mode 100644 index 0000000000..9a93e75350 --- /dev/null +++ b/lib/pcg/src/pcg/metric_attrs.cc @@ -0,0 +1,38 @@ +#include "pcg/metric_attrs.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::unordered_set const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error(fmt::format( + "Initializing MetricsAttrs with unrecogonized metrics type {}", m)); + } + } +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc index 2cf149f78a..940024c9b6 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc @@ -1,5 +1,5 @@ #include "pcg/parallel_computation_graph/generate_weight_transform.h" -#include "op-attrs/dim_ordered/enumerate.h" +#include "op-attrs/ff_ordered/enumerate.h" #include "op-attrs/parallel_tensor_shape.h" namespace FlexFlow { diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 253fd3cb4f..83e7c15460 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,11 +77,11 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h index f0962b15c2..5005a0b51c 100644 --- a/lib/substitutions/include/substitutions/pcg_pattern.h +++ b/lib/substitutions/include/substitutions/pcg_pattern.h @@ -12,6 +12,10 @@ namespace FlexFlow { std::unordered_set get_nodes(PCGPattern const &); +std::optional + get_random_pattern_match(PCGPattern const &pattern, + SubParallelComputationGraph const &pcg); + /** * @brief Find all locations in \p pcg that match \p pattern */ diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h index 183f76ac8a..959ba3da2c 100644 --- a/lib/substitutions/include/substitutions/unity_substitution_set.h +++ b/lib/substitutions/include/substitutions/unity_substitution_set.h @@ -6,6 +6,8 @@ #include "utils/fmt/vector.h" namespace FlexFlow { +std::optional + get_random_substitution(MachineSpecification const &resources); std::vector get_substitution_set(MachineSpecification const &resources); diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc index 194ae49255..f39b771364 100644 --- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc +++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc @@ -16,6 +16,16 @@ bool operator_satisfies_constraint( switch (constraint.constraint_type) { case ConstraintType::EQUAL: return expr_val.value() == constraint.attribute_value; + case ConstraintType::DIVISIBLE_BY: { + if (expr_val.value().has() && + constraint.attribute_value.has()) { + return expr_val.value().get() % + constraint.attribute_value.get() == + 0; + } + throw mk_runtime_error( + "DIVISIBLE_BY constraint requires nonnegative_int values"); + } default: throw mk_runtime_error( fmt::format("Unknown constraint type {}", diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc index a0af875848..fbc181a0f9 100644 --- a/lib/substitutions/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/src/substitutions/pcg_pattern.cc @@ -11,6 +11,7 @@ #include "utils/graph/node/algorithms.h" #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h" #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h" +#include "utils/random_utils.h" namespace FlexFlow { @@ -37,6 +38,17 @@ static MatchAdditionalCriterion }}; } +std::optional + get_random_pattern_match(PCGPattern const &pattern, + SubParallelComputationGraph const &pcg) { + std::vector pattern_matches = + find_pattern_matches(pattern, pcg); + if (pattern_matches.empty()) { + return std::nullopt; + } + return select_random(pattern_matches); +} + std::vector find_pattern_matches(PCGPattern const &pattern, SubParallelComputationGraph const &pcg) { diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc index 83df74f21b..0c673f0a8a 100644 --- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc +++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc @@ -188,34 +188,33 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs, } std::string as_dot(SubParallelComputationGraph const &spcg) { - NOT_IMPLEMENTED(); - // std::function get_node_label = - // [](ParallelLayerAttrs const &a) -> std::string { - // RecordFormatter r = as_dot(a.op_attrs); - // - // if (a.name.has_value()) { - // RecordFormatter rr; - // rr << "Name" << a.name.value(); - // r << rr; - // } - // - // std::ostringstream oss; - // oss << r; - // return oss.str(); - // }; - // - // std::function get_input_label = - // [](ParallelTensorAttrs const &a) -> std::string { - // RecordFormatter r; - // - // r << fmt::to_string(a.shape); - // - // std::ostringstream oss; - // oss << r; - // return oss.str(); - // }; - // - // return as_dot(spcg.raw_graph, get_node_label, get_input_label); + std::function get_node_label = + [](ParallelLayerAttrs const &a) -> std::string { + RecordFormatter r = as_dot(a.op_attrs); + + if (a.name.has_value()) { + RecordFormatter rr; + rr << "Name" << a.name.value(); + r << rr; + } + + std::ostringstream oss; + oss << r; + return oss.str(); + }; + + std::function get_input_label = + [](ParallelTensorAttrs const &a) -> std::string { + RecordFormatter r; + + r << fmt::to_string(a.shape); + + std::ostringstream oss; + oss << r; + return oss.str(); + }; + + return as_dot(spcg.raw_graph, get_node_label, get_input_label); } void debug_print_dot(SubParallelComputationGraph const &spcg) { diff --git a/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc index 974bfcabc0..cc0af12c91 100644 --- a/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc +++ b/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc @@ -12,6 +12,16 @@ bool parallel_tensor_satisfies_constraint( switch (constraint.constraint_type) { case ConstraintType::EQUAL: return expr_val == constraint.attribute_value; + case ConstraintType::DIVISIBLE_BY: { + if (expr_val.has() && + constraint.attribute_value.has()) { + return expr_val.get() % + constraint.attribute_value.get() == + 0; + } + throw mk_runtime_error( + "DIVISIBLE_BY constraint requires nonnegative_int values"); + } default: throw mk_runtime_error( fmt::format("Unknown constraint type {}", diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc index 4b00cdd95f..c8d9266978 100644 --- a/lib/substitutions/src/substitutions/unity_substitution_set.cc +++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc @@ -7,9 +7,19 @@ #include "utils/containers/get_only.h" #include "utils/nonnegative_int/nonnegative_int.h" #include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/random_utils.h" namespace FlexFlow { +std::optional + get_random_substitution(MachineSpecification const &resources) { + std::vector substitutions = get_substitution_set(resources); + if (substitutions.empty()) { + return std::nullopt; + } + return select_random(substitutions); +} + std::vector get_substitution_set(MachineSpecification const &resources) { std::vector substitutions; diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc index 9d8e4bc259..fa0ff7794a 100644 --- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc +++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc @@ -140,7 +140,6 @@ std::vector } } } - return matches; } diff --git a/lib/utils/include/utils/containers/subvec.h b/lib/utils/include/utils/containers/slice.h similarity index 69% rename from lib/utils/include/utils/containers/subvec.h rename to lib/utils/include/utils/containers/slice.h index c89e9227de..a82fb383b5 100644 --- a/lib/utils/include/utils/containers/subvec.h +++ b/lib/utils/include/utils/containers/slice.h @@ -9,9 +9,9 @@ namespace FlexFlow { template -std::vector subvec(std::vector const &v, - std::optional const &maybe_start, - std::optional const &maybe_end) { +std::vector slice(std::vector const &v, + int const &maybe_start, + std::optional const &maybe_end) { auto begin_iter = v.cbegin(); auto end_iter = v.cend(); @@ -22,15 +22,13 @@ std::vector subvec(std::vector const &v, if (idx < 0) { new_idx = size + idx; } - if (new_idx < 0 || new_idx > size) { - throw mk_runtime_error("Index {} is out of bounds for array {}"); - } + + ASSERT(new_idx >= 0, "Index out of bounds"); + ASSERT(new_idx <= size, "Index out of bounds"); return new_idx; }; - if (maybe_start.has_value()) { - begin_iter += resolve_loc(maybe_start.value()); - } + begin_iter += resolve_loc(maybe_start); if (maybe_end.has_value()) { end_iter = v.cbegin() + resolve_loc(maybe_end.value()); diff --git a/lib/utils/include/utils/containers/zip_strict.h b/lib/utils/include/utils/containers/zip_strict.h index 64049042d4..5606fccff1 100644 --- a/lib/utils/include/utils/containers/zip_strict.h +++ b/lib/utils/include/utils/containers/zip_strict.h @@ -4,21 +4,17 @@ #include "utils/containers/zip.h" #include "utils/exception.h" #include "utils/fmt/vector.h" +#include namespace FlexFlow { template std::vector> zip_strict(std::vector const &lhs, std::vector const &rhs) { - if (lhs.size() != rhs.size()) { - throw mk_runtime_error( - fmt::format("zip_strict requires lhs and rhs to have the same length, " - "but received lhs={} (length {}), rhs={} (length {})", - lhs, - lhs.size(), - rhs, - rhs.size())); - } + ASSERT(lhs.size() == rhs.size(), + "zip_strict requires lhs and rhs to have the same length", + lhs, + rhs); return zip(lhs, rhs); } diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h index 080cbb3611..f95eb8a38d 100644 --- a/lib/utils/include/utils/exception.h +++ b/lib/utils/include/utils/exception.h @@ -3,6 +3,7 @@ #include "utils/fmt.h" #include +#include #include #include diff --git a/lib/utils/include/utils/full_binary_tree/as_dot.h b/lib/utils/include/utils/full_binary_tree/as_dot.h new file mode 100644 index 0000000000..e104d05e06 --- /dev/null +++ b/lib/utils/include/utils/full_binary_tree/as_dot.h @@ -0,0 +1,81 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FULL_BINARY_TREE_AS_DOT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FULL_BINARY_TREE_AS_DOT_H + +#include "utils/containers/get_only.h" +#include "utils/dot_file.h" +#include "utils/full_binary_tree/full_binary_tree_implementation.dtg.h" +#include "utils/full_binary_tree/full_binary_tree_visitor.dtg.h" +#include "utils/full_binary_tree/visit.h" +#include "utils/graph/dataflow_graph/dataflow_graph.h" +#include "utils/graph/dataflow_graph/dataflow_graph_view.h" +#include "utils/graph/digraph/digraph_view.h" +#include "utils/graph/instances/adjacency_digraph.h" +#include "utils/graph/instances/unordered_set_dataflow_graph.h" +#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h" +#include "utils/graph/labelled_dataflow_graph/algorithms/view_as_labelled_open_dataflow_graph.h" +#include "utils/graph/labelled_dataflow_graph/labelled_dataflow_graph.h" +#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h" +#include +#include +#include + +namespace FlexFlow { + +template +LabelledDataflowGraph as_labelled_dataflow_graph( + Tree const &tree, + FullBinaryTreeImplementation const &impl, + std::function const &get_parent_label, + std::function const &get_leaf_label) { + auto g = LabelledDataflowGraph::template create< + UnorderedSetLabelledOpenDataflowGraph>(); + + FullBinaryTreeVisitor visitor = + FullBinaryTreeVisitor{ + [&](Parent const &parent) -> DataflowOutput { + DataflowOutput left_child_output = + visit(impl.get_left_child(parent), impl, visitor); + DataflowOutput right_child_output = + visit(impl.get_right_child(parent), impl, visitor); + NodeLabel parent_label = get_parent_label(parent); + NodeAddedResult parent_added = + g.add_node(parent_label, + {left_child_output, right_child_output}, + {std::monostate{}}); + return get_only(parent_added.outputs); + }, + [&](Leaf const &leaf) -> DataflowOutput { + NodeLabel leaf_label = get_leaf_label(leaf); + NodeAddedResult leaf_added = + g.add_node(leaf_label, {}, {std::monostate{}}); + return get_only(leaf_added.outputs); + }, + }; + + visit(tree, impl, visitor); + + return g; +} + +template +std::string + as_dot(Tree const &tree, + FullBinaryTreeImplementation const &impl, + std::function const &get_parent_label, + std::function const &get_leaf_label) { + + LabelledDataflowGraphView g = + as_labelled_dataflow_graph(tree, impl, get_parent_label, get_leaf_label); + + std::function get_node_label = + [](std::string const &s) { return s; }; + std::function get_input_label = + [](std::monostate const &) { return ""; }; + + return as_dot( + view_as_labelled_open_dataflow_graph(g), get_node_label, get_input_label); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h index de48cd17e9..9b4ea6cd20 100644 --- a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h +++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h @@ -1,11 +1,13 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H +#include "utils/full_binary_tree/binary_tree_path.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_parallel_split.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_series_split.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h" #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h" +#include #include namespace FlexFlow { @@ -23,6 +25,10 @@ std::unordered_multiset get_leaves(BinarySPDecompositionTree const &); SPDecompositionTreeNodeType get_node_type(BinarySPDecompositionTree const &); +std::optional + binary_sp_decomposition_tree_get_subtree_at_path( + BinarySPDecompositionTree const &, BinaryTreePath const &); + } // namespace FlexFlow #endif diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h new file mode 100644 index 0000000000..9c999d8f6e --- /dev/null +++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h @@ -0,0 +1,43 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_GENERIC_BINARY_SP_DECOMPOSITION_TREE_AS_DOT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_GENERIC_BINARY_SP_DECOMPOSITION_TREE_AS_DOT_H + +#include "utils/full_binary_tree/as_dot.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.h" +#include "utils/overload.h" + +namespace FlexFlow { + +template +std::string as_dot( + Tree const &tree, + GenericBinarySPDecompositionTreeImplementation const &impl, + std::function const &get_series_label, + std::function const &get_parallel_label, + std::function const &get_leaf_label) { + FullBinaryTreeImplementation, Leaf> + full_binary_tree_impl = get_full_binary_impl_from_generic_sp_impl(impl); + + std::function const &)> + get_parent_label = + [&](std::variant const &parent) -> std::string { + return std::visit(overload{ + [&](Series const &series) -> std::string { + return get_series_label(series); + }, + [&](Parallel const ¶llel) -> std::string { + return get_parallel_label(parallel); + }, + }, + parent); + }; + + return as_dot(tree, full_binary_tree_impl, get_parent_label, get_leaf_label); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/indent.h b/lib/utils/include/utils/indent.h new file mode 100644 index 0000000000..eccbd34cfc --- /dev/null +++ b/lib/utils/include/utils/indent.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H + +#include + +namespace FlexFlow { + +std::string indent(std::string const &, int indent_size = 2); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h index 377561d70c..8673264d36 100644 --- a/lib/utils/include/utils/optional.h +++ b/lib/utils/include/utils/optional.h @@ -32,6 +32,11 @@ T const &assert_unwrap(std::optional const &o) { return o.value(); } +template +T expect(std::optional const &x, std::string const &err) { + return unwrap(x, [&]() { throw mk_runtime_error(err); }); +} + } // namespace FlexFlow #endif diff --git a/lib/utils/include/utils/random_utils.h b/lib/utils/include/utils/random_utils.h index 99da9646a1..014c38fc51 100644 --- a/lib/utils/include/utils/random_utils.h +++ b/lib/utils/include/utils/random_utils.h @@ -5,7 +5,7 @@ #include #include -float randf() { +inline float randf() { return static_cast(std::rand()) / static_cast(RAND_MAX); } diff --git a/lib/utils/include/utils/stack_vector/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h index 5d4d6eaad3..64d005a10e 100644 --- a/lib/utils/include/utils/stack_vector/stack_vector.h +++ b/lib/utils/include/utils/stack_vector/stack_vector.h @@ -272,18 +272,6 @@ struct stack_vector { return !(*this == other); } - bool operator<(stack_vector const &other) const { - for (std::size_t i = 0; i < std::min(this->m_size, other.m_size); i++) { - if (this->at(i) < other.at(i)) { - return true; - } else if (this->at(i) > other.at(i)) { - return false; - } - } - - return (this->m_size < other.m_size); - } - std::size_t size() const { return this->m_size; } @@ -305,17 +293,16 @@ struct stack_vector { private: std::size_t m_size = 0; std::array contents; - - static_assert( - implies, is_equal_comparable>::value, - ""); - static_assert( - implies, is_neq_comparable>::value, - ""); - static_assert( - implies, is_lt_comparable>::value, ""); }; +template +auto operator<(stack_vector const &lhs, + stack_vector const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} + template std::ostream &operator<<(std::ostream &s, stack_vector const &v) { return s << fmt::to_string(v); diff --git a/lib/utils/src/utils/containers/slice.cc b/lib/utils/src/utils/containers/slice.cc new file mode 100644 index 0000000000..f960c21881 --- /dev/null +++ b/lib/utils/src/utils/containers/slice.cc @@ -0,0 +1,3 @@ +#include "utils/containers/slice.h" + +namespace FlexFlow {} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/subvec.cc b/lib/utils/src/utils/containers/subvec.cc deleted file mode 100644 index 93c7de31c5..0000000000 --- a/lib/utils/src/utils/containers/subvec.cc +++ /dev/null @@ -1 +0,0 @@ -#include "utils/containers/subvec.h" diff --git a/lib/utils/src/utils/full_binary_tree/as_dot.cc b/lib/utils/src/utils/full_binary_tree/as_dot.cc new file mode 100644 index 0000000000..12a1ab5533 --- /dev/null +++ b/lib/utils/src/utils/full_binary_tree/as_dot.cc @@ -0,0 +1,16 @@ +#include "utils/full_binary_tree/as_dot.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using Tree = value_type<0>; +using Parent = value_type<1>; +using Leaf = value_type<2>; + +template std::string + as_dot(Tree const &, + FullBinaryTreeImplementation const &, + std::function const &, + std::function const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc index 8445a2721a..8aed06ae01 100644 --- a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc +++ b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc @@ -1,5 +1,5 @@ #include "utils/full_binary_tree/binary_tree_path.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" namespace FlexFlow { @@ -27,7 +27,7 @@ BinaryTreePathEntry binary_tree_path_get_top_level(BinaryTreePath const &p) { BinaryTreePath binary_tree_path_get_non_top_level(BinaryTreePath const &p) { return BinaryTreePath{ - subvec(p.entries, 1, std::nullopt), + slice(p.entries, 1, std::nullopt), }; } diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc index 62489ff75f..3e4bc13289 100644 --- a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc +++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc @@ -1,5 +1,6 @@ #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_left_associative.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_right_associative.h" @@ -82,4 +83,10 @@ SPDecompositionTreeNodeType }); } +std::optional + binary_sp_decomposition_tree_get_subtree_at_path( + BinarySPDecompositionTree const &tree, BinaryTreePath const &path) { + return get_subtree_at_path(tree, generic_impl_for_binary_sp_tree(), path); +} + } // namespace FlexFlow diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc new file mode 100644 index 0000000000..f557515c83 --- /dev/null +++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc @@ -0,0 +1,21 @@ +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using Tree = value_type<0>; +using Series = value_type<1>; +using Parallel = value_type<2>; +using Leaf = value_type<3>; + +template std::string + as_dot(Tree const &, + GenericBinarySPDecompositionTreeImplementation const &, + std::function const &, + std::function const &, + std::function const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc index 5b9b592444..459e61be71 100644 --- a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc +++ b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc @@ -3,7 +3,7 @@ #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" #include "utils/containers/require_same.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/values.h" #include "utils/graph/digraph/algorithms/get_predecessors.h" @@ -103,7 +103,7 @@ MultiDiEdge Node last = g.get_multidiedge_dst(reduction.edges.back()); std::vector internal_nodes; - for (MultiDiEdge const &e : subvec(reduction.edges, std::nullopt, -1)) { + for (MultiDiEdge const &e : slice(reduction.edges, 0, -1)) { internal_nodes.push_back(g.get_multidiedge_dst(e)); } diff --git a/lib/utils/src/utils/indent.cc b/lib/utils/src/utils/indent.cc new file mode 100644 index 0000000000..2761ad1878 --- /dev/null +++ b/lib/utils/src/utils/indent.cc @@ -0,0 +1,17 @@ +#include "utils/indent.h" +#include "utils/containers/flatmap.h" + +namespace FlexFlow { + +std::string indent(std::string const &s, int indent_size) { + std::string indent_str(indent_size, ' '); + return indent_str + flatmap(s, [&](char c) -> std::string { + if (c == '\n') { + return "\n" + indent_str; + } else { + return std::string{c}; + }; + }); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc index d4fb849412..e2009d74d3 100644 --- a/lib/utils/src/utils/stack_vector/stack_vector.cc +++ b/lib/utils/src/utils/stack_vector/stack_vector.cc @@ -1,9 +1,9 @@ #include "utils/stack_vector/stack_vector.h" -#include "utils/archetypes/ordered_value_type.h" +#include "utils/archetypes/value_type.h" namespace FlexFlow { -using T = ordered_value_type<0>; +using T = value_type<0>; template struct stack_vector; template struct stack_vector; diff --git a/lib/utils/test/common/include/test/utils/doctest/check_kv.h b/lib/utils/test/common/include/test/utils/doctest/check_kv.h new file mode 100644 index 0000000000..6449b8ac87 --- /dev/null +++ b/lib/utils/test/common/include/test/utils/doctest/check_kv.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H +#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H + +#include + +namespace FlexFlow { + +std::string check_kv(std::string const &k, std::string const &v); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc index 9522fa7fdb..6df2d925b7 100644 --- a/lib/utils/test/common/src/main.cc +++ b/lib/utils/test/common/src/main.cc @@ -1,2 +1,15 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "doctest/doctest.h" +#define DOCTEST_CONFIG_IMPLEMENT +#include + +#include +#include + +void libassert_throw_exception_handler(libassert::assertion_info const &info) { + throw std::runtime_error("Assertion failed:\n" + info.to_string()); +} + +int main(int argc, char **argv) { + libassert::set_failure_handler(libassert_throw_exception_handler); + + return doctest::Context(argc, argv).run(); +} diff --git a/lib/utils/test/common/src/test/utils/doctest/check_kv.cc b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc new file mode 100644 index 0000000000..d3c1ee335e --- /dev/null +++ b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc @@ -0,0 +1,17 @@ +#include "test/utils/doctest/check_kv.h" +#include "utils/indent.h" +#include + +namespace FlexFlow { + +std::string check_kv(std::string const &k, std::string const &v) { + std::ostringstream oss; + + oss << std::endl + << indent(k + "=", /*indent_size=*/4) << std::endl + << indent(v, /*indent_size=*/6); + + return oss.str(); +} + +} // namespace FlexFlow diff --git a/lib/utils/test/src/utils/containers/subvec.cc b/lib/utils/test/src/utils/containers/slice.cc similarity index 69% rename from lib/utils/test/src/utils/containers/subvec.cc rename to lib/utils/test/src/utils/containers/slice.cc index 610fc55b5a..4e4d840bfe 100644 --- a/lib/utils/test/src/utils/containers/subvec.cc +++ b/lib/utils/test/src/utils/containers/slice.cc @@ -1,4 +1,4 @@ -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "test/utils/doctest/fmt/vector.h" #include #include @@ -6,57 +6,57 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("subvec") { + TEST_CASE("slice") { std::vector v = {1, 2, 3, 4, 5}; - SUBCASE("Basic subvector") { - auto result = subvec(v, 1, 4); + SUBCASE("Basic slice") { + auto result = slice(v, 1, 4); std::vector correct = {2, 3, 4}; CHECK(result == correct); } SUBCASE("From beginning to index") { - auto result = subvec(v, std::nullopt, 3); + auto result = slice(v, 0, 3); std::vector correct = {1, 2, 3}; CHECK(result == correct); } SUBCASE("From index to end") { - auto result = subvec(v, 2, std::nullopt); + auto result = slice(v, 2, std::nullopt); std::vector correct = {3, 4, 5}; CHECK(result == correct); } SUBCASE("All of the vector") { - auto result = subvec(v, std::nullopt, std::nullopt); + auto result = slice(v, 0, std::nullopt); std::vector correct = {1, 2, 3, 4, 5}; CHECK(result == correct); } SUBCASE("Start greater than end") { - auto result = subvec(v, 3, 1); + auto result = slice(v, 3, 1); std::vector correct = {}; CHECK(result == correct); } SUBCASE("Start equal to end") { - auto result = subvec(v, 3, 3); + auto result = slice(v, 3, 3); std::vector correct = {}; CHECK(result == correct); } SUBCASE("Negative indices") { - auto result = subvec(v, -3, -1); + auto result = slice(v, -3, -1); std::vector correct = {3, 4}; CHECK(result == correct); } SUBCASE("Upper index is out of bounds by 1") { - CHECK_THROWS(subvec(v, 2, 6)); + CHECK_THROWS(slice(v, 2, 6)); } SUBCASE("Lower index is out of bounds by 1") { - CHECK_THROWS(subvec(v, -6, 2)); + CHECK_THROWS(slice(v, -6, 2)); } } } diff --git a/lib/utils/test/src/utils/indent.cc b/lib/utils/test/src/utils/indent.cc new file mode 100644 index 0000000000..b137253fae --- /dev/null +++ b/lib/utils/test/src/utils/indent.cc @@ -0,0 +1,66 @@ +#include "utils/indent.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("indent") { + SUBCASE("string is empty") { + std::string input = ""; + + std::string result = indent(input); + std::string correct = " "; + + CHECK(result == correct); + } + + SUBCASE("string is one line") { + std::string input = "hello world"; + std::string result = indent(input); + std::string correct = " hello world"; + + CHECK(result == correct); + } + + SUBCASE("string has multiple lines") { + std::string input = "\n" + "a b\n" + "c d\n" + "e f\n" + "g\n"; + + std::string result = indent(input); + std::string correct = " \n" + " a b\n" + " c d\n" + " e f\n" + " g\n" + " "; + + CHECK(result == correct); + } + + SUBCASE("leading and trailing whitespace is preserved") { + std::string input = " a b \n" + "c d e\n" + " "; + + std::string result = indent(input); + std::string correct = " a b \n" + " c d e\n" + " "; + + CHECK(result == correct); + } + + SUBCASE("allows custom indent size") { + std::string input = "hello\nworld"; + + std::string result = indent(input, /*indent_size=*/4); + std::string correct = " hello\n" + " world"; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc index c36de733b6..6eb2cc0d88 100644 --- a/lib/utils/test/src/utils/stack_vector/stack_vector.cc +++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc @@ -1,12 +1,97 @@ #include "utils/stack_vector/stack_vector.h" #include "test/utils/doctest/fmt/vector.h" #include "test/utils/rapidcheck.h" +#include "utils/archetypes/value_type.h" #include #include using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("operator<(stack_vector, stack_vector)") { + constexpr std::size_t MAXSIZE = 5; + + SUBCASE("T is ordered") { + SUBCASE("inputs are the same") { + std::vector input = {2, 1, 2, 3}; + + bool result = (input < input); + bool correct = false; + + CHECK(result == correct); + } + + SUBCASE("lhs is strict prefix of rhs") { + std::vector lhs = {2, 1, 2}; + std::vector rhs = {2, 1, 2, 3}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("lhs is empty") { + std::vector lhs = {}; + std::vector rhs = {2, 1, 2, 3}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("lhs has a smaller element first") { + std::vector lhs = {2, 1, 0, 3}; + std::vector rhs = {2, 1, 2}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + // from the definition of a strict total order, i.e., + // https://en.wikipedia.org/w/index.php?title=Total_order&oldid=1278541072#Strict_and_non-strict_total_orders + RC_SUBCASE("operator< is irreflexive", + [](stack_vector const &input) { + RC_ASSERT(!(input < input)); + }); + + RC_SUBCASE("operator< is asymmetric", + [](stack_vector const &lhs, + stack_vector const &rhs) { + RC_PRE(lhs != rhs); + + RC_ASSERT((lhs < rhs) == !(rhs < lhs)); + }); + + RC_SUBCASE("operator< is transitive", + [](stack_vector const &a, + stack_vector const &b, + stack_vector const &c) { + RC_PRE(a < b); + RC_PRE(b < c); + + RC_ASSERT(a < c); + }); + + RC_SUBCASE("operator< is connected", + [](stack_vector const &lhs, + stack_vector const &rhs) { + RC_PRE(lhs != rhs); + + RC_ASSERT((lhs < rhs) || (rhs < lhs)); + }); + } + + SUBCASE("T is not ordered") { + bool result = is_lt_comparable_v, MAXSIZE>>; + + CHECK_FALSE(result); + } + } + TEST_CASE_TEMPLATE( "stack_vector::push_back", T, int, double, char) { constexpr std::size_t MAXSIZE = 5;