From bfd8477c51f2a1d28e8d6d3426dbcd1da0877d7e Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 23 May 2024 16:23:24 +0200 Subject: [PATCH 1/4] Update proposal2a.md --- proposal2a.md | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/proposal2a.md b/proposal2a.md index 7469672..ee7cbcb 100644 --- a/proposal2a.md +++ b/proposal2a.md @@ -14,16 +14,36 @@ This can probably be just about any integral type. ```C typedef enum { - XXX_TYPE_F32, - XXX_TYPE_F64, - XXX_TYPE_C32, - XXX_TYPE_C64, + /* maybe these could be a bitfield, but are there enough bits??? */ + /* Reserved for standard 0x0 - 0x100 (for example) */ + XXX_TYPE_F32, // required + XXX_TYPE_F64, // required + XXX_TYPE_C32, // required + XXX_TYPE_C64, // required + XXX_TYPE_F16, // required? + XXX_TYPE_BF16, // required? + XXX_TYPE_F8, + XXX_TYPE_BF8, + XXX_TYPE_I32, + XXX_TYPE_U32, + XXX_TYPE_I16, + XXX_TYPE_U16, + XXX_TYPE_I8, + XXX_TYPE_U8, + /* Available for implementers 0x100 - 0x1000 (for example) */ ... } XXX_datatype; typedef enum { + /* Implementations may use more precise computational type */ + /* Reserved for standard 0x2000 - 0x2100 (for example) */ XXX_TYPE_F32_F32_ACCUM_F32 = XXX_TYPE_F32, + XXX_TYPE_F64_F64_ACCUM_F64 = XXX_TYPE_F64, + ..., + XXX_TYPE_LOWER, /* narrowest of input precisions */ /* should this be part of attr's */ /* should there be a truly neutral default (maybe HW dependent)? */ + XXX_TYPE_HIGHER, /* widest of input precisions */ + /* Available for implementers 0x2100 - 0x3000 (for example) */ ... } XXX_comp_datatype; ``` @@ -32,6 +52,11 @@ Enumerations for the supported storage and computational datatypes. Not all comb ```C typedef /* unspecified */ XXX_error; // Should be a trivial type, e.g. "int" +/* + * Required errors: + * - Invalid values (negative lengths, same extent for shared dimension) + * - Null pointers (except 0-dimensional [or maybe 1+-dimensional is required?]) + int XXX_error_check(XXX_error err); // return non-zero on error const char* XXX_error_explain(XXX_error err); @@ -81,16 +106,13 @@ XXX_contract(const void* alpha, const void* C, XXX_datatype type_C, int nmode_C, - const XXX_extent* shape_C, - const XXX_stride* stride_C, + const XXX_extent* shape_C, + const XXX_stride* stride_C, const XXX_index* idx_C, - void* D, - XXX_datatype type_D, - int nmode_D, - const XXX_extent* shape_D, - const XXX_stride* stride_D, - const XXX_index* idx_D, - XXX_comp_datatype comp_type, + void* D, // users should specify C twice for in-place + XXX_datatype type_D, // instead, could C or D be NULL? + const XXX_stride* stride_D, // if C == D, do we also need nmode_D, shape_D, etc.? + XXX_comp_datatype comp_type, // maybe XXX_IN_PLACE tag for C == D? XXX_attr attr); ``` From 9dd3a7d76bfa75ec5fe1e49d50c02c0665ce1fdc Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 23 May 2024 17:03:45 +0200 Subject: [PATCH 2/4] Update proposal2a.md --- proposal2a.md | 144 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/proposal2a.md b/proposal2a.md index ee7cbcb..72cda50 100644 --- a/proposal2a.md +++ b/proposal2a.md @@ -56,12 +56,15 @@ typedef /* unspecified */ XXX_error; // Should be a trivial type, e.g. "int" * Required errors: * - Invalid values (negative lengths, same extent for shared dimension) * - Null pointers (except 0-dimensional [or maybe 1+-dimensional is required?]) + * - If D == C (or XXX_IN_PLACE), stride_D_XXX are ignored (can be NULL) + * - Invocation failure (generic failure)? + * + * Should some other information be available, e.g. out-of-memory so user could try again later. + */ -int XXX_error_check(XXX_error err); // return non-zero on error +int XXX_check_success(XXX_error err); // return non-zero on success const char* XXX_error_explain(XXX_error err); - -void XXX_error_clear(XXX_error err); ``` Error handling --- implementation defined. @@ -69,6 +72,10 @@ Error handling --- implementation defined. typedef /* unspecified */ XXX_attr; // Requires initialization. E.g. "struct XXX_attr_internal*" typedef int32_t XXX_key; // Some values should be reserved for standardization +/* + * Potential keys: + * - Execution plan (pointer to object) + XXX_error XXX_attr_init(XXX_attr* attr); XXX_error XXX_attr_destroy(XXX_attr* attr); @@ -84,6 +91,9 @@ Implementation defined (and maybe some standard) attributes, loosely based on MP ```C // Unary and binary element-wise operations (transpose, scale, norm, reduction, etc.) should also be defined! +// Element-wise ops on A, B, and AB are very important for machine learning. +// Can this functionality be required in the interface without requiring JIT???? + // Compute D_{idx_D} = alpha * A_{idx_A} * B_{idx_B} + beta * C_{idx_C} XXX_error @@ -114,5 +124,133 @@ XXX_contract(const void* alpha, const XXX_stride* stride_D, // if C == D, do we also need nmode_D, shape_D, etc.? XXX_comp_datatype comp_type, // maybe XXX_IN_PLACE tag for C == D? XXX_attr attr); + +// Compute D_{MNL} = alpha * \sum_K A_{MKL} B_{KNL} + beta * C_{MNL} + +XXX_error +XXX_contract( int nmode_M, + const XXX_extent* shape_M, + int nmode_N, + const XXX_extent* shape_N, + int nmode_K, + const XXX_extent* shape_K, + int nmode_L, + const XXX_extent* shape_L, + const void* alpha, + XXX_datatype type_alpha, + const void* A, + XXX_datatype type_A, + const XXX_stride* stride_A_M, + const XXX_stride* stride_A_K, + const XXX_stride* stride_A_L, + const void* B, + XXX_datatype type_B, + const XXX_stride* stride_B_K, + const XXX_stride* stride_B_N, + const XXX_stride* stride_B_L, + const void* beta, + XXX_datatype type_beta, + const void* C, + XXX_datatype type_C, + const XXX_stride* stride_C_M, + const XXX_stride* stride_C_N, + const XXX_stride* stride_C_L, + void* D, // users should specify C twice for in-place + XXX_datatype type_D, // instead, could C or D be NULL? + const XXX_stride* stride_D_M, // if C == D, do we also need nmode_D, shape_D, etc.? + const XXX_stride* stride_D_N, // maybe XXX_IN_PLACE tag for C == D? + const XXX_stride* stride_D_L, + XXX_comp_datatype type_comp, + XXX_attr attr); + +// Compute D_{MNL} = alpha * \sum_K A_{MKL} B_{KNL} + beta * C_{MNL} +// Here, plan creation is a required part of the API + +typedef /* unspecified */ XXX_plan; // probably pointer to struct + +XXX_error +XXX_contract_plan( int nmode_M, + const XXX_extent* shape_M, + int nmode_N, + const XXX_extent* shape_N, + int nmode_K, + const XXX_extent* shape_K, + int nmode_L, + const XXX_extent* shape_L, + XXX_datatype type_alpha, + XXX_datatype type_A, + const XXX_stride* stride_A_M, + const XXX_stride* stride_A_K, + const XXX_stride* stride_A_L, + XXX_datatype type_B, + const XXX_stride* stride_B_K, + const XXX_stride* stride_B_N, + const XXX_stride* stride_B_L, + XXX_datatype type_beta, + XXX_datatype type_C, + const XXX_stride* stride_C_M, + const XXX_stride* stride_C_N, + const XXX_stride* stride_C_L, // users should specify C twice for in-place + XXX_datatype type_D, // instead, could C or D be NULL? + const XXX_stride* stride_D_M, // if C == D, do we also need nmode_D, shape_D, etc.? + const XXX_stride* stride_D_N, // maybe XXX_IN_PLACE tag for C == D? + const XXX_stride* stride_D_L, + XXX_comp_datatype type_comp, + XXX_plan* plan, + XXX_attr attr); + +XXX_error +XXX_contract_execute( + const void* alpha, + const void* A, + const void* B, + const void* beta, + const void* C, + void* D, + XXX_plan plan); + +// Batched tensor contraction (TBD) + +XXX_error +XXX_contract_batched( + int batch_size, + int nmode_M, + const XXX_extent* shape_M, + int nmode_N, + const XXX_extent* shape_N, + int nmode_K, + const XXX_extent* shape_K, + int nmode_L, + const XXX_extent* shape_L, + const void* alpha, + XXX_datatype type_alpha, + const void** A, + XXX_datatype type_A, + const XXX_stride* stride_A_M, + const XXX_stride* stride_A_K, + const XXX_stride* stride_A_L, + const void** B, + XXX_datatype type_B, + const XXX_stride* stride_B_K, + const XXX_stride* stride_B_N, + const XXX_stride* stride_B_L, + const void* beta, + XXX_datatype type_beta, + const void** C, + XXX_datatype type_C, + const XXX_stride* stride_C_M, + const XXX_stride* stride_C_N, + const XXX_stride* stride_C_L, + void** D, // users should specify C twice for in-place + XXX_datatype type_D, // instead, could C or D be NULL? + const XXX_stride* stride_D_M, // if C == D, do we also need nmode_D, shape_D, etc.? + const XXX_stride* stride_D_N, // maybe XXX_IN_PLACE tag for C == D? + const XXX_stride* stride_D_L, + XXX_comp_datatype type_comp, + XXX_attr attr); + +/* See also cublasDgemmGroupedBatched for more complex batched interface */ + + ``` From 4a1c76b7b1132aa0866ab50f3872da14e03f2b11 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 24 May 2024 12:43:06 +0200 Subject: [PATCH 3/4] Update proposal2a.md --- proposal2a.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/proposal2a.md b/proposal2a.md index 72cda50..7e7b234 100644 --- a/proposal2a.md +++ b/proposal2a.md @@ -62,9 +62,13 @@ typedef /* unspecified */ XXX_error; // Should be a trivial type, e.g. "int" * Should some other information be available, e.g. out-of-memory so user could try again later. */ -int XXX_check_success(XXX_error err); // return non-zero on success +// The error explain function should not allocate the error string itself +// for security concerns. +// Adapted from the function MPI_Error_string +XXX_ERROR XXX_error_explain(XXX_ERROR err, char *error_string, int *error_size); -const char* XXX_error_explain(XXX_error err); +// Additionally one has to define as in MPI a MAX_ERROR_STRING +#define XXX_MAX_ERROR_STRING 512 /* implementation dependent */ ``` Error handling --- implementation defined. From 6461c3b6e378aebfd1befbde62e73364988fef2c Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 24 May 2024 12:51:05 +0200 Subject: [PATCH 4/4] Update proposal2a.md --- proposal2a.md | 87 ++++----------------------------------------------- 1 file changed, 6 insertions(+), 81 deletions(-) diff --git a/proposal2a.md b/proposal2a.md index 7e7b234..221a137 100644 --- a/proposal2a.md +++ b/proposal2a.md @@ -99,108 +99,33 @@ Implementation defined (and maybe some standard) attributes, loosely based on MP // Can this functionality be required in the interface without requiring JIT???? // Compute D_{idx_D} = alpha * A_{idx_A} * B_{idx_B} + beta * C_{idx_C} +// Here, plan creation is a required part of the API + +typedef /* unspecified */ XXX_plan; // probably pointer to struct XXX_error -XXX_contract(const void* alpha, +XXX_contract_plan( XXX_datatype type_alpha, - const void* A, XXX_datatype type_A, int nmode_A, const XXX_extent* shape_A, const XXX_stride* stride_A, const XXX_index* idx_A, - const void* B, XXX_datatype type_B, int nmode_B, const XXX_extent* shape_B, const XXX_stride* stride_B, const XXX_index* idx_B, - const void* beta, XXX_datatype type_beta, - const void* C, XXX_datatype type_C, int nmode_C, const XXX_extent* shape_C, const XXX_stride* stride_C, - const XXX_index* idx_C, - void* D, // users should specify C twice for in-place + const XXX_index* idx_C, // users should specify C twice for in-place XXX_datatype type_D, // instead, could C or D be NULL? const XXX_stride* stride_D, // if C == D, do we also need nmode_D, shape_D, etc.? - XXX_comp_datatype comp_type, // maybe XXX_IN_PLACE tag for C == D? - XXX_attr attr); - -// Compute D_{MNL} = alpha * \sum_K A_{MKL} B_{KNL} + beta * C_{MNL} - -XXX_error -XXX_contract( int nmode_M, - const XXX_extent* shape_M, - int nmode_N, - const XXX_extent* shape_N, - int nmode_K, - const XXX_extent* shape_K, - int nmode_L, - const XXX_extent* shape_L, - const void* alpha, - XXX_datatype type_alpha, - const void* A, - XXX_datatype type_A, - const XXX_stride* stride_A_M, - const XXX_stride* stride_A_K, - const XXX_stride* stride_A_L, - const void* B, - XXX_datatype type_B, - const XXX_stride* stride_B_K, - const XXX_stride* stride_B_N, - const XXX_stride* stride_B_L, - const void* beta, - XXX_datatype type_beta, - const void* C, - XXX_datatype type_C, - const XXX_stride* stride_C_M, - const XXX_stride* stride_C_N, - const XXX_stride* stride_C_L, - void* D, // users should specify C twice for in-place - XXX_datatype type_D, // instead, could C or D be NULL? - const XXX_stride* stride_D_M, // if C == D, do we also need nmode_D, shape_D, etc.? - const XXX_stride* stride_D_N, // maybe XXX_IN_PLACE tag for C == D? - const XXX_stride* stride_D_L, - XXX_comp_datatype type_comp, - XXX_attr attr); - -// Compute D_{MNL} = alpha * \sum_K A_{MKL} B_{KNL} + beta * C_{MNL} -// Here, plan creation is a required part of the API - -typedef /* unspecified */ XXX_plan; // probably pointer to struct - -XXX_error -XXX_contract_plan( int nmode_M, - const XXX_extent* shape_M, - int nmode_N, - const XXX_extent* shape_N, - int nmode_K, - const XXX_extent* shape_K, - int nmode_L, - const XXX_extent* shape_L, - XXX_datatype type_alpha, - XXX_datatype type_A, - const XXX_stride* stride_A_M, - const XXX_stride* stride_A_K, - const XXX_stride* stride_A_L, - XXX_datatype type_B, - const XXX_stride* stride_B_K, - const XXX_stride* stride_B_N, - const XXX_stride* stride_B_L, - XXX_datatype type_beta, - XXX_datatype type_C, - const XXX_stride* stride_C_M, - const XXX_stride* stride_C_N, - const XXX_stride* stride_C_L, // users should specify C twice for in-place - XXX_datatype type_D, // instead, could C or D be NULL? - const XXX_stride* stride_D_M, // if C == D, do we also need nmode_D, shape_D, etc.? - const XXX_stride* stride_D_N, // maybe XXX_IN_PLACE tag for C == D? - const XXX_stride* stride_D_L, XXX_comp_datatype type_comp, - XXX_plan* plan, + XXX_plan plan, XXX_attr attr); XXX_error