Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 5 additions & 20 deletions include/oneapi/dnnl/dnnl_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,24 +64,6 @@ typedef enum {
dnnl_format_kind_max = 0x7fff,
} dnnl_format_kind_t;

#ifdef DNNL_EXPERIMENTAL_SPARSE
/// Sparse encodings.
typedef enum {
/// Undefined sparse encoding kind, used for empty memory descriptors.
dnnl_sparse_encoding_undef = 0,
/// Compressed Sparse Row (CSR) encoding.
dnnl_csr,
/// An encoding that is used for an opaque storage schema for
/// tensors with unstructured sparsity. A memory descriptor with the
/// packed encoding cannot be used to create a memory object. It can
/// only be used to create a primitive descriptor to query the
/// actual memory descriptor (similar to the format tag `any`).
dnnl_packed,
/// Coordinate Sparse Encoding (COO).
dnnl_coo,
} dnnl_sparse_encoding_t;
#endif

#ifdef DNNL_EXPERIMENTAL_PROFILING
/// Profiling data kind.
typedef enum {
Expand Down Expand Up @@ -2371,6 +2353,10 @@ typedef enum {
dnnl_sparse_encoding_packed,
dnnl_sparse_encoding_csr,
dnnl_sparse_encoding_coo,
// Legacy aliases for backward compatibility.
dnnl_packed = dnnl_sparse_encoding_packed,
dnnl_csr = dnnl_sparse_encoding_csr,
dnnl_coo = dnnl_sparse_encoding_coo,
} dnnl_sparse_encoding_t;

/* typedef struct dnnl_sparse_desc *dnnl_sparse_desc_t; */
Expand Down Expand Up @@ -2925,13 +2911,12 @@ typedef enum {
dnnl_query_inner_nblks_s32, ///< number of innermost blocks
dnnl_query_inner_blks, ///< vector of sizes of the innermost blocks
dnnl_query_inner_idxs, ///< vector of logical indices of the blocks
#ifdef DNNL_EXPERIMENTAL_SPARSE
dnnl_query_sparse_encoding, ///< Sparse encoding
#ifdef DNNL_EXPERIMENTAL_SPARSE
dnnl_query_nnz_s64, ///< Number of non-zero entries
dnnl_query_num_handles_s32, ///< Number of buffers required for a memory
/// descriptor
#endif
dnnl_query_sparse_encoding,

// Max value to prevent UB for internal use only dnnl_query_t
dnnl_query_max = 0x7fff,
Expand Down
52 changes: 0 additions & 52 deletions src/cpu/aarch64/acl_reorder.cpp

This file was deleted.

235 changes: 9 additions & 226 deletions src/cpu/aarch64/acl_reorder.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2023-2025 Arm Ltd. and affiliates
* Copyright 2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -13,239 +13,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef CPU_ACL_REORDER_HPP
#define CPU_ACL_REORDER_HPP

#include "arm_compute/core/Types.h"
#include "common/utils.hpp"
#include "cpu/acl/acl_utils.hpp"
#include "cpu/aarch64/cpu_isa_traits.hpp"
#include "cpu/reorder/cpu_reorder_pd.hpp"
#ifndef CPU_AARCH64_ACL_REORDER_HPP
#define CPU_AARCH64_ACL_REORDER_HPP

// Keep include path compatibility with code that expects this header.
#include "cpu/aarch64/reorder/acl_reorder.hpp"

// Provide the expected cpu::acl namespace alias used by common headers.
namespace dnnl {
namespace impl {
namespace cpu {
namespace acl {

struct acl_reorder_obj_t {
arm_compute::NEReorderLayer reorder;
arm_compute::Tensor src_tensor;
arm_compute::Tensor dst_tensor;
arm_compute::WeightFormat src_wf;
arm_compute::WeightFormat dst_wf;
};

struct acl_reorder_conf_t {
arm_compute::TensorInfo src_info;
arm_compute::TensorInfo dst_info;
arm_compute::WeightFormat src_wf;
arm_compute::WeightFormat dst_wf;
};

struct acl_reorder_resource_t : public resource_t {
acl_reorder_resource_t()
: acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}

status_t configure(const acl_reorder_conf_t &app) {
if (!acl_obj_) return status::out_of_memory;

// Init Compute Library tensors based on info from descriptor
acl_obj_->src_tensor.allocator()->init(app.src_info);
acl_obj_->dst_tensor.allocator()->init(app.dst_info);

// clang-format off
acl_obj_->reorder.configure(
&acl_obj_->src_tensor,
&acl_obj_->dst_tensor,
app.src_wf,
app.dst_wf
);
// clang-format on

return status::success;
}

acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);

private:
std::unique_ptr<acl_reorder_obj_t> acl_obj_;
}; // acl_reorder_resource_t

struct acl_reorder_fwd_t : public primitive_t {
using primitive_t::primitive_t;
struct pd_t : public cpu_reorder_pd_t {

using cpu_reorder_pd_t::cpu_reorder_pd_t;

DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);

static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
const primitive_attr_t *attr, engine_t *src_engine,
const memory_desc_t *src_md, engine_t *dst_engine,
const memory_desc_t *dst_md) {

using namespace acl_utils;

// ACL reorder support f32->f32 and f32->bf16
bool ok = src_md->data_type == data_type::f32
&& utils::one_of(
dst_md->data_type, data_type::f32, data_type::bf16)
&& attr->has_default_values();

if (!ok) return status::unimplemented;

if (!attr->scales_.has_default_values(DNNL_ARG_DST)) {
int mask = attr->scales_.get_mask(DNNL_ARG_DST);
const memory_desc_wrapper input_d(src_md);
if (input_d.has_runtime_dims_or_strides() && mask > 0)
return status::unimplemented;
}

// Create and check primitive descriptor
auto _pd = make_unique_pd<pd_t>(attr, src_engine->kind(), src_md,
dst_engine->kind(), dst_md);
if (_pd == nullptr) return status::out_of_memory;
if (_pd->init(engine, src_engine, dst_engine) != status::success) {
return status::unimplemented;
}

// In case we have two or four dimensions, we can't have one of the
// two first dimensions as 1. This is valid for f32->f32 and f32->bf16.
if (dst_md->dims[0] == 1 || dst_md->dims[1] == 1) {
return status::unimplemented;
}

auto src_tag = memory_desc_matches_one_of_tag(
*src_md, format_tag::ab, format_tag::ba, format_tag::cdba);
ACL_CHECK_SUPPORT(format_tag::undef == src_tag,
"Only ab, ba or cdba source formats supported");

auto dst_tag = memory_desc_matches_one_of_tag(*dst_md,
format_tag::BA8b4a, format_tag::BA4b4a, format_tag::Ab4a,
format_tag::Ab8a, format_tag::Acdb8a, format_tag::Acdb4a);
ACL_CHECK_SUPPORT(format_tag::undef == dst_tag,
"Only Ab4a/Ab8a, BA8b4a/BA4b4a and Acdb8a/Acdb4a "
"destination formats supported");

if (dst_tag == format_tag::BA4b4a || dst_tag == format_tag::Acdb4a
|| dst_tag == format_tag::Ab4a) {
_pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
} else if (aarch64::mayiuse(aarch64::sve_256)
&& (dst_tag == format_tag::BA8b4a
|| dst_tag == format_tag::Acdb8a
|| dst_tag == format_tag::Ab8a)) {
_pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
} else {
return status::unimplemented;
}

arm_compute::TensorShape acl_tensor_shape_in;
arm_compute::TensorShape acl_tensor_shape_out;

// Switch for 2 or 4 dim tensors
switch (src_md->ndims) {
case 2: {
if (src_tag == format_tag::ab
&& dst_md->data_type == data_type::bf16
&& utils::one_of(dst_tag, format_tag::BA8b4a,
format_tag::BA4b4a)) { // bf16
acl_tensor_shape_in = arm_compute::TensorShape(
src_md->dims[0], src_md->dims[1]);
acl_tensor_shape_out = arm_compute::TensorShape(
dst_md->padded_dims[0], dst_md->padded_dims[1]);
} else if (src_tag == format_tag::ba
&& dst_md->data_type == data_type::f32
&& !utils::one_of(dst_tag, format_tag::BA8b4a,
format_tag::BA4b4a)) { // f32
acl_tensor_shape_in = arm_compute::TensorShape(
src_md->dims[1], src_md->dims[0]);
acl_tensor_shape_out = arm_compute::TensorShape(
dst_md->padded_dims[1], dst_md->padded_dims[0]);
} else {
return status::unimplemented;
}
} break;
case 4: {
// Currently only supporting AxBx1x1 cases
if (dst_md->dims[2] != 1 || dst_md->dims[3] != 1) {
return status::unimplemented;
}

acl_tensor_shape_in = arm_compute::TensorShape(
src_md->dims[3], src_md->dims[2], src_md->dims[1],
src_md->dims[0]);
acl_tensor_shape_out = arm_compute::TensorShape(
dst_md->padded_dims[3], dst_md->padded_dims[2],
dst_md->padded_dims[1], dst_md->padded_dims[0]);
break;
}
default: return status::unimplemented;
}

// Choose the data layout
const auto acl_layout = arm_compute::DataLayout::NCHW;

// Set Source WeightFormat
_pd->app_.src_wf = arm_compute::WeightFormat::OHWI;

// Create ACL tensor infos
const arm_compute::DataType src_acl_data_t
= acl_utils::get_acl_data_t(src_md->data_type);
_pd->app_.src_info = arm_compute::TensorInfo(
acl_tensor_shape_in, 1, src_acl_data_t, acl_layout);

const arm_compute::DataType dst_acl_data_t
= acl_utils::get_acl_data_t(dst_md->data_type);
_pd->app_.dst_info = arm_compute::TensorInfo(
acl_tensor_shape_out, 1, dst_acl_data_t, acl_layout);

ACL_CHECK_VALID(arm_compute::NEReorderLayer::validate(
&_pd->app_.src_info, &_pd->app_.dst_info, _pd->app_.src_wf,
_pd->app_.dst_wf));

// Init scratch memory, not used so 0 in this implementation
_pd->init_scratchpad_md();

return safe_ptr_assign(*reorder_pd, _pd.release());
} // create

friend dnnl::impl::impl_list_item_t;
acl_reorder_conf_t app_;

}; // pd_t

acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}

status_t create_resource(
engine_t *engine, resource_mapper_t &mapper) const override {
if (mapper.has_resource(this)) return status::success;

auto r = utils::make_unique<acl_reorder_resource_t>();
if (!r) return status::out_of_memory;

// Configure the resource based on information from primitive descriptor
CHECK(r->configure(pd()->app_));

mapper.add(this, std::move(r));
return status::success;
}

status_t execute(const exec_ctx_t &ctx) const override {
return execute_forward(ctx);
}

private:
// To guard the const execute_forward, the mutex must be 'mutable'
mutable std::mutex mtx;
status_t execute_forward(const exec_ctx_t &ctx) const;
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }

}; // acl_reorder_fwd_t

using aarch64::acl_reorder_fwd_t;
} // namespace acl
} // namespace cpu
} // namespace impl
} // namespace dnnl

#endif // CPU_ACL_REORDER_HPP
#endif
Loading
Loading