diff --git a/detection_2d/detection_2d_rt_detr/CMakeLists.txt b/detection_2d/detection_2d_rt_detr/CMakeLists.txt
index 10cc562..142c8e9 100644
--- a/detection_2d/detection_2d_rt_detr/CMakeLists.txt
+++ b/detection_2d/detection_2d_rt_detr/CMakeLists.txt
@@ -5,12 +5,9 @@ add_compile_options(-std=c++17)
 add_compile_options(-O3 -Wextra -Wdeprecated -fPIC)
 set(CMAKE_CXX_STANDARD 17)
 
-
 find_package(OpenCV REQUIRED)
 find_package(glog REQUIRED)
 
-
-
 include_directories(
   include
   ${OpenCV_INCLUDE_DIRS}
@@ -35,3 +32,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/include)
 if (BUILD_TESTING)
   add_subdirectory(test)
 endif()
+
+if (BUILD_BENCHMARK)
+  add_subdirectory(benchmark)
+endif()
diff --git a/detection_2d/detection_2d_rt_detr/benchmark/CMakeLists.txt b/detection_2d/detection_2d_rt_detr/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..6d47fb3
--- /dev/null
+++ b/detection_2d/detection_2d_rt_detr/benchmark/CMakeLists.txt
@@ -0,0 +1,53 @@
+add_compile_options(-std=c++17)
+add_compile_options(-O3 -Wextra -Wdeprecated -fPIC)
+set(CMAKE_CXX_STANDARD 17)
+
+if(ENABLE_TENSORRT)
+  list(APPEND platform_core_packages trt_core)
+endif()
+
+if(ENABLE_RKNN)
+  list(APPEND platform_core_packages rknn_core)
+endif()
+
+if(ENABLE_ORT)
+  list(APPEND platform_core_packages ort_core)
+endif()
+
+find_package(glog REQUIRED)
+find_package(OpenCV REQUIRED)
+find_package(benchmark REQUIRED)
+
+set(source_file
+  benchmark_detection_2d_rt_detr.cpp
+)
+
+include_directories(
+  include
+  ${OpenCV_INCLUDE_DIRS}
+)
+
+add_executable(benchmark_detection_2d_rt_detr ${source_file})
+
+target_link_libraries(benchmark_detection_2d_rt_detr PUBLIC
+  benchmark::benchmark
+  glog::glog
+  ${OpenCV_LIBS}
+  deploy_core
+  image_processing_utils
+  detection_2d_rt_detr
+  benchmark_utils
+  ${platform_core_packages}
+)
+
+if(ENABLE_TENSORRT)
+  target_compile_definitions(benchmark_detection_2d_rt_detr PRIVATE ENABLE_TENSORRT)
+endif()
+
+if(ENABLE_RKNN)
+  target_compile_definitions(benchmark_detection_2d_rt_detr PRIVATE ENABLE_RKNN)
+endif()
+
+if(ENABLE_ORT)
+  target_compile_definitions(benchmark_detection_2d_rt_detr PRIVATE ENABLE_ORT)
+endif()
diff --git a/detection_2d/detection_2d_rt_detr/benchmark/benchmark_detection_2d_rt_detr.cpp b/detection_2d/detection_2d_rt_detr/benchmark/benchmark_detection_2d_rt_detr.cpp
new file mode 100644
index 0000000..6a545d1
--- /dev/null
+++ b/detection_2d/detection_2d_rt_detr/benchmark/benchmark_detection_2d_rt_detr.cpp
@@ -0,0 +1,83 @@
+#include <gtest/gtest.h>
+
+#include "detection_2d_util/detection_2d_util.h"
+#include "detection_2d_rt_detr/rt_detr.h"
+#include "benchmark_utils/detection_2d_benchmark_utils.hpp"
+
+using namespace inference_core;
+using namespace detection_2d;
+using namespace benchmark_utils;
+
+#ifdef ENABLE_TENSORRT
+
+#include "trt_core/trt_core.h"
+
+std::shared_ptr<BaseDetectionModel> CreateRTDetrTensorRTModel()
+{
+  std::string                    model_path   = "/workspace/models/rt_detr_v2_single_input.engine";
+  const int                      input_height = 640;
+  const int                      input_width  = 640;
+  const int                      input_channels    = 3;
+  const int                      cls_number        = 80;
+  const std::vector<std::string> input_blobs_name  = {"images"};
+  const std::vector<std::string> output_blobs_name = {"labels", "boxes", "scores"};
+
+  auto infer_core = CreateTrtInferCore(model_path);
+  auto preprocess = CreateCudaDetPreProcess();
+
+  auto rt_detr_model =
+      CreateRTDetrDetectionModel(infer_core, preprocess, input_height, input_width, input_channels,
+                                 cls_number, input_blobs_name, output_blobs_name);
+  return rt_detr_model;
+}
+
+static void benchmark_detection_2d_rt_detr_tensorrt_sync(benchmark::State &state)
+{
+  benchmark_detection_2d_sync(state, CreateRTDetrTensorRTModel());
+}
+static void benchmark_detection_2d_rt_detr_tensorrt_async(benchmark::State &state)
+{
+  benchmark_detection_2d_async(state, CreateRTDetrTensorRTModel());
+}
+BENCHMARK(benchmark_detection_2d_rt_detr_tensorrt_sync)->Arg(500)->UseRealTime();
+BENCHMARK(benchmark_detection_2d_rt_detr_tensorrt_async)->Arg(500)->UseRealTime();
+
+#endif
+
+#ifdef ENABLE_ORT
+
+#include "ort_core/ort_core.h"
+
+std::shared_ptr<BaseDetectionModel> CreateRTDetrOnnxRuntimeModel()
+{
+  std::string                    model_path     = "/workspace/models/rt_detr_v2_single_input.onnx";
+  const int                      input_height   = 640;
+  const int                      input_width    = 640;
+  const int                      input_channels = 3;
+  const int                      cls_number     = 80;
+  const std::vector<std::string> input_blobs_name  = {"images"};
+  const std::vector<std::string> output_blobs_name = {"labels", "boxes", "scores"};
+
+  auto infer_core = CreateOrtInferCore(model_path);
+  auto preprocess = CreateCpuDetPreProcess({0, 0, 0}, {255, 255, 255}, true, true);
+
+  auto rt_detr_model =
+      CreateRTDetrDetectionModel(infer_core, preprocess, input_height, input_width, input_channels,
+                                 cls_number, input_blobs_name, output_blobs_name);
+  return rt_detr_model;
+}
+
+static void benchmark_detection_2d_rt_detr_onnxruntime_sync(benchmark::State &state)
+{
+  benchmark_detection_2d_sync(state, CreateRTDetrOnnxRuntimeModel());
+}
+static void benchmark_detection_2d_rt_detr_onnxruntime_async(benchmark::State &state)
+{
+  benchmark_detection_2d_async(state, CreateRTDetrOnnxRuntimeModel());
+}
+BENCHMARK(benchmark_detection_2d_rt_detr_onnxruntime_sync)->Arg(100)->UseRealTime();
+BENCHMARK(benchmark_detection_2d_rt_detr_onnxruntime_async)->Arg(100)->UseRealTime();
+
+#endif
+
+BENCHMARK_MAIN();
diff --git a/detection_2d/detection_2d_yolov8/CMakeLists.txt b/detection_2d/detection_2d_yolov8/CMakeLists.txt
index 512ec51..8d58d24 100644
--- a/detection_2d/detection_2d_yolov8/CMakeLists.txt
+++ b/detection_2d/detection_2d_yolov8/CMakeLists.txt
@@ -32,3 +32,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/include)
 if (BUILD_TESTING)
   add_subdirectory(test)
 endif()
+
+if (BUILD_BENCHMARK)
+  add_subdirectory(benchmark)
+endif()
diff --git a/detection_2d/detection_2d_yolov8/benchmark/CMakeLists.txt b/detection_2d/detection_2d_yolov8/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..d1df83d
--- /dev/null
+++ b/detection_2d/detection_2d_yolov8/benchmark/CMakeLists.txt
@@ -0,0 +1,53 @@
+add_compile_options(-std=c++17)
+add_compile_options(-O3 -Wextra -Wdeprecated -fPIC)
+set(CMAKE_CXX_STANDARD 17)
+
+if(ENABLE_TENSORRT)
+  list(APPEND platform_core_packages trt_core)
+endif()
+
+if(ENABLE_RKNN)
+  list(APPEND platform_core_packages rknn_core)
+endif()
+
+if(ENABLE_ORT)
+  list(APPEND platform_core_packages ort_core)
+endif()
+
+find_package(glog REQUIRED)
+find_package(OpenCV REQUIRED)
+find_package(benchmark REQUIRED)
+
+set(source_file
+  benchmark_detection_2d_yolov8.cpp
+)
+
+include_directories(
+  include
+  ${OpenCV_INCLUDE_DIRS}
+)
+
+add_executable(benchmark_detection_2d_yolov8 ${source_file})
+
+target_link_libraries(benchmark_detection_2d_yolov8 PUBLIC
+  benchmark::benchmark
+  glog::glog
+  ${OpenCV_LIBS}
+  deploy_core
+  image_processing_utils
+  detection_2d_yolov8
+  benchmark_utils
+  ${platform_core_packages}
+)
+
+if(ENABLE_TENSORRT)
+  target_compile_definitions(benchmark_detection_2d_yolov8 PRIVATE ENABLE_TENSORRT)
+endif()
+
+if(ENABLE_RKNN)
+  target_compile_definitions(benchmark_detection_2d_yolov8 PRIVATE ENABLE_RKNN)
+endif()
+
+if(ENABLE_ORT)
+  target_compile_definitions(benchmark_detection_2d_yolov8 PRIVATE ENABLE_ORT)
+endif()
diff --git a/detection_2d/detection_2d_yolov8/benchmark/benchmark_detection_2d_yolov8.cpp b/detection_2d/detection_2d_yolov8/benchmark/benchmark_detection_2d_yolov8.cpp
new file mode 100644
index 0000000..edca386
--- /dev/null
+++ b/detection_2d/detection_2d_yolov8/benchmark/benchmark_detection_2d_yolov8.cpp
@@ -0,0 +1,124 @@
+#include <gtest/gtest.h>
+
+#include "detection_2d_util/detection_2d_util.h"
+#include "detection_2d_yolov8/yolov8.h"
+#include "benchmark_utils/detection_2d_benchmark_utils.hpp"
+
+using namespace inference_core;
+using namespace detection_2d;
+using namespace benchmark_utils;
+
+#ifdef ENABLE_TENSORRT
+
+#include "trt_core/trt_core.h"
+
+std::shared_ptr<BaseDetectionModel> CreateYolov8TensorRTModel()
+{
+  std::string                    model_path        = "/workspace/models/yolov8n.engine";
+  const int                      input_height      = 640;
+  const int                      input_width       = 640;
+  const int                      input_channels    = 3;
+  const int                      cls_number        = 80;
+  const std::vector<std::string> input_blobs_name  = {"images"};
+  const std::vector<std::string> output_blobs_name = {"output0"};
+
+  auto infer_core  = CreateTrtInferCore(model_path);
+  auto preprocess  = CreateCudaDetPreProcess();
+  auto postprocess = CreateYolov8PostProcessCpuOrigin(input_height, input_width, cls_number);
+
+  auto yolov8_model =
+      CreateYolov8DetectionModel(infer_core, preprocess, postprocess, input_height, input_width,
+                                 input_channels, cls_number, input_blobs_name, output_blobs_name);
+  return yolov8_model;
+}
+
+static void benchmark_detection_2d_yolov8_tensorrt_sync(benchmark::State &state)
+{
+  benchmark_detection_2d_sync(state, CreateYolov8TensorRTModel());
+}
+static void benchmark_detection_2d_yolov8_tensorrt_async(benchmark::State &state)
+{
+  benchmark_detection_2d_async(state, CreateYolov8TensorRTModel());
+}
+BENCHMARK(benchmark_detection_2d_yolov8_tensorrt_sync)->Arg(1000)->UseRealTime();
+BENCHMARK(benchmark_detection_2d_yolov8_tensorrt_async)->Arg(1000)->UseRealTime();
+
+#endif
+
+#ifdef ENABLE_ORT
+
+#include "ort_core/ort_core.h"
+
+std::shared_ptr<BaseDetectionModel> CreateYolov8OnnxRuntimeModel()
+{
+  std::string                    model_path        = "/workspace/models/yolov8n.onnx";
+  const int                      input_height      = 640;
+  const int                      input_width       = 640;
+  const int                      input_channels    = 3;
+  const int                      cls_number        = 80;
+  const std::vector<std::string> input_blobs_name  = {"images"};
+  const std::vector<std::string> output_blobs_name = {"output0"};
+
+  auto infer_core  = CreateOrtInferCore(model_path);
+  auto preprocess  = CreateCpuDetPreProcess({0, 0, 0}, {255, 255, 255}, true, true);
+  auto postprocess = CreateYolov8PostProcessCpuOrigin(input_height, input_width, cls_number);
+
+  auto yolov8_model =
+      CreateYolov8DetectionModel(infer_core, preprocess, postprocess, input_height, input_width,
+                                 input_channels, cls_number, input_blobs_name, output_blobs_name);
+  return yolov8_model;
+}
+
+static void benchmark_detection_2d_yolov8_onnxruntime_sync(benchmark::State &state)
+{
+  benchmark_detection_2d_sync(state, CreateYolov8OnnxRuntimeModel());
+}
+static void benchmark_detection_2d_yolov8_onnxruntime_async(benchmark::State &state)
+{
+  benchmark_detection_2d_async(state, CreateYolov8OnnxRuntimeModel());
+}
+BENCHMARK(benchmark_detection_2d_yolov8_onnxruntime_sync)->Arg(200)->UseRealTime();
+BENCHMARK(benchmark_detection_2d_yolov8_onnxruntime_async)->Arg(200)->UseRealTime();
+
+#endif
+
+#ifdef ENABLE_RKNN
+
+#include "rknn_core/rknn_core.h"
+
+std::shared_ptr<BaseDetectionModel> CreateYolov8RknnModel()
+{
+  std::string                    model_path       = "/workspace/models/yolov8n_divide_opset11.rknn";
+  const int                      input_height     = 640;
+  const int                      input_width      = 640;
+  const int                      input_channels   = 3;
+  const int                      cls_number       = 80;
+  const std::vector<std::string> input_blobs_name = {"images"};
+  const std::vector<std::string> output_blobs_name = {"318", "onnx::ReduceSum_326", "331",
+                                                      "338", "onnx::ReduceSum_346", "350",
+                                                      "357", "onnx::ReduceSum_365", "369"};
+
+  auto infer_core  = CreateRknnInferCore(model_path, {{"images", RknnInputTensorType::RK_UINT8}});
+  auto preprocess  = CreateCpuDetPreProcess({0, 0, 0}, {1, 1, 1}, false, false);
+  auto postprocess = CreateYolov8PostProcessCpuDivide(input_height, input_width, cls_number);
+
+  auto yolov8_model =
+      CreateYolov8DetectionModel(infer_core, preprocess, postprocess, input_height, input_width,
+                                 input_channels, cls_number, input_blobs_name, output_blobs_name);
+  return yolov8_model;
+}
+
+static void benchmark_detection_2d_yolov8_rknn_sync(benchmark::State &state)
+{
+  benchmark_detection_2d_sync(state, CreateYolov8RknnModel());
+}
+static void benchmark_detection_2d_yolov8_rknn_async(benchmark::State &state)
+{
+  benchmark_detection_2d_async(state, CreateYolov8RknnModel());
+}
+BENCHMARK(benchmark_detection_2d_yolov8_rknn_sync)->Arg(500)->UseRealTime();
+BENCHMARK(benchmark_detection_2d_yolov8_rknn_async)->Arg(500)->UseRealTime();
+
+#endif
+
+BENCHMARK_MAIN();
diff --git a/detection_2d/detection_2d_yolov8/test/test_detection_2d_yolov8.cpp b/detection_2d/detection_2d_yolov8/test/test_detection_2d_yolov8.cpp
index 66e9369..af37d6e 100644
--- a/detection_2d/detection_2d_yolov8/test/test_detection_2d_yolov8.cpp
+++ b/detection_2d/detection_2d_yolov8/test/test_detection_2d_yolov8.cpp
@@ -116,8 +116,8 @@ class Yolov8_Rknn_Fixture : public BaseYolov8Fixture {
     const int                      cls_number     = 80;
     const std::vector<std::string> input_blobs_name  = {"images"};
     const std::vector<std::string> output_blobs_name = {"318", "onnx::ReduceSum_326", "331",
-                                                      "338", "onnx::ReduceSum_346", "350",
-                                                      "357", "onnx::ReduceSum_365", "369"};
+                                                        "338", "onnx::ReduceSum_346", "350",
+                                                        "357", "onnx::ReduceSum_365", "369"};
 
     auto infer_core  = CreateRknnInferCore(model_path, {{"images", RknnInputTensorType::RK_UINT8}});
     auto preprocess  = CreateCpuDetPreProcess({0, 0, 0}, {1, 1, 1}, false, false);
diff --git a/easy_deploy_tool b/easy_deploy_tool
index 4a01290..6c254c6 160000
--- a/easy_deploy_tool
+++ b/easy_deploy_tool
@@ -1 +1 @@
-Subproject commit 4a012904f39be0c35f0da9921e5c761f3ef1e2bb
+Subproject commit 6c254c6d53e429513d46924f96fb1e543364497f
diff --git a/sam/sam_mobilesam/CMakeLists.txt b/sam/sam_mobilesam/CMakeLists.txt
index 66fdf30..ac8dfe4 100644
--- a/sam/sam_mobilesam/CMakeLists.txt
+++ b/sam/sam_mobilesam/CMakeLists.txt
@@ -33,3 +33,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/include)
 if (BUILD_TESTING)
   add_subdirectory(test)
 endif()
+
+if (BUILD_BENCHMARK)
+  add_subdirectory(benchmark)
+endif()
diff --git a/sam/sam_mobilesam/benchmark/CMakeLists.txt b/sam/sam_mobilesam/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..7baea5a
--- /dev/null
+++ b/sam/sam_mobilesam/benchmark/CMakeLists.txt
@@ -0,0 +1,53 @@
+add_compile_options(-std=c++17)
+add_compile_options(-O3 -Wextra -Wdeprecated -fPIC)
+set(CMAKE_CXX_STANDARD 17)
+
+if(ENABLE_TENSORRT)
+  list(APPEND platform_core_packages trt_core)
+endif()
+
+if(ENABLE_RKNN)
+  list(APPEND platform_core_packages rknn_core)
+endif()
+
+if(ENABLE_ORT)
+  list(APPEND platform_core_packages ort_core)
+endif()
+
+find_package(glog REQUIRED)
+find_package(OpenCV REQUIRED)
+find_package(benchmark REQUIRED)
+
+set(source_file
+  benchmark_sam_mobilesam.cpp
+)
+
+include_directories(
+  include
+  ${OpenCV_INCLUDE_DIRS}
+)
+
+add_executable(benchmark_sam_mobilesam ${source_file})
+
+target_link_libraries(benchmark_sam_mobilesam PUBLIC
+  benchmark::benchmark
+  glog::glog
+  ${OpenCV_LIBS}
+  deploy_core
+  image_processing_utils
+  sam_mobilesam
+  benchmark_utils
+  ${platform_core_packages}
+)
+
+if(ENABLE_TENSORRT)
+  target_compile_definitions(benchmark_sam_mobilesam PRIVATE ENABLE_TENSORRT)
+endif()
+
+if(ENABLE_RKNN)
+  target_compile_definitions(benchmark_sam_mobilesam PRIVATE ENABLE_RKNN)
+endif()
+
+if(ENABLE_ORT)
+  target_compile_definitions(benchmark_sam_mobilesam PRIVATE ENABLE_ORT)
+endif()
diff --git a/sam/sam_mobilesam/benchmark/benchmark_sam_mobilesam.cpp b/sam/sam_mobilesam/benchmark/benchmark_sam_mobilesam.cpp
new file mode 100644
index 0000000..fbfa1a5
--- /dev/null
+++ b/sam/sam_mobilesam/benchmark/benchmark_sam_mobilesam.cpp
@@ -0,0 +1,194 @@
+#include <gtest/gtest.h>
+
+#include "detection_2d_util/detection_2d_util.h"
+#include "sam_mobilesam/mobilesam.h"
+#include "benchmark_utils/sam_benchmark_utils.hpp"
+
+using namespace inference_core;
+using namespace detection_2d;
+using namespace sam;
+using namespace benchmark_utils;
+
+#ifdef ENABLE_TENSORRT
+
+#include "trt_core/trt_core.h"
+
+std::shared_ptr<BaseSamModel> CreateSAMTensorRTModel(const std::string &image_encoder_model_path)
+{
+  auto box_decoder_model_path   = "/workspace/models/modified_mobile_sam_box.engine";
+  auto point_decoder_model_path = "/workspace/models/modified_mobile_sam_point.engine";
+
+  auto image_encoder = CreateTrtInferCore(image_encoder_model_path);
+
+  const int SAM_MAX_BOX    = 1;
+  const int SAM_MAX_POINTS = 8;
+
+  auto box_decoder_factory =
+      CreateTrtInferCoreFactory(box_decoder_model_path,
+                                {
+                                    {"image_embeddings", {1, 256, 64, 64}},
+                                    {"boxes", {1, SAM_MAX_BOX, 4}},
+                                    {"mask_input", {1, 1, 256, 256}},
+                                    {"has_mask_input", {1}},
+                                },
+                                {{"masks", {1, 1, 256, 256}}, {"scores", {1, 1}}});
+
+  auto point_decoder_factory =
+      CreateTrtInferCoreFactory(point_decoder_model_path,
+                                {
+                                    {"image_embeddings", {1, 256, 64, 64}},
+                                    {"point_coords", {1, SAM_MAX_POINTS, 2}},
+                                    {"point_labels", {1, SAM_MAX_POINTS}},
+                                    {"mask_input", {1, 1, 256, 256}},
+                                    {"has_mask_input", {1}},
+                                },
+                                {{"masks", {1, 1, 256, 256}}, {"scores", {1, 1}}});
+
+  auto image_preprocess_factory = CreateCudaDetPreProcessFactory();
+
+  return CreateMobileSamModel(image_encoder, point_decoder_factory->Create(),
+                              box_decoder_factory->Create(), image_preprocess_factory->Create());
+}
+
+// benchmark sam_mobilesam
+static void benchmark_sam_mobilesam_tensorrt_sync(benchmark::State &state)
+{
+  auto mobilesam_image_encoder_model_path = "/workspace/models/mobile_sam_encoder.engine";
+  benchmark_sam_sync(state, CreateSAMTensorRTModel(mobilesam_image_encoder_model_path));
+}
+static void benchmark_sam_mobilesam_tensorrt_async(benchmark::State &state)
+{
+  auto mobilesam_image_encoder_model_path = "/workspace/models/mobile_sam_encoder.engine";
+  benchmark_sam_async(state, CreateSAMTensorRTModel(mobilesam_image_encoder_model_path));
+}
+BENCHMARK(benchmark_sam_mobilesam_tensorrt_sync)->Arg(100)->UseRealTime();
+BENCHMARK(benchmark_sam_mobilesam_tensorrt_async)->Arg(100)->UseRealTime();
+
+// benchmark sam_nanosam
+static void benchmark_sam_nanosam_tensorrt_sync(benchmark::State &state)
+{
+  auto nanosam_image_encoder_model_path = "/workspace/models/nanosam_image_encoder_opset11.engine";
+  benchmark_sam_sync(state, CreateSAMTensorRTModel(nanosam_image_encoder_model_path));
+}
+static void benchmark_sam_nanosam_tensorrt_async(benchmark::State &state)
+{
+  auto nanosam_image_encoder_model_path = "/workspace/models/nanosam_image_encoder_opset11.engine";
+  benchmark_sam_async(state, CreateSAMTensorRTModel(nanosam_image_encoder_model_path));
+}
+BENCHMARK(benchmark_sam_nanosam_tensorrt_sync)->Arg(200)->UseRealTime();
+BENCHMARK(benchmark_sam_nanosam_tensorrt_async)->Arg(200)->UseRealTime();
+
+#endif
+
+#ifdef ENABLE_ORT
+
+#include "ort_core/ort_core.h"
+
+std::shared_ptr<BaseSamModel> CreateSAMOnnxRuntimeModel(const std::string &image_encoder_model_path)
+{
+  auto box_decoder_model_path   = "/workspace/models/modified_mobile_sam_box.onnx";
+  auto point_decoder_model_path = "/workspace/models/modified_mobile_sam_point.onnx";
+
+  auto image_encoder = CreateOrtInferCore(image_encoder_model_path);
+
+  const int SAM_MAX_BOX    = 1;
+  const int SAM_MAX_POINTS = 8;
+
+  auto box_decoder_factory =
+      CreateOrtInferCoreFactory(box_decoder_model_path,
+                                {
+                                    {"image_embeddings", {1, 256, 64, 64}},
+                                    {"boxes", {1, SAM_MAX_BOX, 4}},
+                                    {"mask_input", {1, 1, 256, 256}},
+                                    {"has_mask_input", {1}},
+                                },
+                                {{"masks", {1, 1, 256, 256}}, {"scores", {1, 1}}});
+
+  auto point_decoder_factory =
+      CreateOrtInferCoreFactory(point_decoder_model_path,
+                                {
+                                    {"image_embeddings", {1, 256, 64, 64}},
+                                    {"point_coords", {1, SAM_MAX_POINTS, 2}},
+                                    {"point_labels", {1, SAM_MAX_POINTS}},
+                                    {"mask_input", {1, 1, 256, 256}},
+                                    {"has_mask_input", {1}},
+                                },
+                                {{"masks", {1, 1, 256, 256}}, {"scores", {1, 1}}});
+
+  auto image_preprocess_factory =
+      CreateCpuDetPreProcessFactory({0, 0, 0}, {255, 255, 255}, true, true);
+
+  return CreateMobileSamModel(image_encoder, point_decoder_factory->Create(),
+                              box_decoder_factory->Create(), image_preprocess_factory->Create());
+}
+
+// benchmark sam_mobilesam
+static void benchmark_sam_mobilesam_onnxruntime_sync(benchmark::State &state)
+{
+  auto mobilesam_image_encoder_model_path = "/workspace/models/mobile_sam_encoder.onnx";
+  benchmark_sam_sync(state, CreateSAMOnnxRuntimeModel(mobilesam_image_encoder_model_path));
+}
+static void benchmark_sam_mobilesam_onnxruntime_async(benchmark::State &state)
+{
+  auto mobilesam_image_encoder_model_path = "/workspace/models/mobile_sam_encoder.onnx";
+  benchmark_sam_async(state, CreateSAMOnnxRuntimeModel(mobilesam_image_encoder_model_path));
+}
+BENCHMARK(benchmark_sam_mobilesam_onnxruntime_sync)->Arg(20)->UseRealTime();
+BENCHMARK(benchmark_sam_mobilesam_onnxruntime_async)->Arg(20)->UseRealTime();
+
+// benchmark sam_nanosam
+static void benchmark_sam_nanosam_onnxruntime_sync(benchmark::State &state)
+{
+  auto nanosam_image_encoder_model_path = "/workspace/models/nanosam_image_encoder_opset11.onnx";
+  benchmark_sam_sync(state, CreateSAMOnnxRuntimeModel(nanosam_image_encoder_model_path));
+}
+static void benchmark_sam_nanosam_onnxruntime_async(benchmark::State &state)
+{
+  auto nanosam_image_encoder_model_path = "/workspace/models/nanosam_image_encoder_opset11.onnx";
+  benchmark_sam_async(state, CreateSAMOnnxRuntimeModel(nanosam_image_encoder_model_path));
+}
+BENCHMARK(benchmark_sam_nanosam_onnxruntime_sync)->Arg(50)->UseRealTime();
+BENCHMARK(benchmark_sam_nanosam_onnxruntime_async)->Arg(50)->UseRealTime();
+
+#endif
+
+#ifdef ENABLE_RKNN
+
+#include "rknn_core/rknn_core.h"
+
+std::shared_ptr<BaseSamModel> CreateSAMRknnModel(const std::string &image_encoder_model_path)
+{
+  auto box_decoder_model_path   = "/workspace/models/modified_mobile_sam_box.rknn";
+  auto point_decoder_model_path = "/workspace/models/modified_mobile_sam_point.rknn";
+
+  auto nanosam_image_encoder = CreateRknnInferCore(
+      image_encoder_model_path, {{"images", RknnInputTensorType::RK_UINT8}}, 5, 2);
+
+  auto box_decoder_factory = CreateRknnInferCoreFactory(box_decoder_model_path, {}, 5, 2);
+
+  auto point_decoder_factory = CreateRknnInferCoreFactory(point_decoder_model_path, {}, 5, 2);
+
+  auto image_preprocess_factory =
+      CreateCpuDetPreProcessFactory({0, 0, 0}, {255, 255, 255}, false, false);
+
+  return CreateMobileSamModel(nanosam_image_encoder, point_decoder_factory->Create(),
+                              box_decoder_factory->Create(), image_preprocess_factory->Create());
+}
+
+// benchmark sam_nanosam
+static void benchmark_sam_nanosam_rknn_sync(benchmark::State &state)
+{
+  auto nanosam_image_encoder_model_path = "/workspace/models/nanosam_image_encoder_opset11.rknn";
+  benchmark_sam_sync(state, CreateSAMRknnModel(nanosam_image_encoder_model_path));
+}
+static void benchmark_sam_nanosam_rknn_async(benchmark::State &state)
+{
+  auto nanosam_image_encoder_model_path = "/workspace/models/nanosam_image_encoder_opset11.rknn";
+  benchmark_sam_async(state, CreateSAMRknnModel(nanosam_image_encoder_model_path));
+}
+BENCHMARK(benchmark_sam_nanosam_rknn_sync)->Arg(50)->UseRealTime();
+BENCHMARK(benchmark_sam_nanosam_rknn_async)->Arg(100)->UseRealTime();
+
+#endif
+
+BENCHMARK_MAIN();