From e7b36ae1083ee5a1a5b67344c1403284b4106489 Mon Sep 17 00:00:00 2001 From: Alexandros Theodoridis Date: Thu, 19 Feb 2026 08:25:42 +0000 Subject: [PATCH 1/3] Introduce tsan pipeline --- .github/workflows/ci-ut.yml | 1 + jax_rocm_plugin/.bazelrc | 32 ++++++++++++------- .../build/rocm/sanitizer_wrapper.sh | 17 +++++++--- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci-ut.yml b/.github/workflows/ci-ut.yml index d22b45740..cb1dfc5e6 100644 --- a/.github/workflows/ci-ut.yml +++ b/.github/workflows/ci-ut.yml @@ -31,6 +31,7 @@ jobs: - {name: "py3.13", python_version: "3.13", config: ""} - {name: "py3.14", python_version: "3.14", config: ""} - {name: "asan", python_version: "3.11", config: "--config=asan"} + - {name: "tsan", python_version: "3.11", config: "--config=tsan"} container: # note this image shall match the one defined in platform/linux:tf_linux_gpu image: rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa diff --git a/jax_rocm_plugin/.bazelrc b/jax_rocm_plugin/.bazelrc index d3bbea334..e2a7777d1 100644 --- a/jax_rocm_plugin/.bazelrc +++ b/jax_rocm_plugin/.bazelrc @@ -101,24 +101,34 @@ build:rocm --copt=-Wno-gnu-offsetof-extensions build:rocm --copt=-Qunused-arguments build:rocm --action_env=TF_HIPCC_CLANG="1" -build:asan --strip=never +############################################################################# +# Sanitizer configs +############################################################################# +build:sanitizer --linkopt="-L/usr/lib/llvm-18/lib/clang/18/lib/linux" +build:sanitizer --linkopt="-Wl,-rpath,/usr/lib/llvm-18/lib/clang/18/lib/linux" +build:sanitizer --run_under=//build/rocm:sanitizer_wrapper +build:sanitizer --action_env TF_ROCM_AMDGPU_TARGETS="gfx908,gfx90a,gfx942" +build:sanitizer --strip=never +#O1 because we need to see the asans callstack +build:sanitizer --copt -O1 +build:sanitizer --copt -g +build:sanitizer --copt -fno-omit-frame-pointer +build:sanitizer --linkopt -g + +build:asan --config=sanitizer build:asan --copt -fsanitize=address build:asan --copt -DADDRESS_SANITIZER -#O1 because we need to see the asans callstack -build:asan --copt -O1 -build:asan --copt -g build:asan --copt -gsplit-dwarf -build:asan --copt -fno-omit-frame-pointer build:asan --linkopt -fsanitize=address -build:asan --linkopt -g1 - -build:asan --linkopt="-L/usr/lib/llvm-18/lib/clang/18/lib/linux" -build:asan --linkopt="-Wl,-rpath,/usr/lib/llvm-18/lib/clang/18/lib/linux" build:asan --linkopt="-lclang_rt.asan-x86_64" build:asan --linkopt="-lclang_rt.asan_cxx-x86_64" build:asan --//build/rocm:sanitizer=asan -build:asan --run_under=//build/rocm:sanitizer_wrapper -build:asan --action_env TF_ROCM_AMDGPU_TARGETS="gfx908,gfx90a,gfx942" + +build:tsan --config=sanitizer +build:tsan --copt -fsanitize=thread +build:tsan --linkopt -fsanitize=thread +build:tsan --linkopt="-l:libclang_rt.tsan-x86_64.so" +build:tsan --//build/rocm:sanitizer=tsan ############################################################################# # Configuration for running RBE builds and tests diff --git a/jax_rocm_plugin/build/rocm/sanitizer_wrapper.sh b/jax_rocm_plugin/build/rocm/sanitizer_wrapper.sh index d001e36aa..646f5657a 100755 --- a/jax_rocm_plugin/build/rocm/sanitizer_wrapper.sh +++ b/jax_rocm_plugin/build/rocm/sanitizer_wrapper.sh @@ -1,25 +1,32 @@ #!/usr/bin/env bash ASAN_RT="/usr/lib/llvm-18/lib/clang/18/lib/linux/libclang_rt.asan-x86_64.so" - -# Only set LD_PRELOAD for the test binary, not for Bazel's wrappers -export LD_PRELOAD="${ASAN_RT}" +TSAN_RT="/usr/lib/llvm-18/lib/clang/18/lib/linux/libclang_rt.tsan-x86_64.so" # Resolve suppression files from runfiles ASAN_SUPP="$TEST_SRCDIR/jax_rocm_plugin/build/rocm/asan_ignore_list.txt" LSAN_SUPP="$TEST_SRCDIR/jax_rocm_plugin/build/rocm/lsan_ignore_list.txt" +TSAN_SUPP="$TEST_SRCDIR/jax_rocm_plugin/build/rocm/tsan_ignore_list.txt" ASAN_OPTS="use_sigaltstack=0:detect_leaks=0" LSAN_OPTS="use_sigaltstack=0" +TSAN_OPTS="history_size=7:ignore_noninstrumented_modules=1" if [[ -f "$ASAN_SUPP" ]]; then - ASAN_OPTS="suppressions=${ASAN_SUPP}:${ASAN_OPTS}" + export LD_PRELOAD="${ASAN_RT}" + ASAN_OPTS="suppressions=${ASAN_SUPP}:${ASAN_OPTS}" fi if [[ -f "$LSAN_SUPP" ]]; then - LSAN_OPTS="suppressions=${LSAN_SUPP}:${LSAN_OPTS}" + export LD_PRELOAD="${ASAN_RT}" + LSAN_OPTS="suppressions=${LSAN_SUPP}:${LSAN_OPTS}" +fi +if [[ -f "$TSAN_SUPP" ]]; then + export LD_PRELOAD="${TSAN_RT}" + TSAN_OPTS="suppressions=${TSAN_SUPP}:${TSAN_OPTS}" fi export ASAN_OPTIONS="${ASAN_OPTS}" export LSAN_OPTIONS="${LSAN_OPTS}" +export TSAN_OPTIONS="${TSAN_OPTS}" exec "$@" From 78e81eb03c0e7a4156cf56ef22d58e1b87af2c86 Mon Sep 17 00:00:00 2001 From: Alexandros Theodoridis Date: Thu, 19 Feb 2026 11:01:00 +0000 Subject: [PATCH 2/3] Ignore command buffer race --- jax_rocm_plugin/build/rocm/tsan_ignore_list.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jax_rocm_plugin/build/rocm/tsan_ignore_list.txt b/jax_rocm_plugin/build/rocm/tsan_ignore_list.txt index 9f88753e2..8f727173d 100644 --- a/jax_rocm_plugin/build/rocm/tsan_ignore_list.txt +++ b/jax_rocm_plugin/build/rocm/tsan_ignore_list.txt @@ -28,7 +28,9 @@ race:xla::gpu::GpuCompiler::CompileSingleModule race:xla::LiteralBase::Piece::Storage::Storage race:xla::LocalClient::TransferFromOutfeedLocal race:llvm::cl::opt_storage::setValue +race:stream_executor::gpu::RocmCommandBuffer::LaunchGraph +# ignore race:xla::gpu::(anonymous namespace)::RecoverExp2Pattern::initStaticsIfNeeded* race:lld::lldMain race:llvm::* From 5ce1187e4c9b75d24967e6697a6c4d766af70962 Mon Sep 17 00:00:00 2001 From: Alexandros Theodoridis Date: Thu, 19 Feb 2026 15:27:03 +0000 Subject: [PATCH 3/3] Make tsan builds to be executed locally --- .github/workflows/ci-ut.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-ut.yml b/.github/workflows/ci-ut.yml index cb1dfc5e6..18a202793 100644 --- a/.github/workflows/ci-ut.yml +++ b/.github/workflows/ci-ut.yml @@ -31,7 +31,8 @@ jobs: - {name: "py3.13", python_version: "3.13", config: ""} - {name: "py3.14", python_version: "3.14", config: ""} - {name: "asan", python_version: "3.11", config: "--config=asan"} - - {name: "tsan", python_version: "3.11", config: "--config=tsan"} + # yamllint disable-line rule:line-length + - {name: "tsan", python_version: "3.11", config: "--config=tsan --strategy=TestRunner=local"} container: # note this image shall match the one defined in platform/linux:tf_linux_gpu image: rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa