From f8d3efce766d24dc5c3646742ba0f1149486544d Mon Sep 17 00:00:00 2001 From: Christian Bruckmayer Date: Tue, 1 Jul 2025 12:58:24 +0100 Subject: [PATCH] Fix race in requeue --- redis/requeue.lua | 3 +++ ruby/lib/ci/queue/redis/build_record.rb | 10 ++++++++-- ruby/lib/ci/queue/redis/worker.rb | 1 + ruby/lib/minitest/queue/build_status_recorder.rb | 4 +++- ruby/test/integration/minitest_redis_test.rb | 3 ++- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/redis/requeue.lua b/redis/requeue.lua index a1fcc25c..b1dd35ec 100644 --- a/redis/requeue.lua +++ b/redis/requeue.lua @@ -4,6 +4,7 @@ local queue_key = KEYS[3] local zset_key = KEYS[4] local worker_queue_key = KEYS[5] local owners_key = KEYS[6] +local error_reports_key = KEYS[7] local max_requeues = tonumber(ARGV[1]) local global_max_requeues = tonumber(ARGV[2]) @@ -31,6 +32,8 @@ end redis.call('hincrby', requeues_count_key, '___total___', 1) redis.call('hincrby', requeues_count_key, test, 1) +redis.call('hdel', error_reports_key, test) + local pivot = redis.call('lrange', queue_key, -1 - offset, 0 - offset)[1] if pivot then redis.call('linsert', queue_key, 'BEFORE', pivot, test) diff --git a/ruby/lib/ci/queue/redis/build_record.rb b/ruby/lib/ci/queue/redis/build_record.rb index 4c68c100..9463cfbc 100644 --- a/ruby/lib/ci/queue/redis/build_record.rb +++ b/ruby/lib/ci/queue/redis/build_record.rb @@ -65,9 +65,9 @@ def record_error(id, payload, stats: nil) nil end - def record_success(id, stats: nil, skip_flaky_record: false, acknowledge: true) + def record_success(id, stats: nil, skip_flaky_record: false) _, error_reports_deleted_count, requeued_count, _ = redis.multi do |transaction| - @queue.acknowledge(id, pipeline: transaction) if acknowledge + @queue.acknowledge(id, pipeline: transaction) transaction.hdel(key('error-reports'), id) transaction.hget(key('requeues-count'), id) record_stats(stats, pipeline: transaction) @@ -76,6 +76,12 @@ def record_success(id, stats: nil, skip_flaky_record: false, acknowledge: true) nil end + def record_requeue(id, stats: nil) + redis.pipelined do |pipeline| + record_stats(stats, pipeline: pipeline) + end + end + def record_flaky(id, stats: nil) redis.pipelined do |pipeline| pipeline.sadd?( diff --git a/ruby/lib/ci/queue/redis/worker.rb b/ruby/lib/ci/queue/redis/worker.rb index db87e566..09f55c3b 100644 --- a/ruby/lib/ci/queue/redis/worker.rb +++ b/ruby/lib/ci/queue/redis/worker.rb @@ -131,6 +131,7 @@ def requeue(test, offset: Redis.requeue_offset) key('running'), key('worker', worker_id, 'queue'), key('owners'), + key('error-reports'), ], argv: [config.max_requeues, global_max_requeues, test_key, offset], ) == 1 diff --git a/ruby/lib/minitest/queue/build_status_recorder.rb b/ruby/lib/minitest/queue/build_status_recorder.rb index 7a05bdc5..bf50b1f5 100644 --- a/ruby/lib/minitest/queue/build_status_recorder.rb +++ b/ruby/lib/minitest/queue/build_status_recorder.rb @@ -51,8 +51,10 @@ def record(test) stats = COUNTERS.zip(COUNTERS.map { |c| send(c) }).to_h if (test.failure || test.error?) && !test.skipped? build.record_error("#{test.klass}##{test.name}", dump(test), stats: stats) + elsif test.requeued? + build.record_requeue("#{test.klass}##{test.name}", stats: stats) else - build.record_success("#{test.klass}##{test.name}", stats: stats, skip_flaky_record: test.skipped?, acknowledge: !test.requeued?) + build.record_success("#{test.klass}##{test.name}", stats: stats, skip_flaky_record: test.skipped?) end end diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb index 370c516c..a0ab6c2c 100644 --- a/ruby/test/integration/minitest_redis_test.rb +++ b/ruby/test/integration/minitest_redis_test.rb @@ -509,6 +509,7 @@ def test_retry_report assert_equal 100, error_reports.size error_reports.keys.each_with_index do |test_id, index| + queue.instance_variable_set(:@reserved_tests, Set.new([test_id])) queue.build.record_success(test_id.dup, stats: { 'assertions' => index + 1, 'errors' => 0, @@ -516,7 +517,7 @@ def test_retry_report 'skips' => 0, 'requeues' => 0, 'total_time' => index + 1, - }, acknowledge: false) + }) end # Retry first worker, bailing out