From 5ce9f918ed3b5c28bc764f6d5127a5b68c0b9084 Mon Sep 17 00:00:00 2001 From: Ashay Rane <253344819+raneashay@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:45:49 +0000 Subject: [PATCH] C2: spill scalar live ranges before high-pressure vector loops This patch adds a new pass that spills scalar values residing in vector registers to memory so that hot loops have complete access to the vector register file. Specifically, on both X64 and AArch64, scalar floating-point values often reside in vector registers (e.g. XMM0-XMM15 on AVX2 or D0-D31 on AArch64), and if these scalar live ranges overlap with vector live ranges inside a hot loop, then the loop will have fewer vector registers to work with, resulting in frequent spilling and restoring of the vector registers within the hot loop. The newly introduced pass in this patch first analyzes loops that are both high frequency and have a high register pressure, before finding scalar live ranges that have no definitions inside the loop and which overlap with the vector register file. Such live ranges are then split so that the scalar values are spilled at the beginning of the loop and restored at the end of the loop. I have validated this using an AVX2 machine using a vectorized 3x4 DGEMM outer product kernel, which shows vector spills in the inner loop dropping from 17 to 2. However, it doesn't look there is a good way to add a JTreg test for this change, since we don't seem to have a reliable way to identify spill and restore assembly instructions using `IRNode`. One alternative is to parse the output of `PrintOptoAssembly` but that approach seems very fragile, especially since a source-level loop is often broken down into multiple pre/main/post loops. --- src/hotspot/share/opto/chaitin.cpp | 92 ++++++++++++++++++++++++++++++ src/hotspot/share/opto/chaitin.hpp | 5 ++ 2 files changed, 97 insertions(+) diff --git a/src/hotspot/share/opto/chaitin.cpp b/src/hotspot/share/opto/chaitin.cpp index 3ba3ffc1045ca..c9d8947deb22a 100644 --- a/src/hotspot/share/opto/chaitin.cpp +++ b/src/hotspot/share/opto/chaitin.cpp @@ -480,6 +480,16 @@ void PhaseChaitin::Register_Allocate() { if (C->failing()) { return; } + + // If we notice live ranges of scalars that begin outside a vector loop that + // is executed frequently and has a high register pressure and if these scalar + // values occupy part of the vector register (e.g. for float/double scalars), + // then split these live ranges so that the vector loop has access to the + // entire vector register file. Although this means that the scalars will + // have to be restored later, it is cheaper to spill and restore scalars once + // rather than spilling and restoring vectors in each iteration of the loop. + must_spill += spill_scalars_before_vector_loops(); + // If we have a guaranteed spill, might as well spill now if (must_spill) { if(!_lrg_map.max_lrg_id()) { @@ -1221,6 +1231,88 @@ void PhaseChaitin::set_was_low() { #endif } +uint PhaseChaitin::spill_scalars_before_vector_loops() { + // We want to split live ranges only when the scalar values reside in vector + // registers, so we build a combined mask of all vector register classes. + // Later, we check if this mask overlaps with that of the scalar live range. + static const uint vector_reg_masks[] = { + Op_VecS, Op_VecD, Op_VecX, Op_VecY, Op_VecZ, Op_VecA + }; + + ResourceMark rm; + ResourceMark rm_masks(C->regmask_arena()); + RegMask vector_regs(C->regmask_arena()); + + for (uint idx = 0; idx < ARRAY_SIZE(vector_reg_masks); idx++) { + const RegMask* vmask = Matcher::idealreg2regmask[vector_reg_masks[idx]]; + if (vmask != nullptr) { + vector_regs.or_with(*vmask); + } + } + + // Early return if couldn't find a combined vector mask. + if (vector_regs.is_empty()) { + return 0; + } + + uint spill_count = 0; + GrowableArray loop_headers; + + // Collect all qualifying loop headers so that we can limit our analysis. + for (uint idx = 0; idx < _cfg.number_of_blocks(); idx++) { + Block* block = _cfg.get_block(idx); + if (block->_freg_pressure > Matcher::float_pressure_limit() && + block->_freq >= _high_frequency_lrg && + block->head()->is_Loop() && + block->_loop != nullptr) { + loop_headers.push(block); + } + } + + // Iterate over every qualifying loop header to determine if live ranges are + // defined inside or outside the loop before deciding whether to spill them. + for (int header_idx = 0; header_idx < loop_headers.length(); header_idx++) { + Block* loop_header = loop_headers.at(header_idx); + CFGLoop* loop = loop_header->_loop; + ResourceBitMap live_ranges_defined_in_loop(_lrg_map.max_lrg_id()); + + for (uint block_idx = 0; block_idx < _cfg.number_of_blocks(); block_idx++) { + Block* block = _cfg.get_block(block_idx); + if (loop->in_loop_nest(block)) { + for (uint node_idx = 0; node_idx < block->number_of_nodes(); node_idx++) { + Node* node = block->get_node(node_idx); + uint live_range_id = _lrg_map.live_range_id(node); + if (live_range_id != 0) { + live_ranges_defined_in_loop.set_bit(live_range_id); + } + } + } + } + + // Check the live ranges that are live at the loop header and check if + // they're in `live_ranges_defined_in_loop`. + IndexSetIterator elements(_live->live(loop_header)); + for (uint range_id = elements.next(); range_id != 0; range_id = elements.next()) { + LRG& live_range = lrgs(range_id); + + // Ignore this live range if it's for a vector value, or if it does not + // occupy a vector register, or if it is expected to spill at the header, + // or if it is defined inside the loop. + if (live_range._is_vector || !live_range.mask().overlap(vector_regs) || + live_range._must_spill == 1 || !live_range.alive() || + live_ranges_defined_in_loop.at(range_id)) { + continue; + } + + live_range._must_spill = 1; + live_range.set_reg(OptoReg::Name(LRG::SPILL_REG)); + spill_count += 1; + } + } + + return spill_count; +} + // Compute cost/area ratio, in case we spill. Build the lo-degree list. void PhaseChaitin::cache_lrg_info( ) { Compile::TracePhase tp(_t_chaitinCacheLRG); diff --git a/src/hotspot/share/opto/chaitin.hpp b/src/hotspot/share/opto/chaitin.hpp index 2d4f7eeb3f2a8..14f3d712f8d62 100644 --- a/src/hotspot/share/opto/chaitin.hpp +++ b/src/hotspot/share/opto/chaitin.hpp @@ -691,6 +691,11 @@ class PhaseChaitin : public PhaseRegAlloc { // coalescing, it should Simplify. This call sets the was-lo-degree bit. void set_was_low(); + // Force-spill scalar live ranges that are live across a high frequency, high + // register pressure vector loop and when the live ranges are defined outside + // the loop. + uint spill_scalars_before_vector_loops(); + // Init LRG caching of degree, numregs. Init lo_degree list. void cache_lrg_info( );