From dfa98f4a50b7caeb8014f9615314eaa480138a19 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Mon, 26 Jan 2026 17:20:31 +0800 Subject: [PATCH 1/4] cpu-o3: ideal 2-taken implementation Change-Id: I39d54a0621d139cc00a156b02a6d7d888d9b15f0 Co-authored-by: Xu Boran --- src/cpu/pred/btb/decoupled_bpred.cc | 69 ++++++++++++++++------------- src/cpu/pred/btb/decoupled_bpred.hh | 2 + 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 8869dec427..97d8092128 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -132,44 +132,51 @@ DecoupledBPUWithBTB::tick() return; } - // 1. Request new prediction if FSQ not full and we are idle - if (bpuState == BpuState::IDLE && !targetQueueFull()) { - if (blockPredictionPending) { - DPRINTF(Override, "Prediction blocked to prioritize resolve update\n"); - dbpBtbStats.predictionBlockedForUpdate++; - blockPredictionPending = false; - } else { - requestNewPrediction(); - bpuState = BpuState::PREDICTOR_DONE; + int predsRemainsToBeMade = enableTwoTaken ? 2 : 1; + unsigned tempNumOverrideBubbles = 0; + + while (predsRemainsToBeMade > 0) { + // 1. Request new prediction if FSQ not full and we are idle + if (bpuState == BpuState::IDLE && !targetQueueFull()) { + if (blockPredictionPending) { + DPRINTF(Override, "Prediction blocked to prioritize resolve update\n"); + dbpBtbStats.predictionBlockedForUpdate++; + blockPredictionPending = false; + } else { + requestNewPrediction(); + bpuState = BpuState::PREDICTOR_DONE; + } } - } - // 2. Handle pending prediction if available - if (bpuState == BpuState::PREDICTOR_DONE) { - DPRINTF(Override, "Generating final prediction for PC %#lx\n", s0PC); - numOverrideBubbles = generateFinalPredAndCreateBubbles(); - bpuState = BpuState::PREDICTION_OUTSTANDING; + // 2. Handle pending prediction if available + if (bpuState == BpuState::PREDICTOR_DONE) { + DPRINTF(Override, "Generating final prediction for PC %#lx\n", s0PC); + numOverrideBubbles = generateFinalPredAndCreateBubbles(); + bpuState = BpuState::PREDICTION_OUTSTANDING; - // Clear each predictor's output - for (int i = 0; i < numStages; i++) { - predsOfEachStage[i].btbEntries.clear(); + // Clear each predictor's output + for (int i = 0; i < numStages; i++) { + predsOfEachStage[i].btbEntries.clear(); + } } - } - if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) { - tage->dryRunCycle(s0PC); - } + if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) { + tage->dryRunCycle(s0PC); + } - // check if: - // 1. FSQ has space - // 2. there's no bubble - // 3. PREDICTION_OUTSTANDING - if (validateFSQEnqueue()) { - // Create new FSQ entry with the current prediction - processNewPrediction(); + // check if: + // 1. FSQ has space + // 2. there's no bubble + // 3. PREDICTION_OUTSTANDING + if (validateFSQEnqueue()) { + // Create new FSQ entry with the current prediction + processNewPrediction(); - DPRINTF(Override, "FSQ entry enqueued, prediction state reset\n"); - bpuState = BpuState::IDLE; + DPRINTF(Override, "FSQ entry enqueued, prediction state reset\n"); + bpuState = BpuState::IDLE; + } + + predsRemainsToBeMade--; } // Decrement override bubbles counter diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 576b0fce39..5700f6ef46 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -145,6 +145,8 @@ class DecoupledBPUWithBTB : public BPredUnit unsigned numOverrideBubbles{0}; + bool enableTwoTaken{true}; + bool validateFSQEnqueue(); void processNewPrediction(); From cb91571ed6a8520c4b04768265303f98c3077cd5 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 4 Feb 2026 10:46:22 +0800 Subject: [PATCH 2/4] cpu-o3: ideal 2-fetch Change-Id: Ic203a9694c093034744986309e796b9d66d6f826 --- configs/example/kmhv3.py | 1 + src/cpu/o3/fetch.cc | 48 +++++++++++++++++++++-------- src/cpu/o3/fetch.hh | 5 +-- src/cpu/pred/BranchPredictor.py | 3 ++ src/cpu/pred/btb/decoupled_bpred.cc | 2 ++ src/cpu/pred/btb/decoupled_bpred.hh | 16 ++++++++++ 6 files changed, 61 insertions(+), 14 deletions(-) diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py index 71844d9478..97efa89117 100644 --- a/configs/example/kmhv3.py +++ b/configs/example/kmhv3.py @@ -107,6 +107,7 @@ def setKmhV3Params(args, system): cpu.branchPred.ittage.enabled = True cpu.branchPred.mgsc.enabled = False cpu.branchPred.ras.enabled = True + cpu.branchPred.enable2Fetch = True # l1 cache per core if args.caches: diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index a5079227a0..87eaee3231 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -813,13 +813,34 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc) run_out = fall_thru >= stream.predEndPC; } + bool do_2fetch = false; + // Track how many dynamic instructions were fetched for this (legacy) FTQ/FSQ entry. ftqEntryFetchedInsts[tid]++; if (run_out) { + if (predict_taken && dbpbtb->is2FetchEnabled() && dbpbtb->ftqHasNext()) { + const Addr target_pc = stream.predBranchInfo.target; + const auto &next_stream = dbpbtb->ftqNext(); + const Addr span = next_stream.predEndPC - stream.startPC; + const unsigned max_bytes = dbpbtb->getMaxFetchBytesPerCycle(); + const bool target_in_buffer = + target_pc >= fetchBuffer[tid].startPC && target_pc + 4 <= fetchBuffer[tid].startPC + fetchBufferSize; + + if (target_pc == next_stream.startPC && span <= max_bytes && target_in_buffer) { + do_2fetch = true; + DPRINTF(DecoupleBP, + "2Fetch: extend in-cycle to next FSQ entry (cur [%#lx, %#lx), next [%#lx, %#lx), span=%lu, " + "max=%u)\n", + stream.startPC, stream.predEndPC, next_stream.startPC, next_stream.predEndPC, span, max_bytes); + } + } + dbpbtb->consumeFetchTarget(ftqEntryFetchedInsts[tid]); ftqEntryFetchedInsts[tid] = 0; - fetchBuffer[tid].valid = false; - DPRINTF(DecoupleBP, "Used up fetch targets.\n"); + if (!do_2fetch) { + fetchBuffer[tid].valid = false; + DPRINTF(DecoupleBP, "Used up fetch targets.\n"); + } } inst->setLoopIteration(currentLoopIter); @@ -845,7 +866,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc) ++fetchStats.predictedBranches; } - return predict_taken; + return predict_taken && !do_2fetch; } bool @@ -1857,6 +1878,8 @@ Fetch::checkMemoryNeeds(ThreadID tid, const PCStateBase &this_pc, fetch_pc + 4 > fetchBuffer[tid].startPC + fetchBufferSize) { DPRINTF(Fetch, "[tid:%i] PC %#x outside fetch buffer range [%#x, %#x), stalling on ICache\n", tid, fetch_pc, fetchBuffer[tid].startPC, fetchBuffer[tid].startPC + fetchBufferSize); + // Force issuing a new I-cache request. + fetchBuffer[tid].valid = false; return StallReason::IcacheStall; } @@ -1879,7 +1902,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, StaticInstPtr &curMacroop) { auto *dec_ptr = decoder[tid]; - bool predictedBranch = false; + bool stopFetchThisCycle = false; bool newMacroop = false; // Create a copy of the current PC state to calculate the next PC. @@ -1936,16 +1959,17 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, set(next_pc, pc); // Handle branch prediction and update next_pc for both modes - predictedBranch = lookupAndUpdateNextPC(instruction, *next_pc); + stopFetchThisCycle = lookupAndUpdateNextPC(instruction, *next_pc); + const bool predictedTaken = instruction->readPredTaken(); - if (predictedBranch) { + if (predictedTaken) { DPRINTF(Fetch, "[tid:%i] Branch detected with PC = %s, target = %s\n", instruction->threadNumber, pc, *next_pc); } if (isTraceMode()) { assert(traceFetch); - traceFetch->postBranchPredict(tid, instruction, traceForThisInst, pc, *next_pc, predictedBranch); + traceFetch->postBranchPredict(tid, instruction, traceForThisInst, pc, *next_pc, predictedTaken); } // A new macro-op also begins if the PC changes discontinuously. @@ -1959,7 +1983,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, // Update the main PC state for the next instruction. set(pc, *next_pc); - return predictedBranch; + return stopFetchThisCycle; } void @@ -1977,7 +2001,7 @@ Fetch::performInstructionFetch(ThreadID tid) StaticInstPtr &curMacroop = macroop[tid]; // Control flags for main fetch loop - bool predictedBranch = false; + bool stopFetchThisCycle = false; DPRINTF(Fetch, "[tid:%i] Adding instructions to queue to decode.\n", tid); @@ -1985,7 +2009,7 @@ Fetch::performInstructionFetch(ThreadID tid) // For decoupled frontend (including trace mode), check FTQ availability StallReason stall = StallReason::NoStall; while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize && - !predictedBranch && !ftqEmpty() && !waitForVsetvl) { + !stopFetchThisCycle && !ftqEmpty() && !waitForVsetvl) { // Check memory needs and supply bytes to decoder if required stall = checkMemoryNeeds(tid, pc_state, curMacroop); @@ -1998,7 +2022,7 @@ Fetch::performInstructionFetch(ThreadID tid) // into multiple micro-ops. do { // Process a single instruction, from decoding to PC update. - predictedBranch = processSingleInstruction(tid, pc_state, curMacroop); + stopFetchThisCycle = processSingleInstruction(tid, pc_state, curMacroop); } while (curMacroop && numInst < fetchWidth && @@ -2017,7 +2041,7 @@ Fetch::performInstructionFetch(ThreadID tid) } // Log why fetch stopped - if (predictedBranch) { + if (stopFetchThisCycle) { DPRINTF(Fetch, "[tid:%i] Done fetching, predicted branch instruction encountered.\n", tid); } else if (numInst >= fetchWidth) { DPRINTF(Fetch, "[tid:%i] Done fetching, reached fetch bandwidth for this cycle.\n", tid); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 65ae953797..96565f9963 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -354,7 +354,8 @@ class Fetch * Looks up the branch predictor, gets a prediction, and updates the PC. * @param inst The dynamic instruction object. * @param next_pc The PC state to update with the prediction. - * @return true if a branch was predicted taken. + * @return true if fetch should stop this cycle due to a predicted-taken + * branch (2Fetch may override and return false). */ bool lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc); @@ -564,7 +565,7 @@ class Fetch * @param tid The thread ID of the instruction. * @param pc The current program counter state (will be updated). * @param curMacroop The current macro-op being processed (if any). - * @return true if a branch was predicted. + * @return true if fetch should stop this cycle. */ bool processSingleInstruction(ThreadID tid, PCStateBase &pc, diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index a5132f48af..77860e1dac 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1178,3 +1178,6 @@ class DecoupledBPUWithBTB(BranchPredictor): bpDBSwitches = VectorParam.String([], "Enable which traces in the form of database") resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once") + + enable2Fetch = Param.Bool(False, "Enable 2fetch feature") + maxFetchBytesPerCycle = Param.Unsigned(64, "Maximum fetch bytes per cycle for 2fetch") diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 97d8092128..e3a244be33 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -48,6 +48,8 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) numStages(p.numStages), historyManager(16), // TODO: fix this resolveBlockThreshold(p.resolveBlockThreshold), + enable2Fetch(p.enable2Fetch), + maxFetchBytesPerCycle(p.maxFetchBytesPerCycle), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) { if (bpDBSwitches.size() > 0) { diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 5700f6ef46..5265ec138c 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -143,6 +143,9 @@ class DecoupledBPUWithBTB : public BPredUnit unsigned resolveDequeueFailCounter{0}; const unsigned resolveBlockThreshold; + const bool enable2Fetch; + const unsigned maxFetchBytesPerCycle; + unsigned numOverrideBubbles{0}; bool enableTwoTaken{true}; @@ -213,6 +216,13 @@ class DecoupledBPUWithBTB : public BPredUnit return fetchTargetQueue[id - fetchTargetBaseId]; } + const FetchTarget& + getTarget(FetchTargetId id) const + { + assert(hasTarget(id)); + return fetchTargetQueue[id - fetchTargetBaseId]; + } + FetchTargetId frontTargetId() const { @@ -455,6 +465,12 @@ class DecoupledBPUWithBTB : public BPredUnit FetchTargetId ftqHeadId() const { assert(ftqHasHead()); return fetchHeadFtqId; } const FetchTarget &ftqHead() { assert(ftqHasHead()); return getTarget(fetchHeadFtqId); } + bool ftqHasNext() const { return hasTarget(fetchHeadFtqId + 1); } + const FetchTarget &ftqNext() const { assert(ftqHasNext()); return getTarget(fetchHeadFtqId + 1); } + + bool is2FetchEnabled() const { return enable2Fetch; } + unsigned getMaxFetchBytesPerCycle() const { return maxFetchBytesPerCycle; } + void dumpFsq(const char *when); // Dummy overriding From fca73b1f80a259d462b35ae13caba537cf8c590b Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:24:32 +0800 Subject: [PATCH 3/4] cpu-o3: enable 2-fetch on idealkmhv3 config Change-Id: I3f0f686000b610c3bf842e62c9b9e91e7188a028 --- configs/example/idealkmhv3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py index aebedb4f85..c2a37b645e 100644 --- a/configs/example/idealkmhv3.py +++ b/configs/example/idealkmhv3.py @@ -80,6 +80,7 @@ def setKmhV3IdealParams(args, system): if args.bp_type == 'DecoupledBPUWithBTB': cpu.branchPred.ftq_size = 256 cpu.branchPred.fsq_size = 256 + cpu.branchPred.enable2Fetch = True # l1 cache per core if args.caches: From 51bfe68be18795fdfe7e74235fd6688421a0f0ac Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Thu, 5 Feb 2026 10:51:44 +0800 Subject: [PATCH 4/4] cpu-o3: reduce fsq size to 64 Change-Id: Ida30dab747aaf11d36b38637476b1c137e589942 --- configs/example/idealkmhv3.py | 4 ++-- configs/example/kmhv3.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py index c2a37b645e..20fc93243c 100644 --- a/configs/example/idealkmhv3.py +++ b/configs/example/idealkmhv3.py @@ -78,8 +78,8 @@ def setKmhV3IdealParams(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': - cpu.branchPred.ftq_size = 256 - cpu.branchPred.fsq_size = 256 + cpu.branchPred.ftq_size = 64 + cpu.branchPred.fsq_size = 64 cpu.branchPred.enable2Fetch = True # l1 cache per core diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py index 97efa89117..ea7809093d 100644 --- a/configs/example/kmhv3.py +++ b/configs/example/kmhv3.py @@ -92,8 +92,8 @@ def setKmhV3Params(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': - cpu.branchPred.ftq_size = 256 - cpu.branchPred.fsq_size = 256 + cpu.branchPred.ftq_size = 64 + cpu.branchPred.fsq_size = 64 cpu.branchPred.mbtb.resolvedUpdate = True cpu.branchPred.tage.resolvedUpdate = True