From f95836f92eaed9afbe67e37224765e785efc9072 Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Mon, 12 Jan 2026 15:11:03 +0800 Subject: [PATCH 1/3] cpu-o3: add 2Fetch features - Introduced support for 2Fetch features in the branch predictor and Fetch Stage. - Enhanced the FetchTargetQueue to manage next FTQ entries for 2Fetch functionality. - Now if 2 fetchBlock in the same 64 byte fetchBuffer, it could be 2 fetched at the same cycle. Change-Id: I3b112cc844c485d81cea1f4ed0ff221cb37d2782 --- src/cpu/o3/fetch.cc | 2 +- src/cpu/o3/fetch.hh | 17 +++- src/cpu/o3/fetch.md | 13 ++- src/cpu/pred/BranchPredictor.py | 4 + src/cpu/pred/btb/decoupled_bpred.cc | 108 +++++++++++++++++++++- src/cpu/pred/btb/decoupled_bpred.hh | 41 ++++++++ src/cpu/pred/btb/decoupled_bpred_stats.cc | 9 +- src/cpu/pred/btb/fetch_target_queue.cc | 54 +++++++++++ src/cpu/pred/btb/fetch_target_queue.hh | 23 +++++ 9 files changed, 262 insertions(+), 9 deletions(-) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 93909a9186..348f885dbd 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -2070,7 +2070,7 @@ Fetch::performInstructionFetch(ThreadID tid) // Main instruction fetch loop - process until fetch width or other limits StallReason stall = StallReason::NoStall; while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize && - !predictedBranch && !ftqEmpty() && !waitForVsetvl) { + !shouldStopFetchThisCycle(predictedBranch)) { // Check memory needs and supply bytes to decoder if required stall = checkMemoryNeeds(tid, pc_state, curMacroop); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 925527768e..27670eaa24 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -514,8 +514,21 @@ class Fetch /** Profile the reasons of fetch stall. */ void profileStall(ThreadID tid); - - bool ftqEmpty() { return isDecoupledFrontend() && usedUpFetchTargets; } + /** + * Decide whether fetch should stop for this cycle based on frontend mode. + * - Decoupled frontend stops when the current FTQ supply is exhausted. + * - Non-decoupled frontend stops when a branch is predicted taken. + */ + bool shouldStopFetchThisCycle(bool predictedBranch) + { + if (waitForVsetvl) { + return true; + } + if (isDecoupledFrontend()) { + return usedUpFetchTargets; + } + return predictedBranch; + } /** Set the reasons of all fetch stalls. */ void setAllFetchStalls(StallReason stall); diff --git a/src/cpu/o3/fetch.md b/src/cpu/o3/fetch.md index e9ba378e06..a07bdd1ae9 100644 --- a/src/cpu/o3/fetch.md +++ b/src/cpu/o3/fetch.md @@ -341,7 +341,16 @@ bool isFTBPred() { return branchPred->isFTB(); } bool isBTBPred() { return branchPred->isBTB(); } // 主要使用的预测器类型 // Track if FTQ is empty -bool ftqEmpty() { return isDecoupledFrontend() && usedUpFetchTargets; } +bool shouldStopFetchThisCycle(bool predictedBranch) +{ + if (waitForVsetvl) { + return true; + } + if (isDecoupledFrontend()) { + return usedUpFetchTargets; + } + return predictedBranch; +} ``` ### DecoupledBPUWithBTB 工作流程: @@ -573,7 +582,7 @@ void fetch(bool &status_change) { void performInstructionFetch(ThreadID tid, Addr fetch_addr, bool &status_change) { // 主循环: 处理直到fetch宽度或其他限制 while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize && - !predictedBranch && !ftqEmpty() && !waitForVsetvl) { + !shouldStopFetchThisCycle(predictedBranch)) { // 1. 检查内存需求并供给decoder stall = checkMemoryNeeds(tid, this_pc, curMacroop); diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 548a0a0425..ffadb63704 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1181,3 +1181,7 @@ class DecoupledBPUWithBTB(BranchPredictor): enableLoopPredictor = Param.Bool(False, "Use loop predictor to predict loop exit") enableJumpAheadPredictor = Param.Bool(False, "Use jump ahead predictor to skip no-need-to-predict blocks") resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once") + + enable2Taken = Param.Bool(False, "Enable 2taken feature") + enable2Fetch = Param.Bool(False, "Enable 2fetch feature") + maxFetchBytesPerCycle = Param.Unsigned(64, "Maximum fetch bytes per cycle for 2fetch") diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index bef8cb0862..8e34716200 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -1,5 +1,6 @@ #include "cpu/pred/btb/decoupled_bpred.hh" +#include #include #include "base/debug_helper.hh" @@ -27,6 +28,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) enableLoopBuffer(p.enableLoopBuffer), enableLoopPredictor(p.enableLoopPredictor), enableJumpAheadPredictor(p.enableJumpAheadPredictor), + enable2Taken(p.enable2Taken), fetchTargetQueue(p.ftq_size), fetchStreamQueueSize(p.fsq_size), predictWidth(p.predictWidth), @@ -45,7 +47,9 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) numStages(p.numStages), historyManager(16), // TODO: fix this resolveBlockThreshold(p.resolveBlockThreshold), - dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) + dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum), + enable2Fetch(p.enable2Fetch), + maxFetchBytesPerCycle(p.maxFetchBytesPerCycle) { if (bpDBSwitches.size() > 0) { initDB(); @@ -160,7 +164,7 @@ DecoupledBPUWithBTB::tick() // Clear each predictor's output for (int i = 0; i < numStages; i++) { predsOfEachStage[i].btbEntries.clear(); - } +} } if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) { @@ -436,7 +440,20 @@ DecoupledBPUWithBTB::decoupledPredict(const StaticInstPtr &inst, // Increment instruction counter for current FTQ entry currentFtqEntryInstNum++; if (run_out_of_this_entry) { + // Check if 2fetch is enabled, not fetched first FTQ yet, and if we can extend to the next FTQ + // NEW: 2Fetch extension check - before processing completion + dbpBtbStats.fetch2Attempts++; + if (enable2Fetch && !has1Fetched && canExtendToNextFTQ(pc, target_to_fetch)) { + DPRINTF(DecoupleBP, "2Fetch: extending to next FTQ in same cycle\n"); + has1Fetched = true; + processFetchTargetCompletion(target_to_fetch); + extendToNextFTQ(pc); + // first fetchBlock is always taken, do not run out of FTQ now + return std::make_pair(true, false); + } + processFetchTargetCompletion(target_to_fetch); + has1Fetched = false; // reset 2fetch flag } DPRINTF(DecoupleBP, "Predict it %staken to %#lx\n", taken ? "" : "not ", @@ -1372,6 +1389,93 @@ DecoupledBPUWithBTB::recoverHistoryForSquash( } +// NEW: 2Fetch support methods implementation + +/** + * @brief Check if we can extend to next FTQ entry for 2fetch + * + * @param current_pc Current program counter + * @param current_ftq Current FTQ entry that is being completed + * @return true if 2fetch extension is possible + */ +bool +DecoupledBPUWithBTB::canExtendToNextFTQ(const PCStateBase ¤t_pc, const FtqEntry ¤t_ftq) +{ + // Early exit if 2fetch is disabled + if (!enable2Fetch) { + return false; + } + + if (!current_ftq.taken) { + DPRINTF(DecoupleBP, "2Fetch rejected: current FTQ is not taken\n"); + dbpBtbStats.fetch2FirstNotTaken++; + return false; + } + + // Check if next FTQ entry is available + if (!fetchTargetQueue.hasNext()) { + DPRINTF(DecoupleBP, "2Fetch rejected: no next FTQ entry available\n"); + dbpBtbStats.fetch2NoNextFTQ++; + return false; + } + + // Get next FTQ entry (without consuming it) + const auto &next_ftq = fetchTargetQueue.peekNext(); + // current_ftq is passed as parameter + + // Check if current PC is the jump target of the next FTQ start + if (current_pc.instAddr() != next_ftq.startPC) { + DPRINTF(DecoupleBP, "2Fetch rejected: PC %#x not at next FTQ start %#x\n", + current_pc.instAddr(), next_ftq.startPC); + dbpBtbStats.fetch2FirstNotAtStart++; + return false; + } + + // Check if both FTQs fit in maxFetchBytesPerCycle window + Addr span = next_ftq.endPC - current_ftq.startPC; + if (span > maxFetchBytesPerCycle) { + DPRINTF(DecoupleBP, "2Fetch rejected: span %d exceeds %d bytes\n", + span, maxFetchBytesPerCycle); + dbpBtbStats.fetch2SpanTooLarge++; + return false; + } + + DPRINTF(DecoupleBP, "2Fetch enabled: extending to next FTQ [%#x, %#x), total span: %d bytes\n", + next_ftq.startPC, next_ftq.endPC, span); + return true; +} + +/** + * @brief Extend to process next FTQ entry for 2fetch + * + * @param pc Program counter reference to update + * @param seqNum Sequence number + * @param tid Thread ID + * @param currentLoopIter Current loop iteration + */ +void +DecoupledBPUWithBTB::extendToNextFTQ(PCStateBase &pc) +{ + // Move to next FTQ entry + fetchTargetQueue.advance(); + currentFtqEntryInstNum = 0; // Reset instruction counter for new FTQ + + // Get the new FTQ entry + const auto &target_to_fetch = fetchTargetQueue.getTarget(); + + DPRINTF(DecoupleBP, "Processing extended FTQ entry: [%#x, %#x)\n", + target_to_fetch.startPC, target_to_fetch.endPC); + + // Set PC to start of new FTQ + auto &rpc = pc.as(); + rpc.pc(target_to_fetch.startPC); + rpc.npc(target_to_fetch.startPC + 4); + rpc.uReset(); + + // Record successful 2fetch + dbpBtbStats.fetch2Successes++; +} + } // namespace btb_pred } // namespace branch_prediction diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index a6f4180244..84af4d914c 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -82,6 +82,9 @@ class DecoupledBPUWithBTB : public BPredUnit JumpAheadPredictor jap; bool enableJumpAheadPredictor{false}; + // 2taken feature support + bool enable2Taken{true}; // Default enabled + private: std::string _name; @@ -377,6 +380,14 @@ class DecoupledBPUWithBTB : public BPredUnit statistics::Scalar s3PredWrongIttage; statistics::Scalar s3PredWrongRas; + // NEW: 2Fetch statistics + statistics::Scalar fetch2Attempts; ///< Number of 2fetch attempts + statistics::Scalar fetch2Successes; ///< Number of successful 2fetch cycles + statistics::Scalar fetch2SpanTooLarge; ///< Rejected due to span > maxFetchBytes + statistics::Scalar fetch2NoNextFTQ; ///< Rejected due to no next FTQ entry + statistics::Scalar fetch2FirstNotTaken; ///< Rejected due to current FTQ is not taken + statistics::Scalar fetch2FirstNotAtStart; ///< Rejected due to current PC is not at next FTQ start + DBPBTBStats(statistics::Group* parent, unsigned numStages, unsigned fsqSize, unsigned maxInstsNum); } dbpBtbStats; @@ -999,6 +1010,36 @@ class DecoupledBPUWithBTB : public BPredUnit */ int currentFtqEntryInstNum{0}; + // NEW: 2Fetch support variables + /** + * @brief Enable 2fetch capability + */ + bool enable2Fetch{true}; + + /** + * @brief Whether fetched first FTQ + */ + bool has1Fetched{false}; + + /** + * @brief Maximum fetch bytes per cycle for 2fetch + */ + unsigned maxFetchBytesPerCycle{64}; + + // NEW: 2Fetch support methods + /** + * @brief Check if we can extend to next FTQ entry for 2fetch + * + * @param current_pc Current program counter + * @param current_ftq Current FTQ entry that is being completed + * @return true if extension to next FTQ is possible + */ + bool canExtendToNextFTQ(const PCStateBase ¤t_pc, const FtqEntry ¤t_ftq); + + // Extend processing to next FTQ entry for 2fetch + + void extendToNextFTQ(PCStateBase &pc); + /** * @brief Dump statistics on program exit * diff --git a/src/cpu/pred/btb/decoupled_bpred_stats.cc b/src/cpu/pred/btb/decoupled_bpred_stats.cc index ed4c2f8688..5f7066be62 100644 --- a/src/cpu/pred/btb/decoupled_bpred_stats.cc +++ b/src/cpu/pred/btb/decoupled_bpred_stats.cc @@ -505,8 +505,13 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats( ADD_STAT(s3PredWrongMbtb, statistics::units::Count::get(), "S3pred wrong blame mbtb "), ADD_STAT(s3PredWrongTage, statistics::units::Count::get(), "S3pred wrong blame tage "), ADD_STAT(s3PredWrongIttage, statistics::units::Count::get(), "S3pred wrong blame ittage "), - ADD_STAT(s3PredWrongRas, statistics::units::Count::get(), "S3pred wrong blame ras ") - + ADD_STAT(s3PredWrongRas, statistics::units::Count::get(), "S3pred wrong blame ras "), + ADD_STAT(fetch2Attempts, statistics::units::Count::get(), "Number of 2fetch attempts"), + ADD_STAT(fetch2Successes, statistics::units::Count::get(), "Number of successful 2fetch cycles"), + ADD_STAT(fetch2SpanTooLarge, statistics::units::Count::get(), "Rejected due to span > maxFetchBytes"), + ADD_STAT(fetch2NoNextFTQ, statistics::units::Count::get(), "Rejected due to no next FTQ entry"), + ADD_STAT(fetch2FirstNotTaken, statistics::units::Count::get(), "Rejected due to current FTQ is not taken"), + ADD_STAT(fetch2FirstNotAtStart, statistics::units::Count::get(), "Rejected due to PC is != next FTQ start") { predsOfEachStage.init(numStages); commitPredsFromEachStage.init(numStages+1); diff --git a/src/cpu/pred/btb/fetch_target_queue.cc b/src/cpu/pred/btb/fetch_target_queue.cc index 904e0e299a..8c4afea147 100644 --- a/src/cpu/pred/btb/fetch_target_queue.cc +++ b/src/cpu/pred/btb/fetch_target_queue.cc @@ -264,6 +264,60 @@ FetchTargetQueue::resetPC(Addr new_pc) fetchTargetEnqState.pc = new_pc; } +// NEW: 2Fetch support methods implementation + +/** + * @brief Check if there is a next available FTQ entry + * + * @return true if next FTQ entry is available + */ +bool +FetchTargetQueue::hasNext() const +{ + // Check if there's an entry with ID = fetchDemandTargetId + 1 + auto next_it = ftq.find(fetchDemandTargetId + 1); + return next_it != ftq.end(); +} + +/** + * @brief Peek at the next FTQ entry without consuming it + * + * @return Reference to the next FTQ entry + */ +const FtqEntry& +FetchTargetQueue::peekNext() const +{ + assert(hasNext()); + auto next_it = ftq.find(fetchDemandTargetId + 1); + return next_it->second; +} + +/** + * @brief Advance to the next FTQ entry without dequeuing current one + * + * Used for 2fetch when we want to process the next entry + * while keeping the current one active + */ +void +FetchTargetQueue::advance() +{ + // Already moved to next target ID in processFetchTargetCompletion + // Update supply state to point to new target + auto next_it = ftq.find(fetchDemandTargetId); + if (next_it != ftq.end()) { + supplyFetchTargetState.valid = true; + supplyFetchTargetState.targetId = fetchDemandTargetId; + supplyFetchTargetState.entry = &(next_it->second); + + DPRINTF(DecoupleBP, + "Advanced to next FTQ entry: ID %lu, PC [%#lx, %#lx)\n", + fetchDemandTargetId, next_it->second.startPC, next_it->second.endPC); + } else { + supplyFetchTargetState.valid = false; + supplyFetchTargetState.entry = nullptr; + } +} + } // namespace btb_pred } // namespace branch_prediction diff --git a/src/cpu/pred/btb/fetch_target_queue.hh b/src/cpu/pred/btb/fetch_target_queue.hh index 94b70b88b9..46377fa0c8 100644 --- a/src/cpu/pred/btb/fetch_target_queue.hh +++ b/src/cpu/pred/btb/fetch_target_queue.hh @@ -157,6 +157,29 @@ class FetchTargetQueue */ void finishCurrentFetchTarget(); + // NEW: 2Fetch support methods + /** + * @brief Check if there is a next available FTQ entry + * + * @return true if next FTQ entry is available + */ + bool hasNext() const; + + /** + * @brief Peek at the next FTQ entry without consuming it + * + * @return Reference to the next FTQ entry + */ + const FtqEntry& peekNext() const; + + /** + * @brief Advance to the next FTQ entry without dequeuing current one + * + * Used for 2fetch when we want to process the next entry + * while keeping the current one active + */ + void advance(); + /** * @brief Try to supply fetch with a target matching the demand PC * From 09b46ebaf8896173cba846a1792ec67446d82d7b Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Mon, 12 Jan 2026 15:59:39 +0800 Subject: [PATCH 2/3] cpu-o3: fix bug for decoupled frontend 2fetch - Added handling in the fetch stage to invalidate the fetch buffer and force a new FTQ entry when the architectural fetch PC exceeds the buffered FTQ window in decoupled-frontend mode. This prevents potential deadlocks during ICache stalls. Change-Id: I38a6003e136a51c115f33358a62138dc987336da --- src/cpu/o3/fetch.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 348f885dbd..1c862bf91a 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -1961,6 +1961,18 @@ Fetch::checkMemoryNeeds(ThreadID tid, const PCStateBase &this_pc, fetch_pc + 4 > fetchBuffer[tid].startPC + fetchBufferSize) { DPRINTF(Fetch, "[tid:%i] PC %#x outside fetch buffer range [%#x, %#x), stalling on ICache\n", tid, fetch_pc, fetchBuffer[tid].startPC, fetchBuffer[tid].startPC + fetchBufferSize); + if (isDecoupledFrontend()) { + // In decoupled-frontend mode, the I-cache fetch address is driven by + // FTQ entries. If the architectural fetch PC escapes the currently + // buffered FTQ window (e.g., due to redirect/loop), we must force a + // new FTQ entry and invalidate the current buffer, otherwise fetch + // can deadlock in an ICache stall without issuing a new request. + usedUpFetchTargets = true; + fetchBuffer[tid].valid = false; + DPRINTF(Fetch, "[tid:%i] Decoupled frontend: invalidating fetchBuffer and " + "forcing new FTQ entry (pc=%#x, bufStart=%#x)\n", + tid, fetch_pc, fetchBuffer[tid].startPC); + } return StallReason::IcacheStall; } From 12224429066d5f5685216ac278865c472f06754b Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Mon, 12 Jan 2026 16:00:09 +0800 Subject: [PATCH 3/3] cpu-o3: test 2 fetch Change-Id: Iaa4a853508ae80e412a8b3bdd41dea26d9eb8c29 --- configs/example/kmhv3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py index 7ae076ffb1..8442c6f5f9 100644 --- a/configs/example/kmhv3.py +++ b/configs/example/kmhv3.py @@ -108,6 +108,8 @@ def setKmhV3Params(args, system): cpu.branchPred.mgsc.enabled = False cpu.branchPred.ras.enabled = True + cpu.branchPred.enable2Fetch = True + # l1 cache per core if args.caches: cpu.icache.size = '64kB'