Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/example/kmhv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def setKmhV3Params(args, system):
cpu.branchPred.mgsc.enabled = False
cpu.branchPred.ras.enabled = True

cpu.branchPred.enable2Fetch = True

# l1 cache per core
if args.caches:
cpu.icache.size = '64kB'
Expand Down
14 changes: 13 additions & 1 deletion src/cpu/o3/fetch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1961,6 +1961,18 @@ Fetch::checkMemoryNeeds(ThreadID tid, const PCStateBase &this_pc,
fetch_pc + 4 > fetchBuffer[tid].startPC + fetchBufferSize) {
DPRINTF(Fetch, "[tid:%i] PC %#x outside fetch buffer range [%#x, %#x), stalling on ICache\n",
tid, fetch_pc, fetchBuffer[tid].startPC, fetchBuffer[tid].startPC + fetchBufferSize);
if (isDecoupledFrontend()) {
// In decoupled-frontend mode, the I-cache fetch address is driven by
// FTQ entries. If the architectural fetch PC escapes the currently
// buffered FTQ window (e.g., due to redirect/loop), we must force a
// new FTQ entry and invalidate the current buffer, otherwise fetch
// can deadlock in an ICache stall without issuing a new request.
usedUpFetchTargets = true;
fetchBuffer[tid].valid = false;
DPRINTF(Fetch, "[tid:%i] Decoupled frontend: invalidating fetchBuffer and "
"forcing new FTQ entry (pc=%#x, bufStart=%#x)\n",
tid, fetch_pc, fetchBuffer[tid].startPC);
}
return StallReason::IcacheStall;
}

Expand Down Expand Up @@ -2070,7 +2082,7 @@ Fetch::performInstructionFetch(ThreadID tid)
// Main instruction fetch loop - process until fetch width or other limits
StallReason stall = StallReason::NoStall;
while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize &&
!predictedBranch && !ftqEmpty() && !waitForVsetvl) {
!shouldStopFetchThisCycle(predictedBranch)) {

// Check memory needs and supply bytes to decoder if required
stall = checkMemoryNeeds(tid, pc_state, curMacroop);
Expand Down
17 changes: 15 additions & 2 deletions src/cpu/o3/fetch.hh
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,21 @@ class Fetch
/** Profile the reasons of fetch stall. */
void profileStall(ThreadID tid);


bool ftqEmpty() { return isDecoupledFrontend() && usedUpFetchTargets; }
/**
* Decide whether fetch should stop for this cycle based on frontend mode.
* - Decoupled frontend stops when the current FTQ supply is exhausted.
* - Non-decoupled frontend stops when a branch is predicted taken.
*/
bool shouldStopFetchThisCycle(bool predictedBranch)
{
if (waitForVsetvl) {
return true;
}
if (isDecoupledFrontend()) {
return usedUpFetchTargets;
}
return predictedBranch;
}

/** Set the reasons of all fetch stalls. */
void setAllFetchStalls(StallReason stall);
Expand Down
13 changes: 11 additions & 2 deletions src/cpu/o3/fetch.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,16 @@ bool isFTBPred() { return branchPred->isFTB(); }
bool isBTBPred() { return branchPred->isBTB(); } // 主要使用的预测器类型

// Track if FTQ is empty
bool ftqEmpty() { return isDecoupledFrontend() && usedUpFetchTargets; }
bool shouldStopFetchThisCycle(bool predictedBranch)
{
if (waitForVsetvl) {
return true;
}
if (isDecoupledFrontend()) {
return usedUpFetchTargets;
}
return predictedBranch;
}
```

### DecoupledBPUWithBTB 工作流程:
Expand Down Expand Up @@ -573,7 +582,7 @@ void fetch(bool &status_change) {
void performInstructionFetch(ThreadID tid, Addr fetch_addr, bool &status_change) {
// 主循环: 处理直到fetch宽度或其他限制
while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize &&
!predictedBranch && !ftqEmpty() && !waitForVsetvl) {
!shouldStopFetchThisCycle(predictedBranch)) {

// 1. 检查内存需求并供给decoder
stall = checkMemoryNeeds(tid, this_pc, curMacroop);
Expand Down
4 changes: 4 additions & 0 deletions src/cpu/pred/BranchPredictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1181,3 +1181,7 @@ class DecoupledBPUWithBTB(BranchPredictor):
enableLoopPredictor = Param.Bool(False, "Use loop predictor to predict loop exit")
enableJumpAheadPredictor = Param.Bool(False, "Use jump ahead predictor to skip no-need-to-predict blocks")
resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once")

enable2Taken = Param.Bool(False, "Enable 2taken feature")
enable2Fetch = Param.Bool(False, "Enable 2fetch feature")
maxFetchBytesPerCycle = Param.Unsigned(64, "Maximum fetch bytes per cycle for 2fetch")
108 changes: 106 additions & 2 deletions src/cpu/pred/btb/decoupled_bpred.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "cpu/pred/btb/decoupled_bpred.hh"

#include <algorithm>
#include <array>

#include "base/debug_helper.hh"
Expand Down Expand Up @@ -27,6 +28,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
enableLoopBuffer(p.enableLoopBuffer),
enableLoopPredictor(p.enableLoopPredictor),
enableJumpAheadPredictor(p.enableJumpAheadPredictor),
enable2Taken(p.enable2Taken),
fetchTargetQueue(p.ftq_size),
fetchStreamQueueSize(p.fsq_size),
predictWidth(p.predictWidth),
Expand All @@ -45,7 +47,9 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
numStages(p.numStages),
historyManager(16), // TODO: fix this
resolveBlockThreshold(p.resolveBlockThreshold),
dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum)
dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum),
enable2Fetch(p.enable2Fetch),
maxFetchBytesPerCycle(p.maxFetchBytesPerCycle)
{
if (bpDBSwitches.size() > 0) {
initDB();
Expand Down Expand Up @@ -160,7 +164,7 @@ DecoupledBPUWithBTB::tick()
// Clear each predictor's output
for (int i = 0; i < numStages; i++) {
predsOfEachStage[i].btbEntries.clear();
}
}
}

if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) {
Expand Down Expand Up @@ -436,7 +440,20 @@ DecoupledBPUWithBTB::decoupledPredict(const StaticInstPtr &inst,
// Increment instruction counter for current FTQ entry
currentFtqEntryInstNum++;
if (run_out_of_this_entry) {
// Check if 2fetch is enabled, not fetched first FTQ yet, and if we can extend to the next FTQ
// NEW: 2Fetch extension check - before processing completion
dbpBtbStats.fetch2Attempts++;
if (enable2Fetch && !has1Fetched && canExtendToNextFTQ(pc, target_to_fetch)) {
DPRINTF(DecoupleBP, "2Fetch: extending to next FTQ in same cycle\n");
has1Fetched = true;
processFetchTargetCompletion(target_to_fetch);
extendToNextFTQ(pc);
// first fetchBlock is always taken, do not run out of FTQ now
return std::make_pair(true, false);
}

processFetchTargetCompletion(target_to_fetch);
has1Fetched = false; // reset 2fetch flag
}
Comment on lines +443 to 457
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

fetch2Attempts is incremented even when 2Fetch is disabled or already used.

The counter fetch2Attempts is incremented unconditionally at line 445, before checking enable2Fetch and !has1Fetched. This means:

  1. Attempts are counted even when enable2Fetch is false
  2. The second FTQ completion in a 2Fetch cycle (when has1Fetched=true) also counts as an attempt

Consider moving the increment inside the condition, or after the early-exit checks in canExtendToNextFTQ.

🐛 Suggested fix
     if (run_out_of_this_entry) {
         // Check if 2fetch is enabled, not fetched first FTQ yet, and if we can extend to the next FTQ
         // NEW: 2Fetch extension check - before processing completion
-        dbpBtbStats.fetch2Attempts++;
-        if (enable2Fetch && !has1Fetched && canExtendToNextFTQ(pc, target_to_fetch)) {
+        if (enable2Fetch && !has1Fetched) {
+            dbpBtbStats.fetch2Attempts++;
+            if (canExtendToNextFTQ(pc, target_to_fetch)) {
             DPRINTF(DecoupleBP, "2Fetch: extending to next FTQ in same cycle\n");
             has1Fetched = true;
             processFetchTargetCompletion(target_to_fetch);
             extendToNextFTQ(pc);
             // first fetchBlock is always taken, do not run out of FTQ now
             return std::make_pair(true, false);
+            }
         }

         processFetchTargetCompletion(target_to_fetch);


DPRINTF(DecoupleBP, "Predict it %staken to %#lx\n", taken ? "" : "not ",
Expand Down Expand Up @@ -1372,6 +1389,93 @@ DecoupledBPUWithBTB::recoverHistoryForSquash(
}


// NEW: 2Fetch support methods implementation

/**
* @brief Check if we can extend to next FTQ entry for 2fetch
*
* @param current_pc Current program counter
* @param current_ftq Current FTQ entry that is being completed
* @return true if 2fetch extension is possible
*/
bool
DecoupledBPUWithBTB::canExtendToNextFTQ(const PCStateBase &current_pc, const FtqEntry &current_ftq)
{
// Early exit if 2fetch is disabled
if (!enable2Fetch) {
return false;
}

if (!current_ftq.taken) {
DPRINTF(DecoupleBP, "2Fetch rejected: current FTQ is not taken\n");
dbpBtbStats.fetch2FirstNotTaken++;
return false;
}

// Check if next FTQ entry is available
if (!fetchTargetQueue.hasNext()) {
DPRINTF(DecoupleBP, "2Fetch rejected: no next FTQ entry available\n");
dbpBtbStats.fetch2NoNextFTQ++;
return false;
}

// Get next FTQ entry (without consuming it)
const auto &next_ftq = fetchTargetQueue.peekNext();
// current_ftq is passed as parameter

// Check if current PC is the jump target of the next FTQ start
if (current_pc.instAddr() != next_ftq.startPC) {
DPRINTF(DecoupleBP, "2Fetch rejected: PC %#x not at next FTQ start %#x\n",
current_pc.instAddr(), next_ftq.startPC);
dbpBtbStats.fetch2FirstNotAtStart++;
return false;
}

// Check if both FTQs fit in maxFetchBytesPerCycle window
Addr span = next_ftq.endPC - current_ftq.startPC;
if (span > maxFetchBytesPerCycle) {
DPRINTF(DecoupleBP, "2Fetch rejected: span %d exceeds %d bytes\n",
span, maxFetchBytesPerCycle);
dbpBtbStats.fetch2SpanTooLarge++;
Comment on lines +1435 to +1439

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Gate 2fetch extension by actual fetch-buffer span

The 2fetch gate only checks next_ftq.endPC - current_ftq.startPC <= maxFetchBytesPerCycle, but it doesn’t guarantee that the next FTQ entry actually lies within the current fetch buffer. If maxFetchBytesPerCycle is configured larger than fetchBufferSize (or the next entry crosses the buffer), extendToNextFTQ returns (true, false) and leaves usedUpFetchTargets false; checkMemoryNeeds() will then stall because the PC is outside the buffer, yet needNewFTQEntry() won’t issue a new cache request, so fetch can hang. Consider clamping the span check to the fetch-buffer size or invalidating the buffer when extension spans beyond it.

Useful? React with 👍 / 👎.

return false;
}

DPRINTF(DecoupleBP, "2Fetch enabled: extending to next FTQ [%#x, %#x), total span: %d bytes\n",
next_ftq.startPC, next_ftq.endPC, span);
return true;
}

/**
* @brief Extend to process next FTQ entry for 2fetch
*
* @param pc Program counter reference to update
* @param seqNum Sequence number
* @param tid Thread ID
* @param currentLoopIter Current loop iteration
*/
void
DecoupledBPUWithBTB::extendToNextFTQ(PCStateBase &pc)
{
// Move to next FTQ entry
fetchTargetQueue.advance();
currentFtqEntryInstNum = 0; // Reset instruction counter for new FTQ

// Get the new FTQ entry
const auto &target_to_fetch = fetchTargetQueue.getTarget();

DPRINTF(DecoupleBP, "Processing extended FTQ entry: [%#x, %#x)\n",
target_to_fetch.startPC, target_to_fetch.endPC);

// Set PC to start of new FTQ
auto &rpc = pc.as<GenericISA::PCStateWithNext>();
rpc.pc(target_to_fetch.startPC);
rpc.npc(target_to_fetch.startPC + 4);
rpc.uReset();

// Record successful 2fetch
dbpBtbStats.fetch2Successes++;
}

} // namespace btb_pred

} // namespace branch_prediction
Expand Down
41 changes: 41 additions & 0 deletions src/cpu/pred/btb/decoupled_bpred.hh
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ class DecoupledBPUWithBTB : public BPredUnit
JumpAheadPredictor jap;
bool enableJumpAheadPredictor{false};

// 2taken feature support
bool enable2Taken{true}; // Default enabled

private:
std::string _name;

Expand Down Expand Up @@ -377,6 +380,14 @@ class DecoupledBPUWithBTB : public BPredUnit
statistics::Scalar s3PredWrongIttage;
statistics::Scalar s3PredWrongRas;

// NEW: 2Fetch statistics
statistics::Scalar fetch2Attempts; ///< Number of 2fetch attempts
statistics::Scalar fetch2Successes; ///< Number of successful 2fetch cycles
statistics::Scalar fetch2SpanTooLarge; ///< Rejected due to span > maxFetchBytes
statistics::Scalar fetch2NoNextFTQ; ///< Rejected due to no next FTQ entry
statistics::Scalar fetch2FirstNotTaken; ///< Rejected due to current FTQ is not taken
statistics::Scalar fetch2FirstNotAtStart; ///< Rejected due to current PC is not at next FTQ start

DBPBTBStats(statistics::Group* parent, unsigned numStages, unsigned fsqSize, unsigned maxInstsNum);
} dbpBtbStats;

Expand Down Expand Up @@ -999,6 +1010,36 @@ class DecoupledBPUWithBTB : public BPredUnit
*/
int currentFtqEntryInstNum{0};

// NEW: 2Fetch support variables
/**
* @brief Enable 2fetch capability
*/
bool enable2Fetch{true};

/**
* @brief Whether fetched first FTQ
*/
bool has1Fetched{false};

/**
* @brief Maximum fetch bytes per cycle for 2fetch
*/
unsigned maxFetchBytesPerCycle{64};

// NEW: 2Fetch support methods
/**
* @brief Check if we can extend to next FTQ entry for 2fetch
*
* @param current_pc Current program counter
* @param current_ftq Current FTQ entry that is being completed
* @return true if extension to next FTQ is possible
*/
bool canExtendToNextFTQ(const PCStateBase &current_pc, const FtqEntry &current_ftq);

// Extend processing to next FTQ entry for 2fetch

void extendToNextFTQ(PCStateBase &pc);

/**
* @brief Dump statistics on program exit
*
Expand Down
9 changes: 7 additions & 2 deletions src/cpu/pred/btb/decoupled_bpred_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -505,8 +505,13 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats(
ADD_STAT(s3PredWrongMbtb, statistics::units::Count::get(), "S3pred wrong blame mbtb "),
ADD_STAT(s3PredWrongTage, statistics::units::Count::get(), "S3pred wrong blame tage "),
ADD_STAT(s3PredWrongIttage, statistics::units::Count::get(), "S3pred wrong blame ittage "),
ADD_STAT(s3PredWrongRas, statistics::units::Count::get(), "S3pred wrong blame ras ")

ADD_STAT(s3PredWrongRas, statistics::units::Count::get(), "S3pred wrong blame ras "),
ADD_STAT(fetch2Attempts, statistics::units::Count::get(), "Number of 2fetch attempts"),
ADD_STAT(fetch2Successes, statistics::units::Count::get(), "Number of successful 2fetch cycles"),
ADD_STAT(fetch2SpanTooLarge, statistics::units::Count::get(), "Rejected due to span > maxFetchBytes"),
ADD_STAT(fetch2NoNextFTQ, statistics::units::Count::get(), "Rejected due to no next FTQ entry"),
ADD_STAT(fetch2FirstNotTaken, statistics::units::Count::get(), "Rejected due to current FTQ is not taken"),
ADD_STAT(fetch2FirstNotAtStart, statistics::units::Count::get(), "Rejected due to PC is != next FTQ start")
{
predsOfEachStage.init(numStages);
commitPredsFromEachStage.init(numStages+1);
Expand Down
54 changes: 54 additions & 0 deletions src/cpu/pred/btb/fetch_target_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,60 @@ FetchTargetQueue::resetPC(Addr new_pc)
fetchTargetEnqState.pc = new_pc;
}

// NEW: 2Fetch support methods implementation

/**
* @brief Check if there is a next available FTQ entry
*
* @return true if next FTQ entry is available
*/
bool
FetchTargetQueue::hasNext() const
{
// Check if there's an entry with ID = fetchDemandTargetId + 1
auto next_it = ftq.find(fetchDemandTargetId + 1);
return next_it != ftq.end();
}

/**
* @brief Peek at the next FTQ entry without consuming it
*
* @return Reference to the next FTQ entry
*/
const FtqEntry&
FetchTargetQueue::peekNext() const
{
assert(hasNext());
auto next_it = ftq.find(fetchDemandTargetId + 1);
return next_it->second;
}

/**
* @brief Advance to the next FTQ entry without dequeuing current one
*
* Used for 2fetch when we want to process the next entry
* while keeping the current one active
*/
void
FetchTargetQueue::advance()
{
// Already moved to next target ID in processFetchTargetCompletion
// Update supply state to point to new target
auto next_it = ftq.find(fetchDemandTargetId);
if (next_it != ftq.end()) {
supplyFetchTargetState.valid = true;
supplyFetchTargetState.targetId = fetchDemandTargetId;
supplyFetchTargetState.entry = &(next_it->second);

DPRINTF(DecoupleBP,
"Advanced to next FTQ entry: ID %lu, PC [%#lx, %#lx)\n",
fetchDemandTargetId, next_it->second.startPC, next_it->second.endPC);
} else {
supplyFetchTargetState.valid = false;
supplyFetchTargetState.entry = nullptr;
}
}

} // namespace btb_pred

} // namespace branch_prediction
Expand Down
Loading