diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py index aebedb4f85..20fc93243c 100644 --- a/configs/example/idealkmhv3.py +++ b/configs/example/idealkmhv3.py @@ -78,8 +78,9 @@ def setKmhV3IdealParams(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': - cpu.branchPred.ftq_size = 256 - cpu.branchPred.fsq_size = 256 + cpu.branchPred.ftq_size = 64 + cpu.branchPred.fsq_size = 64 + cpu.branchPred.enable2Fetch = True # l1 cache per core if args.caches: diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py index 71844d9478..ea7809093d 100644 --- a/configs/example/kmhv3.py +++ b/configs/example/kmhv3.py @@ -92,8 +92,8 @@ def setKmhV3Params(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': - cpu.branchPred.ftq_size = 256 - cpu.branchPred.fsq_size = 256 + cpu.branchPred.ftq_size = 64 + cpu.branchPred.fsq_size = 64 cpu.branchPred.mbtb.resolvedUpdate = True cpu.branchPred.tage.resolvedUpdate = True @@ -107,6 +107,7 @@ def setKmhV3Params(args, system): cpu.branchPred.ittage.enabled = True cpu.branchPred.mgsc.enabled = False cpu.branchPred.ras.enabled = True + cpu.branchPred.enable2Fetch = True # l1 cache per core if args.caches: diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index a5079227a0..87eaee3231 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -813,13 +813,34 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc) run_out = fall_thru >= stream.predEndPC; } + bool do_2fetch = false; + // Track how many dynamic instructions were fetched for this (legacy) FTQ/FSQ entry. ftqEntryFetchedInsts[tid]++; if (run_out) { + if (predict_taken && dbpbtb->is2FetchEnabled() && dbpbtb->ftqHasNext()) { + const Addr target_pc = stream.predBranchInfo.target; + const auto &next_stream = dbpbtb->ftqNext(); + const Addr span = next_stream.predEndPC - stream.startPC; + const unsigned max_bytes = dbpbtb->getMaxFetchBytesPerCycle(); + const bool target_in_buffer = + target_pc >= fetchBuffer[tid].startPC && target_pc + 4 <= fetchBuffer[tid].startPC + fetchBufferSize; + + if (target_pc == next_stream.startPC && span <= max_bytes && target_in_buffer) { + do_2fetch = true; + DPRINTF(DecoupleBP, + "2Fetch: extend in-cycle to next FSQ entry (cur [%#lx, %#lx), next [%#lx, %#lx), span=%lu, " + "max=%u)\n", + stream.startPC, stream.predEndPC, next_stream.startPC, next_stream.predEndPC, span, max_bytes); + } + } + dbpbtb->consumeFetchTarget(ftqEntryFetchedInsts[tid]); ftqEntryFetchedInsts[tid] = 0; - fetchBuffer[tid].valid = false; - DPRINTF(DecoupleBP, "Used up fetch targets.\n"); + if (!do_2fetch) { + fetchBuffer[tid].valid = false; + DPRINTF(DecoupleBP, "Used up fetch targets.\n"); + } } inst->setLoopIteration(currentLoopIter); @@ -845,7 +866,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc) ++fetchStats.predictedBranches; } - return predict_taken; + return predict_taken && !do_2fetch; } bool @@ -1857,6 +1878,8 @@ Fetch::checkMemoryNeeds(ThreadID tid, const PCStateBase &this_pc, fetch_pc + 4 > fetchBuffer[tid].startPC + fetchBufferSize) { DPRINTF(Fetch, "[tid:%i] PC %#x outside fetch buffer range [%#x, %#x), stalling on ICache\n", tid, fetch_pc, fetchBuffer[tid].startPC, fetchBuffer[tid].startPC + fetchBufferSize); + // Force issuing a new I-cache request. + fetchBuffer[tid].valid = false; return StallReason::IcacheStall; } @@ -1879,7 +1902,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, StaticInstPtr &curMacroop) { auto *dec_ptr = decoder[tid]; - bool predictedBranch = false; + bool stopFetchThisCycle = false; bool newMacroop = false; // Create a copy of the current PC state to calculate the next PC. @@ -1936,16 +1959,17 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, set(next_pc, pc); // Handle branch prediction and update next_pc for both modes - predictedBranch = lookupAndUpdateNextPC(instruction, *next_pc); + stopFetchThisCycle = lookupAndUpdateNextPC(instruction, *next_pc); + const bool predictedTaken = instruction->readPredTaken(); - if (predictedBranch) { + if (predictedTaken) { DPRINTF(Fetch, "[tid:%i] Branch detected with PC = %s, target = %s\n", instruction->threadNumber, pc, *next_pc); } if (isTraceMode()) { assert(traceFetch); - traceFetch->postBranchPredict(tid, instruction, traceForThisInst, pc, *next_pc, predictedBranch); + traceFetch->postBranchPredict(tid, instruction, traceForThisInst, pc, *next_pc, predictedTaken); } // A new macro-op also begins if the PC changes discontinuously. @@ -1959,7 +1983,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, // Update the main PC state for the next instruction. set(pc, *next_pc); - return predictedBranch; + return stopFetchThisCycle; } void @@ -1977,7 +2001,7 @@ Fetch::performInstructionFetch(ThreadID tid) StaticInstPtr &curMacroop = macroop[tid]; // Control flags for main fetch loop - bool predictedBranch = false; + bool stopFetchThisCycle = false; DPRINTF(Fetch, "[tid:%i] Adding instructions to queue to decode.\n", tid); @@ -1985,7 +2009,7 @@ Fetch::performInstructionFetch(ThreadID tid) // For decoupled frontend (including trace mode), check FTQ availability StallReason stall = StallReason::NoStall; while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize && - !predictedBranch && !ftqEmpty() && !waitForVsetvl) { + !stopFetchThisCycle && !ftqEmpty() && !waitForVsetvl) { // Check memory needs and supply bytes to decoder if required stall = checkMemoryNeeds(tid, pc_state, curMacroop); @@ -1998,7 +2022,7 @@ Fetch::performInstructionFetch(ThreadID tid) // into multiple micro-ops. do { // Process a single instruction, from decoding to PC update. - predictedBranch = processSingleInstruction(tid, pc_state, curMacroop); + stopFetchThisCycle = processSingleInstruction(tid, pc_state, curMacroop); } while (curMacroop && numInst < fetchWidth && @@ -2017,7 +2041,7 @@ Fetch::performInstructionFetch(ThreadID tid) } // Log why fetch stopped - if (predictedBranch) { + if (stopFetchThisCycle) { DPRINTF(Fetch, "[tid:%i] Done fetching, predicted branch instruction encountered.\n", tid); } else if (numInst >= fetchWidth) { DPRINTF(Fetch, "[tid:%i] Done fetching, reached fetch bandwidth for this cycle.\n", tid); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 65ae953797..96565f9963 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -354,7 +354,8 @@ class Fetch * Looks up the branch predictor, gets a prediction, and updates the PC. * @param inst The dynamic instruction object. * @param next_pc The PC state to update with the prediction. - * @return true if a branch was predicted taken. + * @return true if fetch should stop this cycle due to a predicted-taken + * branch (2Fetch may override and return false). */ bool lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc); @@ -564,7 +565,7 @@ class Fetch * @param tid The thread ID of the instruction. * @param pc The current program counter state (will be updated). * @param curMacroop The current macro-op being processed (if any). - * @return true if a branch was predicted. + * @return true if fetch should stop this cycle. */ bool processSingleInstruction(ThreadID tid, PCStateBase &pc, diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index a5132f48af..77860e1dac 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1178,3 +1178,6 @@ class DecoupledBPUWithBTB(BranchPredictor): bpDBSwitches = VectorParam.String([], "Enable which traces in the form of database") resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once") + + enable2Fetch = Param.Bool(False, "Enable 2fetch feature") + maxFetchBytesPerCycle = Param.Unsigned(64, "Maximum fetch bytes per cycle for 2fetch") diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 8869dec427..e3a244be33 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -48,6 +48,8 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) numStages(p.numStages), historyManager(16), // TODO: fix this resolveBlockThreshold(p.resolveBlockThreshold), + enable2Fetch(p.enable2Fetch), + maxFetchBytesPerCycle(p.maxFetchBytesPerCycle), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) { if (bpDBSwitches.size() > 0) { @@ -132,44 +134,51 @@ DecoupledBPUWithBTB::tick() return; } - // 1. Request new prediction if FSQ not full and we are idle - if (bpuState == BpuState::IDLE && !targetQueueFull()) { - if (blockPredictionPending) { - DPRINTF(Override, "Prediction blocked to prioritize resolve update\n"); - dbpBtbStats.predictionBlockedForUpdate++; - blockPredictionPending = false; - } else { - requestNewPrediction(); - bpuState = BpuState::PREDICTOR_DONE; + int predsRemainsToBeMade = enableTwoTaken ? 2 : 1; + unsigned tempNumOverrideBubbles = 0; + + while (predsRemainsToBeMade > 0) { + // 1. Request new prediction if FSQ not full and we are idle + if (bpuState == BpuState::IDLE && !targetQueueFull()) { + if (blockPredictionPending) { + DPRINTF(Override, "Prediction blocked to prioritize resolve update\n"); + dbpBtbStats.predictionBlockedForUpdate++; + blockPredictionPending = false; + } else { + requestNewPrediction(); + bpuState = BpuState::PREDICTOR_DONE; + } } - } - // 2. Handle pending prediction if available - if (bpuState == BpuState::PREDICTOR_DONE) { - DPRINTF(Override, "Generating final prediction for PC %#lx\n", s0PC); - numOverrideBubbles = generateFinalPredAndCreateBubbles(); - bpuState = BpuState::PREDICTION_OUTSTANDING; + // 2. Handle pending prediction if available + if (bpuState == BpuState::PREDICTOR_DONE) { + DPRINTF(Override, "Generating final prediction for PC %#lx\n", s0PC); + numOverrideBubbles = generateFinalPredAndCreateBubbles(); + bpuState = BpuState::PREDICTION_OUTSTANDING; - // Clear each predictor's output - for (int i = 0; i < numStages; i++) { - predsOfEachStage[i].btbEntries.clear(); + // Clear each predictor's output + for (int i = 0; i < numStages; i++) { + predsOfEachStage[i].btbEntries.clear(); + } } - } - if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) { - tage->dryRunCycle(s0PC); - } + if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) { + tage->dryRunCycle(s0PC); + } - // check if: - // 1. FSQ has space - // 2. there's no bubble - // 3. PREDICTION_OUTSTANDING - if (validateFSQEnqueue()) { - // Create new FSQ entry with the current prediction - processNewPrediction(); + // check if: + // 1. FSQ has space + // 2. there's no bubble + // 3. PREDICTION_OUTSTANDING + if (validateFSQEnqueue()) { + // Create new FSQ entry with the current prediction + processNewPrediction(); - DPRINTF(Override, "FSQ entry enqueued, prediction state reset\n"); - bpuState = BpuState::IDLE; + DPRINTF(Override, "FSQ entry enqueued, prediction state reset\n"); + bpuState = BpuState::IDLE; + } + + predsRemainsToBeMade--; } // Decrement override bubbles counter diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 576b0fce39..5265ec138c 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -143,8 +143,13 @@ class DecoupledBPUWithBTB : public BPredUnit unsigned resolveDequeueFailCounter{0}; const unsigned resolveBlockThreshold; + const bool enable2Fetch; + const unsigned maxFetchBytesPerCycle; + unsigned numOverrideBubbles{0}; + bool enableTwoTaken{true}; + bool validateFSQEnqueue(); void processNewPrediction(); @@ -211,6 +216,13 @@ class DecoupledBPUWithBTB : public BPredUnit return fetchTargetQueue[id - fetchTargetBaseId]; } + const FetchTarget& + getTarget(FetchTargetId id) const + { + assert(hasTarget(id)); + return fetchTargetQueue[id - fetchTargetBaseId]; + } + FetchTargetId frontTargetId() const { @@ -453,6 +465,12 @@ class DecoupledBPUWithBTB : public BPredUnit FetchTargetId ftqHeadId() const { assert(ftqHasHead()); return fetchHeadFtqId; } const FetchTarget &ftqHead() { assert(ftqHasHead()); return getTarget(fetchHeadFtqId); } + bool ftqHasNext() const { return hasTarget(fetchHeadFtqId + 1); } + const FetchTarget &ftqNext() const { assert(ftqHasNext()); return getTarget(fetchHeadFtqId + 1); } + + bool is2FetchEnabled() const { return enable2Fetch; } + unsigned getMaxFetchBytesPerCycle() const { return maxFetchBytesPerCycle; } + void dumpFsq(const char *when); // Dummy overriding