diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py
index aebedb4f85..20fc93243c 100644
--- a/configs/example/idealkmhv3.py
+++ b/configs/example/idealkmhv3.py
@@ -78,8 +78,9 @@ def setKmhV3IdealParams(args, system):
 
         # branch predictor
         if args.bp_type == 'DecoupledBPUWithBTB':
-            cpu.branchPred.ftq_size = 256
-            cpu.branchPred.fsq_size = 256
+            cpu.branchPred.ftq_size = 64
+            cpu.branchPred.fsq_size = 64
+            cpu.branchPred.enable2Fetch = True
 
         # l1 cache per core
         if args.caches:
diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py
index 71844d9478..ea7809093d 100644
--- a/configs/example/kmhv3.py
+++ b/configs/example/kmhv3.py
@@ -92,8 +92,8 @@ def setKmhV3Params(args, system):
 
         # branch predictor
         if args.bp_type == 'DecoupledBPUWithBTB':
-            cpu.branchPred.ftq_size = 256
-            cpu.branchPred.fsq_size = 256
+            cpu.branchPred.ftq_size = 64
+            cpu.branchPred.fsq_size = 64
 
             cpu.branchPred.mbtb.resolvedUpdate = True
             cpu.branchPred.tage.resolvedUpdate = True
@@ -107,6 +107,7 @@ def setKmhV3Params(args, system):
             cpu.branchPred.ittage.enabled = True
             cpu.branchPred.mgsc.enabled = False
             cpu.branchPred.ras.enabled = True
+            cpu.branchPred.enable2Fetch = True
 
         # l1 cache per core
         if args.caches:
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index a5079227a0..87eaee3231 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -813,13 +813,34 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc)
         run_out = fall_thru >= stream.predEndPC;
     }
 
+    bool do_2fetch = false;
+
     // Track how many dynamic instructions were fetched for this (legacy) FTQ/FSQ entry.
     ftqEntryFetchedInsts[tid]++;
     if (run_out) {
+        if (predict_taken && dbpbtb->is2FetchEnabled() && dbpbtb->ftqHasNext()) {
+            const Addr target_pc = stream.predBranchInfo.target;
+            const auto &next_stream = dbpbtb->ftqNext();
+            const Addr span = next_stream.predEndPC - stream.startPC;
+            const unsigned max_bytes = dbpbtb->getMaxFetchBytesPerCycle();
+            const bool target_in_buffer =
+                target_pc >= fetchBuffer[tid].startPC && target_pc + 4 <= fetchBuffer[tid].startPC + fetchBufferSize;
+
+            if (target_pc == next_stream.startPC && span <= max_bytes && target_in_buffer) {
+                do_2fetch = true;
+                DPRINTF(DecoupleBP,
+                        "2Fetch: extend in-cycle to next FSQ entry (cur [%#lx, %#lx), next [%#lx, %#lx), span=%lu, "
+                        "max=%u)\n",
+                        stream.startPC, stream.predEndPC, next_stream.startPC, next_stream.predEndPC, span, max_bytes);
+            }
+        }
+
         dbpbtb->consumeFetchTarget(ftqEntryFetchedInsts[tid]);
         ftqEntryFetchedInsts[tid] = 0;
-        fetchBuffer[tid].valid = false;
-        DPRINTF(DecoupleBP, "Used up fetch targets.\n");
+        if (!do_2fetch) {
+            fetchBuffer[tid].valid = false;
+            DPRINTF(DecoupleBP, "Used up fetch targets.\n");
+        }
     }
 
     inst->setLoopIteration(currentLoopIter);
@@ -845,7 +866,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc)
         ++fetchStats.predictedBranches;
     }
 
-    return predict_taken;
+    return predict_taken && !do_2fetch;
 }
 
 bool
@@ -1857,6 +1878,8 @@ Fetch::checkMemoryNeeds(ThreadID tid, const PCStateBase &this_pc,
         fetch_pc + 4 > fetchBuffer[tid].startPC + fetchBufferSize) {
         DPRINTF(Fetch, "[tid:%i] PC %#x outside fetch buffer range [%#x, %#x), stalling on ICache\n",
                 tid, fetch_pc, fetchBuffer[tid].startPC, fetchBuffer[tid].startPC + fetchBufferSize);
+        // Force issuing a new I-cache request.
+        fetchBuffer[tid].valid = false;
         return StallReason::IcacheStall;
     }
 
@@ -1879,7 +1902,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
                                StaticInstPtr &curMacroop)
 {
     auto *dec_ptr = decoder[tid];
-    bool predictedBranch = false;
+    bool stopFetchThisCycle = false;
     bool newMacroop = false;
 
     // Create a copy of the current PC state to calculate the next PC.
@@ -1936,16 +1959,17 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
     set(next_pc, pc);
 
     // Handle branch prediction and update next_pc for both modes
-    predictedBranch = lookupAndUpdateNextPC(instruction, *next_pc);
+    stopFetchThisCycle = lookupAndUpdateNextPC(instruction, *next_pc);
+    const bool predictedTaken = instruction->readPredTaken();
 
-    if (predictedBranch) {
+    if (predictedTaken) {
         DPRINTF(Fetch, "[tid:%i] Branch detected with PC = %s, target = %s\n",
                 instruction->threadNumber, pc, *next_pc);
     }
 
     if (isTraceMode()) {
         assert(traceFetch);
-        traceFetch->postBranchPredict(tid, instruction, traceForThisInst, pc, *next_pc, predictedBranch);
+        traceFetch->postBranchPredict(tid, instruction, traceForThisInst, pc, *next_pc, predictedTaken);
     }
 
     // A new macro-op also begins if the PC changes discontinuously.
@@ -1959,7 +1983,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
     // Update the main PC state for the next instruction.
     set(pc, *next_pc);
 
-    return predictedBranch;
+    return stopFetchThisCycle;
 }
 
 void
@@ -1977,7 +2001,7 @@ Fetch::performInstructionFetch(ThreadID tid)
     StaticInstPtr &curMacroop = macroop[tid];
 
     // Control flags for main fetch loop
-    bool predictedBranch = false;
+    bool stopFetchThisCycle = false;
 
     DPRINTF(Fetch, "[tid:%i] Adding instructions to queue to decode.\n", tid);
 
@@ -1985,7 +2009,7 @@ Fetch::performInstructionFetch(ThreadID tid)
     // For decoupled frontend (including trace mode), check FTQ availability
     StallReason stall = StallReason::NoStall;
     while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize &&
-           !predictedBranch && !ftqEmpty() && !waitForVsetvl) {
+           !stopFetchThisCycle && !ftqEmpty() && !waitForVsetvl) {
 
         // Check memory needs and supply bytes to decoder if required
         stall = checkMemoryNeeds(tid, pc_state, curMacroop);
@@ -1998,7 +2022,7 @@ Fetch::performInstructionFetch(ThreadID tid)
         // into multiple micro-ops.
         do {
             // Process a single instruction, from decoding to PC update.
-            predictedBranch = processSingleInstruction(tid, pc_state, curMacroop);
+            stopFetchThisCycle = processSingleInstruction(tid, pc_state, curMacroop);
 
         } while (curMacroop &&
                  numInst < fetchWidth &&
@@ -2017,7 +2041,7 @@ Fetch::performInstructionFetch(ThreadID tid)
     }
 
     // Log why fetch stopped
-    if (predictedBranch) {
+    if (stopFetchThisCycle) {
         DPRINTF(Fetch, "[tid:%i] Done fetching, predicted branch instruction encountered.\n", tid);
     } else if (numInst >= fetchWidth) {
         DPRINTF(Fetch, "[tid:%i] Done fetching, reached fetch bandwidth for this cycle.\n", tid);
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 65ae953797..96565f9963 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -354,7 +354,8 @@ class Fetch
      * Looks up the branch predictor, gets a prediction, and updates the PC.
      * @param inst The dynamic instruction object.
      * @param next_pc The PC state to update with the prediction.
-     * @return true if a branch was predicted taken.
+     * @return true if fetch should stop this cycle due to a predicted-taken
+     * branch (2Fetch may override and return false).
      */
     bool lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc);
 
@@ -564,7 +565,7 @@ class Fetch
      * @param tid The thread ID of the instruction.
      * @param pc The current program counter state (will be updated).
      * @param curMacroop The current macro-op being processed (if any).
-     * @return true if a branch was predicted.
+     * @return true if fetch should stop this cycle.
      */
     bool
     processSingleInstruction(ThreadID tid, PCStateBase &pc,
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index a5132f48af..77860e1dac 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -1178,3 +1178,6 @@ class DecoupledBPUWithBTB(BranchPredictor):
 
     bpDBSwitches = VectorParam.String([], "Enable which traces in the form of database")
     resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once")
+
+    enable2Fetch = Param.Bool(False, "Enable 2fetch feature")
+    maxFetchBytesPerCycle = Param.Unsigned(64, "Maximum fetch bytes per cycle for 2fetch")
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index 8869dec427..e3a244be33 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -48,6 +48,8 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
       numStages(p.numStages),
       historyManager(16), // TODO: fix this
       resolveBlockThreshold(p.resolveBlockThreshold),
+      enable2Fetch(p.enable2Fetch),
+      maxFetchBytesPerCycle(p.maxFetchBytesPerCycle),
       dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum)
 {
     if (bpDBSwitches.size() > 0) {
@@ -132,44 +134,51 @@ DecoupledBPUWithBTB::tick()
         return;
     }
 
-    // 1. Request new prediction if FSQ not full and we are idle
-    if (bpuState == BpuState::IDLE && !targetQueueFull()) {
-        if (blockPredictionPending) {
-            DPRINTF(Override, "Prediction blocked to prioritize resolve update\n");
-            dbpBtbStats.predictionBlockedForUpdate++;
-            blockPredictionPending = false;
-        } else {
-            requestNewPrediction();
-            bpuState = BpuState::PREDICTOR_DONE;
+    int predsRemainsToBeMade = enableTwoTaken ? 2 : 1;
+    unsigned tempNumOverrideBubbles = 0;
+
+    while (predsRemainsToBeMade > 0) {
+        // 1. Request new prediction if FSQ not full and we are idle
+        if (bpuState == BpuState::IDLE && !targetQueueFull()) {
+            if (blockPredictionPending) {
+                DPRINTF(Override, "Prediction blocked to prioritize resolve update\n");
+                dbpBtbStats.predictionBlockedForUpdate++;
+                blockPredictionPending = false;
+            } else {
+                requestNewPrediction();
+                bpuState = BpuState::PREDICTOR_DONE;
+            }
         }
-    }
 
-    // 2. Handle pending prediction if available
-    if (bpuState == BpuState::PREDICTOR_DONE) {
-        DPRINTF(Override, "Generating final prediction for PC %#lx\n", s0PC);
-        numOverrideBubbles = generateFinalPredAndCreateBubbles();
-        bpuState = BpuState::PREDICTION_OUTSTANDING;
+        // 2. Handle pending prediction if available
+        if (bpuState == BpuState::PREDICTOR_DONE) {
+            DPRINTF(Override, "Generating final prediction for PC %#lx\n", s0PC);
+            numOverrideBubbles = generateFinalPredAndCreateBubbles();
+            bpuState = BpuState::PREDICTION_OUTSTANDING;
 
-        // Clear each predictor's output
-        for (int i = 0; i < numStages; i++) {
-            predsOfEachStage[i].btbEntries.clear();
+            // Clear each predictor's output
+            for (int i = 0; i < numStages; i++) {
+                predsOfEachStage[i].btbEntries.clear();
+            }
         }
-    }
 
-    if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) {
-        tage->dryRunCycle(s0PC);
-    }
+        if (bpuState == BpuState::PREDICTION_OUTSTANDING && numOverrideBubbles > 0) {
+            tage->dryRunCycle(s0PC);
+        }
 
-    // check if:
-    // 1. FSQ has space
-    // 2. there's no bubble
-    // 3. PREDICTION_OUTSTANDING
-    if (validateFSQEnqueue()) {
-        // Create new FSQ entry with the current prediction
-        processNewPrediction();
+        // check if:
+        // 1. FSQ has space
+        // 2. there's no bubble
+        // 3. PREDICTION_OUTSTANDING
+        if (validateFSQEnqueue()) {
+            // Create new FSQ entry with the current prediction
+            processNewPrediction();
 
-        DPRINTF(Override, "FSQ entry enqueued, prediction state reset\n");
-        bpuState = BpuState::IDLE;
+            DPRINTF(Override, "FSQ entry enqueued, prediction state reset\n");
+            bpuState = BpuState::IDLE;
+        }
+
+        predsRemainsToBeMade--;
     }
 
     // Decrement override bubbles counter
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 576b0fce39..5265ec138c 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -143,8 +143,13 @@ class DecoupledBPUWithBTB : public BPredUnit
     unsigned resolveDequeueFailCounter{0};
     const unsigned resolveBlockThreshold;
 
+    const bool enable2Fetch;
+    const unsigned maxFetchBytesPerCycle;
+
     unsigned numOverrideBubbles{0};
 
+    bool enableTwoTaken{true};
+
     bool validateFSQEnqueue();
 
     void processNewPrediction();
@@ -211,6 +216,13 @@ class DecoupledBPUWithBTB : public BPredUnit
         return fetchTargetQueue[id - fetchTargetBaseId];
     }
 
+    const FetchTarget&
+    getTarget(FetchTargetId id) const
+    {
+        assert(hasTarget(id));
+        return fetchTargetQueue[id - fetchTargetBaseId];
+    }
+
     FetchTargetId
     frontTargetId() const
     {
@@ -453,6 +465,12 @@ class DecoupledBPUWithBTB : public BPredUnit
     FetchTargetId ftqHeadId() const { assert(ftqHasHead()); return fetchHeadFtqId; }
     const FetchTarget &ftqHead() { assert(ftqHasHead()); return getTarget(fetchHeadFtqId); }
 
+    bool ftqHasNext() const { return hasTarget(fetchHeadFtqId + 1); }
+    const FetchTarget &ftqNext() const { assert(ftqHasNext()); return getTarget(fetchHeadFtqId + 1); }
+
+    bool is2FetchEnabled() const { return enable2Fetch; }
+    unsigned getMaxFetchBytesPerCycle() const { return maxFetchBytesPerCycle; }
+
     void dumpFsq(const char *when);
 
     // Dummy overriding