martin-augment · martin-augment · Oct 17, 2025 · Dec 31, 2025 · Jan 2, 2026 · Mar 10, 2026
diff --git a/tokio/src/fs/read_uring.rs b/tokio/src/fs/read_uring.rs
@@ -1,5 +1,5 @@
 use crate::fs::OpenOptions;
-use crate::runtime::driver::op::Op;
+use crate::runtime::driver::op::{CqeResult, Op};
 
 use std::io;
 use std::io::ErrorKind;
@@ -11,16 +11,19 @@ use std::path::Path;
 const PROBE_SIZE: usize = 32;
 const PROBE_SIZE_U32: u32 = PROBE_SIZE as u32;
 
+// batch size for batched reads
+const BATCH_SIZE: usize = 200;
+
 // Max bytes we can read using io uring submission at a time
 // SAFETY: cannot be higher than u32::MAX for safe cast
 // Set to read max 64 MiB at time
-// Set to read max 64 MiB at time
+// Set to read max 64 KiB at time
-// Set to read max 64 MiB at time
+// Set to read max 64 KiB at time
-const MAX_READ_SIZE: usize = 64 * 1024 * 1024;
+pub(crate) const MAX_READ_SIZE: usize = 64 * 1024;
 
 pub(crate) async fn read_uring(path: &Path) -> io::Result<Vec<u8>> {
     let file = OpenOptions::new().read(true).open(path).await?;
 
     // TODO: use io uring in the future to obtain metadata
-    let size_hint: Option<usize> = file.metadata().await.map(|m| m.len() as usize).ok();
+    let size_hint = file.metadata().await.map(|m| m.len() as usize).ok();
 
     let fd: OwnedFd = file
         .try_into_std()
@@ -33,10 +36,26 @@ pub(crate) async fn read_uring(path: &Path) -> io::Result<Vec<u8>> {
         buf.try_reserve(size_hint)?;
     }
 
-    read_to_end_uring(fd, buf).await
+    read_to_end_batch(fd, buf).await
+}
+
+async fn read_to_end_batch(fd: OwnedFd, buf: Vec<u8>) -> io::Result<Vec<u8>> {
+    // try reading in batch if we have substantial capacity
+    if buf.capacity() > MAX_READ_SIZE {
+        match Op::read_batch_size::<BATCH_SIZE>(fd, buf).await {
+            Ok(buf) => Ok(buf),
+            Err((r_fd, mut r_buf)) => {
+                // clear the buffer before we write to it from start again
+                r_buf.clear();
+                read_to_end(r_fd, r_buf).await
+            }
+        }
+    } else {
+        read_to_end(fd, buf).await
+    }
 }
 
-async fn read_to_end_uring(mut fd: OwnedFd, mut buf: Vec<u8>) -> io::Result<Vec<u8>> {
+async fn read_to_end(mut fd: OwnedFd, mut buf: Vec<u8>) -> io::Result<Vec<u8>> {
     let mut offset = 0;
     let start_cap = buf.capacity();
 
@@ -119,16 +138,20 @@ async fn op_read(
         let (res, r_fd, r_buf) = Op::read(fd, buf, read_len, *offset).await;
 
         match res {
-            Err(e) if e.kind() == ErrorKind::Interrupted => {
-                buf = r_buf;
-                fd = r_fd;
-            }
-            Err(e) => return Err(e),
-            Ok(size_read) => {
+            CqeResult::Single(Ok(size_read)) => {
                 *offset += size_read as u64;
 
                 return Ok((r_fd, r_buf, size_read == 0));
             }
+            CqeResult::InitErr(e) | CqeResult::Single(Err(e)) => {
+                if e.kind() == ErrorKind::Interrupted {
+                    buf = r_buf;
+                    fd = r_fd;
+                } else {
+                    return Err(e);
+                }
+            }
+            CqeResult::Batch(_) => return Err(ErrorKind::Unsupported.into()),
         }
     }
 }
diff --git a/tokio/src/io/uring/open.rs b/tokio/src/io/uring/open.rs
@@ -5,7 +5,7 @@ use crate::runtime::driver::op::{CancelData, Cancellable, Completable, CqeResult
 
 use io_uring::{opcode, types};
 use std::ffi::CString;
-use std::io::{self, Error};
+use std::io::{self, ErrorKind};
 use std::os::fd::FromRawFd;
 use std::path::Path;
 
@@ -19,13 +19,13 @@ pub(crate) struct Open {
 
 impl Completable for Open {
     type Output = io::Result<crate::fs::File>;
-    fn complete(self, cqe: CqeResult) -> Self::Output {
-        cqe.result
-            .map(|fd| unsafe { crate::fs::File::from_raw_fd(fd as i32) })
-    }
 
-    fn complete_with_error(self, err: Error) -> Self::Output {
-        Err(err)
+    fn complete(self, cqe: CqeResult) -> Self::Output {
+        match cqe {
+            CqeResult::Single(Ok(fd)) => Ok(unsafe { crate::fs::File::from_raw_fd(fd as i32) }),
+            CqeResult::Batch(_) => Err(ErrorKind::Unsupported.into()),
+            CqeResult::InitErr(err) | CqeResult::Single(Err(err)) => Err(err),
+        }
     }
 }
 

diff --git a/tokio/src/io/uring/read.rs b/tokio/src/io/uring/read.rs
@@ -1,32 +1,40 @@
-use crate::runtime::driver::op::{CancelData, Cancellable, Completable, CqeResult, Op};
+use crate::fs::read_uring::MAX_READ_SIZE;
+use crate::runtime::driver::op::{
+    i32_to_result, CancelData, Cancellable, Completable, CqeResult, Op,
+};
 
+use io_uring::squeue::{Entry, Flags};
 use io_uring::{opcode, types};
-use std::io::{self, Error};
+
+use std::io::ErrorKind;
+use std::mem::MaybeUninit;
 use std::os::fd::{AsRawFd, OwnedFd};
 
+type Output<const N: usize> = (CqeResult<N>, OwnedFd, Vec<u8>);
+
 #[derive(Debug)]
 pub(crate) struct Read {
     fd: OwnedFd,
     buf: Vec<u8>,
 }
 
-impl Completable for Read {
-    type Output = (io::Result<u32>, OwnedFd, Vec<u8>);
+impl<const N: usize> Completable<N> for Read {
+    type Output = Output<N>;
 
-    fn complete(self, cqe: CqeResult) -> Self::Output {
+    fn complete(self, cqe: CqeResult<N>) -> Self::Output {
         let mut buf = self.buf;
 
-        if let Ok(len) = cqe.result {
+        if let CqeResult::Single(Ok(len)) = cqe {
+            // increase length of buffer on successful
+            // completion
             let new_len = buf.len() + len as usize;
             // SAFETY: Kernel read len bytes
             unsafe { buf.set_len(new_len) };
         }
 
-        (cqe.result, self.fd, buf)
-    }
+        // Handle rest each batch outside
 
-    fn complete_with_error(self, err: Error) -> Self::Output {
-        (Err(err), self.fd, self.buf)
+        (cqe, self.fd, buf)
     }
 }
 
@@ -58,4 +66,116 @@ impl Op<Read> {
         // SAFETY: Parameters are valid for the entire duration of the operation
         unsafe { Op::new(read_op, Read { fd, buf }) }
     }
+
+    // Split file read operations by batches of size N. batches will be executed
+    // in groups of N.
+    //
+    // This function will return the final buffer of bytes in the file
+    pub(crate) async fn read_batch_size<const N: usize>(
+        mut fd: OwnedFd,
+        mut buf: Vec<u8>,
+    ) -> Result<Vec<u8>, (OwnedFd, Vec<u8>)> {
+        // hold batch_size operations and reuse it
+        let mut n_ops = [const { MaybeUninit::uninit() }; N];
+        let mut last_len = 0;
+        let mut read_len = 0;
+        let len = buf.capacity();
+
+        for i in 0..N {
+            n_ops[i].write(opcode::Nop::new().build());
+        }
+
+        let mut n_ops = unsafe { assume_init(n_ops) };
+
+        // total number of batch entries to read the file completly
+        for (index, start) in (0..len).step_by(MAX_READ_SIZE).enumerate() {
+            let end = (start + MAX_READ_SIZE).min(len); // clamp to len for the final chunk
+            let array_idx = index % N;
+
+            // skip first iteration
+            if array_idx == 0 && index != 0 {
+                // SAFETY: Batches are valid array entries
+                let op = unsafe { Op::batch(n_ops.clone(), Read { fd, buf }) };
+                let (_, r_fd, r_buf) = uring_task(op, &mut read_len).await?;
+
+                fd = r_fd;
+                buf = r_buf;
+            } else {
+                let op = opcode::Read::new(
+                    types::Fd(fd.as_raw_fd()),
+                    buf.spare_capacity_mut()[start..].as_mut_ptr().cast(),
+                    (end - start) as u32,
+                )
+                .offset(start as u64)
+                .build()
+                // link our sqes so cqes arrive in order
+                .flags(Flags::IO_LINK);
+
+                n_ops[array_idx] = op;
+            }
+
+            last_len = array_idx; // save last array_idx for last batch
+        }
-        for (index, start) in (0..len).step_by(MAX_READ_SIZE).enumerate() {
-            let end = (start + MAX_READ_SIZE).min(len); // clamp to len for the final chunk
-            let array_idx = index % N;
-
-            // skip first iteration
-            if array_idx == 0 && index != 0 {
-                // SAFETY: Batches are valid array entries
-                let op = unsafe { Op::batch(n_ops.clone(), Read { fd, buf }) };
-                let (_, r_fd, r_buf) = uring_task(op, &mut read_len).await?;
-
-                fd = r_fd;
-                buf = r_buf;
-            } else {
-                let op = opcode::Read::new(
-                    types::Fd(fd.as_raw_fd()),
-                    buf.spare_capacity_mut()[start..].as_mut_ptr().cast(),
-                    (end - start) as u32,
-                )
-                .offset(start as u64)
-                .build()
-                // link our sqes so cqes arrive in order
-                .flags(Flags::IO_LINK);
-
-                n_ops[array_idx] = op;
-            }
-
-            last_len = array_idx; // save last array_idx for last batch
-        }
+        for (index, start) in (0..len).step_by(MAX_READ_SIZE).enumerate() {
+            let end = (start + MAX_READ_SIZE).min(len); // clamp to len for the final chunk
+            let array_idx = index % N;
+
+            // skip first iteration
+            if array_idx == 0 && index != 0 {
+                // SAFETY: Batches are valid array entries
+                let op = unsafe { Op::batch(n_ops.clone(), Read { fd, buf }) };
+                let (_, r_fd, r_buf) = uring_task(op, &mut read_len).await?;
+
+                fd = r_fd;
+                buf = r_buf;
+            }
+
+            let op = opcode::Read::new(
+                types::Fd(fd.as_raw_fd()),
+                buf.spare_capacity_mut()[start..].as_mut_ptr().cast(),
+                (end - start) as u32,
+            )
+            .offset(start as u64)
+            .build()
+            // link our sqes so cqes arrive in order
+            .flags(Flags::IO_LINK);
+
+            n_ops[array_idx] = op;
+
+            last_len = array_idx; // save last array_idx for last batch
+        }
-        for (index, start) in (0..len).step_by(MAX_READ_SIZE).enumerate() {
-            let end = (start + MAX_READ_SIZE).min(len); // clamp to len for the final chunk
-            let array_idx = index % N;
-
-            // skip first iteration
-            if array_idx == 0 && index != 0 {
-                // SAFETY: Batches are valid array entries
-                let op = unsafe { Op::batch(n_ops.clone(), Read { fd, buf }) };
-                let (_, r_fd, r_buf) = uring_task(op, &mut read_len).await?;
-
-                fd = r_fd;
-                buf = r_buf;
-            } else {
-                let op = opcode::Read::new(
-                    types::Fd(fd.as_raw_fd()),
-                    buf.spare_capacity_mut()[start..].as_mut_ptr().cast(),
-                    (end - start) as u32,
-                )
-                .offset(start as u64)
-                .build()
-                // link our sqes so cqes arrive in order
-                .flags(Flags::IO_LINK);
-
-                n_ops[array_idx] = op;
-            }
-
-            last_len = array_idx; // save last array_idx for last batch
-        }
+        for (index, start) in (0..len).step_by(MAX_READ_SIZE).enumerate() {
+            let end = (start + MAX_READ_SIZE).min(len); // clamp to len for the final chunk
+            let array_idx = index % N;
+
+            // skip first iteration
+            if array_idx == 0 && index != 0 {
+                // SAFETY: Batches are valid array entries
+                let op = unsafe { Op::batch(n_ops.clone(), Read { fd, buf }) };
+                let (_, r_fd, r_buf) = uring_task(op, &mut read_len).await?;
+
+                fd = r_fd;
+                buf = r_buf;
+            }
+
+            let op = opcode::Read::new(
+                types::Fd(fd.as_raw_fd()),
+                buf.spare_capacity_mut()[start..].as_mut_ptr().cast(),
+                (end - start) as u32,
+            )
+            .offset(start as u64)
+            .build()
+            // link our sqes so cqes arrive in order
+            .flags(Flags::IO_LINK);
+
+            n_ops[array_idx] = op;
+
+            last_len = array_idx; // save last array_idx for last batch
+        }
+
+        // Handle last partial batch if there is any
+        if last_len > 0 {
+            for (i, entry) in n_ops.iter_mut().enumerate() {
+                // no op the double counted entries
+                if i > last_len {
+                    *entry = opcode::Nop::new().build();
+                }
+            }
+
+            // SAFETY: Because of the no-op loop above, we can assume double entry
+            // read is not possible
+            let op = unsafe { Op::batch(n_ops, Read { fd, buf }) };
+
+            let (_, _, r_buf) = uring_task(op, &mut read_len).await?;
+
+            buf = r_buf;
+        }
+
+        unsafe {
+            buf.set_len(buf.len() + read_len);
+        }
+
+        Ok(buf)
+    }
+}
+
+unsafe fn assume_init<const N: usize>(n_ops: [MaybeUninit<Entry>; N]) -> [Entry; N] {
+    unsafe { std::mem::transmute_copy(&n_ops) }
+}
+
+// Poll the batch operation and get the Output out of it
+async fn uring_task<const N: usize>(
+    op: Op<Read, N>,
+    read_len: &mut usize,
+) -> Result<Output<N>, (OwnedFd, Vec<u8>)> {
+    let (res, r_fd, r_buf) = op.await;
+
+    match res {
+        CqeResult::Batch(cqes) => {
+            for cqe in cqes {
+                let cqe = i32_to_result(cqe);
+
+                match cqe {
+                    Ok(r_size) => match read_len.checked_add(r_size as usize) {
+                        Some(len) => {
+                            *read_len = len;
+                        }
+                        None => return Err((r_fd, r_buf)),
+                    },
+                    Err(e) => {
+                        if e.kind() != ErrorKind::Interrupted {
+                            return Err((r_fd, r_buf));
+                        }
+                    }
+                }
+            }
+        }
+        _ => return Err((r_fd, r_buf)),
+    };
+
+    Ok((res, r_fd, r_buf))
 }
diff --git a/tokio/src/io/uring/write.rs b/tokio/src/io/uring/write.rs
@@ -2,7 +2,7 @@ use crate::runtime::driver::op::{CancelData, Cancellable, Completable, CqeResult
 use crate::util::as_ref::OwnedBuf;
 
 use io_uring::{opcode, types};
-use std::io::{self, Error};
+use std::io::{self, ErrorKind};
 use std::os::fd::{AsRawFd, OwnedFd};
 
 #[derive(Debug)]
@@ -13,12 +13,15 @@ pub(crate) struct Write {
 
 impl Completable for Write {
     type Output = (io::Result<u32>, OwnedBuf, OwnedFd);
+
     fn complete(self, cqe: CqeResult) -> Self::Output {
-        (cqe.result, self.buf, self.fd)
-    }
+        let res = match cqe {
+            CqeResult::Single(cqe) => cqe,
+            CqeResult::Batch(_) => Err(ErrorKind::Unsupported.into()),
+            CqeResult::InitErr(err) => Err(err),
+        };
 
-    fn complete_with_error(self, err: Error) -> Self::Output {
-        (Err(err), self.buf, self.fd)
+        (res, self.buf, self.fd)
     }
 }