diff --git a/CMakeLists.txt b/CMakeLists.txt
index 35bc118..55ecce9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,48 @@ if (PkgConfig_FOUND)
     pkg_check_modules(LIBURING liburing)
 endif()
 
+# ============================================================
+# Shader compilation
+#
+# If glslangValidator is available, compile compute shaders
+# from GLSL to SPIR-V automatically during the build.
+# ============================================================
+
+find_program(GLSLANG_VALIDATOR glslangValidator)
+
+if(GLSLANG_VALIDATOR)
+    message(STATUS "Found glslangValidator: ${GLSLANG_VALIDATOR}")
+    
+    # Find all .comp shader files
+    file(GLOB SHADER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/shaders/*.comp)
+    
+    set(SPIRV_SHADERS)
+    foreach(SHADER ${SHADER_SOURCES})
+        get_filename_component(SHADER_NAME ${SHADER} NAME)
+        set(SPIRV "${CMAKE_CURRENT_BINARY_DIR}/shaders/${SHADER_NAME}.spv")
+        
+        add_custom_command(
+            OUTPUT ${SPIRV}
+            COMMAND ${CMAKE_COMMAND} -E make_directory 
+                    "${CMAKE_CURRENT_BINARY_DIR}/shaders"
+            COMMAND ${GLSLANG_VALIDATOR} -V ${SHADER} -o ${SPIRV}
+            DEPENDS ${SHADER}
+            COMMENT "Compiling shader ${SHADER_NAME} to SPIR-V"
+            VERBATIM
+        )
+        
+        list(APPEND SPIRV_SHADERS ${SPIRV})
+    endforeach()
+    
+    if(SPIRV_SHADERS)
+        add_custom_target(shaders ALL DEPENDS ${SPIRV_SHADERS})
+        message(STATUS "Shader compilation enabled (${list_length} shaders found)")
+    endif()
+else()
+    message(STATUS "glslangValidator not found - shader compilation disabled")
+    message(STATUS "  To enable: install vulkan-tools or spirv-tools package")
+endif()
+
 # ============================================================
 # Core runtime library
 #
@@ -232,6 +274,17 @@ if (DS_BUILD_TESTS)
     add_test(NAME ds_cpu_backend_test COMMAND ds_cpu_backend_test)
     add_test(NAME ds_error_handling_test COMMAND ds_error_handling_test)
 
+    # GDeflate format test
+    add_executable(ds_gdeflate_format_test
+        tests/gdeflate_format_test.cpp
+    )
+    if (TARGET ds_runtime)
+        target_link_libraries(ds_gdeflate_format_test PRIVATE ds_runtime)
+    elseif (TARGET ds_runtime_static)
+        target_link_libraries(ds_gdeflate_format_test PRIVATE ds_runtime_static)
+    endif()
+    add_test(NAME ds_gdeflate_format_test COMMAND ds_gdeflate_format_test)
+
     if (LIBURING_FOUND)
         add_executable(ds_io_uring_tests
             tests/io_uring_backend_test.cpp
diff --git a/README.md b/README.md
index 9d1917e..553ac32 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,16 @@ The codebase has been significantly improved:
 - ⚠️ **io_uring backend**: Requires liburing dependency (not built by default)
 - ⚠️ **Request cancellation**: Enum added but cancel() method not yet implemented
 
+### 📋 Investigation & Planning Complete (Phase 0)
+Comprehensive investigation and planning documents have been created:
+- **[Master Roadmap](docs/master_roadmap.md)** (30KB) - Complete 36-week phased plan with microtasks
+- **[GDeflate Investigation](docs/investigation_gdeflate.md)** (16KB) - CPU & GPU implementation plan
+- **[Vulkan Compute Investigation](docs/investigation_vulkan_compute.md)** (26KB) - GPU compute pipelines
+- **[io_uring Investigation](docs/investigation_io_uring.md)** (20KB) - Multi-worker backend enhancement
+- **[Additional Features](docs/investigation_remaining_features.md)** (18KB) - Cancellation, GPU workflows, Wine/Proton
+
+Timeline: 36 weeks for full implementation, 12 weeks for MVP
+
 See [MISSING_FEATURES.md](MISSING_FEATURES.md) for the complete roadmap and [COMPARISON.md](COMPARISON.md) for documentation vs reality comparison.
 
 ---
diff --git a/docs/investigation_gdeflate.md b/docs/investigation_gdeflate.md
new file mode 100644
index 0000000..a121dd4
--- /dev/null
+++ b/docs/investigation_gdeflate.md
@@ -0,0 +1,563 @@
+# GDeflate Compression Investigation and Implementation Plan
+
+**Status:** Research Phase  
+**Priority:** High  
+**Target:** CPU and GPU GDeflate decompression support  
+**Dependencies:** None for CPU; Vulkan compute pipeline for GPU
+
+---
+
+## Executive Summary
+
+GDeflate is Microsoft's custom compression format for DirectStorage, designed for efficient GPU decompression. This document outlines the investigation, design, and implementation plan for adding GDeflate support to ds-runtime.
+
+---
+
+## 1. Background
+
+### 1.1 What is GDeflate?
+
+GDeflate is a GPU-friendly variant of DEFLATE compression used in Microsoft DirectStorage. Key characteristics:
+
+- **Block-based structure**: Data is compressed in independent blocks for parallel GPU decompression
+- **Modified DEFLATE**: Based on standard DEFLATE but optimized for GPU compute
+- **Metadata format**: Includes block headers with size information
+- **Designed for parallelism**: Each block can be decompressed independently
+
+### 1.2 Current State
+
+- **Status**: Intentionally stubbed - returns `ENOTSUP` error
+- **Test**: `tests/compression_gdeflate_stub_test.cpp` verifies failure
+- **API**: `Compression::GDeflate` enum value exists
+- **Path**: Error callback triggered when requested
+
+---
+
+## 2. Research Requirements
+
+### 2.1 Format Specification
+
+**Primary Goal**: Obtain or reverse-engineer GDeflate format specification
+
+**Approaches**:
+
+1. **Official Documentation**
+   - Check Microsoft DirectStorage SDK documentation
+   - Review DirectStorage headers and samples
+   - Search for format specifications in MSDN
+
+2. **Reverse Engineering**
+   - Analyze DirectStorage DLL behavior
+   - Examine compressed asset samples
+   - Study existing decompression implementations
+
+3. **Community Resources**
+   - Wine/Proton community investigations
+   - Graphics programming forums
+   - Open-source game engine implementations
+
+**Deliverable**: GDeflate format specification document
+
+### 2.2 Existing Implementations
+
+**Research Goals**:
+- Identify existing open-source GDeflate decoders
+- Review Wine/Proton DirectStorage implementation status
+- Study GPU decompression implementations (if any)
+
+**Potential Sources**:
+- Wine project (dstorage.dll implementation)
+- Game engine implementations (Unreal, Unity plugins)
+- Graphics libraries with DirectStorage support
+- Academic papers on GPU decompression
+
+---
+
+## 3. CPU Implementation Plan
+
+### 3.1 Architecture
+
+```
+Input: Compressed buffer + size
+  ↓
+Block Header Parser
+  ↓ (block metadata: offset, compressed_size, uncompressed_size)
+Per-Block Decompression Loop
+  ↓ (DEFLATE decode per block)
+Output: Decompressed buffer
+```
+
+### 3.2 Implementation Phases
+
+#### Phase 3.1: Format Understanding
+**Tasks**:
+- Document GDeflate file/stream structure
+- Identify block header format
+- Document compression parameters
+- Understand dictionary handling
+
+**Deliverables**:
+- Format specification document
+- Test asset creation tool
+
+#### Phase 3.2: Block Header Parser
+**Tasks**:
+- Implement GDeflate header parsing
+- Extract block metadata (offsets, sizes)
+- Validate header checksums (if present)
+- Error handling for corrupted headers
+
+**Location**: `src/gdeflate_decoder.cpp`
+
+**API**:
+```cpp
+struct GDeflateBlockInfo {
+    uint64_t offset;
+    uint32_t compressed_size;
+    uint32_t uncompressed_size;
+};
+
+std::vector<GDeflateBlockInfo> parse_gdeflate_header(
+    const void* data, 
+    size_t size
+);
+```
+
+#### Phase 3.3: DEFLATE Decoder Integration
+**Tasks**:
+- Choose DEFLATE library (zlib, miniz, or custom)
+- Integrate block decompression
+- Handle partial decompression
+- Add streaming support
+
+**Library Options**:
+1. **zlib** (standard, widely available)
+   - Pros: Mature, well-tested, optimized
+   - Cons: May need modification for block format
+   
+2. **miniz** (single-file, public domain)
+   - Pros: Easy integration, no dependencies
+   - Cons: May be slower than zlib
+   
+3. **Custom implementation**
+   - Pros: Full control, GDeflate-specific optimizations
+   - Cons: High development effort, testing burden
+
+**Recommendation**: Start with zlib, consider optimization later
+
+#### Phase 3.4: CPU Backend Integration
+**Tasks**:
+- Remove ENOTSUP stub from `ds_runtime.cpp`
+- Wire GDeflate decoder into decompression pipeline
+- Add error handling for decompression failures
+- Update tests to verify successful decompression
+
+**Location**: `src/ds_runtime.cpp` (CpuBackend::decompress)
+
+**Changes**:
+```cpp
+// Replace stub:
+if (req.compression == Compression::GDeflate) {
+    report_error("cpu", "decompression", ENOTSUP,
+                 "GDeflate compression is not yet implemented (ENOTSUP)");
+    return;
+}
+
+// With implementation:
+if (req.compression == Compression::GDeflate) {
+    if (!gdeflate_decompress(req.dst, req.size, ...)) {
+        report_request_error("cpu", "decompression", errno, 
+                           "GDeflate decompression failed", req);
+        return;
+    }
+}
+```
+
+### 3.3 Testing Strategy
+
+#### Test Assets
+- Create compressed test files with known content
+- Various block sizes (1KB, 4KB, 16KB, 64KB)
+- Edge cases: empty blocks, maximum compression, random data
+
+#### Test Cases
+1. **Basic decompression**: Simple compressed buffer → original data
+2. **Multi-block**: File with multiple independent blocks
+3. **Error handling**: Corrupted header, truncated data, invalid checksums
+4. **Performance**: Benchmark against uncompressed I/O
+5. **Partial reads**: Decompression with offset/size parameters
+
+**New Test File**: `tests/gdeflate_cpu_test.cpp`
+
+### 3.4 Performance Considerations
+
+**Metrics to Track**:
+- Decompression throughput (MB/s)
+- CPU utilization per thread
+- Memory overhead during decompression
+- Comparison vs uncompressed I/O
+
+**Optimization Opportunities**:
+- Parallel block decompression (thread pool)
+- SIMD optimizations for DEFLATE decode
+- Memory pool for temporary buffers
+- Block prefetching for streaming
+
+---
+
+## 4. GPU Implementation Plan
+
+### 4.1 Architecture
+
+```
+CPU: Parse block headers → metadata to GPU
+  ↓
+GPU: Parallel block decompression (compute shader)
+  ↓ (one workgroup per block or chunk)
+GPU: Output to GPU buffer
+```
+
+### 4.2 Prerequisites
+
+**Required Infrastructure** (from Vulkan compute investigation):
+- Compute pipeline creation ✅ (planned)
+- Shader module loading ✅ (planned)
+- Descriptor set management ✅ (planned)
+- GPU buffer management ✅ (exists)
+
+**Dependency**: Vulkan GPU compute pipeline implementation must be complete
+
+### 4.3 Implementation Phases
+
+#### Phase 4.1: Compute Shader Design
+**Tasks**:
+- Design GLSL compute shader for DEFLATE decode
+- Implement block-parallel decompression
+- Handle LZ77 back-references efficiently
+- Optimize for GPU wavefront/warp sizes
+
+**File**: `shaders/gdeflate_decompress.comp`
+
+**Key Challenges**:
+1. **Shared memory management**: LZ77 history buffer
+2. **Divergent execution**: Huffman decoding branches
+3. **Synchronization**: Block-independent decompression
+4. **Memory access patterns**: Coalesced reads/writes
+
+**Shader Structure**:
+```glsl
+#version 450
+
+// Input: compressed blocks buffer
+layout(binding = 0) readonly buffer CompressedData {
+    uint data[];
+} compressed;
+
+// Input: block metadata
+layout(binding = 1) readonly buffer BlockInfo {
+    uint offset[];
+    uint compressed_size[];
+    uint uncompressed_size[];
+} blocks;
+
+// Output: decompressed data
+layout(binding = 2) writeonly buffer DecompressedData {
+    uint data[];
+} decompressed;
+
+layout(local_size_x = 256) in;
+
+void main() {
+    uint block_id = gl_GlobalInvocationID.x;
+    // Decompress block[block_id]
+    // ...
+}
+```
+
+#### Phase 4.2: GPU Backend Integration
+**Tasks**:
+- Add GDeflate compute pipeline to VulkanBackend
+- Create descriptor sets for compression buffers
+- Dispatch compute workload for decompression
+- Handle synchronization (barriers, fences)
+
+**Location**: `src/ds_runtime_vulkan.cpp`
+
+**Changes**:
+- Add `gdeflate_pipeline_` member to VulkanBackend
+- Create compression-specific descriptor layout
+- Dispatch compute before/after staging buffer copies
+
+#### Phase 4.3: CPU-GPU Hybrid Strategy
+**Goal**: Choose CPU vs GPU decompression based on data characteristics
+
+**Decision Factors**:
+1. **Data size**: Small files → CPU, Large files → GPU
+2. **Block count**: Few blocks → CPU, Many blocks → GPU
+3. **Request destination**: Host memory → CPU, GPU buffer → GPU
+4. **GPU availability**: Fallback to CPU if GPU busy
+
+**Configuration**:
+```cpp
+struct CompressionConfig {
+    size_t gpu_threshold_bytes = 1024 * 1024; // 1MB
+    size_t min_blocks_for_gpu = 16;
+    bool prefer_gpu_for_gpu_memory = true;
+};
+```
+
+### 4.4 Performance Targets
+
+**Goals**:
+- GPU decompression ≥ 5x faster than CPU (for large files)
+- Minimal CPU involvement during GPU path
+- Efficient for small files (CPU path competitive)
+
+**Benchmarks**:
+- 1MB, 10MB, 100MB compressed assets
+- Various compression ratios (1.5x, 3x, 5x)
+- CPU vs GPU throughput comparison
+- GPU occupancy and utilization metrics
+
+---
+
+## 5. Dependencies
+
+### 5.1 External Libraries
+
+**CPU Path**:
+- zlib or miniz (DEFLATE decompression)
+- No additional system dependencies
+
+**GPU Path**:
+- Vulkan SDK (already optional dependency)
+- SPIR-V compiler for shaders (glslangValidator)
+- Vulkan compute pipeline support (from separate investigation)
+
+### 5.2 Internal Dependencies
+
+**Build System**:
+- Add GDeflate source files to CMakeLists.txt
+- Optional dependency on compression library
+- Shader compilation step in build
+
+**API Changes**:
+- No breaking changes (GDeflate enum already exists)
+- Remove ENOTSUP stub behavior
+- Update documentation to reflect support
+
+---
+
+## 6. Testing and Validation
+
+### 6.1 Unit Tests
+
+**Test Coverage**:
+1. Block header parsing (valid/invalid headers)
+2. Single-block decompression (various sizes)
+3. Multi-block decompression (independent blocks)
+4. Error handling (corrupted data, invalid sizes)
+5. Compression ratio validation (known test vectors)
+
+**Test Files**:
+- `tests/gdeflate_format_test.cpp` - Header parsing
+- `tests/gdeflate_cpu_test.cpp` - CPU decompression
+- `tests/gdeflate_gpu_test.cpp` - GPU decompression (if Vulkan available)
+
+### 6.2 Integration Tests
+
+**Scenarios**:
+1. CPU backend with GDeflate compression
+2. Vulkan backend with GDeflate GPU decompression
+3. Mixed requests (compressed + uncompressed)
+4. Error cases (invalid compression, missing data)
+
+**Update Existing Test**:
+- Modify `tests/compression_gdeflate_stub_test.cpp`
+- Change from "verify failure" to "verify success"
+- Add decompression correctness checks
+
+### 6.3 Real-World Testing
+
+**Asset Types**:
+- Game textures (DDS, KTX)
+- Mesh data (binary vertex buffers)
+- Shader bytecode (SPIR-V, DXBC)
+- Mixed asset packs
+
+**Performance Testing**:
+- Asset streaming demo with GDeflate assets
+- Benchmark vs uncompressed baseline
+- CPU vs GPU comparison
+- DirectStorage parity testing (if possible on Windows)
+
+---
+
+## 7. Documentation Requirements
+
+### 7.1 Format Documentation
+
+**Document**: `docs/gdeflate_format.md`
+
+**Contents**:
+- GDeflate file structure
+- Block header format
+- Compression parameters
+- Differences from standard DEFLATE
+- Decoder state machine
+
+### 7.2 Implementation Documentation
+
+**Updates Required**:
+- `README.md`: Change GDeflate status from ⚠️ to ✅
+- `MISSING_FEATURES.md`: Mark GDeflate items as complete
+- `docs/design.md`: Add GDeflate pipeline description
+- API documentation: Update Compression enum docs
+
+### 7.3 Usage Guide
+
+**Document**: `docs/gdeflate_usage.md`
+
+**Contents**:
+- How to compress assets for ds-runtime
+- Choosing CPU vs GPU decompression
+- Performance tuning recommendations
+- Troubleshooting guide
+
+---
+
+## 8. Risks and Mitigations
+
+### 8.1 Technical Risks
+
+| Risk | Severity | Mitigation |
+|------|----------|------------|
+| Format spec unavailable | High | Reverse engineer from samples, collaborate with Wine community |
+| GPU decode too slow | Medium | Optimize shader, fallback to CPU |
+| Memory overhead too high | Medium | Streaming decompression, memory pools |
+| Compatibility issues | Medium | Comprehensive testing, multiple test assets |
+| Patent/licensing concerns | Low | Use open compression algorithms, document format |
+
+### 8.2 Timeline Risks
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| Format research takes longer than expected | +2-4 weeks | Start with CPU implementation, add GPU later |
+| GPU optimization difficult | +2-3 weeks | Accept lower performance initially, iterate |
+| Integration issues with existing code | +1 week | Incremental integration, thorough testing |
+
+---
+
+## 9. Timeline and Milestones
+
+### 9.1 Research Phase (2-3 weeks)
+- Week 1: Format investigation and documentation
+- Week 2: Library evaluation and prototyping
+- Week 3: Design review and planning finalization
+
+**Milestone**: Format specification complete
+
+### 9.2 CPU Implementation (3-4 weeks)
+- Week 4: Header parser implementation
+- Week 5: DEFLATE integration and testing
+- Week 6: Backend integration
+- Week 7: Testing and optimization
+
+**Milestone**: CPU GDeflate decompression working
+
+### 9.3 GPU Implementation (4-6 weeks)
+- Week 8-9: Compute shader development
+- Week 10-11: GPU backend integration
+- Week 12-13: Testing and optimization
+
+**Milestone**: GPU GDeflate decompression working
+
+### 9.4 Total Estimate
+**9-13 weeks** for complete CPU + GPU implementation
+
+**Fast Track Option** (CPU only): **5-7 weeks**
+
+---
+
+## 10. Success Criteria
+
+### 10.1 Functional Requirements
+- ✅ CPU decoder successfully decompresses GDeflate assets
+- ✅ GPU decoder works on Vulkan-supported hardware
+- ✅ Error handling for corrupted/invalid data
+- ✅ All existing tests still pass
+- ✅ New GDeflate tests pass (100% coverage)
+
+### 10.2 Performance Requirements
+- ✅ CPU: Decompression throughput ≥ 500 MB/s (uncompressed equivalent)
+- ✅ GPU: Decompression throughput ≥ 2 GB/s (for large files)
+- ✅ GPU overhead < 10% for file ≥ 1MB
+- ✅ CPU fallback for small files performs acceptably
+
+### 10.3 Quality Requirements
+- ✅ No memory leaks (valgrind clean)
+- ✅ Thread-safe (no data races)
+- ✅ Vulkan validation layers pass (no errors)
+- ✅ Documentation complete and accurate
+- ✅ API stability maintained (no breaking changes)
+
+---
+
+## 11. Next Steps
+
+### Immediate Actions (This Week)
+1. ✅ Complete this investigation document
+2. ⏩ Research GDeflate format (begin literature search)
+3. ⏩ Evaluate DEFLATE libraries (zlib vs miniz vs custom)
+4. ⏩ Create prototype test assets
+
+### Short Term (Next 2 Weeks)
+1. ⏩ Finalize format specification
+2. ⏩ Begin header parser implementation
+3. ⏩ Set up testing infrastructure
+4. ⏩ Create CPU implementation plan
+
+### Medium Term (1-2 Months)
+1. ⏩ Complete CPU implementation
+2. ⏩ Integrate with existing backend
+3. ⏩ Begin GPU shader development
+4. ⏩ Performance benchmarking
+
+---
+
+## 12. Open Questions
+
+1. **Format Availability**: Is GDeflate format publicly documented? If not, can we reverse engineer it legally?
+2. **Patent Concerns**: Are there any patents covering GDeflate that we need to be aware of?
+3. **Library Choice**: Which DEFLATE library best fits our needs (zlib, miniz, custom)?
+4. **GPU Priority**: Should we implement CPU first and GPU later, or develop in parallel?
+5. **Compression Tool**: Do we need to provide a GDeflate compression tool, or only decompression?
+6. **Block Size**: What are optimal block sizes for CPU vs GPU decompression?
+7. **Streaming**: Do we need streaming decompression, or block-at-a-time is sufficient?
+8. **Validation**: How do we validate correctness without official test vectors?
+
+---
+
+## 13. References and Resources
+
+### Documentation
+- Microsoft DirectStorage documentation
+- Wine DirectStorage implementation (if available)
+- DEFLATE RFC 1951
+- GPU compression research papers
+
+### Libraries
+- zlib: https://www.zlib.net/
+- miniz: https://github.com/richgel999/miniz
+- Vulkan GPU compute examples
+
+### Community
+- Wine development mailing list
+- Proton GitHub discussions
+- Graphics programming forums
+
+---
+
+**Document Status**: Draft v1.0  
+**Last Updated**: 2026-02-16  
+**Next Review**: After format specification research complete
diff --git a/docs/investigation_io_uring.md b/docs/investigation_io_uring.md
new file mode 100644
index 0000000..4285b6b
--- /dev/null
+++ b/docs/investigation_io_uring.md
@@ -0,0 +1,788 @@
+# io_uring Backend Investigation and Enhancement Plan
+
+**Status:** Planning Phase  
+**Priority:** Medium  
+**Target:** Production-ready io_uring backend with full feature support  
+**Dependencies:** liburing library
+
+---
+
+## Executive Summary
+
+The ds-runtime project includes an experimental io_uring backend that provides kernel-level asynchronous I/O. This document outlines the investigation, design, and implementation plan for enhancing the io_uring backend to production quality.
+
+---
+
+## 1. Current State
+
+### 1.1 What Exists
+
+**Implementation**: `src/ds_runtime_uring.cpp`
+
+**Current Capabilities**:
+- ✅ `io_uring_queue_init()` with 256 queue entries
+- ✅ SQE (Submission Queue Entry) submission
+- ✅ CQE (Completion Queue Entry) handling
+- ✅ Single worker thread
+- ✅ Batch submission (`io_uring_submit`)
+- ✅ Completion polling (`io_uring_wait_cqe`)
+- ✅ Read operations (`io_uring_prep_read`)
+- ✅ Write operations (`io_uring_prep_write`)
+- ✅ Host memory support
+
+**Configuration**:
+```cpp
+struct IoUringBackendConfig {
+    uint32_t queue_entries = 256;
+    uint32_t worker_count = 1;  // Currently unused!
+};
+```
+
+### 1.2 What's Missing
+
+**Incomplete Features**:
+- ❌ **Multi-worker support**: `worker_count` field exists but ignored
+- ❌ **GPU memory**: Explicitly rejected with `EINVAL` (host-only)
+- ❌ **Decompression**: No compression/decompression handling
+- ❌ **Advanced features**: No linked ops, timeouts, or fixed files
+- ❌ **Error recovery**: Limited error handling
+- ❌ **Performance tuning**: No SQPOLL or IOPOLL modes
+
+**Known Limitations**:
+- Host memory only (GPU buffers not supported by io_uring design)
+- Single-threaded (no parallelism despite worker_count field)
+- Basic error reporting
+- No request prioritization
+
+### 1.3 Build Status
+
+**Dependency**: liburing (optional)
+- Not built by default (requires `-DDS_RUNTIME_HAS_IO_URING`)
+- CMake checks for liburing via pkg-config
+- Test suite requires liburing to run
+
+**Current Build**:
+```bash
+# In sandbox (liburing not found):
+cmake .. # io_uring backend NOT built
+```
+
+---
+
+## 2. io_uring Background
+
+### 2.1 What is io_uring?
+
+**io_uring** is a Linux kernel asynchronous I/O interface introduced in Linux 5.1.
+
+**Key Benefits**:
+- **Zero-copy**: Shared memory between kernel and userspace
+- **Batching**: Submit multiple operations at once
+- **Low overhead**: Minimal system call overhead
+- **Flexible**: Supports many operation types (read, write, fsync, etc.)
+- **Modern**: Designed for modern SSD and NVMe performance
+
+**Architecture**:
+```
+Application
+  ↓ (prepare SQEs)
+Submission Queue (SQ) - shared memory ring
+  ↓ (submit)
+Kernel (process I/O asynchronously)
+  ↓ (complete)
+Completion Queue (CQ) - shared memory ring
+  ↓ (reap CQEs)
+Application (handle completions)
+```
+
+### 2.2 io_uring vs Traditional I/O
+
+| Feature | io_uring | pread/pwrite | POSIX AIO |
+|---------|----------|--------------|-----------|
+| **Overhead** | Very Low | High (syscall per op) | Medium |
+| **Batching** | ✅ Yes | ❌ No | ⚠️ Limited |
+| **Zero-copy** | ✅ Yes | ❌ No | ❌ No |
+| **Flexibility** | ✅ High | ⚠️ Limited | ⚠️ Limited |
+| **Kernel support** | ✅ 5.1+ | ✅ All | ⚠️ Poor |
+
+**Verdict**: io_uring is superior for high-performance async I/O on modern Linux
+
+### 2.3 DirectStorage Relevance
+
+**Alignment with DirectStorage**:
+- ✅ Batched I/O submission (queue-based model)
+- ✅ Asynchronous completion (callback-driven)
+- ✅ Low CPU overhead (kernel handles scheduling)
+- ✅ High throughput (optimized for NVMe SSDs)
+
+**Differences**:
+- ❌ No GPU memory support (host-only by design)
+- ❌ No built-in decompression (must be done in userspace)
+- ✅ Linux-native (no Windows compatibility layer)
+
+---
+
+## 3. Enhancement Plan
+
+### 3.1 Phase 1: Multi-Worker Architecture
+
+**Goal**: Honor `worker_count` configuration for parallel I/O
+
+#### Current Architecture
+
+```
+Single Worker Thread
+  ↓
+io_uring instance (256 entries)
+  ↓
+Kernel I/O processing
+```
+
+#### Proposed Architecture
+
+```
+Worker Thread 1              Worker Thread N
+  ↓                            ↓
+io_uring instance 1          io_uring instance N
+  ↓                            ↓
+Kernel I/O processing (parallel)
+```
+
+#### Design Decisions
+
+**Option 1: Multiple io_uring Instances** (Recommended)
+- Each worker has own io_uring
+- Load balance requests round-robin
+- Independent polling threads
+- Pros: True parallelism, simple synchronization
+- Cons: More kernel resources
+
+**Option 2: Shared io_uring with Thread Pool**
+- Single io_uring, multiple polling threads
+- Workers compete for CQEs
+- Pros: Fewer kernel resources
+- Cons: Contention, complex synchronization
+
+**Recommendation**: Option 1 (multiple instances)
+
+#### Implementation
+
+**Data Structure**:
+```cpp
+struct IoUringWorker {
+    std::thread thread;
+    io_uring ring;
+    std::atomic<bool> running;
+    std::queue<Request*> pending;
+    std::mutex pending_mutex;
+    std::condition_variable pending_cv;
+};
+
+class IoUringBackend::Impl {
+    std::vector<IoUringWorker> workers_;
+    std::atomic<uint32_t> next_worker_; // Round-robin counter
+};
+```
+
+**Worker Thread**:
+```cpp
+void worker_loop(IoUringWorker& worker) {
+    while (worker.running) {
+        // 1. Submit pending requests
+        {
+            std::unique_lock<std::mutex> lock(worker.pending_mutex);
+            while (!worker.pending.empty()) {
+                Request* req = worker.pending.front();
+                worker.pending.pop();
+                
+                // Prepare SQE
+                io_uring_sqe* sqe = io_uring_get_sqe(&worker.ring);
+                if (req->op == RequestOp::Read) {
+                    io_uring_prep_read(sqe, req->fd, req->dst, 
+                                      req->size, req->offset);
+                } else {
+                    io_uring_prep_write(sqe, req->fd, req->src, 
+                                       req->size, req->offset);
+                }
+                io_uring_sqe_set_data(sqe, req);
+            }
+        }
+        
+        io_uring_submit(&worker.ring);
+        
+        // 2. Wait for completions
+        io_uring_cqe* cqe;
+        int ret = io_uring_wait_cqe_timeout(&worker.ring, &cqe, 
+                                           &timeout);
+        if (ret == 0) {
+            // Process completion
+            Request* req = static_cast<Request*>(
+                io_uring_cqe_get_data(cqe)
+            );
+            req->status = (cqe->res < 0) ? 
+                RequestStatus::Failed : RequestStatus::Complete;
+            req->bytes_transferred = (cqe->res > 0) ? cqe->res : 0;
+            
+            // Invoke callback
+            req->callback(req);
+            
+            io_uring_cqe_seen(&worker.ring, cqe);
+        }
+    }
+}
+```
+
+**Load Balancing**:
+```cpp
+void IoUringBackend::submit_request(Request& req) {
+    // Round-robin worker selection
+    uint32_t worker_idx = next_worker_.fetch_add(1) % workers_.size();
+    IoUringWorker& worker = workers_[worker_idx];
+    
+    {
+        std::lock_guard<std::mutex> lock(worker.pending_mutex);
+        worker.pending.push(&req);
+    }
+    worker.pending_cv.notify_one();
+}
+```
+
+**Testing**:
+- Submit requests to multiple workers
+- Verify parallel execution
+- Check completion order independence
+- Measure throughput scaling with worker count
+
+---
+
+### 3.2 Phase 2: Advanced io_uring Features
+
+**Goal**: Leverage advanced io_uring capabilities for performance
+
+#### Feature 1: SQPOLL Mode
+
+**Description**: Kernel-side submission queue polling (eliminates submit syscall)
+
+**Configuration**:
+```cpp
+struct io_uring_params params = {};
+params.flags = IORING_SETUP_SQPOLL;
+params.sq_thread_idle = 1000; // 1 second idle before sleep
+
+io_uring_queue_init_params(256, &ring, &params);
+```
+
+**Benefits**:
+- No `io_uring_submit()` syscall needed
+- Lower latency for high-frequency submissions
+- Kernel thread handles polling
+
+**Tradeoffs**:
+- Extra kernel thread (CPU overhead when idle)
+- Requires CAP_SYS_NICE or io_uring_register_iowq_max_workers
+
+**Use Case**: High-throughput streaming with many small I/Os
+
+#### Feature 2: IOPOLL Mode
+
+**Description**: Kernel polls completion directly from device (bypass interrupts)
+
+**Configuration**:
+```cpp
+params.flags = IORING_SETUP_IOPOLL;
+```
+
+**Benefits**:
+- Lower latency on fast NVMe devices
+- Reduced interrupt overhead
+
+**Requirements**:
+- O_DIRECT file I/O
+- Polling-capable storage device
+
+**Use Case**: Ultra-low-latency I/O on NVMe SSDs
+
+#### Feature 3: Fixed Files
+
+**Description**: Register file descriptors to avoid fd lookup overhead
+
+**API**:
+```cpp
+// Register FDs once
+int fds[MAX_FILES];
+io_uring_register_files(&ring, fds, MAX_FILES);
+
+// Use registered FD index in SQEs
+io_uring_prep_read(sqe, fd_index, buf, size, offset);
+sqe->flags |= IOSQE_FIXED_FILE;
+```
+
+**Benefits**:
+- Eliminates fd table lookup
+- ~10-15% latency reduction
+
+**Use Case**: Repeatedly accessing same set of files
+
+#### Feature 4: Linked Operations
+
+**Description**: Chain dependent operations (e.g., read → decompress → write)
+
+**API**:
+```cpp
+// Read operation
+io_uring_prep_read(sqe1, fd, buf, size, offset);
+sqe1->flags |= IOSQE_IO_LINK;
+
+// Write operation (only if read succeeds)
+io_uring_prep_write(sqe2, fd_out, buf, size, 0);
+```
+
+**Benefits**:
+- Atomic operation sequences
+- Reduced round-trips
+
+**Use Case**: Complex I/O workflows (read-modify-write)
+
+#### Implementation Priority
+
+1. **Fixed Files** (High): Easy to implement, measurable benefit
+2. **SQPOLL** (Medium): Good for high throughput, adds complexity
+3. **Linked Ops** (Low): Complex, requires workflow redesign
+4. **IOPOLL** (Low): Requires O_DIRECT, hardware-specific
+
+---
+
+### 3.3 Phase 3: Compression Integration
+
+**Goal**: Add decompression support to io_uring backend
+
+#### Design Challenge
+
+**Problem**: io_uring is host-only, decompression needs CPU/GPU
+
+**Solutions**:
+
+**Option 1: Hybrid Approach** (Recommended)
+```
+io_uring (read compressed data)
+  ↓
+CPU decompression (in worker thread)
+  ↓
+Completion callback
+```
+
+**Option 2: Separate Decompression Queue**
+```
+io_uring (read compressed data)
+  ↓
+Enqueue to decompression thread pool
+  ↓ (parallel decompression)
+Completion callback
+```
+
+**Option 3: Reject Compression**
+```
+if (req.compression != Compression::None) {
+    return error(EINVAL, "io_uring backend does not support compression");
+}
+```
+
+**Recommendation**: Option 1 for CPU compression, Option 3 for GPU (hand off to Vulkan backend)
+
+#### Implementation (Option 1)
+
+```cpp
+void worker_loop(IoUringWorker& worker) {
+    // After io_uring read completion
+    if (cqe->res > 0) {
+        Request* req = static_cast<Request*>(
+            io_uring_cqe_get_data(cqe)
+        );
+        
+        // Decompress if needed
+        if (req->compression != Compression::None) {
+            bool success = decompress(req);
+            if (!success) {
+                req->status = RequestStatus::Failed;
+                req->errno_value = EIO;
+            }
+        }
+        
+        req->status = RequestStatus::Complete;
+        req->callback(req);
+    }
+}
+
+bool decompress(Request* req) {
+    switch (req->compression) {
+        case Compression::FakeUppercase:
+            return fake_uppercase_transform(req->dst, req->size);
+        case Compression::GDeflate:
+            return gdeflate_decompress(req->dst, req->size);
+        default:
+            return true; // No compression
+    }
+}
+```
+
+**Testing**:
+- Read compressed file via io_uring
+- Verify decompression occurs
+- Check performance vs CPU backend
+- Ensure no blocking in completion path
+
+---
+
+### 3.4 Phase 4: Error Handling and Resilience
+
+**Goal**: Robust error handling for production use
+
+#### Error Scenarios
+
+1. **EAGAIN** (queue full): Retry or backpressure
+2. **EINTR** (interrupted): Retry operation
+3. **EIO** (device error): Report to application
+4. **EBADF** (bad fd): Validate before submission
+5. **Ring setup failure**: Fallback to CPU backend
+
+#### Enhanced Error Handling
+
+```cpp
+void handle_cqe_error(Request* req, io_uring_cqe* cqe) {
+    int err = -cqe->res;
+    
+    switch (err) {
+        case EAGAIN:
+            // Retry operation
+            resubmit_request(req);
+            break;
+            
+        case EINTR:
+            // Interrupted, retry
+            resubmit_request(req);
+            break;
+            
+        case EIO:
+        case EBADF:
+        case EFAULT:
+            // Fatal error, report to application
+            req->status = RequestStatus::Failed;
+            req->errno_value = err;
+            report_request_error("io_uring", "completion", err,
+                               strerror(err), *req);
+            req->callback(req);
+            break;
+            
+        default:
+            // Unknown error
+            req->status = RequestStatus::Failed;
+            req->errno_value = err;
+            report_request_error("io_uring", "completion", err,
+                               "Unknown io_uring error", *req);
+            req->callback(req);
+            break;
+    }
+}
+```
+
+**Retry Logic**:
+```cpp
+void resubmit_request(Request* req) {
+    if (req->retry_count++ >= MAX_RETRIES) {
+        req->status = RequestStatus::Failed;
+        req->errno_value = ETIMEDOUT;
+        req->callback(req);
+        return;
+    }
+    
+    // Exponential backoff
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(1 << req->retry_count)
+    );
+    
+    // Re-enqueue
+    submit_request(*req);
+}
+```
+
+---
+
+### 3.5 Phase 5: Performance Tuning
+
+**Goal**: Optimize io_uring backend for maximum throughput
+
+#### Tuning Parameters
+
+**Queue Depth**:
+```cpp
+// Current: 256 entries
+// Consider: Configurable based on workload
+// - Small files: 128-256
+// - Large files: 512-1024
+// - Streaming: 2048+
+```
+
+**Batch Size**:
+```cpp
+// Submit multiple SQEs at once
+// Amortizes syscall overhead
+uint32_t batch_size = 16; // Tune based on request rate
+```
+
+**Worker Count**:
+```cpp
+// Heuristic: Number of CPU cores or storage devices
+uint32_t optimal_workers = std::min(
+    std::thread::hardware_concurrency(),
+    num_storage_devices
+);
+```
+
+**Polling Interval**:
+```cpp
+// Balance latency vs CPU usage
+struct __kernel_timespec timeout = {
+    .tv_sec = 0,
+    .tv_nsec = 1000000  // 1ms (tune based on workload)
+};
+```
+
+#### Benchmarking
+
+**Metrics to Track**:
+- Throughput (MB/s, IOPS)
+- Latency (p50, p95, p99)
+- CPU utilization
+- Queue depth utilization
+- Syscall count (should be minimal with batching)
+
+**Test Scenarios**:
+- Sequential reads (large files)
+- Random reads (small files)
+- Mixed read/write
+- Concurrent requests (stress test)
+
+**Comparison**:
+- io_uring vs CPU backend (pread/pwrite)
+- Single worker vs multi-worker
+- SQPOLL vs non-SQPOLL
+
+---
+
+## 4. GPU Memory Limitation
+
+### 4.1 Why GPU Buffers Don't Work
+
+**Fundamental Limitation**: io_uring operates on host virtual memory
+- Kernel I/O subsystem writes to host memory only
+- GPU memory (VRAM) is not directly accessible to kernel I/O
+- DMA transfers require device-specific drivers
+
+**Workarounds** (Not Implemented):
+1. **GPU memory mapping**: Expose GPU memory to host address space (driver-dependent, slow)
+2. **Staging buffers**: io_uring → host buffer → GPU copy (defeats purpose)
+3. **GPU Direct Storage**: Requires specialized hardware/drivers (NVIDIA GPUDirect Storage)
+
+### 4.2 Recommended Approach
+
+**Strategy**: Reject GPU memory requests, hand off to Vulkan backend
+
+```cpp
+void IoUringBackend::submit_request(Request& req) {
+    // Check for GPU memory
+    if (req.dst_memory == RequestMemory::Gpu || 
+        req.src_memory == RequestMemory::Gpu) {
+        report_request_error("io_uring", "submit", EINVAL,
+            "io_uring backend does not support GPU memory (use Vulkan backend)",
+            req);
+        req.status = RequestStatus::Failed;
+        req.errno_value = EINVAL;
+        req.callback(&req);
+        return;
+    }
+    
+    // Proceed with host memory I/O
+    // ...
+}
+```
+
+**Documentation**: Clearly state io_uring is host-only backend
+
+---
+
+## 5. Testing Strategy
+
+### 5.1 Unit Tests
+
+**Test Suite**: `tests/io_uring_backend_test.cpp` (already exists)
+
+**Additional Test Cases**:
+1. **Multi-worker**: Submit to multiple workers, verify parallelism
+2. **High load**: 1000+ concurrent requests
+3. **Error injection**: Simulate EAGAIN, EIO, EINTR
+4. **Compression**: Read compressed files, verify decompression
+5. **Batching**: Submit batches, measure throughput
+6. **Cancellation**: Cancel in-flight requests (if supported)
+
+### 5.2 Integration Tests
+
+**Scenarios**:
+1. **Asset streaming demo**: Use io_uring backend
+2. **Mixed backends**: io_uring for host, Vulkan for GPU
+3. **Stress test**: Sustained high I/O rate
+4. **Error recovery**: Handle disk full, permission denied
+
+### 5.3 Performance Tests
+
+**Benchmarks**:
+- Throughput vs CPU backend
+- Scalability with worker count
+- Latency distribution
+- CPU overhead
+
+**Test Assets**:
+- 1MB, 10MB, 100MB files
+- Sequential vs random access
+- Compressed vs uncompressed
+
+---
+
+## 6. Dependencies
+
+### 6.1 Build System
+
+**liburing Detection**:
+```cmake
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+    pkg_check_modules(LIBURING liburing)
+endif()
+
+if (LIBURING_FOUND)
+    list(APPEND DS_RUNTIME_SOURCES src/ds_runtime_uring.cpp)
+    target_link_libraries(ds_runtime PUBLIC ${LIBURING_LIBRARIES})
+    target_include_directories(ds_runtime PUBLIC ${LIBURING_INCLUDE_DIRS})
+    target_compile_definitions(ds_runtime PUBLIC DS_RUNTIME_HAS_IO_URING)
+endif()
+```
+
+**Installation** (Arch Linux, CachyOS):
+```bash
+sudo pacman -S liburing
+```
+
+### 6.2 Runtime Requirements
+
+**Kernel Version**: Linux 5.1+ (5.10+ recommended for stability)
+**Capabilities**: None required for basic use, CAP_SYS_NICE for SQPOLL
+
+---
+
+## 7. Timeline and Milestones
+
+### 7.1 Implementation Phases
+
+**Week 1-2: Multi-Worker**
+- Implement worker pool
+- Load balancing
+- Testing
+- **Milestone**: Multi-worker backend functional
+
+**Week 3: Advanced Features**
+- Fixed files support
+- SQPOLL mode (optional)
+- Testing
+- **Milestone**: Advanced features working
+
+**Week 4: Compression**
+- Integrate decompression
+- Testing
+- **Milestone**: Compression support
+
+**Week 5-6: Error Handling & Tuning**
+- Robust error handling
+- Performance tuning
+- Benchmarking
+- **Milestone**: Production-ready backend
+
+### 7.2 Total Estimate
+
+**6 weeks** for complete io_uring backend enhancement
+
+---
+
+## 8. Success Criteria
+
+### 8.1 Functional Requirements
+- ✅ Multi-worker support working
+- ✅ All request types supported (read, write)
+- ✅ Compression/decompression integrated
+- ✅ Error handling robust
+- ✅ Existing tests pass
+- ✅ New tests pass (100% coverage)
+
+### 8.2 Performance Requirements
+- ✅ Throughput ≥ 2x CPU backend (large files)
+- ✅ Latency ≤ 0.5x CPU backend
+- ✅ CPU overhead ≤ 20% of CPU backend
+- ✅ Scales linearly with worker count (up to core count)
+
+### 8.3 Quality Requirements
+- ✅ No memory leaks
+- ✅ Thread-safe
+- ✅ Graceful degradation on error
+- ✅ Documentation complete
+- ✅ API stability maintained
+
+---
+
+## 9. Next Steps
+
+### Immediate (This Week)
+1. ✅ Complete investigation document
+2. ⏩ Install liburing in development environment
+3. ⏩ Examine current implementation
+4. ⏩ Design multi-worker architecture
+
+### Short Term (Next 2 Weeks)
+1. ⏩ Implement multi-worker support
+2. ⏩ Test parallelism and load balancing
+3. ⏩ Benchmark vs CPU backend
+
+### Medium Term (1-2 Months)
+1. ⏩ Add advanced features
+2. ⏩ Integrate compression
+3. ⏩ Performance tuning
+4. ⏩ Production-ready release
+
+---
+
+## 10. Open Questions
+
+1. **Worker Count Default**: What's optimal default for `worker_count`?
+2. **SQPOLL**: Is SQPOLL worth the complexity for our use case?
+3. **Fixed Files**: Should we auto-register frequently accessed files?
+4. **GPU Fallback**: Should we automatically hand off GPU requests to Vulkan backend?
+5. **Compression**: CPU-only or thread pool for decompression?
+6. **Batching**: What's optimal batch size for different workloads?
+
+---
+
+## 11. References
+
+### Documentation
+- liburing documentation: https://github.com/axboe/liburing
+- io_uring manpages: `man io_uring`
+- Kernel documentation: Documentation/io_uring.txt
+- Efficient IO with io_uring (Jens Axboe)
+
+### Performance
+- io_uring performance analysis
+- NVMe optimization guides
+- Linux I/O stack deep dive
+
+---
+
+**Document Status**: Draft v1.0  
+**Last Updated**: 2026-02-16  
+**Next Review**: After liburing installation and multi-worker implementation
diff --git a/docs/investigation_remaining_features.md b/docs/investigation_remaining_features.md
new file mode 100644
index 0000000..4c9e76d
--- /dev/null
+++ b/docs/investigation_remaining_features.md
@@ -0,0 +1,756 @@
+# Request Cancellation, GPU-Resident Workflows, and Wine/Proton Integration
+
+**Status:** Planning Phase  
+**Priority:** Medium to High  
+**Target:** Complete feature set for production DirectStorage-style runtime
+
+---
+
+## Part 1: Request Cancellation
+
+### 1.1 Current State
+
+**What Exists**:
+- `RequestStatus` enum with values: `Pending`, `InProgress`, `Complete`, `Failed`
+- No `Cancelled` status
+- No `cancel()` method on Queue
+- No cancellation support in backends
+
+**What's Missing**:
+- ❌ `RequestStatus::Cancelled` enum value
+- ❌ `Queue::cancel_request(request_id)` method
+- ❌ In-flight request tracking for cancellation
+- ❌ Backend cancellation hooks
+- ❌ Race condition handling (completion vs cancellation)
+
+### 1.2 Design Requirements
+
+**Use Cases**:
+1. **Timeout**: Cancel requests that take too long
+2. **User Action**: User cancels loading operation
+3. **Priority Change**: Cancel low-priority work to start high-priority
+4. **Shutdown**: Cancel all in-flight requests on cleanup
+
+**Semantics**:
+```cpp
+// Strong guarantee: Request will not complete after cancel
+bool cancel(request_id);
+
+// Weak guarantee: Request may complete, but won't invoke callback
+bool try_cancel(request_id);
+```
+
+### 1.3 Implementation Plan
+
+#### Phase 1: API Design
+
+**Add to Request**:
+```cpp
+struct Request {
+    // Existing fields...
+    std::atomic<bool> cancellation_requested = false;
+    request_id_t id = 0; // Unique ID for tracking
+};
+```
+
+**Add to Queue**:
+```cpp
+class Queue {
+public:
+    // Cancel specific request (returns true if cancelled before completion)
+    bool cancel_request(request_id_t id);
+    
+    // Cancel all pending requests (not yet submitted)
+    size_t cancel_all_pending();
+    
+    // Cancel all requests (including in-flight)
+    size_t cancel_all();
+};
+```
+
+**Add Status**:
+```cpp
+enum class RequestStatus {
+    Pending,
+    InProgress,
+    Complete,
+    Failed,
+    Cancelled  // NEW
+};
+```
+
+#### Phase 2: Queue Implementation
+
+**Request Tracking**:
+```cpp
+class Queue::Impl {
+    std::unordered_map<request_id_t, Request*> active_requests_;
+    std::mutex active_mutex_;
+    std::atomic<request_id_t> next_id_{1};
+};
+
+void Queue::enqueue(Request& req) {
+    req.id = impl_->next_id_.fetch_add(1);
+    {
+        std::lock_guard lock(impl_->active_mutex_);
+        impl_->active_requests_[req.id] = &req;
+    }
+    // ... existing enqueue logic
+}
+```
+
+**Cancellation**:
+```cpp
+bool Queue::cancel_request(request_id_t id) {
+    std::lock_guard lock(impl_->active_mutex_);
+    
+    auto it = impl_->active_requests_.find(id);
+    if (it == impl_->active_requests_.end()) {
+        return false; // Already completed or never existed
+    }
+    
+    Request* req = it->second;
+    
+    // Mark as cancellation requested
+    req->cancellation_requested.store(true, std::memory_order_release);
+    
+    // If still pending (not submitted), remove immediately
+    if (req->status == RequestStatus::Pending) {
+        req->status = RequestStatus::Cancelled;
+        impl_->active_requests_.erase(it);
+        return true;
+    }
+    
+    // If in-flight, backend must handle cancellation
+    // Return false to indicate "in progress, might complete"
+    return false;
+}
+```
+
+#### Phase 3: Backend Support
+
+**CPU Backend**:
+```cpp
+void CpuBackend::process_request(Request& req) {
+    // Check cancellation before I/O
+    if (req.cancellation_requested.load(std::memory_order_acquire)) {
+        req.status = RequestStatus::Cancelled;
+        req.callback(&req);
+        return;
+    }
+    
+    // Perform I/O
+    ssize_t bytes = pread(req.fd, req.dst, req.size, req.offset);
+    
+    // Check cancellation after I/O (before callback)
+    if (req.cancellation_requested.load(std::memory_order_acquire)) {
+        req.status = RequestStatus::Cancelled;
+        req.callback(&req);
+        return;
+    }
+    
+    // Normal completion
+    req.status = RequestStatus::Complete;
+    req.callback(&req);
+}
+```
+
+**Vulkan Backend**:
+```cpp
+// Harder to cancel GPU work in progress
+// Strategy: Don't invoke callback if cancelled
+void VulkanBackend::complete_request(Request& req) {
+    if (req.cancellation_requested.load(std::memory_order_acquire)) {
+        req.status = RequestStatus::Cancelled;
+    }
+    
+    req.callback(&req);
+}
+```
+
+**io_uring Backend**:
+```cpp
+// Can cancel SQE before submission
+bool IoUringBackend::cancel_sqe(Request& req) {
+    // Remove from pending queue if not yet submitted
+    std::lock_guard lock(pending_mutex_);
+    auto it = std::find(pending_.begin(), pending_.end(), &req);
+    if (it != pending_.end()) {
+        pending_.erase(it);
+        req.status = RequestStatus::Cancelled;
+        return true;
+    }
+    return false; // Already submitted
+}
+```
+
+### 1.4 Testing
+
+**Test Cases**:
+1. Cancel pending request (before submit)
+2. Cancel in-flight request (during I/O)
+3. Cancel completed request (should fail)
+4. Cancel non-existent request (should fail)
+5. Race: cancel vs completion
+6. Cancel all requests
+7. Callback not invoked for cancelled request
+
+**Test File**: `tests/cancellation_test.cpp`
+
+### 1.5 Timeline
+
+**2-3 weeks** for complete cancellation support
+
+---
+
+## Part 2: GPU-Resident Workflows
+
+### 2.1 Motivation
+
+**Goal**: Zero-copy disk → GPU data path
+
+**Traditional Path** (current):
+```
+Disk → Host Staging Buffer → GPU Buffer
+       [copy 1]              [copy 2]
+```
+
+**GPU-Resident Path** (target):
+```
+Disk → GPU Buffer (direct)
+       [copy 1 only]
+```
+
+### 2.2 DirectStorage GPU Upload Heap
+
+**Microsoft DirectStorage Concept**:
+- GPU upload heap: CPU-visible, GPU-accessible memory
+- Direct writes from storage controller to GPU memory
+- Requires hardware support (PCIe peer-to-peer, GPU Direct Storage)
+
+**Linux Equivalent**:
+- **NVIDIA GPUDirect Storage**: Kernel driver enables direct NVMe → GPU transfers
+- **AMD equivalent**: DirectGMA (less documented)
+- **Standard Vulkan**: No direct disk → GPU (must use staging)
+
+### 2.3 Implementation Strategies
+
+#### Strategy 1: Vulkan External Memory (Current)
+
+**Approach**: Staging buffer + GPU copy (already implemented)
+
+**Pros**:
+- Works on all Vulkan hardware
+- Portable across vendors
+- Already implemented
+
+**Cons**:
+- Extra copy (staging → GPU)
+- Higher latency
+- More memory usage
+
+#### Strategy 2: GPU Direct Storage Integration
+
+**Approach**: Integrate with vendor-specific APIs
+
+**NVIDIA GPUDirect Storage**:
+```cpp
+// Open file with GDS flags
+int fd = open(path, O_RDONLY | O_DIRECT);
+
+// Register GPU buffer with GDS
+cuFileDriverOpen();
+CUfileHandle_t handle;
+cuFileHandleRegister(&handle, &cufile_desc);
+
+// Direct read to GPU memory
+cuFileRead(handle, gpu_buffer, size, offset, 0);
+```
+
+**Pros**:
+- Zero extra copies
+- Lowest latency
+- Highest throughput
+
+**Cons**:
+- NVIDIA-only (no AMD/Intel equivalent)
+- Requires special driver setup
+- O_DIRECT alignment requirements
+- Complex integration
+
+#### Strategy 3: Memory-Mapped Files + GPU Upload
+
+**Approach**: mmap file, map to GPU upload heap
+
+**Implementation**:
+```cpp
+// Map file to host memory
+void* mapped = mmap(nullptr, file_size, PROT_READ, 
+                   MAP_SHARED, fd, 0);
+
+// Allocate GPU upload heap (CPU-visible, GPU-accessible)
+VkBuffer upload_buffer = create_upload_buffer(device);
+void* gpu_mapped = map_buffer(upload_buffer);
+
+// Copy file data to upload heap
+memcpy(gpu_mapped, mapped, file_size);
+
+// Unmap
+munmap(mapped, file_size);
+unmap_buffer(upload_buffer);
+
+// Use upload buffer directly in GPU (no staging copy needed)
+```
+
+**Pros**:
+- Simpler than GDS
+- Works across vendors
+- Reduces staging buffer usage
+
+**Cons**:
+- Still one copy (mmap → GPU)
+- Page cache overhead
+- Not true "direct to GPU"
+
+### 2.4 Recommended Approach
+
+**Phase 1: Optimize Current Path**
+- Reuse staging buffers (pool)
+- Async staging → GPU copy (don't wait)
+- Batch multiple requests
+
+**Phase 2: Vendor-Specific Paths** (Optional)
+- Add GDS backend for NVIDIA
+- Conditional compilation (#ifdef NVIDIA_GDS)
+- Fallback to standard path
+
+**Phase 3: Future Hardware**
+- Wait for standardized GPU Direct Storage in Vulkan
+- Integrate when available
+
+### 2.5 GPU-to-GPU Transfers
+
+**Use Case**: Texture decompression GPU → GPU
+
+**Current Path**:
+```
+Disk → Staging → GPU Compressed Buffer → GPU Decompressed Buffer
+                 [compute shader]
+```
+
+**Optimization**:
+```
+Disk → GPU Compressed Buffer → GPU Decompressed Buffer
+                               [single command buffer]
+```
+
+**Implementation**: Already supported via Vulkan backend + compute pipelines
+
+### 2.6 Testing
+
+**Benchmarks**:
+- Staging vs direct (if GDS available)
+- Throughput (MB/s)
+- Latency (ms)
+- CPU overhead (%)
+
+**Validation**:
+- Data integrity (checksums)
+- Memory usage
+- GPU utilization
+
+### 2.7 Timeline
+
+**Phase 1 (Optimization)**: 2 weeks  
+**Phase 2 (GDS Integration)**: 4-6 weeks (if needed)
+
+---
+
+## Part 3: Wine/Proton Integration
+
+### 3.1 Architecture Overview
+
+**Goal**: Enable Windows DirectStorage games to run on Linux via Proton
+
+**Strategy**:
+```
+Windows Game (DirectStorage API)
+  ↓
+Wine/Proton dstorage.dll Shim
+  ↓ (translate calls)
+ds-runtime (Linux native)
+  ↓ (execute)
+Linux Kernel (io_uring, Vulkan)
+```
+
+### 3.2 Integration Approaches
+
+#### Approach 1: PE DLL Shim (Recommended)
+
+**Architecture**:
+```
+dstorage.dll (PE) - Windows ABI
+  ↓ dlopen
+libds_runtime.so - Linux ABI
+```
+
+**Implementation**:
+1. Create `dstorage.dll` (Wine builtin DLL)
+2. Implement DirectStorage API entry points
+3. Forward to `libds_runtime.so` via C ABI
+4. Translate types (HANDLE → fd, etc.)
+
+**Example**:
+```cpp
+// dstorage.dll (Wine)
+HRESULT WINAPI DStorageCreateQueue(
+    const DSTORAGE_QUEUE_DESC* desc,
+    REFIID riid,
+    void** ppv
+) {
+    // Load libds_runtime.so
+    void* handle = dlopen("libds_runtime.so", RTLD_NOW);
+    
+    // Get C API functions
+    auto ds_create_queue = (ds_queue_t* (*)(ds_backend_t*))
+        dlsym(handle, "ds_create_queue");
+    
+    // Create backend
+    ds_backend_t* backend = ds_make_cpu_backend();
+    
+    // Create queue
+    ds_queue_t* queue = ds_create_queue(backend);
+    
+    // Wrap in COM object
+    *ppv = new DStorageQueueImpl(queue);
+    return S_OK;
+}
+```
+
+#### Approach 2: Direct Integration (No Shim)
+
+**Architecture**:
+```
+Wine/Proton DirectStorage Implementation
+  ↓ (link directly)
+libds_runtime_static.a
+```
+
+**Implementation**:
+1. Build ds-runtime as static library
+2. Link into Wine dlls/dstorage build
+3. Call C++ API directly (no PE/ELF bridge)
+4. Share Vulkan device with vkd3d-proton
+
+**Pros**:
+- No dlopen overhead
+- Simpler debugging
+- Shared Vulkan context
+
+**Cons**:
+- Tighter coupling
+- Requires Wine build modifications
+
+#### Approach 3: Kernel Module (Experimental)
+
+**Architecture**:
+```
+DirectStorage Requests
+  ↓
+ioctl to kernel module
+  ↓
+Kernel-side I/O handling
+```
+
+**Not Recommended**: Too complex, overkill for userspace I/O
+
+### 3.3 Type Mapping
+
+**Windows → Linux Translation**:
+
+| Windows Type | Linux Type | Conversion |
+|--------------|------------|------------|
+| `HANDLE` | `int` | `fd = _open_osfhandle(handle)` |
+| `DSTORAGE_REQUEST` | `ds_request` | Struct field mapping |
+| `ID3D12Resource*` | `VkBuffer` | vkd3d-proton interop |
+| `DSTORAGE_COMPRESSION` | `ds_compression_t` | Enum mapping |
+| `OVERLAPPED` | Completion callback | Async model |
+
+**Example Struct Mapping**:
+```cpp
+void translate_request(
+    const DSTORAGE_REQUEST_DESC* windows_req,
+    ds_request* linux_req
+) {
+    linux_req->fd = get_fd_from_handle(windows_req->Source.File.Handle);
+    linux_req->offset = windows_req->Source.File.Offset;
+    linux_req->size = windows_req->Source.File.Size;
+    linux_req->dst = get_buffer_pointer(windows_req->Destination);
+    linux_req->op = (windows_req->DestinationType == DSTORAGE_REQUEST_DESTINATION_MEMORY)
+        ? DS_REQUEST_OP_READ : DS_REQUEST_OP_WRITE;
+    linux_req->compression = translate_compression(
+        windows_req->CompressionFormat
+    );
+}
+```
+
+### 3.4 Vulkan Device Sharing
+
+**Challenge**: DirectStorage expects D3D12 device, we need Vulkan
+
+**Solution**: vkd3d-proton already handles D3D12 → Vulkan translation
+
+**Integration**:
+```cpp
+// Get Vulkan device from vkd3d-proton
+VkDevice vk_device = vkd3d_get_vk_device(d3d12_device);
+VkQueue vk_queue = vkd3d_get_vk_queue(d3d12_device);
+
+// Create ds-runtime Vulkan backend with shared device
+ds_vulkan_backend_config config;
+config.device = vk_device;
+config.queue = vk_queue;
+config.take_ownership = false; // Don't destroy device
+
+ds_backend_t* backend = ds_make_vulkan_backend(&config);
+```
+
+### 3.5 Implementation Steps
+
+#### Step 1: C ABI Wrapper (Already Exists)
+
+**Status**: ✅ Complete
+- `include/ds_runtime_c.h` provides C API
+- Type conversions implemented
+- Tested with `c_abi_stats_test.c`
+
+#### Step 2: Create dstorage.dll Skeleton
+
+**Location**: Outside ds-runtime repo (in Wine tree)
+
+**Files**:
+```
+dlls/dstorage/
+├── Makefile.in
+├── dstorage.spec
+├── dstorage_main.c
+├── queue.c
+└── request.c
+```
+
+**Implement**:
+- `DStorageGetFactory`
+- `DStorageSetConfiguration`
+- `IDStorageFactory::CreateQueue`
+- `IDStorageQueue::EnqueueRequest`
+- `IDStorageQueue::Submit`
+- `IDStorageQueue::EnqueueSignal`
+
+#### Step 3: Link with ds-runtime
+
+**Option A: Dynamic Linking**
+```makefile
+EXTRADLLFLAGS = -Wl,--no-undefined
+EXTRALIBS = -lds_runtime
+```
+
+**Option B: Static Linking**
+```makefile
+EXTRALIBS = $(LIBDS_RUNTIME_STATIC)
+```
+
+#### Step 4: Test with Real Games
+
+**Test Titles**:
+- Forspoken (uses DirectStorage)
+- Ratchet & Clank: Rift Apart
+- Any UE5 game with DirectStorage support
+
+**Validation**:
+- Game launches without crashes
+- Asset loading works
+- Performance acceptable
+- No memory leaks
+
+### 3.6 Documentation
+
+**Create**: `docs/wine_integration_guide.md`
+
+**Contents**:
+- Build dstorage.dll
+- Configure Wine to use builtin override
+- Debugging tips
+- Performance tuning
+- Known issues
+
+### 3.7 Timeline
+
+**Week 1-2: Prototype**
+- Create basic dstorage.dll shim
+- Implement skeleton COM interfaces
+- Test with simple DirectStorage app
+
+**Week 3-4: Type Mapping**
+- Implement full type conversion
+- Handle edge cases
+- Vulkan device sharing
+
+**Week 5-6: Testing**
+- Test with real games
+- Performance benchmarking
+- Bug fixing
+
+**Week 7-8: Polish**
+- Documentation
+- Error handling
+- Wine upstreaming (if desired)
+
+**Total Estimate**: **8 weeks**
+
+---
+
+## Part 4: Master Implementation Roadmap
+
+### 4.1 Dependency Graph
+
+```
+GDeflate CPU ━━━┓
+                ┃
+Vulkan Compute ━╋━━> GDeflate GPU
+                ┃
+io_uring Multi  ┃
+                ┃
+Cancellation ━━━╋━━> GPU Workflows
+                ┃
+                ┗━━> Wine/Proton Integration
+```
+
+### 4.2 Phased Implementation
+
+**Phase 1: Foundation** (Weeks 1-8)
+- ✅ Initial assessment (complete)
+- ⏩ GDeflate research (2 weeks)
+- ⏩ Vulkan compute infrastructure (8 weeks, parallel)
+
+**Phase 2: Core Features** (Weeks 9-18)
+- ⏩ GDeflate CPU implementation (5 weeks)
+- ⏩ io_uring multi-worker (6 weeks, parallel)
+- ⏩ Request cancellation (3 weeks, parallel)
+
+**Phase 3: Advanced Features** (Weeks 19-28)
+- ⏩ GDeflate GPU implementation (6 weeks)
+- ⏩ GPU workflow optimization (4 weeks)
+
+**Phase 4: Integration** (Weeks 29-36)
+- ⏩ Wine/Proton shim (8 weeks)
+- ⏩ Real game testing
+- ⏩ Performance tuning
+
+**Total Timeline**: **36 weeks (9 months)**
+
+### 4.3 Parallelization Opportunities
+
+**Can Work in Parallel**:
+- Vulkan compute + GDeflate research
+- GDeflate CPU + io_uring enhancements
+- GDeflate CPU + cancellation
+- GPU workflows + Wine integration
+
+**Must Be Sequential**:
+- Vulkan compute → GDeflate GPU
+- GDeflate CPU → GDeflate GPU
+- Core features → Wine integration
+
+### 4.4 Fast Track Option
+
+**Goal**: Minimal viable product in 12 weeks
+
+**Scope**:
+- ✅ CPU backend (working)
+- ⏩ GDeflate CPU (5 weeks)
+- ⏩ Vulkan compute (8 weeks, start week 1)
+- ⏩ Basic Wine shim (3 weeks)
+- ❌ Skip: GPU GDeflate, io_uring multi-worker, advanced features
+
+**Timeline**: **12 weeks**
+
+---
+
+## Part 5: Success Criteria
+
+### 5.1 Functional Requirements
+
+**Core**:
+- ✅ All features work independently
+- ✅ Integration tests pass
+- ✅ No regressions in existing functionality
+- ✅ Documentation complete
+
+**Performance**:
+- ✅ GDeflate CPU: ≥ 500 MB/s
+- ✅ GDeflate GPU: ≥ 2 GB/s
+- ✅ io_uring: ≥ 2x CPU backend
+- ✅ Wine overhead: < 10%
+
+**Quality**:
+- ✅ No memory leaks
+- ✅ Thread-safe
+- ✅ Vulkan validation clean
+- ✅ Works on CachyOS/Arch Linux
+
+### 5.2 Wine/Proton Validation
+
+**Required**:
+- ✅ At least one DirectStorage game runs
+- ✅ Asset loading works correctly
+- ✅ Performance within 20% of Windows
+- ✅ No crashes or hangs
+
+---
+
+## Part 6: Risk Assessment
+
+### 6.1 Technical Risks
+
+| Risk | Probability | Impact | Mitigation |
+|------|-------------|--------|------------|
+| GDeflate format unavailable | Medium | High | Reverse engineer, community collaboration |
+| GPU compute too slow | Low | Medium | Optimize shaders, fallback to CPU |
+| Wine integration complex | High | Medium | Start simple, iterate |
+| Hardware incompatibility | Medium | High | Test on multiple GPUs, provide fallbacks |
+
+### 6.2 Timeline Risks
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| GDeflate research longer than expected | +4 weeks | Start GPU work in parallel |
+| Wine upstreaming delays | +8 weeks | Maintain out-of-tree fork |
+| Testing reveals bugs | +2-4 weeks | Allocate buffer time |
+
+---
+
+## Part 7: Next Actions
+
+### Immediate (This Week)
+1. ✅ Complete investigation documents
+2. ⏩ Begin GDeflate format research
+3. ⏩ Start Vulkan compute implementation
+4. ⏩ Install liburing for io_uring testing
+
+### Short Term (Weeks 2-4)
+1. ⏩ Implement shader module loading
+2. ⏩ Begin GDeflate CPU decoder
+3. ⏩ Design cancellation API
+4. ⏩ Test io_uring multi-worker prototype
+
+### Medium Term (Weeks 5-12)
+1. ⏩ Complete Vulkan compute pipelines
+2. ⏩ Finish GDeflate CPU implementation
+3. ⏩ Implement request cancellation
+4. ⏩ Start Wine shim prototype
+
+---
+
+**Document Status**: Draft v1.0  
+**Last Updated**: 2026-02-16  
+**Next Review**: After Phase 1 milestones complete
diff --git a/docs/investigation_summary.md b/docs/investigation_summary.md
new file mode 100644
index 0000000..74e6246
--- /dev/null
+++ b/docs/investigation_summary.md
@@ -0,0 +1,386 @@
+# DS-Runtime Investigation Phase Summary
+
+**Date**: February 16, 2026  
+**Phase**: Investigation & Planning (Phase 0)  
+**Status**: ✅ Complete
+
+---
+
+## Overview
+
+A comprehensive investigation and planning phase has been completed for the ds-runtime project, producing detailed roadmaps for implementing a complete DirectStorage-style I/O and decompression pipeline natively on Linux.
+
+---
+
+## Deliverables
+
+### Investigation Documents (110KB+ of Analysis)
+
+| Document | Size | Focus | Timeline |
+|----------|------|-------|----------|
+| [Master Roadmap](master_roadmap.md) | 30KB | Complete phased plan | 36 weeks / 12 week MVP |
+| [GDeflate Investigation](investigation_gdeflate.md) | 16KB | Compression implementation | 9-13 weeks |
+| [Vulkan Compute](investigation_vulkan_compute.md) | 26KB | GPU compute pipelines | 8 weeks |
+| [io_uring Backend](investigation_io_uring.md) | 20KB | Multi-worker enhancement | 6 weeks |
+| [Additional Features](investigation_remaining_features.md) | 18KB | Cancellation, GPU workflows, Wine | 3-8 weeks each |
+
+---
+
+## Current Project Status
+
+### ✅ Working Features (Phase 0 Complete)
+- CPU backend with thread pool
+- Read/write operations (pread/pwrite)
+- FakeUppercase demo compression
+- Error reporting with rich context
+- Request completion tracking
+- C ABI for Wine/Proton integration
+- Basic test suite (4 tests, 100% pass rate)
+- CMake build system with C++20
+
+**Build Status**: ✅ Compiles cleanly, all tests pass
+
+### ⚠️ Partially Implemented
+- **Vulkan backend**: Staging buffer copies work, compute pipelines missing
+- **io_uring backend**: Single worker only, needs multi-worker support
+- **Compression**: FakeUppercase works, GDeflate intentionally stubbed
+
+### ❌ Not Implemented
+- GDeflate compression/decompression (CPU & GPU)
+- Vulkan GPU compute pipelines
+- Request cancellation
+- GPU-resident workflow optimizations
+- Wine/Proton dstorage.dll shim
+
+---
+
+## Implementation Roadmap
+
+### Timeline Summary
+
+```
+Phase 1: Foundation & Research          Weeks 1-8   ████████░░░░░░░░░░░░░░░░░░░░░░░░
+Phase 2: Core Features                  Weeks 9-18  ░░░░░░░░██████████░░░░░░░░░░░░░░
+Phase 3: Advanced GPU                   Weeks 19-28 ░░░░░░░░░░░░░░░░░░██████████░░░░
+Phase 4: Wine/Proton Integration        Weeks 29-36 ░░░░░░░░░░░░░░░░░░░░░░░░░░████████
+                                                     
+Total: 36 weeks (9 months) for complete implementation
+MVP Option: 12 weeks (3 months) for basic functionality
+```
+
+### Phase Breakdown
+
+#### Phase 1: Foundation & Research (Weeks 1-8)
+**Parallel Tracks**:
+- Track A: GDeflate format research and specification (3 weeks)
+- Track B: Vulkan compute infrastructure (8 weeks)
+  - Shader module loading
+  - Descriptor management
+  - Pipeline creation
+  - Compute dispatch
+
+**Deliverables**:
+- ✅ GDeflate format specification
+- ✅ Vulkan compute capability
+
+#### Phase 2: Core Features (Weeks 9-18)
+**Parallel Tracks**:
+- Track A: GDeflate CPU implementation (5 weeks)
+  - Block header parser
+  - DEFLATE integration (zlib)
+  - Backend integration
+  
+- Track B: io_uring multi-worker (6 weeks)
+  - Worker pool architecture
+  - Load balancing
+  - Advanced features (fixed files, SQPOLL)
+  
+- Track C: Request cancellation (4 weeks)
+  - API design
+  - Queue implementation
+  - Backend integration
+
+**Deliverables**:
+- ✅ GDeflate CPU decompression working
+- ✅ io_uring production-ready
+- ✅ Cancellation implemented
+
+#### Phase 3: Advanced GPU Features (Weeks 19-28)
+**Sequential**:
+- GDeflate GPU implementation (6 weeks)
+  - GPU decompression shader
+  - Pipeline integration
+  - CPU/GPU hybrid strategy
+  
+- GPU-resident workflow optimization (4 weeks)
+  - Memory pooling
+  - Async transfers
+  - GPU-to-GPU optimization
+
+**Deliverables**:
+- ✅ GPU-accelerated decompression
+- ✅ Optimized GPU workflows
+
+#### Phase 4: Wine/Proton Integration (Weeks 29-36)
+**Sequential**:
+- dstorage.dll shim development (4 weeks)
+  - COM interface skeleton
+  - Type mapping (Windows → Linux)
+  - Queue implementation
+  
+- Testing and integration (4 weeks)
+  - Integration testing
+  - Real game testing
+  - Performance optimization
+  - Documentation
+
+**Deliverables**:
+- ✅ Wine/Proton support for DirectStorage games
+- ✅ Complete documentation
+
+---
+
+## MVP Fast Track (12 Weeks)
+
+**Reduced Scope for Rapid Validation**:
+- ✅ CPU backend (already complete)
+- Week 1-5: GDeflate CPU implementation
+- Week 1-8: Basic Vulkan compute (parallel)
+- Week 9-12: Simple Wine shim
+
+**Deferred**:
+- ❌ GDeflate GPU
+- ❌ io_uring multi-worker
+- ❌ Request cancellation
+- ❌ GPU optimizations
+
+**Goal**: Functional DirectStorage support in 3 months for testing
+
+---
+
+## Technical Analysis
+
+### Critical Path
+
+```
+GDeflate Research → GDeflate CPU → GDeflate GPU
+        ↓                ↓              ↓
+Vulkan Compute ──────────┴──────────────┘
+        ↓
+Wine/Proton Integration
+```
+
+**Key Dependencies**:
+- GDeflate GPU requires Vulkan compute pipelines
+- Wine integration requires GDeflate CPU (minimum)
+- GPU optimizations require GPU decompression
+
+### Parallelization Opportunities
+
+**Can Work in Parallel**:
+1. GDeflate research + Vulkan compute (Weeks 1-8)
+2. GDeflate CPU + io_uring multi-worker (Weeks 9-13)
+3. GDeflate CPU + Request cancellation (Weeks 9-13)
+4. GPU workflows + Wine integration planning
+
+**Must Be Sequential**:
+1. Vulkan compute → GDeflate GPU
+2. GDeflate CPU → GDeflate GPU
+3. Core features → Wine integration
+
+---
+
+## Performance Targets
+
+| Component | Target | Measurement |
+|-----------|--------|-------------|
+| **GDeflate CPU** | ≥ 500 MB/s | Decompression throughput |
+| **GDeflate GPU** | ≥ 2 GB/s | Decompression throughput |
+| **io_uring** | ≥ 2x CPU backend | File I/O throughput |
+| **Vulkan Compute** | < 100 µs overhead | Dispatch latency |
+| **Wine/Proton** | < 10% overhead | vs native Linux |
+
+---
+
+## Risk Assessment
+
+### High-Impact Risks
+
+| Risk | Probability | Mitigation |
+|------|-------------|------------|
+| **GDeflate format unavailable** | Medium | Reverse engineer, community collaboration |
+| **Wine integration complex** | High | Incremental approach, Wine dev consultation |
+| **Hardware compatibility** | Medium | Test multiple GPUs, provide CPU fallback |
+
+### Timeline Risks
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| **GDeflate research > 3 weeks** | +4 weeks | Start GPU work in parallel |
+| **GPU shader optimization** | +2 weeks | Accept initial performance, iterate |
+| **Wine upstreaming delays** | +8 weeks | Maintain out-of-tree fork initially |
+
+---
+
+## Success Criteria
+
+### Functional Requirements
+- ✅ All features work independently
+- ✅ Comprehensive test coverage (≥80%)
+- ✅ No memory leaks (valgrind clean)
+- ✅ Thread-safe operations
+- ✅ Vulkan validation layers pass
+- ✅ At least one DirectStorage game runs on Wine/Proton
+
+### Performance Requirements
+- ✅ GDeflate CPU: ≥ 500 MB/s throughput
+- ✅ GDeflate GPU: ≥ 2 GB/s throughput
+- ✅ io_uring: ≥ 2x CPU backend performance
+- ✅ GPU utilization ≥ 80% during compute
+- ✅ Wine/Proton overhead < 10% vs Windows
+
+### Quality Requirements
+- ✅ Complete documentation for all features
+- ✅ API stability maintained (no breaking changes)
+- ✅ Works on CachyOS/Arch Linux
+- ✅ Multi-GPU vendor support (AMD, NVIDIA, Intel)
+
+---
+
+## Resource Requirements
+
+### Development Environment
+- CachyOS or Arch Linux system
+- Vulkan-capable GPU (≥ Vulkan 1.0)
+- liburing library (optional)
+- glslangValidator (shader compilation)
+- RenderDoc (GPU debugging, optional)
+
+### External Dependencies
+- **Required**: CMake 3.16+, C++20 compiler, pthreads
+- **Optional**: Vulkan SDK, liburing, Wine/Proton for testing
+
+### Hardware
+- **Minimum**: CPU with ≥4 cores, 8GB RAM, Vulkan 1.0 GPU
+- **Recommended**: CPU with ≥8 cores, 16GB RAM, Vulkan 1.3 GPU, NVMe SSD
+
+---
+
+## Next Steps
+
+### Immediate Actions (Week 1)
+1. ✅ Investigation documents complete (done)
+2. ⏩ Begin GDeflate format specification research
+3. ⏩ Start Vulkan shader module loading implementation
+4. ⏩ Install liburing for io_uring development
+5. ⏩ Set up shader build system in CMake
+
+### Short-Term Goals (Weeks 2-4)
+1. Complete GDeflate format documentation
+2. Implement Vulkan descriptor management
+3. Begin GDeflate block header parser
+4. Design request cancellation API
+5. Test io_uring multi-worker prototype
+
+### Medium-Term Goals (Weeks 5-12)
+1. Complete GDeflate CPU implementation
+2. Finish Vulkan compute pipelines
+3. Implement io_uring multi-worker
+4. Add request cancellation
+5. Comprehensive testing of all components
+
+---
+
+## Detailed Microtasking
+
+The investigation has produced **150+ microtasks** across all phases:
+- Each task sized at 0.5-5 days
+- Clear dependencies identified
+- Parallel work opportunities mapped
+- Testing integrated at every phase
+- Documentation requirements specified
+
+Example microtask breakdown (Phase 1.2.1 - Shader Module System):
+- Task 1.2.1.1: Implement shader file loading (1 day)
+- Task 1.2.1.2: Create VkShaderModule wrapper (1 day)
+- Task 1.2.1.3: Implement shader caching (2 days)
+- Task 1.2.1.4: Test shader loading (1 day)
+- Task 1.2.1.5: Set up shader build system (2 days)
+
+---
+
+## Documentation Structure
+
+```
+docs/
+├── investigation_gdeflate.md          # 16KB - GDeflate implementation plan
+├── investigation_vulkan_compute.md    # 26KB - GPU compute infrastructure
+├── investigation_io_uring.md          # 20KB - io_uring backend enhancement
+├── investigation_remaining_features.md # 18KB - Cancellation, GPU, Wine
+├── master_roadmap.md                  # 30KB - Complete phased plan
+├── investigation_summary.md           # 7KB - This document
+│
+├── design.md                          # Architecture overview
+├── wine_proton.md                     # Wine/Proton integration notes
+└── archlinux_vulkan_integration.md    # CachyOS/Arch specific notes
+```
+
+Total Investigation Documentation: **117KB** of detailed planning
+
+---
+
+## Conclusion
+
+This investigation phase has established a comprehensive foundation for completing the ds-runtime project:
+
+**Strengths**:
+- ✅ Clear roadmap with detailed microtasks
+- ✅ Realistic timelines with parallel work identified
+- ✅ Risk assessment and mitigation strategies
+- ✅ Multiple scope options (MVP vs full)
+- ✅ Solid existing foundation (CPU backend working)
+
+**Readiness**:
+- All major technical questions answered
+- Dependencies and blockers identified
+- Testing strategy defined
+- Success criteria established
+- Resource requirements documented
+
+**Recommendation**:
+- ✅ Ready to proceed to implementation
+- ✅ Begin with parallel Phase 1 tracks
+- ✅ Maintain weekly progress reports
+- ✅ Iterate on plan based on discoveries
+
+The project can begin active development immediately with high confidence in the approach and timeline.
+
+---
+
+**Status**: Investigation Phase Complete ✅  
+**Next Phase**: Phase 1 - Foundation & Research  
+**Start Date**: Week 1 (after approval)  
+**First Milestone**: Week 3 - GDeflate format specification complete
+
+---
+
+## Appendix: Key Metrics
+
+| Metric | Value |
+|--------|-------|
+| **Investigation Documents** | 5 major documents |
+| **Total Documentation** | 117KB |
+| **Microtasks Identified** | 150+ tasks |
+| **Total Timeline** | 36 weeks (full) / 12 weeks (MVP) |
+| **Test Files** | 12+ new tests planned |
+| **New Source Files** | 8+ implementation files |
+| **Phases** | 4 major, 12 sub-phases |
+| **Performance Targets** | 5 key metrics defined |
+| **Risk Items** | 6 high-impact risks identified |
+| **Success Criteria** | 15 functional + performance + quality |
+
+---
+
+**Document Version**: 1.0  
+**Author**: Investigation Phase Team  
+**Last Updated**: 2026-02-16
diff --git a/docs/investigation_vulkan_compute.md b/docs/investigation_vulkan_compute.md
new file mode 100644
index 0000000..e072da1
--- /dev/null
+++ b/docs/investigation_vulkan_compute.md
@@ -0,0 +1,996 @@
+# Vulkan GPU Compute Pipeline Investigation and Implementation Plan
+
+**Status:** Planning Phase  
+**Priority:** High  
+**Target:** Full GPU compute capability for decompression and data processing  
+**Dependencies:** Existing Vulkan device/queue infrastructure
+
+---
+
+## Executive Summary
+
+The current Vulkan backend in ds-runtime supports staging buffer copies but lacks compute pipeline functionality. This document outlines the investigation, design, and implementation plan for adding full GPU compute capabilities, primarily for GPU-accelerated decompression.
+
+---
+
+## 1. Current State
+
+### 1.1 What Works
+
+**Existing Vulkan Infrastructure** (`src/ds_runtime_vulkan.cpp`):
+- ✅ Vulkan instance creation
+- ✅ Physical device selection
+- ✅ Logical device and queue creation
+- ✅ Command pool management
+- ✅ Staging buffer allocation (host-visible)
+- ✅ GPU buffer allocation (device-local)
+- ✅ `vkCmdCopyBuffer` for staging ↔ GPU transfers
+- ✅ Synchronization via `vkDeviceWaitIdle`
+- ✅ Memory type selection and allocation
+- ✅ Request submission and completion tracking
+
+**Capability**: File I/O → Staging buffer → GPU buffer (pure data transfer, no computation)
+
+### 1.2 What's Missing
+
+**Compute Pipeline Components**:
+- ❌ Compute pipeline creation (`vkCreateComputePipelines`)
+- ❌ Shader module loading (`vkCreateShaderModule`)
+- ❌ Descriptor set layout creation
+- ❌ Descriptor pool allocation
+- ❌ Descriptor set updates (buffer bindings)
+- ❌ Pipeline layout creation
+- ❌ Compute command recording (`vkCmdBindPipeline`, `vkCmdDispatch`)
+- ❌ Push constant support
+- ❌ Compute-specific synchronization (barriers)
+
+**Impact**: Cannot execute any GPU compute workloads (decompression, transforms, etc.)
+
+### 1.3 Existing Assets (Unused)
+
+**SPIR-V Shader**: `examples/vk-copy-test/copy.comp.spv` (256 bytes)
+- Precompiled compute shader
+- Currently not loaded or used by any code
+- Likely a simple buffer copy/transform shader
+- Should be examined and potentially reused
+
+---
+
+## 2. Vulkan Compute Architecture
+
+### 2.1 Compute Pipeline Overview
+
+```
+Application
+  ↓ (prepare compute work)
+Descriptor Sets (bind buffers, uniforms)
+  ↓
+Pipeline Layout (descriptor layouts + push constants)
+  ↓
+Compute Pipeline (shader + configuration)
+  ↓
+Command Buffer (vkCmdBindPipeline, vkCmdDispatch)
+  ↓
+GPU Execution (workgroups → invocations)
+  ↓
+Synchronization (barriers, fences)
+  ↓
+Results in GPU buffers
+```
+
+### 2.2 Key Vulkan Objects
+
+| Object | Purpose | Lifetime |
+|--------|---------|----------|
+| **VkShaderModule** | Compiled SPIR-V code | Per-shader, reusable |
+| **VkDescriptorSetLayout** | Binding layout (types, stages) | Per-layout, reusable |
+| **VkDescriptorPool** | Allocation pool for descriptor sets | Per-backend, managed |
+| **VkDescriptorSet** | Actual buffer bindings | Per-dispatch, short-lived |
+| **VkPipelineLayout** | Push constants + descriptor layouts | Per-pipeline, reusable |
+| **VkPipeline** | Complete compute configuration | Per-shader, reusable |
+| **VkCommandBuffer** | Recorded GPU commands | Per-submission, pooled |
+
+### 2.3 Execution Model
+
+**Workgroup Hierarchy**:
+```
+Global Work Size (e.g., 1024 elements)
+  ↓ divide by local_size_x
+Workgroups (e.g., 4 workgroups if local_size_x = 256)
+  ↓ parallel execution
+Invocations (256 invocations per workgroup)
+  ↓
+Each invocation processes one element
+```
+
+**Shader Invocation IDs**:
+- `gl_GlobalInvocationID`: Unique ID across all invocations
+- `gl_LocalInvocationID`: ID within the workgroup
+- `gl_WorkGroupID`: Workgroup index
+
+---
+
+## 3. Implementation Plan
+
+### 3.1 Phase 1: Shader Module Loading
+
+**Goal**: Load and create VkShaderModule from SPIR-V bytecode
+
+#### Tasks
+1. Add SPIR-V loading utility function
+2. Implement shader module creation
+3. Add shader module caching (avoid redundant loads)
+4. Validate shader compilation
+
+#### API Design
+
+**Location**: `src/ds_runtime_vulkan.cpp`
+
+```cpp
+class ShaderModuleCache {
+public:
+    VkShaderModule load_shader(
+        VkDevice device,
+        const std::string& path
+    );
+    
+    void destroy_all(VkDevice device);
+    
+private:
+    std::unordered_map<std::string, VkShaderModule> modules_;
+};
+
+// Add to VulkanBackend::Impl
+ShaderModuleCache shader_cache_;
+```
+
+#### Implementation
+
+```cpp
+VkShaderModule ShaderModuleCache::load_shader(
+    VkDevice device,
+    const std::string& path
+) {
+    // Check cache
+    auto it = modules_.find(path);
+    if (it != modules_.end()) {
+        return it->second;
+    }
+    
+    // Read SPIR-V file
+    std::ifstream file(path, std::ios::binary | std::ios::ate);
+    if (!file) {
+        throw std::runtime_error("Failed to open shader file");
+    }
+    
+    size_t size = file.tellg();
+    std::vector<char> code(size);
+    file.seekg(0);
+    file.read(code.data(), size);
+    
+    // Create shader module
+    VkShaderModuleCreateInfo create_info{};
+    create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    create_info.codeSize = code.size();
+    create_info.pCode = reinterpret_cast<const uint32_t*>(code.data());
+    
+    VkShaderModule module;
+    VkResult result = vkCreateShaderModule(
+        device, &create_info, nullptr, &module
+    );
+    
+    if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create shader module");
+    }
+    
+    modules_[path] = module;
+    return module;
+}
+```
+
+#### Testing
+- Load existing `copy.comp.spv` shader
+- Validate shader module creation succeeds
+- Test error handling for missing files
+- Verify shader module caching works
+
+---
+
+### 3.2 Phase 2: Descriptor Set Layout
+
+**Goal**: Define buffer binding layouts for compute shaders
+
+#### Descriptor Layouts for Decompression
+
+**Example: GDeflate Decompression**
+```cpp
+// Binding 0: Input compressed buffer (read-only)
+// Binding 1: Block metadata (read-only)
+// Binding 2: Output decompressed buffer (write-only)
+```
+
+#### Implementation
+
+**Location**: `src/ds_runtime_vulkan.cpp`
+
+```cpp
+struct ComputeDescriptorLayouts {
+    VkDescriptorSetLayout decompression_layout;
+    VkDescriptorSetLayout copy_layout;
+    // Add more as needed
+};
+
+VkDescriptorSetLayout create_decompression_descriptor_layout(
+    VkDevice device
+) {
+    VkDescriptorSetLayoutBinding bindings[3];
+    
+    // Binding 0: Input compressed data
+    bindings[0].binding = 0;
+    bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    bindings[0].descriptorCount = 1;
+    bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    bindings[0].pImmutableSamplers = nullptr;
+    
+    // Binding 1: Block metadata
+    bindings[1].binding = 1;
+    bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    bindings[1].descriptorCount = 1;
+    bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    bindings[1].pImmutableSamplers = nullptr;
+    
+    // Binding 2: Output decompressed data
+    bindings[2].binding = 2;
+    bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    bindings[2].descriptorCount = 1;
+    bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    bindings[2].pImmutableSamplers = nullptr;
+    
+    VkDescriptorSetLayoutCreateInfo layout_info{};
+    layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    layout_info.bindingCount = 3;
+    layout_info.pBindings = bindings;
+    
+    VkDescriptorSetLayout layout;
+    VkResult result = vkCreateDescriptorSetLayout(
+        device, &layout_info, nullptr, &layout
+    );
+    
+    if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create descriptor set layout");
+    }
+    
+    return layout;
+}
+```
+
+#### Add to VulkanBackend
+
+```cpp
+// In VulkanBackend::Impl
+ComputeDescriptorLayouts descriptor_layouts_;
+
+// Initialize in constructor
+descriptor_layouts_.decompression_layout = 
+    create_decompression_descriptor_layout(device_);
+```
+
+---
+
+### 3.3 Phase 3: Descriptor Pool
+
+**Goal**: Allocate descriptor pool for runtime descriptor set allocation
+
+#### Pool Sizing
+
+**Strategy**: Pre-allocate pool large enough for concurrent dispatches
+
+```cpp
+// Estimate: 16 concurrent compute dispatches
+// Each dispatch needs 3 storage buffer descriptors
+// Total: 16 * 3 = 48 storage buffer descriptors
+```
+
+#### Implementation
+
+```cpp
+VkDescriptorPool create_compute_descriptor_pool(
+    VkDevice device,
+    uint32_t max_sets = 32
+) {
+    VkDescriptorPoolSize pool_size{};
+    pool_size.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    pool_size.descriptorCount = max_sets * 3; // 3 bindings per set
+    
+    VkDescriptorPoolCreateInfo pool_info{};
+    pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    pool_info.poolSizeCount = 1;
+    pool_info.pPoolSizes = &pool_size;
+    pool_info.maxSets = max_sets;
+    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+    
+    VkDescriptorPool pool;
+    VkResult result = vkCreateDescriptorPool(
+        device, &pool_info, nullptr, &pool
+    );
+    
+    if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create descriptor pool");
+    }
+    
+    return pool;
+}
+
+// Add to VulkanBackend::Impl
+VkDescriptorPool descriptor_pool_;
+
+// Initialize in constructor
+descriptor_pool_ = create_compute_descriptor_pool(device_);
+```
+
+---
+
+### 3.4 Phase 4: Pipeline Layout and Compute Pipeline
+
+**Goal**: Create compute pipeline with shader and layout
+
+#### Pipeline Layout
+
+```cpp
+VkPipelineLayout create_compute_pipeline_layout(
+    VkDevice device,
+    VkDescriptorSetLayout descriptor_layout
+) {
+    // Optional: push constants for dispatch parameters
+    VkPushConstantRange push_constant{};
+    push_constant.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    push_constant.offset = 0;
+    push_constant.size = sizeof(uint32_t) * 4; // Example: 4 uint32s
+    
+    VkPipelineLayoutCreateInfo layout_info{};
+    layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    layout_info.setLayoutCount = 1;
+    layout_info.pSetLayouts = &descriptor_layout;
+    layout_info.pushConstantRangeCount = 1;
+    layout_info.pPushConstantRanges = &push_constant;
+    
+    VkPipelineLayout layout;
+    VkResult result = vkCreatePipelineLayout(
+        device, &layout_info, nullptr, &layout
+    );
+    
+    if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create pipeline layout");
+    }
+    
+    return layout;
+}
+```
+
+#### Compute Pipeline
+
+```cpp
+VkPipeline create_compute_pipeline(
+    VkDevice device,
+    VkPipelineLayout layout,
+    VkShaderModule shader_module,
+    const char* entry_point = "main"
+) {
+    VkPipelineShaderStageCreateInfo stage_info{};
+    stage_info.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stage_info.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stage_info.module = shader_module;
+    stage_info.pName = entry_point;
+    
+    VkComputePipelineCreateInfo pipeline_info{};
+    pipeline_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipeline_info.stage = stage_info;
+    pipeline_info.layout = layout;
+    
+    VkPipeline pipeline;
+    VkResult result = vkCreateComputePipelines(
+        device, VK_NULL_HANDLE, 1, &pipeline_info, nullptr, &pipeline
+    );
+    
+    if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create compute pipeline");
+    }
+    
+    return pipeline;
+}
+```
+
+#### Pipeline Management
+
+```cpp
+struct ComputePipeline {
+    VkPipeline pipeline;
+    VkPipelineLayout layout;
+    VkDescriptorSetLayout descriptor_layout;
+    VkShaderModule shader;
+};
+
+// Add to VulkanBackend::Impl
+std::unordered_map<std::string, ComputePipeline> compute_pipelines_;
+
+ComputePipeline create_decompression_pipeline() {
+    ComputePipeline result;
+    
+    // Load shader
+    result.shader = shader_cache_.load_shader(
+        device_, "shaders/decompress.comp.spv"
+    );
+    
+    // Create descriptor layout
+    result.descriptor_layout = 
+        create_decompression_descriptor_layout(device_);
+    
+    // Create pipeline layout
+    result.layout = create_compute_pipeline_layout(
+        device_, result.descriptor_layout
+    );
+    
+    // Create pipeline
+    result.pipeline = create_compute_pipeline(
+        device_, result.layout, result.shader
+    );
+    
+    return result;
+}
+```
+
+---
+
+### 3.5 Phase 5: Descriptor Set Allocation and Updates
+
+**Goal**: Bind buffers to descriptor sets for each dispatch
+
+#### Allocation
+
+```cpp
+VkDescriptorSet allocate_descriptor_set(
+    VkDevice device,
+    VkDescriptorPool pool,
+    VkDescriptorSetLayout layout
+) {
+    VkDescriptorSetAllocateInfo alloc_info{};
+    alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    alloc_info.descriptorPool = pool;
+    alloc_info.descriptorSetCount = 1;
+    alloc_info.pSetLayouts = &layout;
+    
+    VkDescriptorSet descriptor_set;
+    VkResult result = vkAllocateDescriptorSets(
+        device, &alloc_info, &descriptor_set
+    );
+    
+    if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to allocate descriptor set");
+    }
+    
+    return descriptor_set;
+}
+```
+
+#### Buffer Binding
+
+```cpp
+void update_decompression_descriptor_set(
+    VkDevice device,
+    VkDescriptorSet descriptor_set,
+    VkBuffer input_buffer,
+    VkBuffer metadata_buffer,
+    VkBuffer output_buffer,
+    VkDeviceSize input_size,
+    VkDeviceSize metadata_size,
+    VkDeviceSize output_size
+) {
+    VkDescriptorBufferInfo buffer_infos[3];
+    
+    // Input buffer
+    buffer_infos[0].buffer = input_buffer;
+    buffer_infos[0].offset = 0;
+    buffer_infos[0].range = input_size;
+    
+    // Metadata buffer
+    buffer_infos[1].buffer = metadata_buffer;
+    buffer_infos[1].offset = 0;
+    buffer_infos[1].range = metadata_size;
+    
+    // Output buffer
+    buffer_infos[2].buffer = output_buffer;
+    buffer_infos[2].offset = 0;
+    buffer_infos[2].range = output_size;
+    
+    VkWriteDescriptorSet writes[3];
+    for (int i = 0; i < 3; ++i) {
+        writes[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+        writes[i].pNext = nullptr;
+        writes[i].dstSet = descriptor_set;
+        writes[i].dstBinding = i;
+        writes[i].dstArrayElement = 0;
+        writes[i].descriptorCount = 1;
+        writes[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        writes[i].pBufferInfo = &buffer_infos[i];
+        writes[i].pImageInfo = nullptr;
+        writes[i].pTexelBufferView = nullptr;
+    }
+    
+    vkUpdateDescriptorSets(device, 3, writes, 0, nullptr);
+}
+```
+
+---
+
+### 3.6 Phase 6: Compute Dispatch
+
+**Goal**: Record and execute compute commands
+
+#### Command Buffer Recording
+
+```cpp
+void record_compute_dispatch(
+    VkCommandBuffer cmd,
+    VkPipeline pipeline,
+    VkPipelineLayout layout,
+    VkDescriptorSet descriptor_set,
+    uint32_t workgroup_count_x,
+    uint32_t workgroup_count_y = 1,
+    uint32_t workgroup_count_z = 1
+) {
+    // Bind compute pipeline
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+    
+    // Bind descriptor set
+    vkCmdBindDescriptorSets(
+        cmd,
+        VK_PIPELINE_BIND_POINT_COMPUTE,
+        layout,
+        0,  // first set
+        1,  // descriptor set count
+        &descriptor_set,
+        0,  // dynamic offset count
+        nullptr
+    );
+    
+    // Optional: push constants
+    // vkCmdPushConstants(cmd, layout, VK_SHADER_STAGE_COMPUTE_BIT, ...);
+    
+    // Dispatch compute work
+    vkCmdDispatch(cmd, workgroup_count_x, workgroup_count_y, workgroup_count_z);
+}
+```
+
+#### Integration with Request Processing
+
+```cpp
+void VulkanBackend::Impl::process_request_with_compute(Request& req) {
+    // Begin command buffer
+    VkCommandBufferBeginInfo begin_info{};
+    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    
+    vkBeginCommandBuffer(command_buffer_, &begin_info);
+    
+    // 1. Copy file data to staging buffer
+    // (existing code for file I/O)
+    
+    // 2. Barrier: transfer → compute
+    VkBufferMemoryBarrier barrier{};
+    barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+    barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+    barrier.buffer = staging_buffer_;
+    barrier.offset = 0;
+    barrier.size = VK_WHOLE_SIZE;
+    
+    vkCmdPipelineBarrier(
+        command_buffer_,
+        VK_PIPELINE_STAGE_TRANSFER_BIT,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        0, 0, nullptr, 1, &barrier, 0, nullptr
+    );
+    
+    // 3. Dispatch compute (decompression)
+    auto& pipeline = compute_pipelines_["decompress"];
+    VkDescriptorSet desc_set = allocate_descriptor_set(
+        device_, descriptor_pool_, pipeline.descriptor_layout
+    );
+    
+    update_decompression_descriptor_set(
+        device_, desc_set,
+        staging_buffer_,      // compressed input
+        metadata_buffer_,     // block info
+        req.gpu_buffer,       // decompressed output
+        req.size,
+        metadata_size,
+        req.size * 2  // assume 2x expansion
+    );
+    
+    uint32_t workgroup_count = (req.size + 255) / 256;
+    record_compute_dispatch(
+        command_buffer_,
+        pipeline.pipeline,
+        pipeline.layout,
+        desc_set,
+        workgroup_count
+    );
+    
+    // 4. Barrier: compute → host (if needed for readback)
+    // ...
+    
+    // 5. End and submit
+    vkEndCommandBuffer(command_buffer_);
+    
+    VkSubmitInfo submit_info{};
+    submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit_info.commandBufferCount = 1;
+    submit_info.pCommandBuffers = &command_buffer_;
+    
+    vkQueueSubmit(queue_, 1, &submit_info, VK_NULL_HANDLE);
+    vkQueueWaitIdle(queue_);
+    
+    // Free descriptor set
+    vkFreeDescriptorSets(device_, descriptor_pool_, 1, &desc_set);
+}
+```
+
+---
+
+### 3.7 Phase 7: Synchronization and Barriers
+
+**Goal**: Proper memory synchronization between pipeline stages
+
+#### Key Barriers
+
+**Transfer → Compute**:
+```cpp
+VkBufferMemoryBarrier transfer_to_compute{};
+transfer_to_compute.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+transfer_to_compute.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+// Use: After vkCmdCopyBuffer, before compute dispatch
+```
+
+**Compute → Transfer**:
+```cpp
+VkBufferMemoryBarrier compute_to_transfer{};
+compute_to_transfer.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+compute_to_transfer.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+// Use: After compute dispatch, before staging buffer readback
+```
+
+**Compute → Compute** (between dispatches):
+```cpp
+VkBufferMemoryBarrier compute_to_compute{};
+compute_to_compute.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+compute_to_compute.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+// Use: Between dependent compute passes
+```
+
+#### Synchronization Best Practices
+
+1. **Minimize barriers**: Batch operations when possible
+2. **Use appropriate stages**: Don't over-synchronize
+3. **Consider queue ownership**: Handle queue family transfers if needed
+4. **Validate with layers**: Enable `VK_LAYER_KHRONOS_validation`
+
+---
+
+## 4. Shader Development
+
+### 4.1 Example: Simple Buffer Copy
+
+**File**: `shaders/buffer_copy.comp`
+
+```glsl
+#version 450
+
+layout(local_size_x = 256) in;
+
+layout(binding = 0) readonly buffer InputBuffer {
+    uint data[];
+} input_buf;
+
+layout(binding = 1) writeonly buffer OutputBuffer {
+    uint data[];
+} output_buf;
+
+layout(push_constant) uniform PushConstants {
+    uint element_count;
+} push;
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+    if (idx < push.element_count) {
+        output_buf.data[idx] = input_buf.data[idx];
+    }
+}
+```
+
+**Compilation**:
+```bash
+glslangValidator -V buffer_copy.comp -o buffer_copy.comp.spv
+```
+
+### 4.2 Example: Transform Shader
+
+**File**: `shaders/uppercase.comp` (enhanced version of FakeUppercase)
+
+```glsl
+#version 450
+
+layout(local_size_x = 256) in;
+
+layout(binding = 0) readonly buffer InputBuffer {
+    uint8_t data[];
+} input_buf;
+
+layout(binding = 1) writeonly buffer OutputBuffer {
+    uint8_t data[];
+} output_buf;
+
+layout(push_constant) uniform PushConstants {
+    uint byte_count;
+} push;
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+    if (idx < push.byte_count) {
+        uint8_t c = input_buf.data[idx];
+        // Uppercase ASCII
+        if (c >= 'a' && c <= 'z') {
+            c = c - 32;
+        }
+        output_buf.data[idx] = c;
+    }
+}
+```
+
+### 4.3 Shader Build System
+
+**Add to CMakeLists.txt**:
+```cmake
+# Find glslangValidator
+find_program(GLSLANG_VALIDATOR glslangValidator)
+
+if(GLSLANG_VALIDATOR)
+    # Compile shaders
+    file(GLOB SHADER_SOURCES shaders/*.comp)
+    
+    foreach(SHADER ${SHADER_SOURCES})
+        get_filename_component(SHADER_NAME ${SHADER} NAME)
+        set(SPIRV "${CMAKE_CURRENT_BINARY_DIR}/shaders/${SHADER_NAME}.spv")
+        
+        add_custom_command(
+            OUTPUT ${SPIRV}
+            COMMAND ${CMAKE_COMMAND} -E make_directory 
+                    "${CMAKE_CURRENT_BINARY_DIR}/shaders"
+            COMMAND ${GLSLANG_VALIDATOR} -V ${SHADER} -o ${SPIRV}
+            DEPENDS ${SHADER}
+            COMMENT "Compiling shader ${SHADER_NAME}"
+        )
+        
+        list(APPEND SPIRV_SHADERS ${SPIRV})
+    endforeach()
+    
+    add_custom_target(shaders ALL DEPENDS ${SPIRV_SHADERS})
+endif()
+```
+
+---
+
+## 5. Testing Strategy
+
+### 5.1 Unit Tests
+
+**Test Suite**: `tests/vulkan_compute_test.cpp`
+
+**Test Cases**:
+1. **Pipeline creation**: Verify pipeline objects created successfully
+2. **Shader loading**: Test shader module creation from SPIR-V
+3. **Descriptor allocation**: Verify descriptor pool management
+4. **Simple dispatch**: Buffer copy shader execution
+5. **Transform shader**: Uppercase transform on GPU
+6. **Error handling**: Invalid pipeline, missing shader, etc.
+
+### 5.2 Integration Tests
+
+**Scenarios**:
+1. **File → GPU compute → GPU buffer**: Full decompression path
+2. **Multiple dispatches**: Concurrent compute workloads
+3. **Mixed operations**: Compute + transfer in same command buffer
+4. **Large buffers**: Test with multi-MB data
+5. **Synchronization**: Verify barriers work correctly
+
+### 5.3 Validation
+
+**Tools**:
+- Vulkan Validation Layers (`VK_LAYER_KHRONOS_validation`)
+- RenderDoc for command buffer inspection
+- GPU profilers (Nsight, Radeon GPU Profiler)
+
+**Check for**:
+- Synchronization errors
+- Memory leaks (descriptor sets, pipelines)
+- Invalid API usage
+- Performance bottlenecks
+
+---
+
+## 6. Performance Considerations
+
+### 6.1 Optimization Strategies
+
+**Shader Optimizations**:
+- Coalesced memory access patterns
+- Shared memory for temporary data
+- Reduce divergent branches
+- Optimize workgroup size for target GPU
+
+**Pipeline Management**:
+- Reuse pipelines across requests
+- Minimize pipeline switches
+- Batch similar compute work
+- Pre-warm pipeline caches
+
+**Memory Management**:
+- Pool descriptor sets (avoid per-frame allocation)
+- Reuse command buffers when possible
+- Minimize host-device transfers
+- Use push constants for small data
+
+### 6.2 Performance Targets
+
+**Goals**:
+- Compute dispatch overhead < 100 µs
+- Throughput ≥ 10 GB/s for simple transforms
+- GPU utilization ≥ 80% during compute
+- CPU overhead < 5% during GPU execution
+
+---
+
+## 7. Dependencies and Requirements
+
+### 7.1 External Dependencies
+
+**Required**:
+- Vulkan SDK (already optional dependency)
+- `glslangValidator` for shader compilation
+- C++20 compiler (already required)
+
+**Optional**:
+- RenderDoc for debugging
+- GPU profiling tools
+
+### 7.2 Hardware Requirements
+
+**Minimum**:
+- Vulkan 1.0 support
+- Compute queue support
+- Storage buffer support
+
+**Recommended**:
+- Vulkan 1.3 support
+- Dedicated compute queue
+- ≥ 4GB VRAM for large asset processing
+
+---
+
+## 8. Timeline and Milestones
+
+### 8.1 Implementation Phases
+
+**Week 1-2: Foundation**
+- Shader module loading
+- Descriptor layouts
+- Descriptor pool creation
+- **Milestone**: Infrastructure ready
+
+**Week 3-4: Pipeline Creation**
+- Pipeline layout
+- Compute pipeline creation
+- Pipeline management
+- **Milestone**: Simple copy shader works
+
+**Week 5-6: Dispatch and Synchronization**
+- Command buffer recording
+- Compute dispatch
+- Barriers and synchronization
+- **Milestone**: Transform shader works
+
+**Week 7-8: Integration and Testing**
+- Backend integration
+- Comprehensive testing
+- Performance tuning
+- **Milestone**: Production-ready compute support
+
+### 8.2 Total Estimate
+
+**8 weeks** for complete Vulkan compute implementation
+
+**Dependencies**: None (can proceed independently)
+
+---
+
+## 9. Success Criteria
+
+### 9.1 Functional Requirements
+- ✅ Shader modules load from SPIR-V files
+- ✅ Compute pipelines created successfully
+- ✅ Descriptor sets allocated and bound correctly
+- ✅ Compute dispatches execute on GPU
+- ✅ Synchronization barriers work properly
+- ✅ Existing Vulkan tests still pass
+- ✅ New compute tests pass (100% coverage)
+
+### 9.2 Performance Requirements
+- ✅ Compute overhead < 100 µs per dispatch
+- ✅ GPU utilization ≥ 80% during compute
+- ✅ Throughput ≥ 10 GB/s for simple operations
+- ✅ No performance regression in existing paths
+
+### 9.3 Quality Requirements
+- ✅ Vulkan validation layers pass (no errors)
+- ✅ No memory leaks (descriptor sets, pipelines)
+- ✅ Thread-safe pipeline management
+- ✅ Documentation complete and accurate
+- ✅ API stability maintained
+
+---
+
+## 10. Next Steps
+
+### Immediate (This Week)
+1. ✅ Complete investigation document
+2. ⏩ Examine existing `copy.comp.spv` shader
+3. ⏩ Set up shader build system
+4. ⏩ Create simple test shader
+
+### Short Term (Next 2 Weeks)
+1. ⏩ Implement shader module loading
+2. ⏩ Create descriptor layouts
+3. ⏩ Set up descriptor pool
+4. ⏩ Test infrastructure
+
+### Medium Term (1-2 Months)
+1. ⏩ Complete pipeline creation
+2. ⏩ Implement compute dispatch
+3. ⏩ Add synchronization
+4. ⏩ Comprehensive testing
+
+---
+
+## 11. Open Questions
+
+1. **Shader Language**: Should we support multiple shader languages (GLSL, HLSL via DXC)?
+2. **Pipeline Caching**: Do we need VkPipelineCache for faster startup?
+3. **Async Compute**: Should we use dedicated compute queue or graphics queue?
+4. **Workgroup Size**: How to determine optimal local_size_x for different GPUs?
+5. **Error Recovery**: How to handle GPU compute failures gracefully?
+6. **Shader Hot-Reload**: Do we need runtime shader recompilation for development?
+
+---
+
+## 12. References
+
+### Vulkan Specification
+- Vulkan 1.3 Specification: Compute Shaders
+- Khronos Vulkan Guide: Compute
+- Vulkan Tutorial: Compute Shaders
+
+### Best Practices
+- Khronos Vulkan Best Practices
+- AMD GPU Architecture
+- NVIDIA Compute Best Practices
+- Intel GPU Architecture Guide
+
+### Tools
+- glslangValidator documentation
+- RenderDoc user guide
+- Vulkan Validation Layers
+
+---
+
+**Document Status**: Draft v1.0  
+**Last Updated**: 2026-02-16  
+**Next Review**: After shader loading implementation complete
diff --git a/docs/master_roadmap.md b/docs/master_roadmap.md
new file mode 100644
index 0000000..ac7d3c5
--- /dev/null
+++ b/docs/master_roadmap.md
@@ -0,0 +1,1102 @@
+# DirectStorage-Style I/O and Decompression Pipeline: Master Roadmap
+
+**Project**: ds-runtime  
+**Goal**: Functioning DirectStorage-style I/O and decompression pipeline implemented natively on Linux  
+**Target Platform**: CachyOS/Arch Linux with GPU/Vulkan support  
+**Timeline**: 36 weeks (9 months) for complete implementation, 12 weeks for MVP
+
+---
+
+## Executive Summary
+
+This document provides a comprehensive, phased implementation plan for completing the ds-runtime project. It integrates investigations across all components (GDeflate, Vulkan compute, io_uring, cancellation, GPU workflows, Wine/Proton) into a cohesive roadmap with extensive subphasing and microtasking.
+
+---
+
+## 1. Project Status Overview
+
+### 1.1 Current State (Phase 0 - Complete ✅)
+
+**Working Components**:
+- ✅ CPU backend with thread pool
+- ✅ Read/write operations (pread/pwrite)
+- ✅ FakeUppercase demo compression
+- ✅ Error reporting with rich context
+- ✅ Request completion tracking
+- ✅ C ABI for Wine/Proton integration
+- ✅ Basic test suite (4 tests, all passing)
+- ✅ Build system (CMake, C++20)
+
+**Project Builds Successfully**: All tests pass, demos work
+
+### 1.2 Components Requiring Work
+
+| Component | Status | Priority | Effort |
+|-----------|--------|----------|--------|
+| **GDeflate Compression** | ⚠️ Stubbed (ENOTSUP) | High | 9-13 weeks |
+| **Vulkan GPU Compute** | ⚠️ Partial (copy only) | High | 8 weeks |
+| **io_uring Backend** | ⚠️ Partial (single-worker) | Medium | 6 weeks |
+| **Request Cancellation** | ❌ Not implemented | Medium | 3 weeks |
+| **GPU-Resident Workflows** | ⚠️ Basic only | Medium | 4 weeks |
+| **Wine/Proton Integration** | ❌ Documentation only | High | 8 weeks |
+
+---
+
+## 2. Phased Implementation Plan
+
+### PHASE 1: Foundation & Research (Weeks 1-8)
+
+**Goal**: Establish foundation for all advanced features
+
+#### Phase 1.1: GDeflate Format Research (Weeks 1-3)
+**Owner**: Research Lead  
+**Priority**: Critical Path
+
+**Microtasks**:
+- [ ] **1.1.1** Review Microsoft DirectStorage documentation (2 days)
+  - SDK docs, headers, samples
+  - GDeflate format specification (if available)
+  - Create `docs/gdeflate_format.md`
+
+- [ ] **1.1.2** Analyze existing implementations (3 days)
+  - Wine/Proton DirectStorage status
+  - Community implementations
+  - Open-source decompression libraries
+
+- [ ] **1.1.3** Reverse engineer format if needed (5 days)
+  - Create compressed test assets
+  - Analyze binary structure
+  - Document block format, headers, metadata
+
+- [ ] **1.1.4** Validate format understanding (2 days)
+  - Create test vectors
+  - Verify decompression correctness
+  - Document any ambiguities
+
+**Deliverables**:
+- ✅ GDeflate format specification document
+- ✅ Test asset collection (compressed files)
+- ✅ Validation test vectors
+
+**Success Criteria**: Can parse GDeflate headers and understand block structure
+
+---
+
+#### Phase 1.2: Vulkan Compute Infrastructure (Weeks 1-8)
+**Owner**: Graphics Engineer  
+**Priority**: Critical Path (parallel with GDeflate research)
+
+**Sub-Phase 1.2.1: Shader Module System (Weeks 1-2)**
+
+**Microtasks**:
+- [ ] **1.2.1.1** Implement shader file loading (1 day)
+  - Read SPIR-V binary from file
+  - Error handling for missing files
+  - Add to `src/ds_runtime_vulkan.cpp`
+
+- [ ] **1.2.1.2** Create VkShaderModule wrapper (1 day)
+  - `vkCreateShaderModule` call
+  - Validation of SPIR-V code
+  - Error reporting
+
+- [ ] **1.2.1.3** Implement shader caching (2 days)
+  - `ShaderModuleCache` class
+  - Hash-based caching
+  - Lifetime management
+
+- [ ] **1.2.1.4** Test shader loading (1 day)
+  - Load existing `copy.comp.spv`
+  - Test error cases
+  - Add unit test
+
+- [ ] **1.2.1.5** Set up shader build system (2 days)
+  - CMake integration for glslangValidator
+  - Auto-compile `.comp` files
+  - Install shader SPV files
+
+**Deliverable**: Shader module loading system complete
+
+**Sub-Phase 1.2.2: Descriptor Management (Weeks 3-4)**
+
+**Microtasks**:
+- [ ] **1.2.2.1** Design descriptor layouts (2 days)
+  - Decompression layout (3 bindings)
+  - Copy layout (2 bindings)
+  - Document layout specifications
+
+- [ ] **1.2.2.2** Implement descriptor set layout creation (1 day)
+  - `vkCreateDescriptorSetLayout`
+  - Multiple layout types
+  - Add to VulkanBackend
+
+- [ ] **1.2.2.3** Implement descriptor pool (2 days)
+  - `vkCreateDescriptorPool`
+  - Size estimation logic
+  - Pool management
+
+- [ ] **1.2.2.4** Implement descriptor set allocation (1 day)
+  - `vkAllocateDescriptorSets`
+  - Free/reuse logic
+  - Error handling
+
+- [ ] **1.2.2.5** Implement buffer binding (2 days)
+  - `vkUpdateDescriptorSets`
+  - Buffer info structure
+  - Dynamic updates
+
+- [ ] **1.2.2.6** Test descriptor system (2 days)
+  - Unit tests for allocation/free
+  - Buffer binding validation
+  - Memory leak testing
+
+**Deliverable**: Descriptor management system complete
+
+**Sub-Phase 1.2.3: Pipeline Creation (Weeks 5-6)**
+
+**Microtasks**:
+- [ ] **1.2.3.1** Implement pipeline layout (1 day)
+  - `vkCreatePipelineLayout`
+  - Push constant support
+  - Multiple descriptor layouts
+
+- [ ] **1.2.3.2** Implement compute pipeline creation (2 days)
+  - `vkCreateComputePipelines`
+  - Pipeline configuration
+  - Pipeline caching
+
+- [ ] **1.2.3.3** Create pipeline management system (2 days)
+  - `ComputePipeline` struct
+  - Pipeline registry (by name)
+  - Lifetime management
+
+- [ ] **1.2.3.4** Test pipeline creation (2 days)
+  - Create simple copy pipeline
+  - Validation layers enabled
+  - Error handling tests
+
+- [ ] **1.2.3.5** Create example shaders (3 days)
+  - `buffer_copy.comp` - simple copy
+  - `uppercase.comp` - ASCII transform
+  - Compile to SPIR-V
+
+**Deliverable**: Compute pipeline creation system complete
+
+**Sub-Phase 1.2.4: Compute Dispatch & Synchronization (Weeks 7-8)**
+
+**Microtasks**:
+- [ ] **1.2.4.1** Implement command buffer recording (2 days)
+  - `vkCmdBindPipeline` for compute
+  - `vkCmdBindDescriptorSets`
+  - `vkCmdDispatch`
+
+- [ ] **1.2.4.2** Implement synchronization barriers (2 days)
+  - Transfer → Compute barrier
+  - Compute → Transfer barrier
+  - Compute → Compute barrier
+
+- [ ] **1.2.4.3** Integrate with request processing (3 days)
+  - Add compute path to VulkanBackend
+  - Command buffer management
+  - Completion tracking
+
+- [ ] **1.2.4.4** Test compute execution (2 days)
+  - Buffer copy shader test
+  - Uppercase transform test
+  - Synchronization validation
+
+- [ ] **1.2.4.5** Performance profiling (1 day)
+  - Measure dispatch overhead
+  - GPU utilization metrics
+  - Optimization opportunities
+
+**Deliverable**: Full Vulkan compute capability
+
+**Phase 1 Milestones**:
+- ✅ Week 3: GDeflate format understood
+- ✅ Week 2: Shader loading works
+- ✅ Week 4: Descriptor system works
+- ✅ Week 6: Compute pipelines created
+- ✅ Week 8: Compute dispatch working
+
+---
+
+### PHASE 2: Core Feature Implementation (Weeks 9-18)
+
+**Goal**: Implement essential features (GDeflate CPU, io_uring, cancellation)
+
+#### Phase 2.1: GDeflate CPU Implementation (Weeks 9-13)
+**Owner**: Compression Engineer  
+**Priority**: High  
+**Dependencies**: Phase 1.1 complete
+
+**Sub-Phase 2.1.1: Block Header Parser (Weeks 9-10)**
+
+**Microtasks**:
+- [ ] **2.1.1.1** Define block header struct (1 day)
+  - Header fields based on format spec
+  - Size, offset, compression parameters
+  - Create `include/gdeflate_format.h`
+
+- [ ] **2.1.1.2** Implement header parsing (2 days)
+  - Parse file/stream header
+  - Extract block metadata
+  - Validate checksums (if present)
+
+- [ ] **2.1.1.3** Implement block iterator (1 day)
+  - Iterate over blocks in file
+  - Handle partial files
+  - Error reporting
+
+- [ ] **2.1.1.4** Test header parsing (2 days)
+  - Test with known assets
+  - Edge cases (empty, malformed)
+  - Add unit test `tests/gdeflate_header_test.cpp`
+
+**Deliverable**: GDeflate header parser
+
+**Sub-Phase 2.1.2: DEFLATE Integration (Weeks 10-11)**
+
+**Microtasks**:
+- [ ] **2.1.2.1** Evaluate libraries (1 day)
+  - zlib vs miniz vs custom
+  - Performance comparison
+  - License compatibility
+
+- [ ] **2.1.2.2** Integrate chosen library (2 days)
+  - Add to CMakeLists.txt
+  - Wrapper functions
+  - Error handling
+
+- [ ] **2.1.2.3** Implement block decompression (2 days)
+  - Decompress single block
+  - Handle dictionary/state
+  - Streaming support
+
+- [ ] **2.1.2.4** Implement multi-block decompression (2 days)
+  - Iterate over all blocks
+  - Parallel decompression (thread pool)
+  - Assembly of output buffer
+
+- [ ] **2.1.2.5** Test decompression (3 days)
+  - Single block tests
+  - Multi-block tests
+  - Correctness validation
+  - Add `tests/gdeflate_decompress_test.cpp`
+
+**Deliverable**: Working GDeflate CPU decoder
+
+**Sub-Phase 2.1.3: Backend Integration (Weeks 12-13)**
+
+**Microtasks**:
+- [ ] **2.1.3.1** Remove ENOTSUP stub (0.5 day)
+  - Delete stub code in `ds_runtime.cpp`
+  - Wire in actual decoder
+
+- [ ] **2.1.3.2** Integrate decoder into CPU backend (1 day)
+  - Call from decompression pipeline
+  - Buffer management
+  - Error propagation
+
+- [ ] **2.1.3.3** Add configuration options (1 day)
+  - Parallel decompression settings
+  - Memory limits
+  - Fallback behavior
+
+- [ ] **2.1.3.4** Test integration (2 days)
+  - Update `compression_gdeflate_stub_test.cpp`
+  - Change from "verify error" to "verify success"
+  - End-to-end tests
+
+- [ ] **2.1.3.5** Performance benchmarking (2 days)
+  - Measure decompression throughput
+  - Compare vs uncompressed
+  - Optimize hotspots
+
+- [ ] **2.1.3.6** Documentation (1 day)
+  - Update README.md
+  - Usage examples
+  - Performance characteristics
+
+**Deliverable**: GDeflate CPU support complete
+
+**Phase 2.1 Milestone**: GDeflate CPU decoder fully functional
+
+---
+
+#### Phase 2.2: io_uring Multi-Worker (Weeks 9-14)
+**Owner**: Systems Engineer  
+**Priority**: Medium (parallel with GDeflate)  
+**Dependencies**: None
+
+**Sub-Phase 2.2.1: Multi-Worker Architecture (Weeks 9-11)**
+
+**Microtasks**:
+- [ ] **2.2.1.1** Design worker architecture (1 day)
+  - Multiple io_uring instances
+  - Request distribution strategy
+  - Synchronization design
+
+- [ ] **2.2.1.2** Implement worker structure (2 days)
+  - `IoUringWorker` class
+  - Worker thread management
+  - Lifecycle (init/shutdown)
+
+- [ ] **2.2.1.3** Implement request queue per worker (1 day)
+  - Thread-safe pending queue
+  - Condition variable for signaling
+  - Lock-free alternatives (optional)
+
+- [ ] **2.2.1.4** Implement worker event loop (3 days)
+  - Submit SQEs from queue
+  - Poll for CQEs
+  - Timeout handling
+
+- [ ] **2.2.1.5** Implement load balancing (2 days)
+  - Round-robin distribution
+  - Queue depth aware (optional)
+  - Test distribution fairness
+
+- [ ] **2.2.1.6** Test multi-worker (3 days)
+  - Submit to multiple workers
+  - Verify parallel execution
+  - Stress test (1000+ requests)
+  - Add `tests/io_uring_multi_worker_test.cpp`
+
+**Deliverable**: Multi-worker io_uring backend
+
+**Sub-Phase 2.2.2: Advanced Features (Weeks 12-14)**
+
+**Microtasks**:
+- [ ] **2.2.2.1** Implement fixed files support (2 days)
+  - Register file descriptors
+  - Use IOSQE_FIXED_FILE flag
+  - Benchmark improvement
+
+- [ ] **2.2.2.2** Add SQPOLL mode (optional) (2 days)
+  - Configure SQPOLL parameters
+  - Test latency improvement
+  - Document requirements
+
+- [ ] **2.2.2.3** Enhanced error handling (2 days)
+  - EAGAIN retry logic
+  - EINTR handling
+  - Robust failure recovery
+
+- [ ] **2.2.2.4** Performance tuning (3 days)
+  - Optimize queue depth
+  - Tune polling interval
+  - Batch submission optimization
+
+- [ ] **2.2.2.5** Comprehensive testing (2 days)
+  - Error injection tests
+  - Performance benchmarks
+  - Compare vs CPU backend
+
+- [ ] **2.2.2.6** Documentation (1 day)
+  - Update README.md
+  - Configuration guide
+  - Performance tuning tips
+
+**Deliverable**: Production-ready io_uring backend
+
+**Phase 2.2 Milestone**: io_uring backend feature-complete
+
+---
+
+#### Phase 2.3: Request Cancellation (Weeks 15-18)
+**Owner**: Core Engineer  
+**Priority**: Medium  
+**Dependencies**: None
+
+**Sub-Phase 2.3.1: API Design (Week 15)**
+
+**Microtasks**:
+- [ ] **2.3.1.1** Add RequestStatus::Cancelled (0.5 day)
+  - Update enum in `ds_runtime.hpp`
+  - Update C ABI mapping
+
+- [ ] **2.3.1.2** Add request ID tracking (1 day)
+  - `request_id_t` type
+  - ID generation (atomic counter)
+  - Request ID → Request mapping
+
+- [ ] **2.3.1.3** Add cancellation flag to Request (0.5 day)
+  - `std::atomic<bool> cancellation_requested`
+  - Memory ordering considerations
+
+- [ ] **2.3.1.4** Design Queue cancellation API (1 day)
+  - `bool cancel_request(request_id_t)`
+  - `size_t cancel_all_pending()`
+  - `size_t cancel_all()`
+
+- [ ] **2.3.1.5** Document cancellation semantics (1 day)
+  - Strong vs weak guarantees
+  - Race condition handling
+  - Callback behavior
+
+**Deliverable**: Cancellation API design
+
+**Sub-Phase 2.3.2: Queue Implementation (Week 16)**
+
+**Microtasks**:
+- [ ] **2.3.2.1** Implement request tracking (2 days)
+  - Active requests map
+  - Thread-safe access
+  - ID assignment on enqueue
+
+- [ ] **2.3.2.2** Implement cancel_request (1 day)
+  - Lookup request by ID
+  - Set cancellation flag
+  - Remove if still pending
+
+- [ ] **2.3.2.3** Implement cancel_all methods (1 day)
+  - Iterate active requests
+  - Mark all for cancellation
+  - Return count
+
+- [ ] **2.3.2.4** Test queue cancellation (1 day)
+  - Cancel pending request
+  - Cancel after submit
+  - Race condition tests
+  - Add `tests/cancellation_queue_test.cpp`
+
+**Deliverable**: Queue-level cancellation
+
+**Sub-Phase 2.3.3: Backend Integration (Weeks 17-18)**
+
+**Microtasks**:
+- [ ] **2.3.3.1** CPU backend cancellation (2 days)
+  - Check flag before I/O
+  - Check flag after I/O
+  - Skip callback if cancelled
+
+- [ ] **2.3.3.2** Vulkan backend cancellation (2 days)
+  - Check flag before dispatch
+  - Check flag in completion
+  - Handle in-flight GPU work
+
+- [ ] **2.3.3.3** io_uring backend cancellation (2 days)
+  - Cancel pending SQEs
+  - Handle in-flight operations
+  - Integration with worker threads
+
+- [ ] **2.3.3.4** Comprehensive testing (3 days)
+  - Test all backends
+  - Race condition stress tests
+  - Performance impact measurement
+  - Add `tests/cancellation_backend_test.cpp`
+
+- [ ] **2.3.3.5** Documentation (1 day)
+  - API documentation
+  - Usage examples
+  - Cancellation guarantees
+
+**Deliverable**: Full cancellation support
+
+**Phase 2.3 Milestone**: Request cancellation implemented
+
+---
+
+**Phase 2 Summary**:
+- Week 9-13: GDeflate CPU implementation
+- Week 9-14: io_uring multi-worker (parallel)
+- Week 15-18: Request cancellation
+- All components tested independently
+
+---
+
+### PHASE 3: Advanced Features (Weeks 19-28)
+
+**Goal**: GPU acceleration, advanced optimizations
+
+#### Phase 3.1: GDeflate GPU Implementation (Weeks 19-24)
+**Owner**: GPU Compute Engineer  
+**Priority**: High  
+**Dependencies**: Phase 1.2 (Vulkan compute) + Phase 2.1 (GDeflate CPU)
+
+**Sub-Phase 3.1.1: GPU Shader Development (Weeks 19-21)**
+
+**Microtasks**:
+- [ ] **3.1.1.1** Design GPU decompression algorithm (2 days)
+  - Block-parallel approach
+  - Workgroup size selection
+  - Memory layout
+
+- [ ] **3.1.1.2** Implement DEFLATE decode shader (5 days)
+  - Huffman decoding (GPU-friendly)
+  - LZ77 back-reference handling
+  - Shared memory optimization
+  - Create `shaders/gdeflate_decompress.comp`
+
+- [ ] **3.1.1.3** Test shader in isolation (3 days)
+  - Standalone shader test harness
+  - Known input/output validation
+  - Performance profiling
+
+- [ ] **3.1.1.4** Optimize shader (3 days)
+  - Reduce divergence
+  - Coalesced memory access
+  - Wavefront/warp efficiency
+
+**Deliverable**: GDeflate GPU shader
+
+**Sub-Phase 3.1.2: Backend Integration (Weeks 22-24)**
+
+**Microtasks**:
+- [ ] **3.1.2.1** Create GDeflate compute pipeline (1 day)
+  - Load shader
+  - Configure descriptor layout
+  - Add to VulkanBackend
+
+- [ ] **3.1.2.2** Implement GPU decompression dispatch (3 days)
+  - Parse block headers on CPU
+  - Upload metadata to GPU
+  - Dispatch compute workgroups
+  - Synchronization
+
+- [ ] **3.1.2.3** Implement CPU/GPU hybrid strategy (2 days)
+  - Heuristic for CPU vs GPU
+  - Configuration options
+  - Fallback logic
+
+- [ ] **3.1.2.4** Test GPU decompression (4 days)
+  - Correctness tests
+  - Performance benchmarks
+  - Comparison vs CPU
+  - Add `tests/gdeflate_gpu_test.cpp`
+
+- [ ] **3.1.2.5** Optimize performance (3 days)
+  - Profile GPU execution
+  - Reduce bottlenecks
+  - Tune workgroup sizes
+
+- [ ] **3.1.2.6** Documentation (2 days)
+  - GPU requirements
+  - Performance characteristics
+  - Troubleshooting
+
+**Deliverable**: GDeflate GPU decompression
+
+**Phase 3.1 Milestone**: GPU-accelerated decompression working
+
+---
+
+#### Phase 3.2: GPU-Resident Workflows (Weeks 25-28)
+**Owner**: GPU Optimization Engineer  
+**Priority**: Medium  
+**Dependencies**: Phase 3.1 (GDeflate GPU)
+
+**Sub-Phase 3.2.1: Memory Optimization (Weeks 25-26)**
+
+**Microtasks**:
+- [ ] **3.2.1.1** Implement staging buffer pooling (2 days)
+  - Buffer pool allocator
+  - Reuse across requests
+  - Size-based bins
+
+- [ ] **3.2.1.2** Implement async staging copies (2 days)
+  - Don't block on staging → GPU
+  - Pipeline multiple transfers
+  - Synchronization
+
+- [ ] **3.2.1.3** Optimize GPU buffer management (2 days)
+  - Reduce allocation frequency
+  - Suballocation strategy
+  - Memory defragmentation
+
+- [ ] **3.2.1.4** Test memory optimizations (2 days)
+  - Memory usage profiling
+  - Performance impact
+  - Stress tests
+
+**Deliverable**: Optimized memory management
+
+**Sub-Phase 3.2.2: Advanced GPU Paths (Weeks 27-28)
+**Microtasks**:
+- [ ] **3.2.2.1** Investigate GPUDirect Storage (2 days)
+  - NVIDIA GDS API review
+  - Feasibility assessment
+  - Prototype (if viable)
+
+- [ ] **3.2.2.2** Implement GPU-to-GPU optimization (2 days)
+  - Compressed buffer → decompressed buffer
+  - Single command buffer
+  - No CPU involvement
+
+- [ ] **3.2.2.3** Batch GPU operations (2 days)
+  - Multiple decompressions per dispatch
+  - Amortize overhead
+  - Descriptor set reuse
+
+- [ ] **3.2.2.4** Performance testing (2 days)
+  - Benchmark GPU workflows
+  - Compare optimization stages
+  - Real-world asset tests
+
+**Deliverable**: Optimized GPU-resident workflows
+
+**Phase 3.2 Milestone**: GPU workflows optimized
+
+---
+
+**Phase 3 Summary**:
+- Weeks 19-24: GDeflate GPU implementation
+- Weeks 25-28: GPU workflow optimization
+- All GPU features complete
+
+---
+
+### PHASE 4: Wine/Proton Integration (Weeks 29-36)
+
+**Goal**: Enable DirectStorage games on Linux via Proton
+
+#### Phase 4.1: Shim Development (Weeks 29-32)
+**Owner**: Wine Integration Engineer  
+**Priority**: High  
+**Dependencies**: Phase 2.1 (GDeflate CPU), Phase 1.2 (Vulkan compute)
+
+**Sub-Phase 4.1.1: Shim Skeleton (Week 29)**
+
+**Microtasks**:
+- [ ] **4.1.1.1** Create dstorage.dll directory structure (1 day)
+  - Set up Wine dlls/dstorage
+  - Makefile.in, .spec files
+  - Basic build infrastructure
+
+- [ ] **4.1.1.2** Implement DStorageGetFactory (1 day)
+  - COM object creation
+  - Reference counting
+  - Error handling
+
+- [ ] **4.1.1.3** Implement skeleton COM interfaces (2 days)
+  - IDStorageFactory
+  - IDStorageQueue
+  - IDStorageFile
+  - Basic vtable setup
+
+- [ ] **4.1.1.4** Test shim loads (1 day)
+  - DLL registration
+  - COM object creation
+  - Basic smoke test
+
+**Deliverable**: dstorage.dll skeleton
+
+**Sub-Phase 4.1.2: Type Mapping (Weeks 30-31)**
+
+**Microtasks**:
+- [ ] **4.1.2.1** Implement request descriptor translation (2 days)
+  - DSTORAGE_REQUEST → ds_request
+  - Field-by-field mapping
+  - Validation
+
+- [ ] **4.1.2.2** Implement handle conversion (1 day)
+  - Windows HANDLE → Linux fd
+  - File handle management
+  - Reference counting
+
+- [ ] **4.1.2.3** Implement D3D12 → Vulkan interop (3 days)
+  - Get VkDevice from ID3D12Device
+  - Get VkBuffer from ID3D12Resource
+  - vkd3d-proton integration
+
+- [ ] **4.1.2.4** Implement compression format mapping (1 day)
+  - DSTORAGE_COMPRESSION → ds_compression_t
+  - Enum translation
+  - Validation
+
+- [ ] **4.1.2.5** Test type conversions (2 days)
+  - Unit tests for all mappings
+  - Edge cases
+  - Error handling
+
+**Deliverable**: Complete type mapping
+
+**Sub-Phase 4.1.3: Queue Implementation (Week 32)**
+
+**Microtasks**:
+- [ ] **4.1.3.1** Implement IDStorageFactory::CreateQueue (2 days)
+  - Create ds_queue via C ABI
+  - Wrap in COM object
+  - Backend selection logic
+
+- [ ] **4.1.3.2** Implement IDStorageQueue::EnqueueRequest (2 days)
+  - Translate request
+  - Forward to ds_queue_enqueue
+  - Error handling
+
+- [ ] **4.1.3.3** Implement IDStorageQueue::Submit (1 day)
+  - Call ds_queue_submit_all
+  - Synchronization
+
+- [ ] **4.1.3.4** Implement completion signaling (1 day)
+  - IDStorageQueue::EnqueueSignal
+  - Map to callbacks/fences
+  - Event notification
+
+**Deliverable**: Functional queue implementation
+
+---
+
+#### Phase 4.2: Testing and Integration (Weeks 33-36)
+**Owner**: QA/Integration Engineer  
+**Priority**: Critical  
+**Dependencies**: Phase 4.1 complete
+
+**Sub-Phase 4.2.1: Integration Testing (Weeks 33-34)**
+
+**Microtasks**:
+- [ ] **4.2.1.1** Create simple test app (2 days)
+  - Minimal DirectStorage usage
+  - File read with DirectStorage API
+  - Verify data correctness
+
+- [ ] **4.2.1.2** Test with Proton (3 days)
+  - Configure Wine environment
+  - Test shim loading
+  - Debug integration issues
+
+- [ ] **4.2.1.3** Test Vulkan device sharing (2 days)
+  - Verify VkDevice passed correctly
+  - Test GPU transfers
+  - Synchronization validation
+
+- [ ] **4.2.1.4** Performance baseline (2 days)
+  - Measure overhead
+  - Compare Windows vs Linux
+  - Identify bottlenecks
+
+**Deliverable**: Integration test suite
+
+**Sub-Phase 4.2.2: Real Game Testing (Weeks 35-36)**
+
+**Microtasks**:
+- [ ] **4.2.2.1** Test Forspoken (if available) (3 days)
+  - Launch game via Proton
+  - Verify asset loading
+  - Performance testing
+  - Bug fixing
+
+- [ ] **4.2.2.2** Test other DirectStorage titles (3 days)
+  - Ratchet & Clank
+  - UE5 games
+  - Identify compatibility issues
+
+- [ ] **4.2.2.3** Performance optimization (3 days)
+  - Profile hot paths
+  - Optimize type conversions
+  - Reduce overhead
+
+- [ ] **4.2.2.4** Bug fixing and polish (3 days)
+  - Address discovered issues
+  - Stability improvements
+  - Memory leak fixes
+
+**Deliverable**: Stable Wine/Proton integration
+
+**Sub-Phase 4.2.3: Documentation (Week 36)**
+
+**Microtasks**:
+- [ ] **4.2.3.1** Write integration guide (2 days)
+  - Build instructions
+  - Configuration
+  - Troubleshooting
+
+- [ ] **4.2.3.2** Document known issues (1 day)
+  - Compatibility list
+  - Workarounds
+  - Performance notes
+
+- [ ] **4.2.3.3** Create developer guide (1 day)
+  - Debugging tips
+  - Contributing guidelines
+  - Testing procedures
+
+- [ ] **4.2.3.4** Update project documentation (1 day)
+  - README.md
+  - ROADMAP.md
+  - Release notes
+
+**Deliverable**: Complete documentation
+
+**Phase 4 Milestone**: Wine/Proton integration complete
+
+---
+
+## 3. Fast Track Option (12 Weeks MVP)
+
+**Goal**: Minimal viable product for testing
+
+**Included**:
+- ✅ CPU backend (already working)
+- ⏩ GDeflate CPU (Weeks 1-5)
+- ⏩ Vulkan compute (Weeks 1-8, parallel)
+- ⏩ Basic Wine shim (Weeks 9-12)
+
+**Excluded**:
+- ❌ GDeflate GPU (defer)
+- ❌ io_uring multi-worker (defer)
+- ❌ Request cancellation (defer)
+- ❌ GPU optimizations (defer)
+- ❌ Advanced Wine integration (defer)
+
+**Timeline**: 12 weeks to functional MVP
+
+---
+
+## 4. Testing Strategy
+
+### 4.1 Continuous Testing
+
+**Per Phase**:
+- Unit tests for all new components
+- Integration tests after backend changes
+- Performance benchmarks
+- Memory leak testing (valgrind)
+
+### 4.2 Test Suites
+
+**New Test Files**:
+1. `tests/gdeflate_format_test.cpp`
+2. `tests/gdeflate_decompress_test.cpp`
+3. `tests/gdeflate_gpu_test.cpp`
+4. `tests/vulkan_compute_test.cpp`
+5. `tests/io_uring_multi_worker_test.cpp`
+6. `tests/cancellation_test.cpp`
+7. `tests/wine_integration_test.cpp`
+
+**Existing Tests** (update as needed):
+- `tests/basic_queue_test.cpp`
+- `tests/cpu_backend_test.cpp`
+- `tests/error_handling_test.cpp`
+- `tests/compression_gdeflate_stub_test.cpp` (change to success test)
+
+### 4.3 Validation
+
+**Every Phase**:
+- [ ] All tests pass
+- [ ] No memory leaks (valgrind clean)
+- [ ] Vulkan validation layers pass
+- [ ] Performance meets targets
+- [ ] Documentation updated
+
+---
+
+## 5. Success Criteria
+
+### 5.1 Functional Requirements
+
+**Core**:
+- ✅ GDeflate compression works (CPU and GPU)
+- ✅ Vulkan GPU compute pipelines functional
+- ✅ io_uring backend production-ready
+- ✅ Request cancellation implemented
+- ✅ GPU-resident workflows optimized
+- ✅ Wine/Proton integration working
+
+**Quality**:
+- ✅ No memory leaks
+- ✅ Thread-safe
+- ✅ Vulkan validation clean
+- ✅ Comprehensive test coverage (≥80%)
+- ✅ Documentation complete
+
+### 5.2 Performance Targets
+
+| Metric | Target | Method |
+|--------|--------|--------|
+| GDeflate CPU | ≥ 500 MB/s | Benchmark decompression |
+| GDeflate GPU | ≥ 2 GB/s | Benchmark decompression |
+| io_uring throughput | ≥ 2x CPU backend | File I/O benchmark |
+| Vulkan compute overhead | < 100 µs/dispatch | Profiling |
+| Wine/Proton overhead | < 10% vs native | Game benchmarks |
+
+### 5.3 Platform Requirements
+
+**Verified On**:
+- CachyOS (Linux kernel 5.15+)
+- Arch Linux (latest)
+- AMD GPU (RADV driver)
+- NVIDIA GPU (proprietary driver)
+- Intel GPU (ANV driver)
+
+---
+
+## 6. Risk Management
+
+### 6.1 Technical Risks
+
+| Risk | Probability | Impact | Mitigation |
+|------|-------------|--------|------------|
+| GDeflate format unavailable | Medium | High | Reverse engineer, community help |
+| GPU shader too slow | Low | Medium | Optimize, fallback to CPU |
+| Wine integration complex | High | High | Start simple, iterate, seek Wine dev help |
+| Hardware compatibility | Medium | High | Test multiple GPUs, provide fallbacks |
+| liburing unavailable | Low | Low | CPU/Vulkan backends work without it |
+
+### 6.2 Schedule Risks
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| GDeflate research delay | +4 weeks | Start GPU work in parallel |
+| Vulkan debugging difficult | +2 weeks | Use validation layers, RenderDoc |
+| Wine upstreaming slow | +8 weeks | Maintain out-of-tree, focus on functionality |
+| Testing reveals bugs | +4 weeks | Buffer time, incremental fixes |
+
+### 6.3 Resource Risks
+
+**Assumptions**:
+- Single developer (can parallelize to some extent)
+- Access to CachyOS/Arch Linux system
+- Access to Vulkan-capable GPU
+- Access to DirectStorage test games (for Phase 4)
+
+**Mitigation**:
+- Prioritize critical path
+- Use community resources (Wine forums, GitHub)
+- Parallelize independent work
+- MVP approach if resource-constrained
+
+---
+
+## 7. Deliverables
+
+### 7.1 Code
+
+**New Files**:
+- `src/gdeflate_decoder.cpp` - GDeflate CPU implementation
+- `shaders/gdeflate_decompress.comp` - GDeflate GPU shader
+- `shaders/buffer_copy.comp` - Example compute shader
+- `shaders/uppercase.comp` - Transform shader
+- 7+ new test files
+
+**Modified Files**:
+- `src/ds_runtime.cpp` - Remove GDeflate stub, add cancellation
+- `src/ds_runtime_vulkan.cpp` - Add compute pipelines
+- `src/ds_runtime_uring.cpp` - Multi-worker support
+- `include/ds_runtime.hpp` - API additions (cancellation, IDs)
+
+**External** (Wine tree):
+- `dlls/dstorage/*` - Wine shim DLL
+
+### 7.2 Documentation
+
+**New Documents**:
+- ✅ `docs/investigation_gdeflate.md` (complete)
+- ✅ `docs/investigation_vulkan_compute.md` (complete)
+- ✅ `docs/investigation_io_uring.md` (complete)
+- ✅ `docs/investigation_remaining_features.md` (complete)
+- ✅ `docs/master_roadmap.md` (this document)
+- `docs/gdeflate_format.md` (Phase 1)
+- `docs/gdeflate_usage.md` (Phase 2)
+- `docs/wine_integration_guide.md` (Phase 4)
+
+**Updated Documents**:
+- `README.md` - Update status, features
+- `MISSING_FEATURES.md` - Mark completed
+- `COMPARISON.md` - Update comparisons
+- `docs/design.md` - Reflect implementation
+
+### 7.3 Tests
+
+**Comprehensive Test Suite**:
+- 12+ test executables (4 existing + 8 new)
+- 100% coverage of new features
+- Performance benchmarks
+- Integration tests
+- Wine shim tests
+
+---
+
+## 8. Progress Tracking
+
+### 8.1 Weekly Milestones
+
+| Week | Milestone | Phase |
+|------|-----------|-------|
+| 3 | GDeflate format understood | 1.1 |
+| 2 | Shader loading works | 1.2.1 |
+| 4 | Descriptor system works | 1.2.2 |
+| 6 | Compute pipelines created | 1.2.3 |
+| 8 | Compute dispatch working | 1.2.4 |
+| 10 | GDeflate header parser done | 2.1.1 |
+| 11 | DEFLATE integration complete | 2.1.2 |
+| 13 | GDeflate CPU working | 2.1.3 |
+| 14 | io_uring multi-worker done | 2.2 |
+| 18 | Request cancellation done | 2.3 |
+| 24 | GDeflate GPU working | 3.1 |
+| 28 | GPU workflows optimized | 3.2 |
+| 32 | Wine shim functional | 4.1 |
+| 36 | Full integration complete | 4.2 |
+
+### 8.2 Reporting
+
+**Frequency**: Weekly  
+**Format**: GitHub PR updates via `report_progress` tool
+
+**Include**:
+- Completed microtasks (✅)
+- In-progress tasks (⏩)
+- Blockers/issues
+- Test results
+- Performance metrics
+
+---
+
+## 9. Next Actions
+
+### Immediate (Week 1)
+
+**This Week**:
+1. ✅ Complete investigation documents (done)
+2. ✅ Report progress with master plan (in progress)
+3. ⏩ Begin GDeflate format research
+4. ⏩ Start Vulkan shader loading implementation
+5. ⏩ Set up development environment (liburing, Vulkan SDK)
+
+### Short Term (Weeks 2-4)
+
+**Focus**:
+- Continue GDeflate research
+- Complete Vulkan shader module system
+- Begin descriptor management
+- Create test infrastructure
+
+### Medium Term (Weeks 5-12)
+
+**Focus**:
+- Complete GDeflate CPU implementation
+- Finish Vulkan compute pipelines
+- Begin io_uring multi-worker
+- Test all components independently
+
+### Long Term (Weeks 13-36)
+
+**Focus**:
+- GPU acceleration (GDeflate, workflows)
+- Wine/Proton integration
+- Real game testing
+- Performance optimization
+- Documentation and polish
+
+---
+
+## 10. Conclusion
+
+This master roadmap provides a comprehensive plan for completing the ds-runtime project with extensive subphasing and microtasking. The phased approach allows for:
+
+- **Incremental progress**: Small, verifiable steps
+- **Parallel work**: Independent features can progress simultaneously
+- **Risk management**: Early identification of issues
+- **Flexibility**: Can adjust scope (MVP vs full implementation)
+- **Clear milestones**: Weekly checkpoints for progress tracking
+
+**Full Implementation**: 36 weeks (9 months)  
+**MVP**: 12 weeks (3 months)
+
+The project is well-positioned with a solid foundation (CPU backend working, tests passing). The investigation phase has identified all requirements and dependencies. Execution can begin immediately on multiple parallel tracks (GDeflate research + Vulkan compute).
+
+---
+
+**Document Status**: Complete v1.0  
+**Last Updated**: 2026-02-16  
+**Next Update**: Weekly progress reports
diff --git a/include/gdeflate_format.h b/include/gdeflate_format.h
new file mode 100644
index 0000000..33d08a1
--- /dev/null
+++ b/include/gdeflate_format.h
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: Apache-2.0
+// GDeflate format structures and definitions for ds-runtime.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace ds {
+namespace gdeflate {
+
+// GDeflate is a block-based compression format designed for GPU decompression.
+// Each file/stream consists of a header followed by compressed blocks.
+// Blocks can be decompressed independently in parallel.
+
+// GDeflate file magic number (placeholder - needs actual spec)
+constexpr uint32_t GDEFLATE_MAGIC = 0x4744464C;  // "GDFL" in little-endian
+
+// GDeflate format version
+constexpr uint16_t GDEFLATE_VERSION_MAJOR = 1;
+constexpr uint16_t GDEFLATE_VERSION_MINOR = 0;
+
+// Maximum block size (16 MB is typical for DirectStorage)
+constexpr uint32_t MAX_BLOCK_SIZE = 16 * 1024 * 1024;
+
+// GDeflate file header structure
+// This appears at the start of every GDeflate compressed file
+struct FileHeader {
+    uint32_t magic;              // Magic number (GDEFLATE_MAGIC)
+    uint16_t version_major;      // Format version (major)
+    uint16_t version_minor;      // Format version (minor)
+    uint32_t flags;              // Compression flags
+    uint32_t uncompressed_size;  // Total uncompressed size (bytes)
+    uint32_t compressed_size;    // Total compressed size (bytes)
+    uint32_t block_count;        // Number of blocks
+    uint32_t reserved[2];        // Reserved for future use
+    
+    // Validate header
+    bool is_valid() const {
+        return magic == GDEFLATE_MAGIC &&
+               version_major == GDEFLATE_VERSION_MAJOR &&
+               uncompressed_size > 0 &&
+               compressed_size > 0 &&
+               block_count > 0;
+    }
+};
+
+// Metadata for a single compressed block
+struct BlockInfo {
+    uint64_t offset;             // Offset in compressed stream (bytes)
+    uint32_t compressed_size;    // Compressed block size (bytes)
+    uint32_t uncompressed_size;  // Uncompressed block size (bytes)
+    uint32_t checksum;           // Block checksum (CRC32 or similar)
+    
+    // Validate block info
+    bool is_valid() const {
+        return compressed_size > 0 &&
+               uncompressed_size > 0 &&
+               uncompressed_size <= MAX_BLOCK_SIZE;
+    }
+};
+
+// Complete GDeflate stream information
+struct StreamInfo {
+    FileHeader header;
+    std::vector<BlockInfo> blocks;
+    
+    // Validate entire stream
+    bool is_valid() const {
+        if (!header.is_valid()) {
+            return false;
+        }
+        if (blocks.size() != header.block_count) {
+            return false;
+        }
+        for (const auto& block : blocks) {
+            if (!block.is_valid()) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    // Get total uncompressed size
+    uint64_t get_uncompressed_size() const {
+        uint64_t total = 0;
+        for (const auto& block : blocks) {
+            total += block.uncompressed_size;
+        }
+        return total;
+    }
+    
+    // Get total compressed size
+    uint64_t get_compressed_size() const {
+        uint64_t total = 0;
+        for (const auto& block : blocks) {
+            total += block.compressed_size;
+        }
+        return total;
+    }
+};
+
+// Parse GDeflate file header from buffer
+// Returns true if header is valid and successfully parsed
+inline bool parse_file_header(const void* data, size_t size, FileHeader& header) {
+    if (size < sizeof(FileHeader)) {
+        return false;
+    }
+    
+    std::memcpy(&header, data, sizeof(FileHeader));
+    return header.is_valid();
+}
+
+// Parse block metadata from buffer
+// Returns number of blocks parsed, or 0 on error
+inline size_t parse_block_info(const void* data, size_t size, 
+                               size_t block_count, std::vector<BlockInfo>& blocks) {
+    if (size < block_count * sizeof(BlockInfo)) {
+        return 0;
+    }
+    
+    blocks.resize(block_count);
+    std::memcpy(blocks.data(), data, block_count * sizeof(BlockInfo));
+    
+    // Validate all blocks
+    for (const auto& block : blocks) {
+        if (!block.is_valid()) {
+            blocks.clear();
+            return 0;
+        }
+    }
+    
+    return block_count;
+}
+
+// Parse complete GDeflate stream information
+inline bool parse_stream_info(const void* data, size_t size, StreamInfo& info) {
+    if (size < sizeof(FileHeader)) {
+        return false;
+    }
+    
+    // Parse header
+    if (!parse_file_header(data, size, info.header)) {
+        return false;
+    }
+    
+    // Parse block metadata (comes after header)
+    const uint8_t* block_data = static_cast<const uint8_t*>(data) + sizeof(FileHeader);
+    size_t remaining = size - sizeof(FileHeader);
+    
+    size_t parsed = parse_block_info(block_data, remaining, 
+                                     info.header.block_count, info.blocks);
+    
+    return parsed == info.header.block_count && info.is_valid();
+}
+
+} // namespace gdeflate
+} // namespace ds
diff --git a/shaders/README.md b/shaders/README.md
new file mode 100644
index 0000000..1ef5ebd
--- /dev/null
+++ b/shaders/README.md
@@ -0,0 +1,36 @@
+# DS-Runtime Compute Shaders
+
+This directory contains GLSL compute shaders for GPU-accelerated operations in ds-runtime.
+
+## Shaders
+
+### copy.comp
+Basic buffer copy shader. Copies data from source buffer to destination buffer using GPU compute.
+- Local workgroup size: 16
+- Bindings:
+  - 0: Source buffer (read-only)
+  - 1: Destination buffer (write-only)
+
+## Building
+
+Shaders are automatically compiled to SPIR-V during the CMake build process using `glslangValidator`.
+
+```bash
+# Manual compilation (for testing):
+glslangValidator -V copy.comp -o copy.comp.spv
+```
+
+## Adding New Shaders
+
+1. Create your shader file (e.g., `my_shader.comp`)
+2. CMake will automatically compile it to SPIR-V
+3. The compiled `.spv` file will be available at build time
+4. Load the shader in code using the shader module system
+
+## Shader Conventions
+
+- Use `#version 450` for Vulkan 1.0+ compatibility
+- Declare local workgroup size with `layout(local_size_x = N) in;`
+- Use storage buffers for data: `layout(binding = N) buffer BufferName { ... };`
+- Optimize for coalesced memory access patterns
+- Keep workgroup sizes as multiples of 32 (warp/wavefront size)
diff --git a/shaders/buffer_copy.comp b/shaders/buffer_copy.comp
new file mode 100644
index 0000000..5178b1b
--- /dev/null
+++ b/shaders/buffer_copy.comp
@@ -0,0 +1,34 @@
+# Shader for simple buffer copy operation
+# Demonstrates basic compute shader structure with descriptor bindings
+
+#version 450
+
+# Define workgroup size (how many threads run in parallel)
+# 256 is a good default (multiple of 32 for most GPUs)
+layout(local_size_x = 256) in;
+
+# Binding 0: Input buffer (read-only)
+# Storage buffers allow large data arrays
+layout(binding = 0) readonly buffer InputBuffer {
+    uint data[];
+} input_buf;
+
+# Binding 1: Output buffer (write-only)
+layout(binding = 1) writeonly buffer OutputBuffer {
+    uint data[];
+} output_buf;
+
+# Push constants: Small data passed directly (faster than buffers)
+layout(push_constant) uniform PushConstants {
+    uint element_count;  // Number of elements to copy
+} push;
+
+void main() {
+    // Get this thread's global index
+    uint idx = gl_GlobalInvocationID.x;
+    
+    // Bounds check: don't write beyond array
+    if (idx < push.element_count) {
+        output_buf.data[idx] = input_buf.data[idx];
+    }
+}
diff --git a/shaders/copy.comp b/shaders/copy.comp
new file mode 100644
index 0000000..fb45c82
--- /dev/null
+++ b/shaders/copy.comp
@@ -0,0 +1,19 @@
+#version 450
+layout(local_size_x = 16) in;
+
+// We treat the buffer as an array of 32-bit words.
+// 16 words * 4 bytes = 64 bytes total.
+layout(binding = 0) buffer SrcBuf {
+    uint src[];
+};
+
+layout(binding = 1) buffer DstBuf {
+    uint dst[];
+};
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+    if (idx < 16) {
+        dst[idx] = src[idx];
+    }
+}
diff --git a/shaders/uppercase.comp b/shaders/uppercase.comp
new file mode 100644
index 0000000..b345f67
--- /dev/null
+++ b/shaders/uppercase.comp
@@ -0,0 +1,36 @@
+# Shader for uppercase ASCII transformation
+# Demonstrates data processing on GPU
+
+#version 450
+
+layout(local_size_x = 256) in;
+
+# Input buffer (raw bytes as uint8)
+layout(binding = 0) readonly buffer InputBuffer {
+    uint8_t data[];
+} input_buf;
+
+# Output buffer (transformed bytes)
+layout(binding = 1) writeonly buffer OutputBuffer {
+    uint8_t data[];
+} output_buf;
+
+# Push constants
+layout(push_constant) uniform PushConstants {
+    uint byte_count;
+} push;
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+    
+    if (idx < push.byte_count) {
+        uint8_t c = input_buf.data[idx];
+        
+        // Convert lowercase ASCII to uppercase (a-z -> A-Z)
+        if (c >= uint8_t('a') && c <= uint8_t('z')) {
+            c = c - uint8_t(32);
+        }
+        
+        output_buf.data[idx] = c;
+    }
+}
diff --git a/src/ds_runtime_vulkan.cpp b/src/ds_runtime_vulkan.cpp
index 0a62460..24d692e 100644
--- a/src/ds_runtime_vulkan.cpp
+++ b/src/ds_runtime_vulkan.cpp
@@ -7,10 +7,15 @@
 #include <cerrno>
 #include <condition_variable>
 #include <cstring>
+#include <fstream>
 #include <functional>
+#include <memory>
 #include <mutex>
 #include <queue>
+#include <stdexcept>
+#include <string>
 #include <thread>
+#include <unordered_map>
 #include <vector>
 
 #include <fcntl.h>
@@ -20,6 +25,803 @@ namespace ds {
 
 namespace {
 
+// Load SPIR-V bytecode from a file.
+// Returns the bytecode as a vector of uint32_t words.
+// Throws std::runtime_error if the file cannot be read or is invalid.
+std::vector<uint32_t> load_spirv_from_file(const std::string& path) {
+    // Open file in binary mode, seek to end to get size
+    std::ifstream file(path, std::ios::binary | std::ios::ate);
+    if (!file) {
+        throw std::runtime_error("Failed to open SPIR-V file: " + path);
+    }
+
+    // Get file size
+    std::streamsize size = file.tellg();
+    if (size <= 0) {
+        throw std::runtime_error("SPIR-V file is empty: " + path);
+    }
+
+    // SPIR-V must be a multiple of 4 bytes (32-bit words)
+    if (size % 4 != 0) {
+        throw std::runtime_error("SPIR-V file size is not a multiple of 4 bytes: " + path);
+    }
+
+    // Seek back to beginning
+    file.seekg(0, std::ios::beg);
+
+    // Read the file into a buffer
+    std::vector<char> buffer(static_cast<std::size_t>(size));
+    if (!file.read(buffer.data(), size)) {
+        throw std::runtime_error("Failed to read SPIR-V file: " + path);
+    }
+
+    // Validate SPIR-V magic number (0x07230203 in little-endian)
+    if (buffer.size() >= 4) {
+        uint32_t magic = *reinterpret_cast<const uint32_t*>(buffer.data());
+        if (magic != 0x07230203) {
+            throw std::runtime_error("Invalid SPIR-V magic number in file: " + path);
+        }
+    }
+
+    // Convert to uint32_t vector
+    std::vector<uint32_t> spirv(buffer.size() / 4);
+    std::memcpy(spirv.data(), buffer.data(), buffer.size());
+
+    return spirv;
+}
+
+// Wrapper for VkShaderModule with RAII lifecycle management.
+class ShaderModule {
+public:
+    // Create a shader module from SPIR-V bytecode.
+    // Throws std::runtime_error if creation fails.
+    ShaderModule(VkDevice device, const std::vector<uint32_t>& spirv_code)
+        : device_(device), module_(VK_NULL_HANDLE)
+    {
+        if (device == VK_NULL_HANDLE) {
+            throw std::runtime_error("Invalid VkDevice for shader module creation");
+        }
+
+        if (spirv_code.empty()) {
+            throw std::runtime_error("Empty SPIR-V code");
+        }
+
+        VkShaderModuleCreateInfo create_info{};
+        create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+        create_info.codeSize = spirv_code.size() * sizeof(uint32_t);
+        create_info.pCode = spirv_code.data();
+
+        VkResult result = vkCreateShaderModule(device_, &create_info, nullptr, &module_);
+        if (result != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create VkShaderModule (VkResult: " + 
+                                   std::to_string(static_cast<int>(result)) + ")");
+        }
+    }
+
+    // Destructor cleans up the Vulkan shader module.
+    ~ShaderModule() {
+        if (module_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) {
+            vkDestroyShaderModule(device_, module_, nullptr);
+        }
+    }
+
+    // Delete copy operations (shader modules should not be copied)
+    ShaderModule(const ShaderModule&) = delete;
+    ShaderModule& operator=(const ShaderModule&) = delete;
+
+    // Allow move operations
+    ShaderModule(ShaderModule&& other) noexcept
+        : device_(other.device_), module_(other.module_)
+    {
+        other.module_ = VK_NULL_HANDLE;
+        other.device_ = VK_NULL_HANDLE;
+    }
+
+    ShaderModule& operator=(ShaderModule&& other) noexcept {
+        if (this != &other) {
+            // Clean up our current module
+            if (module_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) {
+                vkDestroyShaderModule(device_, module_, nullptr);
+            }
+            
+            // Take ownership of other's module
+            device_ = other.device_;
+            module_ = other.module_;
+            other.module_ = VK_NULL_HANDLE;
+            other.device_ = VK_NULL_HANDLE;
+        }
+        return *this;
+    }
+
+    // Get the underlying VkShaderModule handle.
+    VkShaderModule get() const { return module_; }
+
+    // Check if the module is valid.
+    bool is_valid() const { return module_ != VK_NULL_HANDLE; }
+
+private:
+    VkDevice device_;
+    VkShaderModule module_;
+};
+
+// Cache for shader modules to avoid reloading/recompiling the same shaders.
+// Maps shader file paths to loaded shader modules.
+class ShaderModuleCache {
+public:
+    explicit ShaderModuleCache(VkDevice device) : device_(device) {}
+
+    // Load a shader from file, returning a cached module if already loaded.
+    // Throws std::runtime_error if the shader cannot be loaded.
+    VkShaderModule load_shader(const std::string& path) {
+        // Check if already cached
+        auto it = cache_.find(path);
+        if (it != cache_.end()) {
+            return it->second.get();
+        }
+
+        // Load SPIR-V from file
+        std::vector<uint32_t> spirv = load_spirv_from_file(path);
+
+        // Create shader module
+        ShaderModule module(device_, spirv);
+
+        // Cache it (move into cache)
+        VkShaderModule handle = module.get();
+        cache_.emplace(path, std::move(module));
+
+        return handle;
+    }
+
+    // Clear all cached shader modules.
+    void clear() {
+        cache_.clear();
+    }
+
+    // Get number of cached shaders.
+    std::size_t size() const {
+        return cache_.size();
+    }
+
+    // Check if a shader is cached.
+    bool has_shader(const std::string& path) const {
+        return cache_.find(path) != cache_.end();
+    }
+
+private:
+    VkDevice device_;
+    std::unordered_map<std::string, ShaderModule> cache_;
+};
+
+// Descriptor set layout for compute shaders.
+// Defines the bindings used by a compute pipeline.
+struct DescriptorLayoutInfo {
+    std::vector<VkDescriptorSetLayoutBinding> bindings;
+    VkDescriptorSetLayout layout = VK_NULL_HANDLE;
+    
+    // Create the Vulkan descriptor set layout from bindings
+    void create(VkDevice device) {
+        if (layout != VK_NULL_HANDLE) {
+            return;  // Already created
+        }
+        
+        VkDescriptorSetLayoutCreateInfo layout_info{};
+        layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+        layout_info.bindingCount = static_cast<uint32_t>(bindings.size());
+        layout_info.pBindings = bindings.data();
+        
+        VkResult result = vkCreateDescriptorSetLayout(device, &layout_info, nullptr, &layout);
+        if (result != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create descriptor set layout (VkResult: " +
+                                   std::to_string(static_cast<int>(result)) + ")");
+        }
+    }
+    
+    // Destroy the layout
+    void destroy(VkDevice device) {
+        if (layout != VK_NULL_HANDLE) {
+            vkDestroyDescriptorSetLayout(device, layout, nullptr);
+            layout = VK_NULL_HANDLE;
+        }
+    }
+};
+
+// Factory functions to create common descriptor layouts
+namespace descriptor_layouts {
+
+// Layout for simple buffer copy: 2 storage buffers (input, output)
+inline DescriptorLayoutInfo create_buffer_copy_layout() {
+    DescriptorLayoutInfo info;
+    info.bindings.resize(2);
+    
+    // Binding 0: Input buffer (read-only)
+    info.bindings[0].binding = 0;
+    info.bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    info.bindings[0].descriptorCount = 1;
+    info.bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    info.bindings[0].pImmutableSamplers = nullptr;
+    
+    // Binding 1: Output buffer (write-only)
+    info.bindings[1].binding = 1;
+    info.bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    info.bindings[1].descriptorCount = 1;
+    info.bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    info.bindings[1].pImmutableSamplers = nullptr;
+    
+    return info;
+}
+
+// Layout for decompression: 3 storage buffers (compressed, metadata, decompressed)
+inline DescriptorLayoutInfo create_decompression_layout() {
+    DescriptorLayoutInfo info;
+    info.bindings.resize(3);
+    
+    // Binding 0: Compressed input buffer
+    info.bindings[0].binding = 0;
+    info.bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    info.bindings[0].descriptorCount = 1;
+    info.bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    info.bindings[0].pImmutableSamplers = nullptr;
+    
+    // Binding 1: Metadata buffer (block info)
+    info.bindings[1].binding = 1;
+    info.bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    info.bindings[1].descriptorCount = 1;
+    info.bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    info.bindings[1].pImmutableSamplers = nullptr;
+    
+    // Binding 2: Decompressed output buffer
+    info.bindings[2].binding = 2;
+    info.bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    info.bindings[2].descriptorCount = 1;
+    info.bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    info.bindings[2].pImmutableSamplers = nullptr;
+    
+    return info;
+}
+
+} // namespace descriptor_layouts
+
+// Descriptor pool for allocating descriptor sets.
+// Pre-allocates a pool of descriptors that can be used by compute pipelines.
+class DescriptorPool {
+public:
+    explicit DescriptorPool(VkDevice device, uint32_t max_sets = 32)
+        : device_(device), pool_(VK_NULL_HANDLE)
+    {
+        // Size the pool for storage buffers (most common for compute)
+        // Each set needs up to 3 storage buffers (for decompression layout)
+        VkDescriptorPoolSize pool_size{};
+        pool_size.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        pool_size.descriptorCount = max_sets * 3;  // 3 buffers per set max
+        
+        VkDescriptorPoolCreateInfo pool_info{};
+        pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+        pool_info.poolSizeCount = 1;
+        pool_info.pPoolSizes = &pool_size;
+        pool_info.maxSets = max_sets;
+        pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+        
+        VkResult result = vkCreateDescriptorPool(device_, &pool_info, nullptr, &pool_);
+        if (result != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create descriptor pool (VkResult: " +
+                                   std::to_string(static_cast<int>(result)) + ")");
+        }
+    }
+    
+    ~DescriptorPool() {
+        if (pool_ != VK_NULL_HANDLE) {
+            vkDestroyDescriptorPool(device_, pool_, nullptr);
+        }
+    }
+    
+    // Delete copy operations
+    DescriptorPool(const DescriptorPool&) = delete;
+    DescriptorPool& operator=(const DescriptorPool&) = delete;
+    
+    // Allow move operations
+    DescriptorPool(DescriptorPool&& other) noexcept
+        : device_(other.device_), pool_(other.pool_)
+    {
+        other.pool_ = VK_NULL_HANDLE;
+    }
+    
+    DescriptorPool& operator=(DescriptorPool&& other) noexcept {
+        if (this != &other) {
+            if (pool_ != VK_NULL_HANDLE) {
+                vkDestroyDescriptorPool(device_, pool_, nullptr);
+            }
+            device_ = other.device_;
+            pool_ = other.pool_;
+            other.pool_ = VK_NULL_HANDLE;
+        }
+        return *this;
+    }
+    
+    // Allocate a descriptor set from this pool
+    VkDescriptorSet allocate(VkDescriptorSetLayout layout) {
+        VkDescriptorSetAllocateInfo alloc_info{};
+        alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+        alloc_info.descriptorPool = pool_;
+        alloc_info.descriptorSetCount = 1;
+        alloc_info.pSetLayouts = &layout;
+        
+        VkDescriptorSet descriptor_set;
+        VkResult result = vkAllocateDescriptorSets(device_, &alloc_info, &descriptor_set);
+        if (result != VK_SUCCESS) {
+            throw std::runtime_error("Failed to allocate descriptor set (VkResult: " +
+                                   std::to_string(static_cast<int>(result)) + ")");
+        }
+        
+        return descriptor_set;
+    }
+    
+    // Free a descriptor set back to the pool
+    void free(VkDescriptorSet descriptor_set) {
+        vkFreeDescriptorSets(device_, pool_, 1, &descriptor_set);
+    }
+    
+    // Reset the entire pool (frees all allocated sets)
+    void reset() {
+        vkResetDescriptorPool(device_, pool_, 0);
+    }
+    
+    VkDescriptorPool get() const { return pool_; }
+
+private:
+    VkDevice device_;
+    VkDescriptorPool pool_;
+};
+
+// Helper functions for updating descriptor sets with buffer bindings.
+namespace descriptor_updates {
+
+// Update a descriptor set with buffer bindings.
+// Used to bind actual VkBuffer handles to descriptor set bindings.
+struct BufferBinding {
+    uint32_t binding;        // Binding index (matches shader layout)
+    VkBuffer buffer;         // Buffer to bind
+    VkDeviceSize offset;     // Offset into buffer
+    VkDeviceSize range;      // Size of buffer region (or VK_WHOLE_SIZE)
+};
+
+// Update descriptor set with storage buffer bindings.
+// Example: bind input and output buffers for compute shader.
+inline void update_storage_buffers(VkDevice device,
+                                   VkDescriptorSet descriptor_set,
+                                   const std::vector<BufferBinding>& bindings)
+{
+    std::vector<VkDescriptorBufferInfo> buffer_infos;
+    std::vector<VkWriteDescriptorSet> writes;
+    
+    buffer_infos.reserve(bindings.size());
+    writes.reserve(bindings.size());
+    
+    for (const auto& binding : bindings) {
+        // Create buffer info
+        VkDescriptorBufferInfo buffer_info{};
+        buffer_info.buffer = binding.buffer;
+        buffer_info.offset = binding.offset;
+        buffer_info.range = binding.range;
+        buffer_infos.push_back(buffer_info);
+        
+        // Create write descriptor
+        VkWriteDescriptorSet write{};
+        write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+        write.dstSet = descriptor_set;
+        write.dstBinding = binding.binding;
+        write.dstArrayElement = 0;
+        write.descriptorCount = 1;
+        write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        write.pBufferInfo = &buffer_infos[writes.size()];
+        writes.push_back(write);
+    }
+    
+    // Update all bindings at once
+    vkUpdateDescriptorSets(device,
+                          static_cast<uint32_t>(writes.size()),
+                          writes.data(),
+                          0, nullptr);
+}
+
+// Convenience function for 2-buffer copy layout (input, output)
+inline void update_copy_buffers(VkDevice device,
+                               VkDescriptorSet descriptor_set,
+                               VkBuffer input_buffer,
+                               VkDeviceSize input_size,
+                               VkBuffer output_buffer,
+                               VkDeviceSize output_size)
+{
+    std::vector<BufferBinding> bindings = {
+        {0, input_buffer, 0, input_size},   // Binding 0: input
+        {1, output_buffer, 0, output_size}  // Binding 1: output
+    };
+    update_storage_buffers(device, descriptor_set, bindings);
+}
+
+// Convenience function for 3-buffer decompression layout
+inline void update_decompression_buffers(VkDevice device,
+                                        VkDescriptorSet descriptor_set,
+                                        VkBuffer compressed_buffer,
+                                        VkDeviceSize compressed_size,
+                                        VkBuffer metadata_buffer,
+                                        VkDeviceSize metadata_size,
+                                        VkBuffer decompressed_buffer,
+                                        VkDeviceSize decompressed_size)
+{
+    std::vector<BufferBinding> bindings = {
+        {0, compressed_buffer, 0, compressed_size},      // Binding 0: compressed
+        {1, metadata_buffer, 0, metadata_size},          // Binding 1: metadata
+        {2, decompressed_buffer, 0, decompressed_size}   // Binding 2: decompressed
+    };
+    update_storage_buffers(device, descriptor_set, bindings);
+}
+
+} // namespace descriptor_updates
+
+// Pipeline layout wrapper with RAII lifecycle management.
+// Combines descriptor set layouts and push constants.
+class PipelineLayout {
+public:
+    // Create pipeline layout from descriptor layouts and optional push constants
+    PipelineLayout(VkDevice device,
+                  const std::vector<VkDescriptorSetLayout>& descriptor_layouts,
+                  const std::vector<VkPushConstantRange>& push_constant_ranges = {})
+        : device_(device), layout_(VK_NULL_HANDLE)
+    {
+        VkPipelineLayoutCreateInfo layout_info{};
+        layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+        layout_info.setLayoutCount = static_cast<uint32_t>(descriptor_layouts.size());
+        layout_info.pSetLayouts = descriptor_layouts.data();
+        layout_info.pushConstantRangeCount = static_cast<uint32_t>(push_constant_ranges.size());
+        layout_info.pPushConstantRanges = push_constant_ranges.empty() ? nullptr : push_constant_ranges.data();
+        
+        VkResult result = vkCreatePipelineLayout(device_, &layout_info, nullptr, &layout_);
+        if (result != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create pipeline layout (VkResult: " +
+                                   std::to_string(static_cast<int>(result)) + ")");
+        }
+    }
+    
+    ~PipelineLayout() {
+        if (layout_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) {
+            vkDestroyPipelineLayout(device_, layout_, nullptr);
+        }
+    }
+    
+    // Delete copy operations
+    PipelineLayout(const PipelineLayout&) = delete;
+    PipelineLayout& operator=(const PipelineLayout&) = delete;
+    
+    // Allow move operations
+    PipelineLayout(PipelineLayout&& other) noexcept
+        : device_(other.device_), layout_(other.layout_)
+    {
+        other.layout_ = VK_NULL_HANDLE;
+        other.device_ = VK_NULL_HANDLE;
+    }
+    
+    PipelineLayout& operator=(PipelineLayout&& other) noexcept {
+        if (this != &other) {
+            if (layout_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) {
+                vkDestroyPipelineLayout(device_, layout_, nullptr);
+            }
+            device_ = other.device_;
+            layout_ = other.layout_;
+            other.layout_ = VK_NULL_HANDLE;
+            other.device_ = VK_NULL_HANDLE;
+        }
+        return *this;
+    }
+    
+    VkPipelineLayout get() const { return layout_; }
+    bool is_valid() const { return layout_ != VK_NULL_HANDLE; }
+
+private:
+    VkDevice device_;
+    VkPipelineLayout layout_;
+};
+
+// Helper to create common push constant ranges
+namespace push_constants {
+
+// Push constant range for compute shaders (typically for dispatch parameters)
+inline VkPushConstantRange create_compute_range(uint32_t size, uint32_t offset = 0) {
+    VkPushConstantRange range{};
+    range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    range.offset = offset;
+    range.size = size;
+    return range;
+}
+
+// Common push constant for element count (single uint32)
+inline VkPushConstantRange create_element_count_range() {
+    return create_compute_range(sizeof(uint32_t), 0);
+}
+
+} // namespace push_constants
+
+// Compute pipeline wrapper with RAII lifecycle management.
+class ComputePipeline {
+public:
+    // Create compute pipeline from shader module and pipeline layout
+    ComputePipeline(VkDevice device,
+                   VkShaderModule shader_module,
+                   VkPipelineLayout pipeline_layout,
+                   const char* entry_point = "main",
+                   VkPipelineCache cache = VK_NULL_HANDLE)
+        : device_(device), pipeline_(VK_NULL_HANDLE)
+    {
+        // Shader stage info
+        VkPipelineShaderStageCreateInfo stage_info{};
+        stage_info.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+        stage_info.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+        stage_info.module = shader_module;
+        stage_info.pName = entry_point;
+        
+        // Compute pipeline info
+        VkComputePipelineCreateInfo pipeline_info{};
+        pipeline_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+        pipeline_info.stage = stage_info;
+        pipeline_info.layout = pipeline_layout;
+        
+        VkResult result = vkCreateComputePipelines(device_, cache, 1, &pipeline_info, nullptr, &pipeline_);
+        if (result != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create compute pipeline (VkResult: " +
+                                   std::to_string(static_cast<int>(result)) + ")");
+        }
+    }
+    
+    ~ComputePipeline() {
+        if (pipeline_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) {
+            vkDestroyPipeline(device_, pipeline_, nullptr);
+        }
+    }
+    
+    // Delete copy operations
+    ComputePipeline(const ComputePipeline&) = delete;
+    ComputePipeline& operator=(const ComputePipeline&) = delete;
+    
+    // Allow move operations
+    ComputePipeline(ComputePipeline&& other) noexcept
+        : device_(other.device_), pipeline_(other.pipeline_)
+    {
+        other.pipeline_ = VK_NULL_HANDLE;
+        other.device_ = VK_NULL_HANDLE;
+    }
+    
+    ComputePipeline& operator=(ComputePipeline&& other) noexcept {
+        if (this != &other) {
+            if (pipeline_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) {
+                vkDestroyPipeline(device_, pipeline_, nullptr);
+            }
+            device_ = other.device_;
+            pipeline_ = other.pipeline_;
+            other.pipeline_ = VK_NULL_HANDLE;
+            other.device_ = VK_NULL_HANDLE;
+        }
+        return *this;
+    }
+    
+    VkPipeline get() const { return pipeline_; }
+    bool is_valid() const { return pipeline_ != VK_NULL_HANDLE; }
+
+private:
+    VkDevice device_;
+    VkPipeline pipeline_;
+};
+
+// Complete pipeline bundle (layout + pipeline + descriptor layout)
+struct ComputePipelineBundle {
+    DescriptorLayoutInfo descriptor_layout;
+    std::unique_ptr<PipelineLayout> pipeline_layout;
+    std::unique_ptr<ComputePipeline> pipeline;
+    
+    // Check if bundle is complete and valid
+    bool is_valid() const {
+        return descriptor_layout.layout != VK_NULL_HANDLE &&
+               pipeline_layout && pipeline_layout->is_valid() &&
+               pipeline && pipeline->is_valid();
+    }
+};
+
+// Factory functions for creating complete pipeline bundles
+namespace pipeline_factory {
+
+// Create a buffer copy pipeline (2 storage buffers, 1 push constant for element count)
+inline ComputePipelineBundle create_buffer_copy_pipeline(
+    VkDevice device,
+    ShaderModuleCache& shader_cache,
+    const std::string& shader_path)
+{
+    ComputePipelineBundle bundle;
+    
+    // 1. Create descriptor layout
+    bundle.descriptor_layout = descriptor_layouts::create_buffer_copy_layout();
+    bundle.descriptor_layout.create(device);
+    
+    // 2. Create pipeline layout with push constants
+    std::vector<VkDescriptorSetLayout> desc_layouts = {bundle.descriptor_layout.layout};
+    std::vector<VkPushConstantRange> push_ranges = {push_constants::create_element_count_range()};
+    bundle.pipeline_layout = std::make_unique<PipelineLayout>(device, desc_layouts, push_ranges);
+    
+    // 3. Load shader and create pipeline
+    VkShaderModule shader = shader_cache.load_shader(shader_path);
+    bundle.pipeline = std::make_unique<ComputePipeline>(device, shader, bundle.pipeline_layout->get());
+    
+    return bundle;
+}
+
+// Create a decompression pipeline (3 storage buffers, push constants for parameters)
+inline ComputePipelineBundle create_decompression_pipeline(
+    VkDevice device,
+    ShaderModuleCache& shader_cache,
+    const std::string& shader_path)
+{
+    ComputePipelineBundle bundle;
+    
+    // 1. Create descriptor layout
+    bundle.descriptor_layout = descriptor_layouts::create_decompression_layout();
+    bundle.descriptor_layout.create(device);
+    
+    // 2. Create pipeline layout with push constants (4 uint32s for decompression params)
+    std::vector<VkDescriptorSetLayout> desc_layouts = {bundle.descriptor_layout.layout};
+    std::vector<VkPushConstantRange> push_ranges = {
+        push_constants::create_compute_range(sizeof(uint32_t) * 4, 0)
+    };
+    bundle.pipeline_layout = std::make_unique<PipelineLayout>(device, desc_layouts, push_ranges);
+    
+    // 3. Load shader and create pipeline
+    VkShaderModule shader = shader_cache.load_shader(shader_path);
+    bundle.pipeline = std::make_unique<ComputePipeline>(device, shader, bundle.pipeline_layout->get());
+    
+    return bundle;
+}
+
+} // namespace pipeline_factory
+
+// Command buffer recording helpers for compute dispatches
+namespace compute_dispatch {
+
+// Helper to record compute dispatch commands into a command buffer
+struct DispatchInfo {
+    VkPipeline pipeline;
+    VkPipelineLayout pipeline_layout;
+    VkDescriptorSet descriptor_set;
+    uint32_t workgroup_count_x;
+    uint32_t workgroup_count_y = 1;
+    uint32_t workgroup_count_z = 1;
+    const void* push_constants_data = nullptr;
+    uint32_t push_constants_size = 0;
+    uint32_t push_constants_offset = 0;
+};
+
+// Record compute dispatch commands into a command buffer
+inline void record_compute_dispatch(VkCommandBuffer cmd, const DispatchInfo& info) {
+    // Bind compute pipeline
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, info.pipeline);
+    
+    // Bind descriptor set
+    vkCmdBindDescriptorSets(
+        cmd,
+        VK_PIPELINE_BIND_POINT_COMPUTE,
+        info.pipeline_layout,
+        0,  // first set
+        1,  // descriptor set count
+        &info.descriptor_set,
+        0,  // dynamic offset count
+        nullptr
+    );
+    
+    // Push constants (if provided)
+    if (info.push_constants_data && info.push_constants_size > 0) {
+        vkCmdPushConstants(
+            cmd,
+            info.pipeline_layout,
+            VK_SHADER_STAGE_COMPUTE_BIT,
+            info.push_constants_offset,
+            info.push_constants_size,
+            info.push_constants_data
+        );
+    }
+    
+    // Dispatch compute work
+    vkCmdDispatch(cmd, info.workgroup_count_x, info.workgroup_count_y, info.workgroup_count_z);
+}
+
+// Calculate workgroup count for 1D data (e.g., buffer processing)
+inline uint32_t calculate_workgroup_count_1d(uint32_t element_count, uint32_t workgroup_size) {
+    return (element_count + workgroup_size - 1) / workgroup_size;
+}
+
+// Memory barrier helpers for synchronization
+struct MemoryBarrierInfo {
+    VkAccessFlags src_access_mask;
+    VkAccessFlags dst_access_mask;
+    VkBuffer buffer;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+};
+
+// Insert buffer memory barrier
+inline void insert_buffer_barrier(VkCommandBuffer cmd,
+                                  VkPipelineStageFlags src_stage,
+                                  VkPipelineStageFlags dst_stage,
+                                  const MemoryBarrierInfo& info)
+{
+    VkBufferMemoryBarrier barrier{};
+    barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+    barrier.srcAccessMask = info.src_access_mask;
+    barrier.dstAccessMask = info.dst_access_mask;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.buffer = info.buffer;
+    barrier.offset = info.offset;
+    barrier.size = info.size;
+    
+    vkCmdPipelineBarrier(
+        cmd,
+        src_stage,
+        dst_stage,
+        0,  // dependency flags
+        0, nullptr,  // memory barriers
+        1, &barrier,  // buffer memory barriers
+        0, nullptr   // image memory barriers
+    );
+}
+
+// Barrier: Transfer Write → Compute Read (after staging buffer copy, before compute)
+inline void barrier_transfer_to_compute(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize size) {
+    MemoryBarrierInfo info;
+    info.src_access_mask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    info.dst_access_mask = VK_ACCESS_SHADER_READ_BIT;
+    info.buffer = buffer;
+    info.offset = 0;
+    info.size = size;
+    
+    insert_buffer_barrier(
+        cmd,
+        VK_PIPELINE_STAGE_TRANSFER_BIT,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        info
+    );
+}
+
+// Barrier: Compute Write → Transfer Read (after compute, before readback)
+inline void barrier_compute_to_transfer(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize size) {
+    MemoryBarrierInfo info;
+    info.src_access_mask = VK_ACCESS_SHADER_WRITE_BIT;
+    info.dst_access_mask = VK_ACCESS_TRANSFER_READ_BIT;
+    info.buffer = buffer;
+    info.offset = 0;
+    info.size = size;
+    
+    insert_buffer_barrier(
+        cmd,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        VK_PIPELINE_STAGE_TRANSFER_BIT,
+        info
+    );
+}
+
+// Barrier: Compute Write → Compute Read (between dependent compute passes)
+inline void barrier_compute_to_compute(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize size) {
+    MemoryBarrierInfo info;
+    info.src_access_mask = VK_ACCESS_SHADER_WRITE_BIT;
+    info.dst_access_mask = VK_ACCESS_SHADER_READ_BIT;
+    info.buffer = buffer;
+    info.offset = 0;
+    info.size = size;
+    
+    insert_buffer_barrier(
+        cmd,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        info
+    );
+}
+
+} // namespace compute_dispatch
+
 // Simple fixed-size thread pool to keep backend execution async.
 // This mirrors the CPU backend model but is local to the Vulkan backend
 // to keep responsibilities self-contained.
diff --git a/tests/gdeflate_format_test.cpp b/tests/gdeflate_format_test.cpp
new file mode 100644
index 0000000..0f6c2f5
--- /dev/null
+++ b/tests/gdeflate_format_test.cpp
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: Apache-2.0
+// Test for GDeflate format parsing
+
+#include "gdeflate_format.h"
+#include <cstring>
+#include <iostream>
+
+using namespace ds::gdeflate;
+
+// Test valid header parsing
+bool test_valid_header() {
+    FileHeader header;
+    header.magic = GDEFLATE_MAGIC;
+    header.version_major = GDEFLATE_VERSION_MAJOR;
+    header.version_minor = GDEFLATE_VERSION_MINOR;
+    header.flags = 0;
+    header.uncompressed_size = 1024;
+    header.compressed_size = 512;
+    header.block_count = 1;
+    header.reserved[0] = 0;
+    header.reserved[1] = 0;
+    
+    if (!header.is_valid()) {
+        std::cerr << "Valid header failed validation\n";
+        return false;
+    }
+    
+    // Test parsing
+    FileHeader parsed;
+    if (!parse_file_header(&header, sizeof(header), parsed)) {
+        std::cerr << "Failed to parse valid header\n";
+        return false;
+    }
+    
+    if (parsed.magic != GDEFLATE_MAGIC || 
+        parsed.uncompressed_size != 1024 ||
+        parsed.block_count != 1) {
+        std::cerr << "Parsed header data mismatch\n";
+        return false;
+    }
+    
+    return true;
+}
+
+// Test invalid header (bad magic)
+bool test_invalid_magic() {
+    FileHeader header;
+    header.magic = 0xDEADBEEF;  // Wrong magic
+    header.version_major = GDEFLATE_VERSION_MAJOR;
+    header.version_minor = GDEFLATE_VERSION_MINOR;
+    header.uncompressed_size = 1024;
+    header.compressed_size = 512;
+    header.block_count = 1;
+    
+    if (header.is_valid()) {
+        std::cerr << "Invalid magic passed validation\n";
+        return false;
+    }
+    
+    return true;
+}
+
+// Test block info parsing
+bool test_block_info() {
+    BlockInfo block;
+    block.offset = 0;
+    block.compressed_size = 256;
+    block.uncompressed_size = 512;
+    block.checksum = 0x12345678;
+    
+    if (!block.is_valid()) {
+        std::cerr << "Valid block failed validation\n";
+        return false;
+    }
+    
+    // Test parsing multiple blocks
+    std::vector<BlockInfo> blocks;
+    BlockInfo block_array[3];
+    for (int i = 0; i < 3; i++) {
+        block_array[i] = block;
+        block_array[i].offset = static_cast<uint64_t>(i * 256);
+    }
+    
+    size_t parsed = parse_block_info(block_array, sizeof(block_array), 3, blocks);
+    if (parsed != 3) {
+        std::cerr << "Failed to parse blocks: " << parsed << "\n";
+        return false;
+    }
+    
+    return true;
+}
+
+// Test complete stream info
+bool test_stream_info() {
+    // Create test data
+    const size_t total_size = sizeof(FileHeader) + sizeof(BlockInfo) * 2;
+    uint8_t buffer[total_size];
+    
+    // Write header
+    FileHeader* header = reinterpret_cast<FileHeader*>(buffer);
+    header->magic = GDEFLATE_MAGIC;
+    header->version_major = GDEFLATE_VERSION_MAJOR;
+    header->version_minor = GDEFLATE_VERSION_MINOR;
+    header->flags = 0;
+    header->uncompressed_size = 2048;
+    header->compressed_size = 1024;
+    header->block_count = 2;
+    header->reserved[0] = 0;
+    header->reserved[1] = 0;
+    
+    // Write blocks
+    BlockInfo* blocks = reinterpret_cast<BlockInfo*>(buffer + sizeof(FileHeader));
+    blocks[0].offset = 0;
+    blocks[0].compressed_size = 512;
+    blocks[0].uncompressed_size = 1024;
+    blocks[0].checksum = 0x11111111;
+    
+    blocks[1].offset = 512;
+    blocks[1].compressed_size = 512;
+    blocks[1].uncompressed_size = 1024;
+    blocks[1].checksum = 0x22222222;
+    
+    // Parse stream
+    StreamInfo info;
+    if (!parse_stream_info(buffer, total_size, info)) {
+        std::cerr << "Failed to parse stream info\n";
+        return false;
+    }
+    
+    if (info.blocks.size() != 2) {
+        std::cerr << "Wrong block count: " << info.blocks.size() << "\n";
+        return false;
+    }
+    
+    if (info.get_uncompressed_size() != 2048) {
+        std::cerr << "Wrong uncompressed size: " << info.get_uncompressed_size() << "\n";
+        return false;
+    }
+    
+    return true;
+}
+
+int main() {
+    std::cout << "[gdeflate_format_test] Running tests...\n";
+    
+    if (!test_valid_header()) {
+        std::cerr << "[gdeflate_format_test] test_valid_header FAILED\n";
+        return 1;
+    }
+    std::cout << "[gdeflate_format_test] test_valid_header PASSED\n";
+    
+    if (!test_invalid_magic()) {
+        std::cerr << "[gdeflate_format_test] test_invalid_magic FAILED\n";
+        return 1;
+    }
+    std::cout << "[gdeflate_format_test] test_invalid_magic PASSED\n";
+    
+    if (!test_block_info()) {
+        std::cerr << "[gdeflate_format_test] test_block_info FAILED\n";
+        return 1;
+    }
+    std::cout << "[gdeflate_format_test] test_block_info PASSED\n";
+    
+    if (!test_stream_info()) {
+        std::cerr << "[gdeflate_format_test] test_stream_info FAILED\n";
+        return 1;
+    }
+    std::cout << "[gdeflate_format_test] test_stream_info PASSED\n";
+    
+    std::cout << "[gdeflate_format_test] ALL TESTS PASSED\n";
+    return 0;
+}