diff --git a/CMakeLists.txt b/CMakeLists.txt index 35bc118..55ecce9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,48 @@ if (PkgConfig_FOUND) pkg_check_modules(LIBURING liburing) endif() +# ============================================================ +# Shader compilation +# +# If glslangValidator is available, compile compute shaders +# from GLSL to SPIR-V automatically during the build. +# ============================================================ + +find_program(GLSLANG_VALIDATOR glslangValidator) + +if(GLSLANG_VALIDATOR) + message(STATUS "Found glslangValidator: ${GLSLANG_VALIDATOR}") + + # Find all .comp shader files + file(GLOB SHADER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/shaders/*.comp) + + set(SPIRV_SHADERS) + foreach(SHADER ${SHADER_SOURCES}) + get_filename_component(SHADER_NAME ${SHADER} NAME) + set(SPIRV "${CMAKE_CURRENT_BINARY_DIR}/shaders/${SHADER_NAME}.spv") + + add_custom_command( + OUTPUT ${SPIRV} + COMMAND ${CMAKE_COMMAND} -E make_directory + "${CMAKE_CURRENT_BINARY_DIR}/shaders" + COMMAND ${GLSLANG_VALIDATOR} -V ${SHADER} -o ${SPIRV} + DEPENDS ${SHADER} + COMMENT "Compiling shader ${SHADER_NAME} to SPIR-V" + VERBATIM + ) + + list(APPEND SPIRV_SHADERS ${SPIRV}) + endforeach() + + if(SPIRV_SHADERS) + add_custom_target(shaders ALL DEPENDS ${SPIRV_SHADERS}) + message(STATUS "Shader compilation enabled (${list_length} shaders found)") + endif() +else() + message(STATUS "glslangValidator not found - shader compilation disabled") + message(STATUS " To enable: install vulkan-tools or spirv-tools package") +endif() + # ============================================================ # Core runtime library # @@ -232,6 +274,17 @@ if (DS_BUILD_TESTS) add_test(NAME ds_cpu_backend_test COMMAND ds_cpu_backend_test) add_test(NAME ds_error_handling_test COMMAND ds_error_handling_test) + # GDeflate format test + add_executable(ds_gdeflate_format_test + tests/gdeflate_format_test.cpp + ) + if (TARGET ds_runtime) + target_link_libraries(ds_gdeflate_format_test PRIVATE ds_runtime) + elseif (TARGET ds_runtime_static) + target_link_libraries(ds_gdeflate_format_test PRIVATE ds_runtime_static) + endif() + add_test(NAME ds_gdeflate_format_test COMMAND ds_gdeflate_format_test) + if (LIBURING_FOUND) add_executable(ds_io_uring_tests tests/io_uring_backend_test.cpp diff --git a/README.md b/README.md index 9d1917e..553ac32 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,16 @@ The codebase has been significantly improved: - โš ๏ธ **io_uring backend**: Requires liburing dependency (not built by default) - โš ๏ธ **Request cancellation**: Enum added but cancel() method not yet implemented +### ๐Ÿ“‹ Investigation & Planning Complete (Phase 0) +Comprehensive investigation and planning documents have been created: +- **[Master Roadmap](docs/master_roadmap.md)** (30KB) - Complete 36-week phased plan with microtasks +- **[GDeflate Investigation](docs/investigation_gdeflate.md)** (16KB) - CPU & GPU implementation plan +- **[Vulkan Compute Investigation](docs/investigation_vulkan_compute.md)** (26KB) - GPU compute pipelines +- **[io_uring Investigation](docs/investigation_io_uring.md)** (20KB) - Multi-worker backend enhancement +- **[Additional Features](docs/investigation_remaining_features.md)** (18KB) - Cancellation, GPU workflows, Wine/Proton + +Timeline: 36 weeks for full implementation, 12 weeks for MVP + See [MISSING_FEATURES.md](MISSING_FEATURES.md) for the complete roadmap and [COMPARISON.md](COMPARISON.md) for documentation vs reality comparison. --- diff --git a/docs/investigation_gdeflate.md b/docs/investigation_gdeflate.md new file mode 100644 index 0000000..a121dd4 --- /dev/null +++ b/docs/investigation_gdeflate.md @@ -0,0 +1,563 @@ +# GDeflate Compression Investigation and Implementation Plan + +**Status:** Research Phase +**Priority:** High +**Target:** CPU and GPU GDeflate decompression support +**Dependencies:** None for CPU; Vulkan compute pipeline for GPU + +--- + +## Executive Summary + +GDeflate is Microsoft's custom compression format for DirectStorage, designed for efficient GPU decompression. This document outlines the investigation, design, and implementation plan for adding GDeflate support to ds-runtime. + +--- + +## 1. Background + +### 1.1 What is GDeflate? + +GDeflate is a GPU-friendly variant of DEFLATE compression used in Microsoft DirectStorage. Key characteristics: + +- **Block-based structure**: Data is compressed in independent blocks for parallel GPU decompression +- **Modified DEFLATE**: Based on standard DEFLATE but optimized for GPU compute +- **Metadata format**: Includes block headers with size information +- **Designed for parallelism**: Each block can be decompressed independently + +### 1.2 Current State + +- **Status**: Intentionally stubbed - returns `ENOTSUP` error +- **Test**: `tests/compression_gdeflate_stub_test.cpp` verifies failure +- **API**: `Compression::GDeflate` enum value exists +- **Path**: Error callback triggered when requested + +--- + +## 2. Research Requirements + +### 2.1 Format Specification + +**Primary Goal**: Obtain or reverse-engineer GDeflate format specification + +**Approaches**: + +1. **Official Documentation** + - Check Microsoft DirectStorage SDK documentation + - Review DirectStorage headers and samples + - Search for format specifications in MSDN + +2. **Reverse Engineering** + - Analyze DirectStorage DLL behavior + - Examine compressed asset samples + - Study existing decompression implementations + +3. **Community Resources** + - Wine/Proton community investigations + - Graphics programming forums + - Open-source game engine implementations + +**Deliverable**: GDeflate format specification document + +### 2.2 Existing Implementations + +**Research Goals**: +- Identify existing open-source GDeflate decoders +- Review Wine/Proton DirectStorage implementation status +- Study GPU decompression implementations (if any) + +**Potential Sources**: +- Wine project (dstorage.dll implementation) +- Game engine implementations (Unreal, Unity plugins) +- Graphics libraries with DirectStorage support +- Academic papers on GPU decompression + +--- + +## 3. CPU Implementation Plan + +### 3.1 Architecture + +``` +Input: Compressed buffer + size + โ†“ +Block Header Parser + โ†“ (block metadata: offset, compressed_size, uncompressed_size) +Per-Block Decompression Loop + โ†“ (DEFLATE decode per block) +Output: Decompressed buffer +``` + +### 3.2 Implementation Phases + +#### Phase 3.1: Format Understanding +**Tasks**: +- Document GDeflate file/stream structure +- Identify block header format +- Document compression parameters +- Understand dictionary handling + +**Deliverables**: +- Format specification document +- Test asset creation tool + +#### Phase 3.2: Block Header Parser +**Tasks**: +- Implement GDeflate header parsing +- Extract block metadata (offsets, sizes) +- Validate header checksums (if present) +- Error handling for corrupted headers + +**Location**: `src/gdeflate_decoder.cpp` + +**API**: +```cpp +struct GDeflateBlockInfo { + uint64_t offset; + uint32_t compressed_size; + uint32_t uncompressed_size; +}; + +std::vector parse_gdeflate_header( + const void* data, + size_t size +); +``` + +#### Phase 3.3: DEFLATE Decoder Integration +**Tasks**: +- Choose DEFLATE library (zlib, miniz, or custom) +- Integrate block decompression +- Handle partial decompression +- Add streaming support + +**Library Options**: +1. **zlib** (standard, widely available) + - Pros: Mature, well-tested, optimized + - Cons: May need modification for block format + +2. **miniz** (single-file, public domain) + - Pros: Easy integration, no dependencies + - Cons: May be slower than zlib + +3. **Custom implementation** + - Pros: Full control, GDeflate-specific optimizations + - Cons: High development effort, testing burden + +**Recommendation**: Start with zlib, consider optimization later + +#### Phase 3.4: CPU Backend Integration +**Tasks**: +- Remove ENOTSUP stub from `ds_runtime.cpp` +- Wire GDeflate decoder into decompression pipeline +- Add error handling for decompression failures +- Update tests to verify successful decompression + +**Location**: `src/ds_runtime.cpp` (CpuBackend::decompress) + +**Changes**: +```cpp +// Replace stub: +if (req.compression == Compression::GDeflate) { + report_error("cpu", "decompression", ENOTSUP, + "GDeflate compression is not yet implemented (ENOTSUP)"); + return; +} + +// With implementation: +if (req.compression == Compression::GDeflate) { + if (!gdeflate_decompress(req.dst, req.size, ...)) { + report_request_error("cpu", "decompression", errno, + "GDeflate decompression failed", req); + return; + } +} +``` + +### 3.3 Testing Strategy + +#### Test Assets +- Create compressed test files with known content +- Various block sizes (1KB, 4KB, 16KB, 64KB) +- Edge cases: empty blocks, maximum compression, random data + +#### Test Cases +1. **Basic decompression**: Simple compressed buffer โ†’ original data +2. **Multi-block**: File with multiple independent blocks +3. **Error handling**: Corrupted header, truncated data, invalid checksums +4. **Performance**: Benchmark against uncompressed I/O +5. **Partial reads**: Decompression with offset/size parameters + +**New Test File**: `tests/gdeflate_cpu_test.cpp` + +### 3.4 Performance Considerations + +**Metrics to Track**: +- Decompression throughput (MB/s) +- CPU utilization per thread +- Memory overhead during decompression +- Comparison vs uncompressed I/O + +**Optimization Opportunities**: +- Parallel block decompression (thread pool) +- SIMD optimizations for DEFLATE decode +- Memory pool for temporary buffers +- Block prefetching for streaming + +--- + +## 4. GPU Implementation Plan + +### 4.1 Architecture + +``` +CPU: Parse block headers โ†’ metadata to GPU + โ†“ +GPU: Parallel block decompression (compute shader) + โ†“ (one workgroup per block or chunk) +GPU: Output to GPU buffer +``` + +### 4.2 Prerequisites + +**Required Infrastructure** (from Vulkan compute investigation): +- Compute pipeline creation โœ… (planned) +- Shader module loading โœ… (planned) +- Descriptor set management โœ… (planned) +- GPU buffer management โœ… (exists) + +**Dependency**: Vulkan GPU compute pipeline implementation must be complete + +### 4.3 Implementation Phases + +#### Phase 4.1: Compute Shader Design +**Tasks**: +- Design GLSL compute shader for DEFLATE decode +- Implement block-parallel decompression +- Handle LZ77 back-references efficiently +- Optimize for GPU wavefront/warp sizes + +**File**: `shaders/gdeflate_decompress.comp` + +**Key Challenges**: +1. **Shared memory management**: LZ77 history buffer +2. **Divergent execution**: Huffman decoding branches +3. **Synchronization**: Block-independent decompression +4. **Memory access patterns**: Coalesced reads/writes + +**Shader Structure**: +```glsl +#version 450 + +// Input: compressed blocks buffer +layout(binding = 0) readonly buffer CompressedData { + uint data[]; +} compressed; + +// Input: block metadata +layout(binding = 1) readonly buffer BlockInfo { + uint offset[]; + uint compressed_size[]; + uint uncompressed_size[]; +} blocks; + +// Output: decompressed data +layout(binding = 2) writeonly buffer DecompressedData { + uint data[]; +} decompressed; + +layout(local_size_x = 256) in; + +void main() { + uint block_id = gl_GlobalInvocationID.x; + // Decompress block[block_id] + // ... +} +``` + +#### Phase 4.2: GPU Backend Integration +**Tasks**: +- Add GDeflate compute pipeline to VulkanBackend +- Create descriptor sets for compression buffers +- Dispatch compute workload for decompression +- Handle synchronization (barriers, fences) + +**Location**: `src/ds_runtime_vulkan.cpp` + +**Changes**: +- Add `gdeflate_pipeline_` member to VulkanBackend +- Create compression-specific descriptor layout +- Dispatch compute before/after staging buffer copies + +#### Phase 4.3: CPU-GPU Hybrid Strategy +**Goal**: Choose CPU vs GPU decompression based on data characteristics + +**Decision Factors**: +1. **Data size**: Small files โ†’ CPU, Large files โ†’ GPU +2. **Block count**: Few blocks โ†’ CPU, Many blocks โ†’ GPU +3. **Request destination**: Host memory โ†’ CPU, GPU buffer โ†’ GPU +4. **GPU availability**: Fallback to CPU if GPU busy + +**Configuration**: +```cpp +struct CompressionConfig { + size_t gpu_threshold_bytes = 1024 * 1024; // 1MB + size_t min_blocks_for_gpu = 16; + bool prefer_gpu_for_gpu_memory = true; +}; +``` + +### 4.4 Performance Targets + +**Goals**: +- GPU decompression โ‰ฅ 5x faster than CPU (for large files) +- Minimal CPU involvement during GPU path +- Efficient for small files (CPU path competitive) + +**Benchmarks**: +- 1MB, 10MB, 100MB compressed assets +- Various compression ratios (1.5x, 3x, 5x) +- CPU vs GPU throughput comparison +- GPU occupancy and utilization metrics + +--- + +## 5. Dependencies + +### 5.1 External Libraries + +**CPU Path**: +- zlib or miniz (DEFLATE decompression) +- No additional system dependencies + +**GPU Path**: +- Vulkan SDK (already optional dependency) +- SPIR-V compiler for shaders (glslangValidator) +- Vulkan compute pipeline support (from separate investigation) + +### 5.2 Internal Dependencies + +**Build System**: +- Add GDeflate source files to CMakeLists.txt +- Optional dependency on compression library +- Shader compilation step in build + +**API Changes**: +- No breaking changes (GDeflate enum already exists) +- Remove ENOTSUP stub behavior +- Update documentation to reflect support + +--- + +## 6. Testing and Validation + +### 6.1 Unit Tests + +**Test Coverage**: +1. Block header parsing (valid/invalid headers) +2. Single-block decompression (various sizes) +3. Multi-block decompression (independent blocks) +4. Error handling (corrupted data, invalid sizes) +5. Compression ratio validation (known test vectors) + +**Test Files**: +- `tests/gdeflate_format_test.cpp` - Header parsing +- `tests/gdeflate_cpu_test.cpp` - CPU decompression +- `tests/gdeflate_gpu_test.cpp` - GPU decompression (if Vulkan available) + +### 6.2 Integration Tests + +**Scenarios**: +1. CPU backend with GDeflate compression +2. Vulkan backend with GDeflate GPU decompression +3. Mixed requests (compressed + uncompressed) +4. Error cases (invalid compression, missing data) + +**Update Existing Test**: +- Modify `tests/compression_gdeflate_stub_test.cpp` +- Change from "verify failure" to "verify success" +- Add decompression correctness checks + +### 6.3 Real-World Testing + +**Asset Types**: +- Game textures (DDS, KTX) +- Mesh data (binary vertex buffers) +- Shader bytecode (SPIR-V, DXBC) +- Mixed asset packs + +**Performance Testing**: +- Asset streaming demo with GDeflate assets +- Benchmark vs uncompressed baseline +- CPU vs GPU comparison +- DirectStorage parity testing (if possible on Windows) + +--- + +## 7. Documentation Requirements + +### 7.1 Format Documentation + +**Document**: `docs/gdeflate_format.md` + +**Contents**: +- GDeflate file structure +- Block header format +- Compression parameters +- Differences from standard DEFLATE +- Decoder state machine + +### 7.2 Implementation Documentation + +**Updates Required**: +- `README.md`: Change GDeflate status from โš ๏ธ to โœ… +- `MISSING_FEATURES.md`: Mark GDeflate items as complete +- `docs/design.md`: Add GDeflate pipeline description +- API documentation: Update Compression enum docs + +### 7.3 Usage Guide + +**Document**: `docs/gdeflate_usage.md` + +**Contents**: +- How to compress assets for ds-runtime +- Choosing CPU vs GPU decompression +- Performance tuning recommendations +- Troubleshooting guide + +--- + +## 8. Risks and Mitigations + +### 8.1 Technical Risks + +| Risk | Severity | Mitigation | +|------|----------|------------| +| Format spec unavailable | High | Reverse engineer from samples, collaborate with Wine community | +| GPU decode too slow | Medium | Optimize shader, fallback to CPU | +| Memory overhead too high | Medium | Streaming decompression, memory pools | +| Compatibility issues | Medium | Comprehensive testing, multiple test assets | +| Patent/licensing concerns | Low | Use open compression algorithms, document format | + +### 8.2 Timeline Risks + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Format research takes longer than expected | +2-4 weeks | Start with CPU implementation, add GPU later | +| GPU optimization difficult | +2-3 weeks | Accept lower performance initially, iterate | +| Integration issues with existing code | +1 week | Incremental integration, thorough testing | + +--- + +## 9. Timeline and Milestones + +### 9.1 Research Phase (2-3 weeks) +- Week 1: Format investigation and documentation +- Week 2: Library evaluation and prototyping +- Week 3: Design review and planning finalization + +**Milestone**: Format specification complete + +### 9.2 CPU Implementation (3-4 weeks) +- Week 4: Header parser implementation +- Week 5: DEFLATE integration and testing +- Week 6: Backend integration +- Week 7: Testing and optimization + +**Milestone**: CPU GDeflate decompression working + +### 9.3 GPU Implementation (4-6 weeks) +- Week 8-9: Compute shader development +- Week 10-11: GPU backend integration +- Week 12-13: Testing and optimization + +**Milestone**: GPU GDeflate decompression working + +### 9.4 Total Estimate +**9-13 weeks** for complete CPU + GPU implementation + +**Fast Track Option** (CPU only): **5-7 weeks** + +--- + +## 10. Success Criteria + +### 10.1 Functional Requirements +- โœ… CPU decoder successfully decompresses GDeflate assets +- โœ… GPU decoder works on Vulkan-supported hardware +- โœ… Error handling for corrupted/invalid data +- โœ… All existing tests still pass +- โœ… New GDeflate tests pass (100% coverage) + +### 10.2 Performance Requirements +- โœ… CPU: Decompression throughput โ‰ฅ 500 MB/s (uncompressed equivalent) +- โœ… GPU: Decompression throughput โ‰ฅ 2 GB/s (for large files) +- โœ… GPU overhead < 10% for file โ‰ฅ 1MB +- โœ… CPU fallback for small files performs acceptably + +### 10.3 Quality Requirements +- โœ… No memory leaks (valgrind clean) +- โœ… Thread-safe (no data races) +- โœ… Vulkan validation layers pass (no errors) +- โœ… Documentation complete and accurate +- โœ… API stability maintained (no breaking changes) + +--- + +## 11. Next Steps + +### Immediate Actions (This Week) +1. โœ… Complete this investigation document +2. โฉ Research GDeflate format (begin literature search) +3. โฉ Evaluate DEFLATE libraries (zlib vs miniz vs custom) +4. โฉ Create prototype test assets + +### Short Term (Next 2 Weeks) +1. โฉ Finalize format specification +2. โฉ Begin header parser implementation +3. โฉ Set up testing infrastructure +4. โฉ Create CPU implementation plan + +### Medium Term (1-2 Months) +1. โฉ Complete CPU implementation +2. โฉ Integrate with existing backend +3. โฉ Begin GPU shader development +4. โฉ Performance benchmarking + +--- + +## 12. Open Questions + +1. **Format Availability**: Is GDeflate format publicly documented? If not, can we reverse engineer it legally? +2. **Patent Concerns**: Are there any patents covering GDeflate that we need to be aware of? +3. **Library Choice**: Which DEFLATE library best fits our needs (zlib, miniz, custom)? +4. **GPU Priority**: Should we implement CPU first and GPU later, or develop in parallel? +5. **Compression Tool**: Do we need to provide a GDeflate compression tool, or only decompression? +6. **Block Size**: What are optimal block sizes for CPU vs GPU decompression? +7. **Streaming**: Do we need streaming decompression, or block-at-a-time is sufficient? +8. **Validation**: How do we validate correctness without official test vectors? + +--- + +## 13. References and Resources + +### Documentation +- Microsoft DirectStorage documentation +- Wine DirectStorage implementation (if available) +- DEFLATE RFC 1951 +- GPU compression research papers + +### Libraries +- zlib: https://www.zlib.net/ +- miniz: https://github.com/richgel999/miniz +- Vulkan GPU compute examples + +### Community +- Wine development mailing list +- Proton GitHub discussions +- Graphics programming forums + +--- + +**Document Status**: Draft v1.0 +**Last Updated**: 2026-02-16 +**Next Review**: After format specification research complete diff --git a/docs/investigation_io_uring.md b/docs/investigation_io_uring.md new file mode 100644 index 0000000..4285b6b --- /dev/null +++ b/docs/investigation_io_uring.md @@ -0,0 +1,788 @@ +# io_uring Backend Investigation and Enhancement Plan + +**Status:** Planning Phase +**Priority:** Medium +**Target:** Production-ready io_uring backend with full feature support +**Dependencies:** liburing library + +--- + +## Executive Summary + +The ds-runtime project includes an experimental io_uring backend that provides kernel-level asynchronous I/O. This document outlines the investigation, design, and implementation plan for enhancing the io_uring backend to production quality. + +--- + +## 1. Current State + +### 1.1 What Exists + +**Implementation**: `src/ds_runtime_uring.cpp` + +**Current Capabilities**: +- โœ… `io_uring_queue_init()` with 256 queue entries +- โœ… SQE (Submission Queue Entry) submission +- โœ… CQE (Completion Queue Entry) handling +- โœ… Single worker thread +- โœ… Batch submission (`io_uring_submit`) +- โœ… Completion polling (`io_uring_wait_cqe`) +- โœ… Read operations (`io_uring_prep_read`) +- โœ… Write operations (`io_uring_prep_write`) +- โœ… Host memory support + +**Configuration**: +```cpp +struct IoUringBackendConfig { + uint32_t queue_entries = 256; + uint32_t worker_count = 1; // Currently unused! +}; +``` + +### 1.2 What's Missing + +**Incomplete Features**: +- โŒ **Multi-worker support**: `worker_count` field exists but ignored +- โŒ **GPU memory**: Explicitly rejected with `EINVAL` (host-only) +- โŒ **Decompression**: No compression/decompression handling +- โŒ **Advanced features**: No linked ops, timeouts, or fixed files +- โŒ **Error recovery**: Limited error handling +- โŒ **Performance tuning**: No SQPOLL or IOPOLL modes + +**Known Limitations**: +- Host memory only (GPU buffers not supported by io_uring design) +- Single-threaded (no parallelism despite worker_count field) +- Basic error reporting +- No request prioritization + +### 1.3 Build Status + +**Dependency**: liburing (optional) +- Not built by default (requires `-DDS_RUNTIME_HAS_IO_URING`) +- CMake checks for liburing via pkg-config +- Test suite requires liburing to run + +**Current Build**: +```bash +# In sandbox (liburing not found): +cmake .. # io_uring backend NOT built +``` + +--- + +## 2. io_uring Background + +### 2.1 What is io_uring? + +**io_uring** is a Linux kernel asynchronous I/O interface introduced in Linux 5.1. + +**Key Benefits**: +- **Zero-copy**: Shared memory between kernel and userspace +- **Batching**: Submit multiple operations at once +- **Low overhead**: Minimal system call overhead +- **Flexible**: Supports many operation types (read, write, fsync, etc.) +- **Modern**: Designed for modern SSD and NVMe performance + +**Architecture**: +``` +Application + โ†“ (prepare SQEs) +Submission Queue (SQ) - shared memory ring + โ†“ (submit) +Kernel (process I/O asynchronously) + โ†“ (complete) +Completion Queue (CQ) - shared memory ring + โ†“ (reap CQEs) +Application (handle completions) +``` + +### 2.2 io_uring vs Traditional I/O + +| Feature | io_uring | pread/pwrite | POSIX AIO | +|---------|----------|--------------|-----------| +| **Overhead** | Very Low | High (syscall per op) | Medium | +| **Batching** | โœ… Yes | โŒ No | โš ๏ธ Limited | +| **Zero-copy** | โœ… Yes | โŒ No | โŒ No | +| **Flexibility** | โœ… High | โš ๏ธ Limited | โš ๏ธ Limited | +| **Kernel support** | โœ… 5.1+ | โœ… All | โš ๏ธ Poor | + +**Verdict**: io_uring is superior for high-performance async I/O on modern Linux + +### 2.3 DirectStorage Relevance + +**Alignment with DirectStorage**: +- โœ… Batched I/O submission (queue-based model) +- โœ… Asynchronous completion (callback-driven) +- โœ… Low CPU overhead (kernel handles scheduling) +- โœ… High throughput (optimized for NVMe SSDs) + +**Differences**: +- โŒ No GPU memory support (host-only by design) +- โŒ No built-in decompression (must be done in userspace) +- โœ… Linux-native (no Windows compatibility layer) + +--- + +## 3. Enhancement Plan + +### 3.1 Phase 1: Multi-Worker Architecture + +**Goal**: Honor `worker_count` configuration for parallel I/O + +#### Current Architecture + +``` +Single Worker Thread + โ†“ +io_uring instance (256 entries) + โ†“ +Kernel I/O processing +``` + +#### Proposed Architecture + +``` +Worker Thread 1 Worker Thread N + โ†“ โ†“ +io_uring instance 1 io_uring instance N + โ†“ โ†“ +Kernel I/O processing (parallel) +``` + +#### Design Decisions + +**Option 1: Multiple io_uring Instances** (Recommended) +- Each worker has own io_uring +- Load balance requests round-robin +- Independent polling threads +- Pros: True parallelism, simple synchronization +- Cons: More kernel resources + +**Option 2: Shared io_uring with Thread Pool** +- Single io_uring, multiple polling threads +- Workers compete for CQEs +- Pros: Fewer kernel resources +- Cons: Contention, complex synchronization + +**Recommendation**: Option 1 (multiple instances) + +#### Implementation + +**Data Structure**: +```cpp +struct IoUringWorker { + std::thread thread; + io_uring ring; + std::atomic running; + std::queue pending; + std::mutex pending_mutex; + std::condition_variable pending_cv; +}; + +class IoUringBackend::Impl { + std::vector workers_; + std::atomic next_worker_; // Round-robin counter +}; +``` + +**Worker Thread**: +```cpp +void worker_loop(IoUringWorker& worker) { + while (worker.running) { + // 1. Submit pending requests + { + std::unique_lock lock(worker.pending_mutex); + while (!worker.pending.empty()) { + Request* req = worker.pending.front(); + worker.pending.pop(); + + // Prepare SQE + io_uring_sqe* sqe = io_uring_get_sqe(&worker.ring); + if (req->op == RequestOp::Read) { + io_uring_prep_read(sqe, req->fd, req->dst, + req->size, req->offset); + } else { + io_uring_prep_write(sqe, req->fd, req->src, + req->size, req->offset); + } + io_uring_sqe_set_data(sqe, req); + } + } + + io_uring_submit(&worker.ring); + + // 2. Wait for completions + io_uring_cqe* cqe; + int ret = io_uring_wait_cqe_timeout(&worker.ring, &cqe, + &timeout); + if (ret == 0) { + // Process completion + Request* req = static_cast( + io_uring_cqe_get_data(cqe) + ); + req->status = (cqe->res < 0) ? + RequestStatus::Failed : RequestStatus::Complete; + req->bytes_transferred = (cqe->res > 0) ? cqe->res : 0; + + // Invoke callback + req->callback(req); + + io_uring_cqe_seen(&worker.ring, cqe); + } + } +} +``` + +**Load Balancing**: +```cpp +void IoUringBackend::submit_request(Request& req) { + // Round-robin worker selection + uint32_t worker_idx = next_worker_.fetch_add(1) % workers_.size(); + IoUringWorker& worker = workers_[worker_idx]; + + { + std::lock_guard lock(worker.pending_mutex); + worker.pending.push(&req); + } + worker.pending_cv.notify_one(); +} +``` + +**Testing**: +- Submit requests to multiple workers +- Verify parallel execution +- Check completion order independence +- Measure throughput scaling with worker count + +--- + +### 3.2 Phase 2: Advanced io_uring Features + +**Goal**: Leverage advanced io_uring capabilities for performance + +#### Feature 1: SQPOLL Mode + +**Description**: Kernel-side submission queue polling (eliminates submit syscall) + +**Configuration**: +```cpp +struct io_uring_params params = {}; +params.flags = IORING_SETUP_SQPOLL; +params.sq_thread_idle = 1000; // 1 second idle before sleep + +io_uring_queue_init_params(256, &ring, ¶ms); +``` + +**Benefits**: +- No `io_uring_submit()` syscall needed +- Lower latency for high-frequency submissions +- Kernel thread handles polling + +**Tradeoffs**: +- Extra kernel thread (CPU overhead when idle) +- Requires CAP_SYS_NICE or io_uring_register_iowq_max_workers + +**Use Case**: High-throughput streaming with many small I/Os + +#### Feature 2: IOPOLL Mode + +**Description**: Kernel polls completion directly from device (bypass interrupts) + +**Configuration**: +```cpp +params.flags = IORING_SETUP_IOPOLL; +``` + +**Benefits**: +- Lower latency on fast NVMe devices +- Reduced interrupt overhead + +**Requirements**: +- O_DIRECT file I/O +- Polling-capable storage device + +**Use Case**: Ultra-low-latency I/O on NVMe SSDs + +#### Feature 3: Fixed Files + +**Description**: Register file descriptors to avoid fd lookup overhead + +**API**: +```cpp +// Register FDs once +int fds[MAX_FILES]; +io_uring_register_files(&ring, fds, MAX_FILES); + +// Use registered FD index in SQEs +io_uring_prep_read(sqe, fd_index, buf, size, offset); +sqe->flags |= IOSQE_FIXED_FILE; +``` + +**Benefits**: +- Eliminates fd table lookup +- ~10-15% latency reduction + +**Use Case**: Repeatedly accessing same set of files + +#### Feature 4: Linked Operations + +**Description**: Chain dependent operations (e.g., read โ†’ decompress โ†’ write) + +**API**: +```cpp +// Read operation +io_uring_prep_read(sqe1, fd, buf, size, offset); +sqe1->flags |= IOSQE_IO_LINK; + +// Write operation (only if read succeeds) +io_uring_prep_write(sqe2, fd_out, buf, size, 0); +``` + +**Benefits**: +- Atomic operation sequences +- Reduced round-trips + +**Use Case**: Complex I/O workflows (read-modify-write) + +#### Implementation Priority + +1. **Fixed Files** (High): Easy to implement, measurable benefit +2. **SQPOLL** (Medium): Good for high throughput, adds complexity +3. **Linked Ops** (Low): Complex, requires workflow redesign +4. **IOPOLL** (Low): Requires O_DIRECT, hardware-specific + +--- + +### 3.3 Phase 3: Compression Integration + +**Goal**: Add decompression support to io_uring backend + +#### Design Challenge + +**Problem**: io_uring is host-only, decompression needs CPU/GPU + +**Solutions**: + +**Option 1: Hybrid Approach** (Recommended) +``` +io_uring (read compressed data) + โ†“ +CPU decompression (in worker thread) + โ†“ +Completion callback +``` + +**Option 2: Separate Decompression Queue** +``` +io_uring (read compressed data) + โ†“ +Enqueue to decompression thread pool + โ†“ (parallel decompression) +Completion callback +``` + +**Option 3: Reject Compression** +``` +if (req.compression != Compression::None) { + return error(EINVAL, "io_uring backend does not support compression"); +} +``` + +**Recommendation**: Option 1 for CPU compression, Option 3 for GPU (hand off to Vulkan backend) + +#### Implementation (Option 1) + +```cpp +void worker_loop(IoUringWorker& worker) { + // After io_uring read completion + if (cqe->res > 0) { + Request* req = static_cast( + io_uring_cqe_get_data(cqe) + ); + + // Decompress if needed + if (req->compression != Compression::None) { + bool success = decompress(req); + if (!success) { + req->status = RequestStatus::Failed; + req->errno_value = EIO; + } + } + + req->status = RequestStatus::Complete; + req->callback(req); + } +} + +bool decompress(Request* req) { + switch (req->compression) { + case Compression::FakeUppercase: + return fake_uppercase_transform(req->dst, req->size); + case Compression::GDeflate: + return gdeflate_decompress(req->dst, req->size); + default: + return true; // No compression + } +} +``` + +**Testing**: +- Read compressed file via io_uring +- Verify decompression occurs +- Check performance vs CPU backend +- Ensure no blocking in completion path + +--- + +### 3.4 Phase 4: Error Handling and Resilience + +**Goal**: Robust error handling for production use + +#### Error Scenarios + +1. **EAGAIN** (queue full): Retry or backpressure +2. **EINTR** (interrupted): Retry operation +3. **EIO** (device error): Report to application +4. **EBADF** (bad fd): Validate before submission +5. **Ring setup failure**: Fallback to CPU backend + +#### Enhanced Error Handling + +```cpp +void handle_cqe_error(Request* req, io_uring_cqe* cqe) { + int err = -cqe->res; + + switch (err) { + case EAGAIN: + // Retry operation + resubmit_request(req); + break; + + case EINTR: + // Interrupted, retry + resubmit_request(req); + break; + + case EIO: + case EBADF: + case EFAULT: + // Fatal error, report to application + req->status = RequestStatus::Failed; + req->errno_value = err; + report_request_error("io_uring", "completion", err, + strerror(err), *req); + req->callback(req); + break; + + default: + // Unknown error + req->status = RequestStatus::Failed; + req->errno_value = err; + report_request_error("io_uring", "completion", err, + "Unknown io_uring error", *req); + req->callback(req); + break; + } +} +``` + +**Retry Logic**: +```cpp +void resubmit_request(Request* req) { + if (req->retry_count++ >= MAX_RETRIES) { + req->status = RequestStatus::Failed; + req->errno_value = ETIMEDOUT; + req->callback(req); + return; + } + + // Exponential backoff + std::this_thread::sleep_for( + std::chrono::milliseconds(1 << req->retry_count) + ); + + // Re-enqueue + submit_request(*req); +} +``` + +--- + +### 3.5 Phase 5: Performance Tuning + +**Goal**: Optimize io_uring backend for maximum throughput + +#### Tuning Parameters + +**Queue Depth**: +```cpp +// Current: 256 entries +// Consider: Configurable based on workload +// - Small files: 128-256 +// - Large files: 512-1024 +// - Streaming: 2048+ +``` + +**Batch Size**: +```cpp +// Submit multiple SQEs at once +// Amortizes syscall overhead +uint32_t batch_size = 16; // Tune based on request rate +``` + +**Worker Count**: +```cpp +// Heuristic: Number of CPU cores or storage devices +uint32_t optimal_workers = std::min( + std::thread::hardware_concurrency(), + num_storage_devices +); +``` + +**Polling Interval**: +```cpp +// Balance latency vs CPU usage +struct __kernel_timespec timeout = { + .tv_sec = 0, + .tv_nsec = 1000000 // 1ms (tune based on workload) +}; +``` + +#### Benchmarking + +**Metrics to Track**: +- Throughput (MB/s, IOPS) +- Latency (p50, p95, p99) +- CPU utilization +- Queue depth utilization +- Syscall count (should be minimal with batching) + +**Test Scenarios**: +- Sequential reads (large files) +- Random reads (small files) +- Mixed read/write +- Concurrent requests (stress test) + +**Comparison**: +- io_uring vs CPU backend (pread/pwrite) +- Single worker vs multi-worker +- SQPOLL vs non-SQPOLL + +--- + +## 4. GPU Memory Limitation + +### 4.1 Why GPU Buffers Don't Work + +**Fundamental Limitation**: io_uring operates on host virtual memory +- Kernel I/O subsystem writes to host memory only +- GPU memory (VRAM) is not directly accessible to kernel I/O +- DMA transfers require device-specific drivers + +**Workarounds** (Not Implemented): +1. **GPU memory mapping**: Expose GPU memory to host address space (driver-dependent, slow) +2. **Staging buffers**: io_uring โ†’ host buffer โ†’ GPU copy (defeats purpose) +3. **GPU Direct Storage**: Requires specialized hardware/drivers (NVIDIA GPUDirect Storage) + +### 4.2 Recommended Approach + +**Strategy**: Reject GPU memory requests, hand off to Vulkan backend + +```cpp +void IoUringBackend::submit_request(Request& req) { + // Check for GPU memory + if (req.dst_memory == RequestMemory::Gpu || + req.src_memory == RequestMemory::Gpu) { + report_request_error("io_uring", "submit", EINVAL, + "io_uring backend does not support GPU memory (use Vulkan backend)", + req); + req.status = RequestStatus::Failed; + req.errno_value = EINVAL; + req.callback(&req); + return; + } + + // Proceed with host memory I/O + // ... +} +``` + +**Documentation**: Clearly state io_uring is host-only backend + +--- + +## 5. Testing Strategy + +### 5.1 Unit Tests + +**Test Suite**: `tests/io_uring_backend_test.cpp` (already exists) + +**Additional Test Cases**: +1. **Multi-worker**: Submit to multiple workers, verify parallelism +2. **High load**: 1000+ concurrent requests +3. **Error injection**: Simulate EAGAIN, EIO, EINTR +4. **Compression**: Read compressed files, verify decompression +5. **Batching**: Submit batches, measure throughput +6. **Cancellation**: Cancel in-flight requests (if supported) + +### 5.2 Integration Tests + +**Scenarios**: +1. **Asset streaming demo**: Use io_uring backend +2. **Mixed backends**: io_uring for host, Vulkan for GPU +3. **Stress test**: Sustained high I/O rate +4. **Error recovery**: Handle disk full, permission denied + +### 5.3 Performance Tests + +**Benchmarks**: +- Throughput vs CPU backend +- Scalability with worker count +- Latency distribution +- CPU overhead + +**Test Assets**: +- 1MB, 10MB, 100MB files +- Sequential vs random access +- Compressed vs uncompressed + +--- + +## 6. Dependencies + +### 6.1 Build System + +**liburing Detection**: +```cmake +find_package(PkgConfig QUIET) +if (PkgConfig_FOUND) + pkg_check_modules(LIBURING liburing) +endif() + +if (LIBURING_FOUND) + list(APPEND DS_RUNTIME_SOURCES src/ds_runtime_uring.cpp) + target_link_libraries(ds_runtime PUBLIC ${LIBURING_LIBRARIES}) + target_include_directories(ds_runtime PUBLIC ${LIBURING_INCLUDE_DIRS}) + target_compile_definitions(ds_runtime PUBLIC DS_RUNTIME_HAS_IO_URING) +endif() +``` + +**Installation** (Arch Linux, CachyOS): +```bash +sudo pacman -S liburing +``` + +### 6.2 Runtime Requirements + +**Kernel Version**: Linux 5.1+ (5.10+ recommended for stability) +**Capabilities**: None required for basic use, CAP_SYS_NICE for SQPOLL + +--- + +## 7. Timeline and Milestones + +### 7.1 Implementation Phases + +**Week 1-2: Multi-Worker** +- Implement worker pool +- Load balancing +- Testing +- **Milestone**: Multi-worker backend functional + +**Week 3: Advanced Features** +- Fixed files support +- SQPOLL mode (optional) +- Testing +- **Milestone**: Advanced features working + +**Week 4: Compression** +- Integrate decompression +- Testing +- **Milestone**: Compression support + +**Week 5-6: Error Handling & Tuning** +- Robust error handling +- Performance tuning +- Benchmarking +- **Milestone**: Production-ready backend + +### 7.2 Total Estimate + +**6 weeks** for complete io_uring backend enhancement + +--- + +## 8. Success Criteria + +### 8.1 Functional Requirements +- โœ… Multi-worker support working +- โœ… All request types supported (read, write) +- โœ… Compression/decompression integrated +- โœ… Error handling robust +- โœ… Existing tests pass +- โœ… New tests pass (100% coverage) + +### 8.2 Performance Requirements +- โœ… Throughput โ‰ฅ 2x CPU backend (large files) +- โœ… Latency โ‰ค 0.5x CPU backend +- โœ… CPU overhead โ‰ค 20% of CPU backend +- โœ… Scales linearly with worker count (up to core count) + +### 8.3 Quality Requirements +- โœ… No memory leaks +- โœ… Thread-safe +- โœ… Graceful degradation on error +- โœ… Documentation complete +- โœ… API stability maintained + +--- + +## 9. Next Steps + +### Immediate (This Week) +1. โœ… Complete investigation document +2. โฉ Install liburing in development environment +3. โฉ Examine current implementation +4. โฉ Design multi-worker architecture + +### Short Term (Next 2 Weeks) +1. โฉ Implement multi-worker support +2. โฉ Test parallelism and load balancing +3. โฉ Benchmark vs CPU backend + +### Medium Term (1-2 Months) +1. โฉ Add advanced features +2. โฉ Integrate compression +3. โฉ Performance tuning +4. โฉ Production-ready release + +--- + +## 10. Open Questions + +1. **Worker Count Default**: What's optimal default for `worker_count`? +2. **SQPOLL**: Is SQPOLL worth the complexity for our use case? +3. **Fixed Files**: Should we auto-register frequently accessed files? +4. **GPU Fallback**: Should we automatically hand off GPU requests to Vulkan backend? +5. **Compression**: CPU-only or thread pool for decompression? +6. **Batching**: What's optimal batch size for different workloads? + +--- + +## 11. References + +### Documentation +- liburing documentation: https://github.com/axboe/liburing +- io_uring manpages: `man io_uring` +- Kernel documentation: Documentation/io_uring.txt +- Efficient IO with io_uring (Jens Axboe) + +### Performance +- io_uring performance analysis +- NVMe optimization guides +- Linux I/O stack deep dive + +--- + +**Document Status**: Draft v1.0 +**Last Updated**: 2026-02-16 +**Next Review**: After liburing installation and multi-worker implementation diff --git a/docs/investigation_remaining_features.md b/docs/investigation_remaining_features.md new file mode 100644 index 0000000..4c9e76d --- /dev/null +++ b/docs/investigation_remaining_features.md @@ -0,0 +1,756 @@ +# Request Cancellation, GPU-Resident Workflows, and Wine/Proton Integration + +**Status:** Planning Phase +**Priority:** Medium to High +**Target:** Complete feature set for production DirectStorage-style runtime + +--- + +## Part 1: Request Cancellation + +### 1.1 Current State + +**What Exists**: +- `RequestStatus` enum with values: `Pending`, `InProgress`, `Complete`, `Failed` +- No `Cancelled` status +- No `cancel()` method on Queue +- No cancellation support in backends + +**What's Missing**: +- โŒ `RequestStatus::Cancelled` enum value +- โŒ `Queue::cancel_request(request_id)` method +- โŒ In-flight request tracking for cancellation +- โŒ Backend cancellation hooks +- โŒ Race condition handling (completion vs cancellation) + +### 1.2 Design Requirements + +**Use Cases**: +1. **Timeout**: Cancel requests that take too long +2. **User Action**: User cancels loading operation +3. **Priority Change**: Cancel low-priority work to start high-priority +4. **Shutdown**: Cancel all in-flight requests on cleanup + +**Semantics**: +```cpp +// Strong guarantee: Request will not complete after cancel +bool cancel(request_id); + +// Weak guarantee: Request may complete, but won't invoke callback +bool try_cancel(request_id); +``` + +### 1.3 Implementation Plan + +#### Phase 1: API Design + +**Add to Request**: +```cpp +struct Request { + // Existing fields... + std::atomic cancellation_requested = false; + request_id_t id = 0; // Unique ID for tracking +}; +``` + +**Add to Queue**: +```cpp +class Queue { +public: + // Cancel specific request (returns true if cancelled before completion) + bool cancel_request(request_id_t id); + + // Cancel all pending requests (not yet submitted) + size_t cancel_all_pending(); + + // Cancel all requests (including in-flight) + size_t cancel_all(); +}; +``` + +**Add Status**: +```cpp +enum class RequestStatus { + Pending, + InProgress, + Complete, + Failed, + Cancelled // NEW +}; +``` + +#### Phase 2: Queue Implementation + +**Request Tracking**: +```cpp +class Queue::Impl { + std::unordered_map active_requests_; + std::mutex active_mutex_; + std::atomic next_id_{1}; +}; + +void Queue::enqueue(Request& req) { + req.id = impl_->next_id_.fetch_add(1); + { + std::lock_guard lock(impl_->active_mutex_); + impl_->active_requests_[req.id] = &req; + } + // ... existing enqueue logic +} +``` + +**Cancellation**: +```cpp +bool Queue::cancel_request(request_id_t id) { + std::lock_guard lock(impl_->active_mutex_); + + auto it = impl_->active_requests_.find(id); + if (it == impl_->active_requests_.end()) { + return false; // Already completed or never existed + } + + Request* req = it->second; + + // Mark as cancellation requested + req->cancellation_requested.store(true, std::memory_order_release); + + // If still pending (not submitted), remove immediately + if (req->status == RequestStatus::Pending) { + req->status = RequestStatus::Cancelled; + impl_->active_requests_.erase(it); + return true; + } + + // If in-flight, backend must handle cancellation + // Return false to indicate "in progress, might complete" + return false; +} +``` + +#### Phase 3: Backend Support + +**CPU Backend**: +```cpp +void CpuBackend::process_request(Request& req) { + // Check cancellation before I/O + if (req.cancellation_requested.load(std::memory_order_acquire)) { + req.status = RequestStatus::Cancelled; + req.callback(&req); + return; + } + + // Perform I/O + ssize_t bytes = pread(req.fd, req.dst, req.size, req.offset); + + // Check cancellation after I/O (before callback) + if (req.cancellation_requested.load(std::memory_order_acquire)) { + req.status = RequestStatus::Cancelled; + req.callback(&req); + return; + } + + // Normal completion + req.status = RequestStatus::Complete; + req.callback(&req); +} +``` + +**Vulkan Backend**: +```cpp +// Harder to cancel GPU work in progress +// Strategy: Don't invoke callback if cancelled +void VulkanBackend::complete_request(Request& req) { + if (req.cancellation_requested.load(std::memory_order_acquire)) { + req.status = RequestStatus::Cancelled; + } + + req.callback(&req); +} +``` + +**io_uring Backend**: +```cpp +// Can cancel SQE before submission +bool IoUringBackend::cancel_sqe(Request& req) { + // Remove from pending queue if not yet submitted + std::lock_guard lock(pending_mutex_); + auto it = std::find(pending_.begin(), pending_.end(), &req); + if (it != pending_.end()) { + pending_.erase(it); + req.status = RequestStatus::Cancelled; + return true; + } + return false; // Already submitted +} +``` + +### 1.4 Testing + +**Test Cases**: +1. Cancel pending request (before submit) +2. Cancel in-flight request (during I/O) +3. Cancel completed request (should fail) +4. Cancel non-existent request (should fail) +5. Race: cancel vs completion +6. Cancel all requests +7. Callback not invoked for cancelled request + +**Test File**: `tests/cancellation_test.cpp` + +### 1.5 Timeline + +**2-3 weeks** for complete cancellation support + +--- + +## Part 2: GPU-Resident Workflows + +### 2.1 Motivation + +**Goal**: Zero-copy disk โ†’ GPU data path + +**Traditional Path** (current): +``` +Disk โ†’ Host Staging Buffer โ†’ GPU Buffer + [copy 1] [copy 2] +``` + +**GPU-Resident Path** (target): +``` +Disk โ†’ GPU Buffer (direct) + [copy 1 only] +``` + +### 2.2 DirectStorage GPU Upload Heap + +**Microsoft DirectStorage Concept**: +- GPU upload heap: CPU-visible, GPU-accessible memory +- Direct writes from storage controller to GPU memory +- Requires hardware support (PCIe peer-to-peer, GPU Direct Storage) + +**Linux Equivalent**: +- **NVIDIA GPUDirect Storage**: Kernel driver enables direct NVMe โ†’ GPU transfers +- **AMD equivalent**: DirectGMA (less documented) +- **Standard Vulkan**: No direct disk โ†’ GPU (must use staging) + +### 2.3 Implementation Strategies + +#### Strategy 1: Vulkan External Memory (Current) + +**Approach**: Staging buffer + GPU copy (already implemented) + +**Pros**: +- Works on all Vulkan hardware +- Portable across vendors +- Already implemented + +**Cons**: +- Extra copy (staging โ†’ GPU) +- Higher latency +- More memory usage + +#### Strategy 2: GPU Direct Storage Integration + +**Approach**: Integrate with vendor-specific APIs + +**NVIDIA GPUDirect Storage**: +```cpp +// Open file with GDS flags +int fd = open(path, O_RDONLY | O_DIRECT); + +// Register GPU buffer with GDS +cuFileDriverOpen(); +CUfileHandle_t handle; +cuFileHandleRegister(&handle, &cufile_desc); + +// Direct read to GPU memory +cuFileRead(handle, gpu_buffer, size, offset, 0); +``` + +**Pros**: +- Zero extra copies +- Lowest latency +- Highest throughput + +**Cons**: +- NVIDIA-only (no AMD/Intel equivalent) +- Requires special driver setup +- O_DIRECT alignment requirements +- Complex integration + +#### Strategy 3: Memory-Mapped Files + GPU Upload + +**Approach**: mmap file, map to GPU upload heap + +**Implementation**: +```cpp +// Map file to host memory +void* mapped = mmap(nullptr, file_size, PROT_READ, + MAP_SHARED, fd, 0); + +// Allocate GPU upload heap (CPU-visible, GPU-accessible) +VkBuffer upload_buffer = create_upload_buffer(device); +void* gpu_mapped = map_buffer(upload_buffer); + +// Copy file data to upload heap +memcpy(gpu_mapped, mapped, file_size); + +// Unmap +munmap(mapped, file_size); +unmap_buffer(upload_buffer); + +// Use upload buffer directly in GPU (no staging copy needed) +``` + +**Pros**: +- Simpler than GDS +- Works across vendors +- Reduces staging buffer usage + +**Cons**: +- Still one copy (mmap โ†’ GPU) +- Page cache overhead +- Not true "direct to GPU" + +### 2.4 Recommended Approach + +**Phase 1: Optimize Current Path** +- Reuse staging buffers (pool) +- Async staging โ†’ GPU copy (don't wait) +- Batch multiple requests + +**Phase 2: Vendor-Specific Paths** (Optional) +- Add GDS backend for NVIDIA +- Conditional compilation (#ifdef NVIDIA_GDS) +- Fallback to standard path + +**Phase 3: Future Hardware** +- Wait for standardized GPU Direct Storage in Vulkan +- Integrate when available + +### 2.5 GPU-to-GPU Transfers + +**Use Case**: Texture decompression GPU โ†’ GPU + +**Current Path**: +``` +Disk โ†’ Staging โ†’ GPU Compressed Buffer โ†’ GPU Decompressed Buffer + [compute shader] +``` + +**Optimization**: +``` +Disk โ†’ GPU Compressed Buffer โ†’ GPU Decompressed Buffer + [single command buffer] +``` + +**Implementation**: Already supported via Vulkan backend + compute pipelines + +### 2.6 Testing + +**Benchmarks**: +- Staging vs direct (if GDS available) +- Throughput (MB/s) +- Latency (ms) +- CPU overhead (%) + +**Validation**: +- Data integrity (checksums) +- Memory usage +- GPU utilization + +### 2.7 Timeline + +**Phase 1 (Optimization)**: 2 weeks +**Phase 2 (GDS Integration)**: 4-6 weeks (if needed) + +--- + +## Part 3: Wine/Proton Integration + +### 3.1 Architecture Overview + +**Goal**: Enable Windows DirectStorage games to run on Linux via Proton + +**Strategy**: +``` +Windows Game (DirectStorage API) + โ†“ +Wine/Proton dstorage.dll Shim + โ†“ (translate calls) +ds-runtime (Linux native) + โ†“ (execute) +Linux Kernel (io_uring, Vulkan) +``` + +### 3.2 Integration Approaches + +#### Approach 1: PE DLL Shim (Recommended) + +**Architecture**: +``` +dstorage.dll (PE) - Windows ABI + โ†“ dlopen +libds_runtime.so - Linux ABI +``` + +**Implementation**: +1. Create `dstorage.dll` (Wine builtin DLL) +2. Implement DirectStorage API entry points +3. Forward to `libds_runtime.so` via C ABI +4. Translate types (HANDLE โ†’ fd, etc.) + +**Example**: +```cpp +// dstorage.dll (Wine) +HRESULT WINAPI DStorageCreateQueue( + const DSTORAGE_QUEUE_DESC* desc, + REFIID riid, + void** ppv +) { + // Load libds_runtime.so + void* handle = dlopen("libds_runtime.so", RTLD_NOW); + + // Get C API functions + auto ds_create_queue = (ds_queue_t* (*)(ds_backend_t*)) + dlsym(handle, "ds_create_queue"); + + // Create backend + ds_backend_t* backend = ds_make_cpu_backend(); + + // Create queue + ds_queue_t* queue = ds_create_queue(backend); + + // Wrap in COM object + *ppv = new DStorageQueueImpl(queue); + return S_OK; +} +``` + +#### Approach 2: Direct Integration (No Shim) + +**Architecture**: +``` +Wine/Proton DirectStorage Implementation + โ†“ (link directly) +libds_runtime_static.a +``` + +**Implementation**: +1. Build ds-runtime as static library +2. Link into Wine dlls/dstorage build +3. Call C++ API directly (no PE/ELF bridge) +4. Share Vulkan device with vkd3d-proton + +**Pros**: +- No dlopen overhead +- Simpler debugging +- Shared Vulkan context + +**Cons**: +- Tighter coupling +- Requires Wine build modifications + +#### Approach 3: Kernel Module (Experimental) + +**Architecture**: +``` +DirectStorage Requests + โ†“ +ioctl to kernel module + โ†“ +Kernel-side I/O handling +``` + +**Not Recommended**: Too complex, overkill for userspace I/O + +### 3.3 Type Mapping + +**Windows โ†’ Linux Translation**: + +| Windows Type | Linux Type | Conversion | +|--------------|------------|------------| +| `HANDLE` | `int` | `fd = _open_osfhandle(handle)` | +| `DSTORAGE_REQUEST` | `ds_request` | Struct field mapping | +| `ID3D12Resource*` | `VkBuffer` | vkd3d-proton interop | +| `DSTORAGE_COMPRESSION` | `ds_compression_t` | Enum mapping | +| `OVERLAPPED` | Completion callback | Async model | + +**Example Struct Mapping**: +```cpp +void translate_request( + const DSTORAGE_REQUEST_DESC* windows_req, + ds_request* linux_req +) { + linux_req->fd = get_fd_from_handle(windows_req->Source.File.Handle); + linux_req->offset = windows_req->Source.File.Offset; + linux_req->size = windows_req->Source.File.Size; + linux_req->dst = get_buffer_pointer(windows_req->Destination); + linux_req->op = (windows_req->DestinationType == DSTORAGE_REQUEST_DESTINATION_MEMORY) + ? DS_REQUEST_OP_READ : DS_REQUEST_OP_WRITE; + linux_req->compression = translate_compression( + windows_req->CompressionFormat + ); +} +``` + +### 3.4 Vulkan Device Sharing + +**Challenge**: DirectStorage expects D3D12 device, we need Vulkan + +**Solution**: vkd3d-proton already handles D3D12 โ†’ Vulkan translation + +**Integration**: +```cpp +// Get Vulkan device from vkd3d-proton +VkDevice vk_device = vkd3d_get_vk_device(d3d12_device); +VkQueue vk_queue = vkd3d_get_vk_queue(d3d12_device); + +// Create ds-runtime Vulkan backend with shared device +ds_vulkan_backend_config config; +config.device = vk_device; +config.queue = vk_queue; +config.take_ownership = false; // Don't destroy device + +ds_backend_t* backend = ds_make_vulkan_backend(&config); +``` + +### 3.5 Implementation Steps + +#### Step 1: C ABI Wrapper (Already Exists) + +**Status**: โœ… Complete +- `include/ds_runtime_c.h` provides C API +- Type conversions implemented +- Tested with `c_abi_stats_test.c` + +#### Step 2: Create dstorage.dll Skeleton + +**Location**: Outside ds-runtime repo (in Wine tree) + +**Files**: +``` +dlls/dstorage/ +โ”œโ”€โ”€ Makefile.in +โ”œโ”€โ”€ dstorage.spec +โ”œโ”€โ”€ dstorage_main.c +โ”œโ”€โ”€ queue.c +โ””โ”€โ”€ request.c +``` + +**Implement**: +- `DStorageGetFactory` +- `DStorageSetConfiguration` +- `IDStorageFactory::CreateQueue` +- `IDStorageQueue::EnqueueRequest` +- `IDStorageQueue::Submit` +- `IDStorageQueue::EnqueueSignal` + +#### Step 3: Link with ds-runtime + +**Option A: Dynamic Linking** +```makefile +EXTRADLLFLAGS = -Wl,--no-undefined +EXTRALIBS = -lds_runtime +``` + +**Option B: Static Linking** +```makefile +EXTRALIBS = $(LIBDS_RUNTIME_STATIC) +``` + +#### Step 4: Test with Real Games + +**Test Titles**: +- Forspoken (uses DirectStorage) +- Ratchet & Clank: Rift Apart +- Any UE5 game with DirectStorage support + +**Validation**: +- Game launches without crashes +- Asset loading works +- Performance acceptable +- No memory leaks + +### 3.6 Documentation + +**Create**: `docs/wine_integration_guide.md` + +**Contents**: +- Build dstorage.dll +- Configure Wine to use builtin override +- Debugging tips +- Performance tuning +- Known issues + +### 3.7 Timeline + +**Week 1-2: Prototype** +- Create basic dstorage.dll shim +- Implement skeleton COM interfaces +- Test with simple DirectStorage app + +**Week 3-4: Type Mapping** +- Implement full type conversion +- Handle edge cases +- Vulkan device sharing + +**Week 5-6: Testing** +- Test with real games +- Performance benchmarking +- Bug fixing + +**Week 7-8: Polish** +- Documentation +- Error handling +- Wine upstreaming (if desired) + +**Total Estimate**: **8 weeks** + +--- + +## Part 4: Master Implementation Roadmap + +### 4.1 Dependency Graph + +``` +GDeflate CPU โ”โ”โ”โ”“ + โ”ƒ +Vulkan Compute โ”โ•‹โ”โ”> GDeflate GPU + โ”ƒ +io_uring Multi โ”ƒ + โ”ƒ +Cancellation โ”โ”โ”โ•‹โ”โ”> GPU Workflows + โ”ƒ + โ”—โ”โ”> Wine/Proton Integration +``` + +### 4.2 Phased Implementation + +**Phase 1: Foundation** (Weeks 1-8) +- โœ… Initial assessment (complete) +- โฉ GDeflate research (2 weeks) +- โฉ Vulkan compute infrastructure (8 weeks, parallel) + +**Phase 2: Core Features** (Weeks 9-18) +- โฉ GDeflate CPU implementation (5 weeks) +- โฉ io_uring multi-worker (6 weeks, parallel) +- โฉ Request cancellation (3 weeks, parallel) + +**Phase 3: Advanced Features** (Weeks 19-28) +- โฉ GDeflate GPU implementation (6 weeks) +- โฉ GPU workflow optimization (4 weeks) + +**Phase 4: Integration** (Weeks 29-36) +- โฉ Wine/Proton shim (8 weeks) +- โฉ Real game testing +- โฉ Performance tuning + +**Total Timeline**: **36 weeks (9 months)** + +### 4.3 Parallelization Opportunities + +**Can Work in Parallel**: +- Vulkan compute + GDeflate research +- GDeflate CPU + io_uring enhancements +- GDeflate CPU + cancellation +- GPU workflows + Wine integration + +**Must Be Sequential**: +- Vulkan compute โ†’ GDeflate GPU +- GDeflate CPU โ†’ GDeflate GPU +- Core features โ†’ Wine integration + +### 4.4 Fast Track Option + +**Goal**: Minimal viable product in 12 weeks + +**Scope**: +- โœ… CPU backend (working) +- โฉ GDeflate CPU (5 weeks) +- โฉ Vulkan compute (8 weeks, start week 1) +- โฉ Basic Wine shim (3 weeks) +- โŒ Skip: GPU GDeflate, io_uring multi-worker, advanced features + +**Timeline**: **12 weeks** + +--- + +## Part 5: Success Criteria + +### 5.1 Functional Requirements + +**Core**: +- โœ… All features work independently +- โœ… Integration tests pass +- โœ… No regressions in existing functionality +- โœ… Documentation complete + +**Performance**: +- โœ… GDeflate CPU: โ‰ฅ 500 MB/s +- โœ… GDeflate GPU: โ‰ฅ 2 GB/s +- โœ… io_uring: โ‰ฅ 2x CPU backend +- โœ… Wine overhead: < 10% + +**Quality**: +- โœ… No memory leaks +- โœ… Thread-safe +- โœ… Vulkan validation clean +- โœ… Works on CachyOS/Arch Linux + +### 5.2 Wine/Proton Validation + +**Required**: +- โœ… At least one DirectStorage game runs +- โœ… Asset loading works correctly +- โœ… Performance within 20% of Windows +- โœ… No crashes or hangs + +--- + +## Part 6: Risk Assessment + +### 6.1 Technical Risks + +| Risk | Probability | Impact | Mitigation | +|------|-------------|--------|------------| +| GDeflate format unavailable | Medium | High | Reverse engineer, community collaboration | +| GPU compute too slow | Low | Medium | Optimize shaders, fallback to CPU | +| Wine integration complex | High | Medium | Start simple, iterate | +| Hardware incompatibility | Medium | High | Test on multiple GPUs, provide fallbacks | + +### 6.2 Timeline Risks + +| Risk | Impact | Mitigation | +|------|--------|------------| +| GDeflate research longer than expected | +4 weeks | Start GPU work in parallel | +| Wine upstreaming delays | +8 weeks | Maintain out-of-tree fork | +| Testing reveals bugs | +2-4 weeks | Allocate buffer time | + +--- + +## Part 7: Next Actions + +### Immediate (This Week) +1. โœ… Complete investigation documents +2. โฉ Begin GDeflate format research +3. โฉ Start Vulkan compute implementation +4. โฉ Install liburing for io_uring testing + +### Short Term (Weeks 2-4) +1. โฉ Implement shader module loading +2. โฉ Begin GDeflate CPU decoder +3. โฉ Design cancellation API +4. โฉ Test io_uring multi-worker prototype + +### Medium Term (Weeks 5-12) +1. โฉ Complete Vulkan compute pipelines +2. โฉ Finish GDeflate CPU implementation +3. โฉ Implement request cancellation +4. โฉ Start Wine shim prototype + +--- + +**Document Status**: Draft v1.0 +**Last Updated**: 2026-02-16 +**Next Review**: After Phase 1 milestones complete diff --git a/docs/investigation_summary.md b/docs/investigation_summary.md new file mode 100644 index 0000000..74e6246 --- /dev/null +++ b/docs/investigation_summary.md @@ -0,0 +1,386 @@ +# DS-Runtime Investigation Phase Summary + +**Date**: February 16, 2026 +**Phase**: Investigation & Planning (Phase 0) +**Status**: โœ… Complete + +--- + +## Overview + +A comprehensive investigation and planning phase has been completed for the ds-runtime project, producing detailed roadmaps for implementing a complete DirectStorage-style I/O and decompression pipeline natively on Linux. + +--- + +## Deliverables + +### Investigation Documents (110KB+ of Analysis) + +| Document | Size | Focus | Timeline | +|----------|------|-------|----------| +| [Master Roadmap](master_roadmap.md) | 30KB | Complete phased plan | 36 weeks / 12 week MVP | +| [GDeflate Investigation](investigation_gdeflate.md) | 16KB | Compression implementation | 9-13 weeks | +| [Vulkan Compute](investigation_vulkan_compute.md) | 26KB | GPU compute pipelines | 8 weeks | +| [io_uring Backend](investigation_io_uring.md) | 20KB | Multi-worker enhancement | 6 weeks | +| [Additional Features](investigation_remaining_features.md) | 18KB | Cancellation, GPU workflows, Wine | 3-8 weeks each | + +--- + +## Current Project Status + +### โœ… Working Features (Phase 0 Complete) +- CPU backend with thread pool +- Read/write operations (pread/pwrite) +- FakeUppercase demo compression +- Error reporting with rich context +- Request completion tracking +- C ABI for Wine/Proton integration +- Basic test suite (4 tests, 100% pass rate) +- CMake build system with C++20 + +**Build Status**: โœ… Compiles cleanly, all tests pass + +### โš ๏ธ Partially Implemented +- **Vulkan backend**: Staging buffer copies work, compute pipelines missing +- **io_uring backend**: Single worker only, needs multi-worker support +- **Compression**: FakeUppercase works, GDeflate intentionally stubbed + +### โŒ Not Implemented +- GDeflate compression/decompression (CPU & GPU) +- Vulkan GPU compute pipelines +- Request cancellation +- GPU-resident workflow optimizations +- Wine/Proton dstorage.dll shim + +--- + +## Implementation Roadmap + +### Timeline Summary + +``` +Phase 1: Foundation & Research Weeks 1-8 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ +Phase 2: Core Features Weeks 9-18 โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ +Phase 3: Advanced GPU Weeks 19-28 โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘ +Phase 4: Wine/Proton Integration Weeks 29-36 โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ + +Total: 36 weeks (9 months) for complete implementation +MVP Option: 12 weeks (3 months) for basic functionality +``` + +### Phase Breakdown + +#### Phase 1: Foundation & Research (Weeks 1-8) +**Parallel Tracks**: +- Track A: GDeflate format research and specification (3 weeks) +- Track B: Vulkan compute infrastructure (8 weeks) + - Shader module loading + - Descriptor management + - Pipeline creation + - Compute dispatch + +**Deliverables**: +- โœ… GDeflate format specification +- โœ… Vulkan compute capability + +#### Phase 2: Core Features (Weeks 9-18) +**Parallel Tracks**: +- Track A: GDeflate CPU implementation (5 weeks) + - Block header parser + - DEFLATE integration (zlib) + - Backend integration + +- Track B: io_uring multi-worker (6 weeks) + - Worker pool architecture + - Load balancing + - Advanced features (fixed files, SQPOLL) + +- Track C: Request cancellation (4 weeks) + - API design + - Queue implementation + - Backend integration + +**Deliverables**: +- โœ… GDeflate CPU decompression working +- โœ… io_uring production-ready +- โœ… Cancellation implemented + +#### Phase 3: Advanced GPU Features (Weeks 19-28) +**Sequential**: +- GDeflate GPU implementation (6 weeks) + - GPU decompression shader + - Pipeline integration + - CPU/GPU hybrid strategy + +- GPU-resident workflow optimization (4 weeks) + - Memory pooling + - Async transfers + - GPU-to-GPU optimization + +**Deliverables**: +- โœ… GPU-accelerated decompression +- โœ… Optimized GPU workflows + +#### Phase 4: Wine/Proton Integration (Weeks 29-36) +**Sequential**: +- dstorage.dll shim development (4 weeks) + - COM interface skeleton + - Type mapping (Windows โ†’ Linux) + - Queue implementation + +- Testing and integration (4 weeks) + - Integration testing + - Real game testing + - Performance optimization + - Documentation + +**Deliverables**: +- โœ… Wine/Proton support for DirectStorage games +- โœ… Complete documentation + +--- + +## MVP Fast Track (12 Weeks) + +**Reduced Scope for Rapid Validation**: +- โœ… CPU backend (already complete) +- Week 1-5: GDeflate CPU implementation +- Week 1-8: Basic Vulkan compute (parallel) +- Week 9-12: Simple Wine shim + +**Deferred**: +- โŒ GDeflate GPU +- โŒ io_uring multi-worker +- โŒ Request cancellation +- โŒ GPU optimizations + +**Goal**: Functional DirectStorage support in 3 months for testing + +--- + +## Technical Analysis + +### Critical Path + +``` +GDeflate Research โ†’ GDeflate CPU โ†’ GDeflate GPU + โ†“ โ†“ โ†“ +Vulkan Compute โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ +Wine/Proton Integration +``` + +**Key Dependencies**: +- GDeflate GPU requires Vulkan compute pipelines +- Wine integration requires GDeflate CPU (minimum) +- GPU optimizations require GPU decompression + +### Parallelization Opportunities + +**Can Work in Parallel**: +1. GDeflate research + Vulkan compute (Weeks 1-8) +2. GDeflate CPU + io_uring multi-worker (Weeks 9-13) +3. GDeflate CPU + Request cancellation (Weeks 9-13) +4. GPU workflows + Wine integration planning + +**Must Be Sequential**: +1. Vulkan compute โ†’ GDeflate GPU +2. GDeflate CPU โ†’ GDeflate GPU +3. Core features โ†’ Wine integration + +--- + +## Performance Targets + +| Component | Target | Measurement | +|-----------|--------|-------------| +| **GDeflate CPU** | โ‰ฅ 500 MB/s | Decompression throughput | +| **GDeflate GPU** | โ‰ฅ 2 GB/s | Decompression throughput | +| **io_uring** | โ‰ฅ 2x CPU backend | File I/O throughput | +| **Vulkan Compute** | < 100 ยตs overhead | Dispatch latency | +| **Wine/Proton** | < 10% overhead | vs native Linux | + +--- + +## Risk Assessment + +### High-Impact Risks + +| Risk | Probability | Mitigation | +|------|-------------|------------| +| **GDeflate format unavailable** | Medium | Reverse engineer, community collaboration | +| **Wine integration complex** | High | Incremental approach, Wine dev consultation | +| **Hardware compatibility** | Medium | Test multiple GPUs, provide CPU fallback | + +### Timeline Risks + +| Risk | Impact | Mitigation | +|------|--------|------------| +| **GDeflate research > 3 weeks** | +4 weeks | Start GPU work in parallel | +| **GPU shader optimization** | +2 weeks | Accept initial performance, iterate | +| **Wine upstreaming delays** | +8 weeks | Maintain out-of-tree fork initially | + +--- + +## Success Criteria + +### Functional Requirements +- โœ… All features work independently +- โœ… Comprehensive test coverage (โ‰ฅ80%) +- โœ… No memory leaks (valgrind clean) +- โœ… Thread-safe operations +- โœ… Vulkan validation layers pass +- โœ… At least one DirectStorage game runs on Wine/Proton + +### Performance Requirements +- โœ… GDeflate CPU: โ‰ฅ 500 MB/s throughput +- โœ… GDeflate GPU: โ‰ฅ 2 GB/s throughput +- โœ… io_uring: โ‰ฅ 2x CPU backend performance +- โœ… GPU utilization โ‰ฅ 80% during compute +- โœ… Wine/Proton overhead < 10% vs Windows + +### Quality Requirements +- โœ… Complete documentation for all features +- โœ… API stability maintained (no breaking changes) +- โœ… Works on CachyOS/Arch Linux +- โœ… Multi-GPU vendor support (AMD, NVIDIA, Intel) + +--- + +## Resource Requirements + +### Development Environment +- CachyOS or Arch Linux system +- Vulkan-capable GPU (โ‰ฅ Vulkan 1.0) +- liburing library (optional) +- glslangValidator (shader compilation) +- RenderDoc (GPU debugging, optional) + +### External Dependencies +- **Required**: CMake 3.16+, C++20 compiler, pthreads +- **Optional**: Vulkan SDK, liburing, Wine/Proton for testing + +### Hardware +- **Minimum**: CPU with โ‰ฅ4 cores, 8GB RAM, Vulkan 1.0 GPU +- **Recommended**: CPU with โ‰ฅ8 cores, 16GB RAM, Vulkan 1.3 GPU, NVMe SSD + +--- + +## Next Steps + +### Immediate Actions (Week 1) +1. โœ… Investigation documents complete (done) +2. โฉ Begin GDeflate format specification research +3. โฉ Start Vulkan shader module loading implementation +4. โฉ Install liburing for io_uring development +5. โฉ Set up shader build system in CMake + +### Short-Term Goals (Weeks 2-4) +1. Complete GDeflate format documentation +2. Implement Vulkan descriptor management +3. Begin GDeflate block header parser +4. Design request cancellation API +5. Test io_uring multi-worker prototype + +### Medium-Term Goals (Weeks 5-12) +1. Complete GDeflate CPU implementation +2. Finish Vulkan compute pipelines +3. Implement io_uring multi-worker +4. Add request cancellation +5. Comprehensive testing of all components + +--- + +## Detailed Microtasking + +The investigation has produced **150+ microtasks** across all phases: +- Each task sized at 0.5-5 days +- Clear dependencies identified +- Parallel work opportunities mapped +- Testing integrated at every phase +- Documentation requirements specified + +Example microtask breakdown (Phase 1.2.1 - Shader Module System): +- Task 1.2.1.1: Implement shader file loading (1 day) +- Task 1.2.1.2: Create VkShaderModule wrapper (1 day) +- Task 1.2.1.3: Implement shader caching (2 days) +- Task 1.2.1.4: Test shader loading (1 day) +- Task 1.2.1.5: Set up shader build system (2 days) + +--- + +## Documentation Structure + +``` +docs/ +โ”œโ”€โ”€ investigation_gdeflate.md # 16KB - GDeflate implementation plan +โ”œโ”€โ”€ investigation_vulkan_compute.md # 26KB - GPU compute infrastructure +โ”œโ”€โ”€ investigation_io_uring.md # 20KB - io_uring backend enhancement +โ”œโ”€โ”€ investigation_remaining_features.md # 18KB - Cancellation, GPU, Wine +โ”œโ”€โ”€ master_roadmap.md # 30KB - Complete phased plan +โ”œโ”€โ”€ investigation_summary.md # 7KB - This document +โ”‚ +โ”œโ”€โ”€ design.md # Architecture overview +โ”œโ”€โ”€ wine_proton.md # Wine/Proton integration notes +โ””โ”€โ”€ archlinux_vulkan_integration.md # CachyOS/Arch specific notes +``` + +Total Investigation Documentation: **117KB** of detailed planning + +--- + +## Conclusion + +This investigation phase has established a comprehensive foundation for completing the ds-runtime project: + +**Strengths**: +- โœ… Clear roadmap with detailed microtasks +- โœ… Realistic timelines with parallel work identified +- โœ… Risk assessment and mitigation strategies +- โœ… Multiple scope options (MVP vs full) +- โœ… Solid existing foundation (CPU backend working) + +**Readiness**: +- All major technical questions answered +- Dependencies and blockers identified +- Testing strategy defined +- Success criteria established +- Resource requirements documented + +**Recommendation**: +- โœ… Ready to proceed to implementation +- โœ… Begin with parallel Phase 1 tracks +- โœ… Maintain weekly progress reports +- โœ… Iterate on plan based on discoveries + +The project can begin active development immediately with high confidence in the approach and timeline. + +--- + +**Status**: Investigation Phase Complete โœ… +**Next Phase**: Phase 1 - Foundation & Research +**Start Date**: Week 1 (after approval) +**First Milestone**: Week 3 - GDeflate format specification complete + +--- + +## Appendix: Key Metrics + +| Metric | Value | +|--------|-------| +| **Investigation Documents** | 5 major documents | +| **Total Documentation** | 117KB | +| **Microtasks Identified** | 150+ tasks | +| **Total Timeline** | 36 weeks (full) / 12 weeks (MVP) | +| **Test Files** | 12+ new tests planned | +| **New Source Files** | 8+ implementation files | +| **Phases** | 4 major, 12 sub-phases | +| **Performance Targets** | 5 key metrics defined | +| **Risk Items** | 6 high-impact risks identified | +| **Success Criteria** | 15 functional + performance + quality | + +--- + +**Document Version**: 1.0 +**Author**: Investigation Phase Team +**Last Updated**: 2026-02-16 diff --git a/docs/investigation_vulkan_compute.md b/docs/investigation_vulkan_compute.md new file mode 100644 index 0000000..e072da1 --- /dev/null +++ b/docs/investigation_vulkan_compute.md @@ -0,0 +1,996 @@ +# Vulkan GPU Compute Pipeline Investigation and Implementation Plan + +**Status:** Planning Phase +**Priority:** High +**Target:** Full GPU compute capability for decompression and data processing +**Dependencies:** Existing Vulkan device/queue infrastructure + +--- + +## Executive Summary + +The current Vulkan backend in ds-runtime supports staging buffer copies but lacks compute pipeline functionality. This document outlines the investigation, design, and implementation plan for adding full GPU compute capabilities, primarily for GPU-accelerated decompression. + +--- + +## 1. Current State + +### 1.1 What Works + +**Existing Vulkan Infrastructure** (`src/ds_runtime_vulkan.cpp`): +- โœ… Vulkan instance creation +- โœ… Physical device selection +- โœ… Logical device and queue creation +- โœ… Command pool management +- โœ… Staging buffer allocation (host-visible) +- โœ… GPU buffer allocation (device-local) +- โœ… `vkCmdCopyBuffer` for staging โ†” GPU transfers +- โœ… Synchronization via `vkDeviceWaitIdle` +- โœ… Memory type selection and allocation +- โœ… Request submission and completion tracking + +**Capability**: File I/O โ†’ Staging buffer โ†’ GPU buffer (pure data transfer, no computation) + +### 1.2 What's Missing + +**Compute Pipeline Components**: +- โŒ Compute pipeline creation (`vkCreateComputePipelines`) +- โŒ Shader module loading (`vkCreateShaderModule`) +- โŒ Descriptor set layout creation +- โŒ Descriptor pool allocation +- โŒ Descriptor set updates (buffer bindings) +- โŒ Pipeline layout creation +- โŒ Compute command recording (`vkCmdBindPipeline`, `vkCmdDispatch`) +- โŒ Push constant support +- โŒ Compute-specific synchronization (barriers) + +**Impact**: Cannot execute any GPU compute workloads (decompression, transforms, etc.) + +### 1.3 Existing Assets (Unused) + +**SPIR-V Shader**: `examples/vk-copy-test/copy.comp.spv` (256 bytes) +- Precompiled compute shader +- Currently not loaded or used by any code +- Likely a simple buffer copy/transform shader +- Should be examined and potentially reused + +--- + +## 2. Vulkan Compute Architecture + +### 2.1 Compute Pipeline Overview + +``` +Application + โ†“ (prepare compute work) +Descriptor Sets (bind buffers, uniforms) + โ†“ +Pipeline Layout (descriptor layouts + push constants) + โ†“ +Compute Pipeline (shader + configuration) + โ†“ +Command Buffer (vkCmdBindPipeline, vkCmdDispatch) + โ†“ +GPU Execution (workgroups โ†’ invocations) + โ†“ +Synchronization (barriers, fences) + โ†“ +Results in GPU buffers +``` + +### 2.2 Key Vulkan Objects + +| Object | Purpose | Lifetime | +|--------|---------|----------| +| **VkShaderModule** | Compiled SPIR-V code | Per-shader, reusable | +| **VkDescriptorSetLayout** | Binding layout (types, stages) | Per-layout, reusable | +| **VkDescriptorPool** | Allocation pool for descriptor sets | Per-backend, managed | +| **VkDescriptorSet** | Actual buffer bindings | Per-dispatch, short-lived | +| **VkPipelineLayout** | Push constants + descriptor layouts | Per-pipeline, reusable | +| **VkPipeline** | Complete compute configuration | Per-shader, reusable | +| **VkCommandBuffer** | Recorded GPU commands | Per-submission, pooled | + +### 2.3 Execution Model + +**Workgroup Hierarchy**: +``` +Global Work Size (e.g., 1024 elements) + โ†“ divide by local_size_x +Workgroups (e.g., 4 workgroups if local_size_x = 256) + โ†“ parallel execution +Invocations (256 invocations per workgroup) + โ†“ +Each invocation processes one element +``` + +**Shader Invocation IDs**: +- `gl_GlobalInvocationID`: Unique ID across all invocations +- `gl_LocalInvocationID`: ID within the workgroup +- `gl_WorkGroupID`: Workgroup index + +--- + +## 3. Implementation Plan + +### 3.1 Phase 1: Shader Module Loading + +**Goal**: Load and create VkShaderModule from SPIR-V bytecode + +#### Tasks +1. Add SPIR-V loading utility function +2. Implement shader module creation +3. Add shader module caching (avoid redundant loads) +4. Validate shader compilation + +#### API Design + +**Location**: `src/ds_runtime_vulkan.cpp` + +```cpp +class ShaderModuleCache { +public: + VkShaderModule load_shader( + VkDevice device, + const std::string& path + ); + + void destroy_all(VkDevice device); + +private: + std::unordered_map modules_; +}; + +// Add to VulkanBackend::Impl +ShaderModuleCache shader_cache_; +``` + +#### Implementation + +```cpp +VkShaderModule ShaderModuleCache::load_shader( + VkDevice device, + const std::string& path +) { + // Check cache + auto it = modules_.find(path); + if (it != modules_.end()) { + return it->second; + } + + // Read SPIR-V file + std::ifstream file(path, std::ios::binary | std::ios::ate); + if (!file) { + throw std::runtime_error("Failed to open shader file"); + } + + size_t size = file.tellg(); + std::vector code(size); + file.seekg(0); + file.read(code.data(), size); + + // Create shader module + VkShaderModuleCreateInfo create_info{}; + create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + create_info.codeSize = code.size(); + create_info.pCode = reinterpret_cast(code.data()); + + VkShaderModule module; + VkResult result = vkCreateShaderModule( + device, &create_info, nullptr, &module + ); + + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create shader module"); + } + + modules_[path] = module; + return module; +} +``` + +#### Testing +- Load existing `copy.comp.spv` shader +- Validate shader module creation succeeds +- Test error handling for missing files +- Verify shader module caching works + +--- + +### 3.2 Phase 2: Descriptor Set Layout + +**Goal**: Define buffer binding layouts for compute shaders + +#### Descriptor Layouts for Decompression + +**Example: GDeflate Decompression** +```cpp +// Binding 0: Input compressed buffer (read-only) +// Binding 1: Block metadata (read-only) +// Binding 2: Output decompressed buffer (write-only) +``` + +#### Implementation + +**Location**: `src/ds_runtime_vulkan.cpp` + +```cpp +struct ComputeDescriptorLayouts { + VkDescriptorSetLayout decompression_layout; + VkDescriptorSetLayout copy_layout; + // Add more as needed +}; + +VkDescriptorSetLayout create_decompression_descriptor_layout( + VkDevice device +) { + VkDescriptorSetLayoutBinding bindings[3]; + + // Binding 0: Input compressed data + bindings[0].binding = 0; + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[0].pImmutableSamplers = nullptr; + + // Binding 1: Block metadata + bindings[1].binding = 1; + bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[1].descriptorCount = 1; + bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[1].pImmutableSamplers = nullptr; + + // Binding 2: Output decompressed data + bindings[2].binding = 2; + bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[2].descriptorCount = 1; + bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[2].pImmutableSamplers = nullptr; + + VkDescriptorSetLayoutCreateInfo layout_info{}; + layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layout_info.bindingCount = 3; + layout_info.pBindings = bindings; + + VkDescriptorSetLayout layout; + VkResult result = vkCreateDescriptorSetLayout( + device, &layout_info, nullptr, &layout + ); + + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create descriptor set layout"); + } + + return layout; +} +``` + +#### Add to VulkanBackend + +```cpp +// In VulkanBackend::Impl +ComputeDescriptorLayouts descriptor_layouts_; + +// Initialize in constructor +descriptor_layouts_.decompression_layout = + create_decompression_descriptor_layout(device_); +``` + +--- + +### 3.3 Phase 3: Descriptor Pool + +**Goal**: Allocate descriptor pool for runtime descriptor set allocation + +#### Pool Sizing + +**Strategy**: Pre-allocate pool large enough for concurrent dispatches + +```cpp +// Estimate: 16 concurrent compute dispatches +// Each dispatch needs 3 storage buffer descriptors +// Total: 16 * 3 = 48 storage buffer descriptors +``` + +#### Implementation + +```cpp +VkDescriptorPool create_compute_descriptor_pool( + VkDevice device, + uint32_t max_sets = 32 +) { + VkDescriptorPoolSize pool_size{}; + pool_size.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + pool_size.descriptorCount = max_sets * 3; // 3 bindings per set + + VkDescriptorPoolCreateInfo pool_info{}; + pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + pool_info.poolSizeCount = 1; + pool_info.pPoolSizes = &pool_size; + pool_info.maxSets = max_sets; + pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + + VkDescriptorPool pool; + VkResult result = vkCreateDescriptorPool( + device, &pool_info, nullptr, &pool + ); + + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create descriptor pool"); + } + + return pool; +} + +// Add to VulkanBackend::Impl +VkDescriptorPool descriptor_pool_; + +// Initialize in constructor +descriptor_pool_ = create_compute_descriptor_pool(device_); +``` + +--- + +### 3.4 Phase 4: Pipeline Layout and Compute Pipeline + +**Goal**: Create compute pipeline with shader and layout + +#### Pipeline Layout + +```cpp +VkPipelineLayout create_compute_pipeline_layout( + VkDevice device, + VkDescriptorSetLayout descriptor_layout +) { + // Optional: push constants for dispatch parameters + VkPushConstantRange push_constant{}; + push_constant.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + push_constant.offset = 0; + push_constant.size = sizeof(uint32_t) * 4; // Example: 4 uint32s + + VkPipelineLayoutCreateInfo layout_info{}; + layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + layout_info.setLayoutCount = 1; + layout_info.pSetLayouts = &descriptor_layout; + layout_info.pushConstantRangeCount = 1; + layout_info.pPushConstantRanges = &push_constant; + + VkPipelineLayout layout; + VkResult result = vkCreatePipelineLayout( + device, &layout_info, nullptr, &layout + ); + + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create pipeline layout"); + } + + return layout; +} +``` + +#### Compute Pipeline + +```cpp +VkPipeline create_compute_pipeline( + VkDevice device, + VkPipelineLayout layout, + VkShaderModule shader_module, + const char* entry_point = "main" +) { + VkPipelineShaderStageCreateInfo stage_info{}; + stage_info.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + stage_info.stage = VK_SHADER_STAGE_COMPUTE_BIT; + stage_info.module = shader_module; + stage_info.pName = entry_point; + + VkComputePipelineCreateInfo pipeline_info{}; + pipeline_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + pipeline_info.stage = stage_info; + pipeline_info.layout = layout; + + VkPipeline pipeline; + VkResult result = vkCreateComputePipelines( + device, VK_NULL_HANDLE, 1, &pipeline_info, nullptr, &pipeline + ); + + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create compute pipeline"); + } + + return pipeline; +} +``` + +#### Pipeline Management + +```cpp +struct ComputePipeline { + VkPipeline pipeline; + VkPipelineLayout layout; + VkDescriptorSetLayout descriptor_layout; + VkShaderModule shader; +}; + +// Add to VulkanBackend::Impl +std::unordered_map compute_pipelines_; + +ComputePipeline create_decompression_pipeline() { + ComputePipeline result; + + // Load shader + result.shader = shader_cache_.load_shader( + device_, "shaders/decompress.comp.spv" + ); + + // Create descriptor layout + result.descriptor_layout = + create_decompression_descriptor_layout(device_); + + // Create pipeline layout + result.layout = create_compute_pipeline_layout( + device_, result.descriptor_layout + ); + + // Create pipeline + result.pipeline = create_compute_pipeline( + device_, result.layout, result.shader + ); + + return result; +} +``` + +--- + +### 3.5 Phase 5: Descriptor Set Allocation and Updates + +**Goal**: Bind buffers to descriptor sets for each dispatch + +#### Allocation + +```cpp +VkDescriptorSet allocate_descriptor_set( + VkDevice device, + VkDescriptorPool pool, + VkDescriptorSetLayout layout +) { + VkDescriptorSetAllocateInfo alloc_info{}; + alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + alloc_info.descriptorPool = pool; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &layout; + + VkDescriptorSet descriptor_set; + VkResult result = vkAllocateDescriptorSets( + device, &alloc_info, &descriptor_set + ); + + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to allocate descriptor set"); + } + + return descriptor_set; +} +``` + +#### Buffer Binding + +```cpp +void update_decompression_descriptor_set( + VkDevice device, + VkDescriptorSet descriptor_set, + VkBuffer input_buffer, + VkBuffer metadata_buffer, + VkBuffer output_buffer, + VkDeviceSize input_size, + VkDeviceSize metadata_size, + VkDeviceSize output_size +) { + VkDescriptorBufferInfo buffer_infos[3]; + + // Input buffer + buffer_infos[0].buffer = input_buffer; + buffer_infos[0].offset = 0; + buffer_infos[0].range = input_size; + + // Metadata buffer + buffer_infos[1].buffer = metadata_buffer; + buffer_infos[1].offset = 0; + buffer_infos[1].range = metadata_size; + + // Output buffer + buffer_infos[2].buffer = output_buffer; + buffer_infos[2].offset = 0; + buffer_infos[2].range = output_size; + + VkWriteDescriptorSet writes[3]; + for (int i = 0; i < 3; ++i) { + writes[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[i].pNext = nullptr; + writes[i].dstSet = descriptor_set; + writes[i].dstBinding = i; + writes[i].dstArrayElement = 0; + writes[i].descriptorCount = 1; + writes[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writes[i].pBufferInfo = &buffer_infos[i]; + writes[i].pImageInfo = nullptr; + writes[i].pTexelBufferView = nullptr; + } + + vkUpdateDescriptorSets(device, 3, writes, 0, nullptr); +} +``` + +--- + +### 3.6 Phase 6: Compute Dispatch + +**Goal**: Record and execute compute commands + +#### Command Buffer Recording + +```cpp +void record_compute_dispatch( + VkCommandBuffer cmd, + VkPipeline pipeline, + VkPipelineLayout layout, + VkDescriptorSet descriptor_set, + uint32_t workgroup_count_x, + uint32_t workgroup_count_y = 1, + uint32_t workgroup_count_z = 1 +) { + // Bind compute pipeline + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + // Bind descriptor set + vkCmdBindDescriptorSets( + cmd, + VK_PIPELINE_BIND_POINT_COMPUTE, + layout, + 0, // first set + 1, // descriptor set count + &descriptor_set, + 0, // dynamic offset count + nullptr + ); + + // Optional: push constants + // vkCmdPushConstants(cmd, layout, VK_SHADER_STAGE_COMPUTE_BIT, ...); + + // Dispatch compute work + vkCmdDispatch(cmd, workgroup_count_x, workgroup_count_y, workgroup_count_z); +} +``` + +#### Integration with Request Processing + +```cpp +void VulkanBackend::Impl::process_request_with_compute(Request& req) { + // Begin command buffer + VkCommandBufferBeginInfo begin_info{}; + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + vkBeginCommandBuffer(command_buffer_, &begin_info); + + // 1. Copy file data to staging buffer + // (existing code for file I/O) + + // 2. Barrier: transfer โ†’ compute + VkBufferMemoryBarrier barrier{}; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.buffer = staging_buffer_; + barrier.offset = 0; + barrier.size = VK_WHOLE_SIZE; + + vkCmdPipelineBarrier( + command_buffer_, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, 0, nullptr, 1, &barrier, 0, nullptr + ); + + // 3. Dispatch compute (decompression) + auto& pipeline = compute_pipelines_["decompress"]; + VkDescriptorSet desc_set = allocate_descriptor_set( + device_, descriptor_pool_, pipeline.descriptor_layout + ); + + update_decompression_descriptor_set( + device_, desc_set, + staging_buffer_, // compressed input + metadata_buffer_, // block info + req.gpu_buffer, // decompressed output + req.size, + metadata_size, + req.size * 2 // assume 2x expansion + ); + + uint32_t workgroup_count = (req.size + 255) / 256; + record_compute_dispatch( + command_buffer_, + pipeline.pipeline, + pipeline.layout, + desc_set, + workgroup_count + ); + + // 4. Barrier: compute โ†’ host (if needed for readback) + // ... + + // 5. End and submit + vkEndCommandBuffer(command_buffer_); + + VkSubmitInfo submit_info{}; + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &command_buffer_; + + vkQueueSubmit(queue_, 1, &submit_info, VK_NULL_HANDLE); + vkQueueWaitIdle(queue_); + + // Free descriptor set + vkFreeDescriptorSets(device_, descriptor_pool_, 1, &desc_set); +} +``` + +--- + +### 3.7 Phase 7: Synchronization and Barriers + +**Goal**: Proper memory synchronization between pipeline stages + +#### Key Barriers + +**Transfer โ†’ Compute**: +```cpp +VkBufferMemoryBarrier transfer_to_compute{}; +transfer_to_compute.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; +transfer_to_compute.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; +// Use: After vkCmdCopyBuffer, before compute dispatch +``` + +**Compute โ†’ Transfer**: +```cpp +VkBufferMemoryBarrier compute_to_transfer{}; +compute_to_transfer.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; +compute_to_transfer.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; +// Use: After compute dispatch, before staging buffer readback +``` + +**Compute โ†’ Compute** (between dispatches): +```cpp +VkBufferMemoryBarrier compute_to_compute{}; +compute_to_compute.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; +compute_to_compute.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; +// Use: Between dependent compute passes +``` + +#### Synchronization Best Practices + +1. **Minimize barriers**: Batch operations when possible +2. **Use appropriate stages**: Don't over-synchronize +3. **Consider queue ownership**: Handle queue family transfers if needed +4. **Validate with layers**: Enable `VK_LAYER_KHRONOS_validation` + +--- + +## 4. Shader Development + +### 4.1 Example: Simple Buffer Copy + +**File**: `shaders/buffer_copy.comp` + +```glsl +#version 450 + +layout(local_size_x = 256) in; + +layout(binding = 0) readonly buffer InputBuffer { + uint data[]; +} input_buf; + +layout(binding = 1) writeonly buffer OutputBuffer { + uint data[]; +} output_buf; + +layout(push_constant) uniform PushConstants { + uint element_count; +} push; + +void main() { + uint idx = gl_GlobalInvocationID.x; + if (idx < push.element_count) { + output_buf.data[idx] = input_buf.data[idx]; + } +} +``` + +**Compilation**: +```bash +glslangValidator -V buffer_copy.comp -o buffer_copy.comp.spv +``` + +### 4.2 Example: Transform Shader + +**File**: `shaders/uppercase.comp` (enhanced version of FakeUppercase) + +```glsl +#version 450 + +layout(local_size_x = 256) in; + +layout(binding = 0) readonly buffer InputBuffer { + uint8_t data[]; +} input_buf; + +layout(binding = 1) writeonly buffer OutputBuffer { + uint8_t data[]; +} output_buf; + +layout(push_constant) uniform PushConstants { + uint byte_count; +} push; + +void main() { + uint idx = gl_GlobalInvocationID.x; + if (idx < push.byte_count) { + uint8_t c = input_buf.data[idx]; + // Uppercase ASCII + if (c >= 'a' && c <= 'z') { + c = c - 32; + } + output_buf.data[idx] = c; + } +} +``` + +### 4.3 Shader Build System + +**Add to CMakeLists.txt**: +```cmake +# Find glslangValidator +find_program(GLSLANG_VALIDATOR glslangValidator) + +if(GLSLANG_VALIDATOR) + # Compile shaders + file(GLOB SHADER_SOURCES shaders/*.comp) + + foreach(SHADER ${SHADER_SOURCES}) + get_filename_component(SHADER_NAME ${SHADER} NAME) + set(SPIRV "${CMAKE_CURRENT_BINARY_DIR}/shaders/${SHADER_NAME}.spv") + + add_custom_command( + OUTPUT ${SPIRV} + COMMAND ${CMAKE_COMMAND} -E make_directory + "${CMAKE_CURRENT_BINARY_DIR}/shaders" + COMMAND ${GLSLANG_VALIDATOR} -V ${SHADER} -o ${SPIRV} + DEPENDS ${SHADER} + COMMENT "Compiling shader ${SHADER_NAME}" + ) + + list(APPEND SPIRV_SHADERS ${SPIRV}) + endforeach() + + add_custom_target(shaders ALL DEPENDS ${SPIRV_SHADERS}) +endif() +``` + +--- + +## 5. Testing Strategy + +### 5.1 Unit Tests + +**Test Suite**: `tests/vulkan_compute_test.cpp` + +**Test Cases**: +1. **Pipeline creation**: Verify pipeline objects created successfully +2. **Shader loading**: Test shader module creation from SPIR-V +3. **Descriptor allocation**: Verify descriptor pool management +4. **Simple dispatch**: Buffer copy shader execution +5. **Transform shader**: Uppercase transform on GPU +6. **Error handling**: Invalid pipeline, missing shader, etc. + +### 5.2 Integration Tests + +**Scenarios**: +1. **File โ†’ GPU compute โ†’ GPU buffer**: Full decompression path +2. **Multiple dispatches**: Concurrent compute workloads +3. **Mixed operations**: Compute + transfer in same command buffer +4. **Large buffers**: Test with multi-MB data +5. **Synchronization**: Verify barriers work correctly + +### 5.3 Validation + +**Tools**: +- Vulkan Validation Layers (`VK_LAYER_KHRONOS_validation`) +- RenderDoc for command buffer inspection +- GPU profilers (Nsight, Radeon GPU Profiler) + +**Check for**: +- Synchronization errors +- Memory leaks (descriptor sets, pipelines) +- Invalid API usage +- Performance bottlenecks + +--- + +## 6. Performance Considerations + +### 6.1 Optimization Strategies + +**Shader Optimizations**: +- Coalesced memory access patterns +- Shared memory for temporary data +- Reduce divergent branches +- Optimize workgroup size for target GPU + +**Pipeline Management**: +- Reuse pipelines across requests +- Minimize pipeline switches +- Batch similar compute work +- Pre-warm pipeline caches + +**Memory Management**: +- Pool descriptor sets (avoid per-frame allocation) +- Reuse command buffers when possible +- Minimize host-device transfers +- Use push constants for small data + +### 6.2 Performance Targets + +**Goals**: +- Compute dispatch overhead < 100 ยตs +- Throughput โ‰ฅ 10 GB/s for simple transforms +- GPU utilization โ‰ฅ 80% during compute +- CPU overhead < 5% during GPU execution + +--- + +## 7. Dependencies and Requirements + +### 7.1 External Dependencies + +**Required**: +- Vulkan SDK (already optional dependency) +- `glslangValidator` for shader compilation +- C++20 compiler (already required) + +**Optional**: +- RenderDoc for debugging +- GPU profiling tools + +### 7.2 Hardware Requirements + +**Minimum**: +- Vulkan 1.0 support +- Compute queue support +- Storage buffer support + +**Recommended**: +- Vulkan 1.3 support +- Dedicated compute queue +- โ‰ฅ 4GB VRAM for large asset processing + +--- + +## 8. Timeline and Milestones + +### 8.1 Implementation Phases + +**Week 1-2: Foundation** +- Shader module loading +- Descriptor layouts +- Descriptor pool creation +- **Milestone**: Infrastructure ready + +**Week 3-4: Pipeline Creation** +- Pipeline layout +- Compute pipeline creation +- Pipeline management +- **Milestone**: Simple copy shader works + +**Week 5-6: Dispatch and Synchronization** +- Command buffer recording +- Compute dispatch +- Barriers and synchronization +- **Milestone**: Transform shader works + +**Week 7-8: Integration and Testing** +- Backend integration +- Comprehensive testing +- Performance tuning +- **Milestone**: Production-ready compute support + +### 8.2 Total Estimate + +**8 weeks** for complete Vulkan compute implementation + +**Dependencies**: None (can proceed independently) + +--- + +## 9. Success Criteria + +### 9.1 Functional Requirements +- โœ… Shader modules load from SPIR-V files +- โœ… Compute pipelines created successfully +- โœ… Descriptor sets allocated and bound correctly +- โœ… Compute dispatches execute on GPU +- โœ… Synchronization barriers work properly +- โœ… Existing Vulkan tests still pass +- โœ… New compute tests pass (100% coverage) + +### 9.2 Performance Requirements +- โœ… Compute overhead < 100 ยตs per dispatch +- โœ… GPU utilization โ‰ฅ 80% during compute +- โœ… Throughput โ‰ฅ 10 GB/s for simple operations +- โœ… No performance regression in existing paths + +### 9.3 Quality Requirements +- โœ… Vulkan validation layers pass (no errors) +- โœ… No memory leaks (descriptor sets, pipelines) +- โœ… Thread-safe pipeline management +- โœ… Documentation complete and accurate +- โœ… API stability maintained + +--- + +## 10. Next Steps + +### Immediate (This Week) +1. โœ… Complete investigation document +2. โฉ Examine existing `copy.comp.spv` shader +3. โฉ Set up shader build system +4. โฉ Create simple test shader + +### Short Term (Next 2 Weeks) +1. โฉ Implement shader module loading +2. โฉ Create descriptor layouts +3. โฉ Set up descriptor pool +4. โฉ Test infrastructure + +### Medium Term (1-2 Months) +1. โฉ Complete pipeline creation +2. โฉ Implement compute dispatch +3. โฉ Add synchronization +4. โฉ Comprehensive testing + +--- + +## 11. Open Questions + +1. **Shader Language**: Should we support multiple shader languages (GLSL, HLSL via DXC)? +2. **Pipeline Caching**: Do we need VkPipelineCache for faster startup? +3. **Async Compute**: Should we use dedicated compute queue or graphics queue? +4. **Workgroup Size**: How to determine optimal local_size_x for different GPUs? +5. **Error Recovery**: How to handle GPU compute failures gracefully? +6. **Shader Hot-Reload**: Do we need runtime shader recompilation for development? + +--- + +## 12. References + +### Vulkan Specification +- Vulkan 1.3 Specification: Compute Shaders +- Khronos Vulkan Guide: Compute +- Vulkan Tutorial: Compute Shaders + +### Best Practices +- Khronos Vulkan Best Practices +- AMD GPU Architecture +- NVIDIA Compute Best Practices +- Intel GPU Architecture Guide + +### Tools +- glslangValidator documentation +- RenderDoc user guide +- Vulkan Validation Layers + +--- + +**Document Status**: Draft v1.0 +**Last Updated**: 2026-02-16 +**Next Review**: After shader loading implementation complete diff --git a/docs/master_roadmap.md b/docs/master_roadmap.md new file mode 100644 index 0000000..ac7d3c5 --- /dev/null +++ b/docs/master_roadmap.md @@ -0,0 +1,1102 @@ +# DirectStorage-Style I/O and Decompression Pipeline: Master Roadmap + +**Project**: ds-runtime +**Goal**: Functioning DirectStorage-style I/O and decompression pipeline implemented natively on Linux +**Target Platform**: CachyOS/Arch Linux with GPU/Vulkan support +**Timeline**: 36 weeks (9 months) for complete implementation, 12 weeks for MVP + +--- + +## Executive Summary + +This document provides a comprehensive, phased implementation plan for completing the ds-runtime project. It integrates investigations across all components (GDeflate, Vulkan compute, io_uring, cancellation, GPU workflows, Wine/Proton) into a cohesive roadmap with extensive subphasing and microtasking. + +--- + +## 1. Project Status Overview + +### 1.1 Current State (Phase 0 - Complete โœ…) + +**Working Components**: +- โœ… CPU backend with thread pool +- โœ… Read/write operations (pread/pwrite) +- โœ… FakeUppercase demo compression +- โœ… Error reporting with rich context +- โœ… Request completion tracking +- โœ… C ABI for Wine/Proton integration +- โœ… Basic test suite (4 tests, all passing) +- โœ… Build system (CMake, C++20) + +**Project Builds Successfully**: All tests pass, demos work + +### 1.2 Components Requiring Work + +| Component | Status | Priority | Effort | +|-----------|--------|----------|--------| +| **GDeflate Compression** | โš ๏ธ Stubbed (ENOTSUP) | High | 9-13 weeks | +| **Vulkan GPU Compute** | โš ๏ธ Partial (copy only) | High | 8 weeks | +| **io_uring Backend** | โš ๏ธ Partial (single-worker) | Medium | 6 weeks | +| **Request Cancellation** | โŒ Not implemented | Medium | 3 weeks | +| **GPU-Resident Workflows** | โš ๏ธ Basic only | Medium | 4 weeks | +| **Wine/Proton Integration** | โŒ Documentation only | High | 8 weeks | + +--- + +## 2. Phased Implementation Plan + +### PHASE 1: Foundation & Research (Weeks 1-8) + +**Goal**: Establish foundation for all advanced features + +#### Phase 1.1: GDeflate Format Research (Weeks 1-3) +**Owner**: Research Lead +**Priority**: Critical Path + +**Microtasks**: +- [ ] **1.1.1** Review Microsoft DirectStorage documentation (2 days) + - SDK docs, headers, samples + - GDeflate format specification (if available) + - Create `docs/gdeflate_format.md` + +- [ ] **1.1.2** Analyze existing implementations (3 days) + - Wine/Proton DirectStorage status + - Community implementations + - Open-source decompression libraries + +- [ ] **1.1.3** Reverse engineer format if needed (5 days) + - Create compressed test assets + - Analyze binary structure + - Document block format, headers, metadata + +- [ ] **1.1.4** Validate format understanding (2 days) + - Create test vectors + - Verify decompression correctness + - Document any ambiguities + +**Deliverables**: +- โœ… GDeflate format specification document +- โœ… Test asset collection (compressed files) +- โœ… Validation test vectors + +**Success Criteria**: Can parse GDeflate headers and understand block structure + +--- + +#### Phase 1.2: Vulkan Compute Infrastructure (Weeks 1-8) +**Owner**: Graphics Engineer +**Priority**: Critical Path (parallel with GDeflate research) + +**Sub-Phase 1.2.1: Shader Module System (Weeks 1-2)** + +**Microtasks**: +- [ ] **1.2.1.1** Implement shader file loading (1 day) + - Read SPIR-V binary from file + - Error handling for missing files + - Add to `src/ds_runtime_vulkan.cpp` + +- [ ] **1.2.1.2** Create VkShaderModule wrapper (1 day) + - `vkCreateShaderModule` call + - Validation of SPIR-V code + - Error reporting + +- [ ] **1.2.1.3** Implement shader caching (2 days) + - `ShaderModuleCache` class + - Hash-based caching + - Lifetime management + +- [ ] **1.2.1.4** Test shader loading (1 day) + - Load existing `copy.comp.spv` + - Test error cases + - Add unit test + +- [ ] **1.2.1.5** Set up shader build system (2 days) + - CMake integration for glslangValidator + - Auto-compile `.comp` files + - Install shader SPV files + +**Deliverable**: Shader module loading system complete + +**Sub-Phase 1.2.2: Descriptor Management (Weeks 3-4)** + +**Microtasks**: +- [ ] **1.2.2.1** Design descriptor layouts (2 days) + - Decompression layout (3 bindings) + - Copy layout (2 bindings) + - Document layout specifications + +- [ ] **1.2.2.2** Implement descriptor set layout creation (1 day) + - `vkCreateDescriptorSetLayout` + - Multiple layout types + - Add to VulkanBackend + +- [ ] **1.2.2.3** Implement descriptor pool (2 days) + - `vkCreateDescriptorPool` + - Size estimation logic + - Pool management + +- [ ] **1.2.2.4** Implement descriptor set allocation (1 day) + - `vkAllocateDescriptorSets` + - Free/reuse logic + - Error handling + +- [ ] **1.2.2.5** Implement buffer binding (2 days) + - `vkUpdateDescriptorSets` + - Buffer info structure + - Dynamic updates + +- [ ] **1.2.2.6** Test descriptor system (2 days) + - Unit tests for allocation/free + - Buffer binding validation + - Memory leak testing + +**Deliverable**: Descriptor management system complete + +**Sub-Phase 1.2.3: Pipeline Creation (Weeks 5-6)** + +**Microtasks**: +- [ ] **1.2.3.1** Implement pipeline layout (1 day) + - `vkCreatePipelineLayout` + - Push constant support + - Multiple descriptor layouts + +- [ ] **1.2.3.2** Implement compute pipeline creation (2 days) + - `vkCreateComputePipelines` + - Pipeline configuration + - Pipeline caching + +- [ ] **1.2.3.3** Create pipeline management system (2 days) + - `ComputePipeline` struct + - Pipeline registry (by name) + - Lifetime management + +- [ ] **1.2.3.4** Test pipeline creation (2 days) + - Create simple copy pipeline + - Validation layers enabled + - Error handling tests + +- [ ] **1.2.3.5** Create example shaders (3 days) + - `buffer_copy.comp` - simple copy + - `uppercase.comp` - ASCII transform + - Compile to SPIR-V + +**Deliverable**: Compute pipeline creation system complete + +**Sub-Phase 1.2.4: Compute Dispatch & Synchronization (Weeks 7-8)** + +**Microtasks**: +- [ ] **1.2.4.1** Implement command buffer recording (2 days) + - `vkCmdBindPipeline` for compute + - `vkCmdBindDescriptorSets` + - `vkCmdDispatch` + +- [ ] **1.2.4.2** Implement synchronization barriers (2 days) + - Transfer โ†’ Compute barrier + - Compute โ†’ Transfer barrier + - Compute โ†’ Compute barrier + +- [ ] **1.2.4.3** Integrate with request processing (3 days) + - Add compute path to VulkanBackend + - Command buffer management + - Completion tracking + +- [ ] **1.2.4.4** Test compute execution (2 days) + - Buffer copy shader test + - Uppercase transform test + - Synchronization validation + +- [ ] **1.2.4.5** Performance profiling (1 day) + - Measure dispatch overhead + - GPU utilization metrics + - Optimization opportunities + +**Deliverable**: Full Vulkan compute capability + +**Phase 1 Milestones**: +- โœ… Week 3: GDeflate format understood +- โœ… Week 2: Shader loading works +- โœ… Week 4: Descriptor system works +- โœ… Week 6: Compute pipelines created +- โœ… Week 8: Compute dispatch working + +--- + +### PHASE 2: Core Feature Implementation (Weeks 9-18) + +**Goal**: Implement essential features (GDeflate CPU, io_uring, cancellation) + +#### Phase 2.1: GDeflate CPU Implementation (Weeks 9-13) +**Owner**: Compression Engineer +**Priority**: High +**Dependencies**: Phase 1.1 complete + +**Sub-Phase 2.1.1: Block Header Parser (Weeks 9-10)** + +**Microtasks**: +- [ ] **2.1.1.1** Define block header struct (1 day) + - Header fields based on format spec + - Size, offset, compression parameters + - Create `include/gdeflate_format.h` + +- [ ] **2.1.1.2** Implement header parsing (2 days) + - Parse file/stream header + - Extract block metadata + - Validate checksums (if present) + +- [ ] **2.1.1.3** Implement block iterator (1 day) + - Iterate over blocks in file + - Handle partial files + - Error reporting + +- [ ] **2.1.1.4** Test header parsing (2 days) + - Test with known assets + - Edge cases (empty, malformed) + - Add unit test `tests/gdeflate_header_test.cpp` + +**Deliverable**: GDeflate header parser + +**Sub-Phase 2.1.2: DEFLATE Integration (Weeks 10-11)** + +**Microtasks**: +- [ ] **2.1.2.1** Evaluate libraries (1 day) + - zlib vs miniz vs custom + - Performance comparison + - License compatibility + +- [ ] **2.1.2.2** Integrate chosen library (2 days) + - Add to CMakeLists.txt + - Wrapper functions + - Error handling + +- [ ] **2.1.2.3** Implement block decompression (2 days) + - Decompress single block + - Handle dictionary/state + - Streaming support + +- [ ] **2.1.2.4** Implement multi-block decompression (2 days) + - Iterate over all blocks + - Parallel decompression (thread pool) + - Assembly of output buffer + +- [ ] **2.1.2.5** Test decompression (3 days) + - Single block tests + - Multi-block tests + - Correctness validation + - Add `tests/gdeflate_decompress_test.cpp` + +**Deliverable**: Working GDeflate CPU decoder + +**Sub-Phase 2.1.3: Backend Integration (Weeks 12-13)** + +**Microtasks**: +- [ ] **2.1.3.1** Remove ENOTSUP stub (0.5 day) + - Delete stub code in `ds_runtime.cpp` + - Wire in actual decoder + +- [ ] **2.1.3.2** Integrate decoder into CPU backend (1 day) + - Call from decompression pipeline + - Buffer management + - Error propagation + +- [ ] **2.1.3.3** Add configuration options (1 day) + - Parallel decompression settings + - Memory limits + - Fallback behavior + +- [ ] **2.1.3.4** Test integration (2 days) + - Update `compression_gdeflate_stub_test.cpp` + - Change from "verify error" to "verify success" + - End-to-end tests + +- [ ] **2.1.3.5** Performance benchmarking (2 days) + - Measure decompression throughput + - Compare vs uncompressed + - Optimize hotspots + +- [ ] **2.1.3.6** Documentation (1 day) + - Update README.md + - Usage examples + - Performance characteristics + +**Deliverable**: GDeflate CPU support complete + +**Phase 2.1 Milestone**: GDeflate CPU decoder fully functional + +--- + +#### Phase 2.2: io_uring Multi-Worker (Weeks 9-14) +**Owner**: Systems Engineer +**Priority**: Medium (parallel with GDeflate) +**Dependencies**: None + +**Sub-Phase 2.2.1: Multi-Worker Architecture (Weeks 9-11)** + +**Microtasks**: +- [ ] **2.2.1.1** Design worker architecture (1 day) + - Multiple io_uring instances + - Request distribution strategy + - Synchronization design + +- [ ] **2.2.1.2** Implement worker structure (2 days) + - `IoUringWorker` class + - Worker thread management + - Lifecycle (init/shutdown) + +- [ ] **2.2.1.3** Implement request queue per worker (1 day) + - Thread-safe pending queue + - Condition variable for signaling + - Lock-free alternatives (optional) + +- [ ] **2.2.1.4** Implement worker event loop (3 days) + - Submit SQEs from queue + - Poll for CQEs + - Timeout handling + +- [ ] **2.2.1.5** Implement load balancing (2 days) + - Round-robin distribution + - Queue depth aware (optional) + - Test distribution fairness + +- [ ] **2.2.1.6** Test multi-worker (3 days) + - Submit to multiple workers + - Verify parallel execution + - Stress test (1000+ requests) + - Add `tests/io_uring_multi_worker_test.cpp` + +**Deliverable**: Multi-worker io_uring backend + +**Sub-Phase 2.2.2: Advanced Features (Weeks 12-14)** + +**Microtasks**: +- [ ] **2.2.2.1** Implement fixed files support (2 days) + - Register file descriptors + - Use IOSQE_FIXED_FILE flag + - Benchmark improvement + +- [ ] **2.2.2.2** Add SQPOLL mode (optional) (2 days) + - Configure SQPOLL parameters + - Test latency improvement + - Document requirements + +- [ ] **2.2.2.3** Enhanced error handling (2 days) + - EAGAIN retry logic + - EINTR handling + - Robust failure recovery + +- [ ] **2.2.2.4** Performance tuning (3 days) + - Optimize queue depth + - Tune polling interval + - Batch submission optimization + +- [ ] **2.2.2.5** Comprehensive testing (2 days) + - Error injection tests + - Performance benchmarks + - Compare vs CPU backend + +- [ ] **2.2.2.6** Documentation (1 day) + - Update README.md + - Configuration guide + - Performance tuning tips + +**Deliverable**: Production-ready io_uring backend + +**Phase 2.2 Milestone**: io_uring backend feature-complete + +--- + +#### Phase 2.3: Request Cancellation (Weeks 15-18) +**Owner**: Core Engineer +**Priority**: Medium +**Dependencies**: None + +**Sub-Phase 2.3.1: API Design (Week 15)** + +**Microtasks**: +- [ ] **2.3.1.1** Add RequestStatus::Cancelled (0.5 day) + - Update enum in `ds_runtime.hpp` + - Update C ABI mapping + +- [ ] **2.3.1.2** Add request ID tracking (1 day) + - `request_id_t` type + - ID generation (atomic counter) + - Request ID โ†’ Request mapping + +- [ ] **2.3.1.3** Add cancellation flag to Request (0.5 day) + - `std::atomic cancellation_requested` + - Memory ordering considerations + +- [ ] **2.3.1.4** Design Queue cancellation API (1 day) + - `bool cancel_request(request_id_t)` + - `size_t cancel_all_pending()` + - `size_t cancel_all()` + +- [ ] **2.3.1.5** Document cancellation semantics (1 day) + - Strong vs weak guarantees + - Race condition handling + - Callback behavior + +**Deliverable**: Cancellation API design + +**Sub-Phase 2.3.2: Queue Implementation (Week 16)** + +**Microtasks**: +- [ ] **2.3.2.1** Implement request tracking (2 days) + - Active requests map + - Thread-safe access + - ID assignment on enqueue + +- [ ] **2.3.2.2** Implement cancel_request (1 day) + - Lookup request by ID + - Set cancellation flag + - Remove if still pending + +- [ ] **2.3.2.3** Implement cancel_all methods (1 day) + - Iterate active requests + - Mark all for cancellation + - Return count + +- [ ] **2.3.2.4** Test queue cancellation (1 day) + - Cancel pending request + - Cancel after submit + - Race condition tests + - Add `tests/cancellation_queue_test.cpp` + +**Deliverable**: Queue-level cancellation + +**Sub-Phase 2.3.3: Backend Integration (Weeks 17-18)** + +**Microtasks**: +- [ ] **2.3.3.1** CPU backend cancellation (2 days) + - Check flag before I/O + - Check flag after I/O + - Skip callback if cancelled + +- [ ] **2.3.3.2** Vulkan backend cancellation (2 days) + - Check flag before dispatch + - Check flag in completion + - Handle in-flight GPU work + +- [ ] **2.3.3.3** io_uring backend cancellation (2 days) + - Cancel pending SQEs + - Handle in-flight operations + - Integration with worker threads + +- [ ] **2.3.3.4** Comprehensive testing (3 days) + - Test all backends + - Race condition stress tests + - Performance impact measurement + - Add `tests/cancellation_backend_test.cpp` + +- [ ] **2.3.3.5** Documentation (1 day) + - API documentation + - Usage examples + - Cancellation guarantees + +**Deliverable**: Full cancellation support + +**Phase 2.3 Milestone**: Request cancellation implemented + +--- + +**Phase 2 Summary**: +- Week 9-13: GDeflate CPU implementation +- Week 9-14: io_uring multi-worker (parallel) +- Week 15-18: Request cancellation +- All components tested independently + +--- + +### PHASE 3: Advanced Features (Weeks 19-28) + +**Goal**: GPU acceleration, advanced optimizations + +#### Phase 3.1: GDeflate GPU Implementation (Weeks 19-24) +**Owner**: GPU Compute Engineer +**Priority**: High +**Dependencies**: Phase 1.2 (Vulkan compute) + Phase 2.1 (GDeflate CPU) + +**Sub-Phase 3.1.1: GPU Shader Development (Weeks 19-21)** + +**Microtasks**: +- [ ] **3.1.1.1** Design GPU decompression algorithm (2 days) + - Block-parallel approach + - Workgroup size selection + - Memory layout + +- [ ] **3.1.1.2** Implement DEFLATE decode shader (5 days) + - Huffman decoding (GPU-friendly) + - LZ77 back-reference handling + - Shared memory optimization + - Create `shaders/gdeflate_decompress.comp` + +- [ ] **3.1.1.3** Test shader in isolation (3 days) + - Standalone shader test harness + - Known input/output validation + - Performance profiling + +- [ ] **3.1.1.4** Optimize shader (3 days) + - Reduce divergence + - Coalesced memory access + - Wavefront/warp efficiency + +**Deliverable**: GDeflate GPU shader + +**Sub-Phase 3.1.2: Backend Integration (Weeks 22-24)** + +**Microtasks**: +- [ ] **3.1.2.1** Create GDeflate compute pipeline (1 day) + - Load shader + - Configure descriptor layout + - Add to VulkanBackend + +- [ ] **3.1.2.2** Implement GPU decompression dispatch (3 days) + - Parse block headers on CPU + - Upload metadata to GPU + - Dispatch compute workgroups + - Synchronization + +- [ ] **3.1.2.3** Implement CPU/GPU hybrid strategy (2 days) + - Heuristic for CPU vs GPU + - Configuration options + - Fallback logic + +- [ ] **3.1.2.4** Test GPU decompression (4 days) + - Correctness tests + - Performance benchmarks + - Comparison vs CPU + - Add `tests/gdeflate_gpu_test.cpp` + +- [ ] **3.1.2.5** Optimize performance (3 days) + - Profile GPU execution + - Reduce bottlenecks + - Tune workgroup sizes + +- [ ] **3.1.2.6** Documentation (2 days) + - GPU requirements + - Performance characteristics + - Troubleshooting + +**Deliverable**: GDeflate GPU decompression + +**Phase 3.1 Milestone**: GPU-accelerated decompression working + +--- + +#### Phase 3.2: GPU-Resident Workflows (Weeks 25-28) +**Owner**: GPU Optimization Engineer +**Priority**: Medium +**Dependencies**: Phase 3.1 (GDeflate GPU) + +**Sub-Phase 3.2.1: Memory Optimization (Weeks 25-26)** + +**Microtasks**: +- [ ] **3.2.1.1** Implement staging buffer pooling (2 days) + - Buffer pool allocator + - Reuse across requests + - Size-based bins + +- [ ] **3.2.1.2** Implement async staging copies (2 days) + - Don't block on staging โ†’ GPU + - Pipeline multiple transfers + - Synchronization + +- [ ] **3.2.1.3** Optimize GPU buffer management (2 days) + - Reduce allocation frequency + - Suballocation strategy + - Memory defragmentation + +- [ ] **3.2.1.4** Test memory optimizations (2 days) + - Memory usage profiling + - Performance impact + - Stress tests + +**Deliverable**: Optimized memory management + +**Sub-Phase 3.2.2: Advanced GPU Paths (Weeks 27-28) +**Microtasks**: +- [ ] **3.2.2.1** Investigate GPUDirect Storage (2 days) + - NVIDIA GDS API review + - Feasibility assessment + - Prototype (if viable) + +- [ ] **3.2.2.2** Implement GPU-to-GPU optimization (2 days) + - Compressed buffer โ†’ decompressed buffer + - Single command buffer + - No CPU involvement + +- [ ] **3.2.2.3** Batch GPU operations (2 days) + - Multiple decompressions per dispatch + - Amortize overhead + - Descriptor set reuse + +- [ ] **3.2.2.4** Performance testing (2 days) + - Benchmark GPU workflows + - Compare optimization stages + - Real-world asset tests + +**Deliverable**: Optimized GPU-resident workflows + +**Phase 3.2 Milestone**: GPU workflows optimized + +--- + +**Phase 3 Summary**: +- Weeks 19-24: GDeflate GPU implementation +- Weeks 25-28: GPU workflow optimization +- All GPU features complete + +--- + +### PHASE 4: Wine/Proton Integration (Weeks 29-36) + +**Goal**: Enable DirectStorage games on Linux via Proton + +#### Phase 4.1: Shim Development (Weeks 29-32) +**Owner**: Wine Integration Engineer +**Priority**: High +**Dependencies**: Phase 2.1 (GDeflate CPU), Phase 1.2 (Vulkan compute) + +**Sub-Phase 4.1.1: Shim Skeleton (Week 29)** + +**Microtasks**: +- [ ] **4.1.1.1** Create dstorage.dll directory structure (1 day) + - Set up Wine dlls/dstorage + - Makefile.in, .spec files + - Basic build infrastructure + +- [ ] **4.1.1.2** Implement DStorageGetFactory (1 day) + - COM object creation + - Reference counting + - Error handling + +- [ ] **4.1.1.3** Implement skeleton COM interfaces (2 days) + - IDStorageFactory + - IDStorageQueue + - IDStorageFile + - Basic vtable setup + +- [ ] **4.1.1.4** Test shim loads (1 day) + - DLL registration + - COM object creation + - Basic smoke test + +**Deliverable**: dstorage.dll skeleton + +**Sub-Phase 4.1.2: Type Mapping (Weeks 30-31)** + +**Microtasks**: +- [ ] **4.1.2.1** Implement request descriptor translation (2 days) + - DSTORAGE_REQUEST โ†’ ds_request + - Field-by-field mapping + - Validation + +- [ ] **4.1.2.2** Implement handle conversion (1 day) + - Windows HANDLE โ†’ Linux fd + - File handle management + - Reference counting + +- [ ] **4.1.2.3** Implement D3D12 โ†’ Vulkan interop (3 days) + - Get VkDevice from ID3D12Device + - Get VkBuffer from ID3D12Resource + - vkd3d-proton integration + +- [ ] **4.1.2.4** Implement compression format mapping (1 day) + - DSTORAGE_COMPRESSION โ†’ ds_compression_t + - Enum translation + - Validation + +- [ ] **4.1.2.5** Test type conversions (2 days) + - Unit tests for all mappings + - Edge cases + - Error handling + +**Deliverable**: Complete type mapping + +**Sub-Phase 4.1.3: Queue Implementation (Week 32)** + +**Microtasks**: +- [ ] **4.1.3.1** Implement IDStorageFactory::CreateQueue (2 days) + - Create ds_queue via C ABI + - Wrap in COM object + - Backend selection logic + +- [ ] **4.1.3.2** Implement IDStorageQueue::EnqueueRequest (2 days) + - Translate request + - Forward to ds_queue_enqueue + - Error handling + +- [ ] **4.1.3.3** Implement IDStorageQueue::Submit (1 day) + - Call ds_queue_submit_all + - Synchronization + +- [ ] **4.1.3.4** Implement completion signaling (1 day) + - IDStorageQueue::EnqueueSignal + - Map to callbacks/fences + - Event notification + +**Deliverable**: Functional queue implementation + +--- + +#### Phase 4.2: Testing and Integration (Weeks 33-36) +**Owner**: QA/Integration Engineer +**Priority**: Critical +**Dependencies**: Phase 4.1 complete + +**Sub-Phase 4.2.1: Integration Testing (Weeks 33-34)** + +**Microtasks**: +- [ ] **4.2.1.1** Create simple test app (2 days) + - Minimal DirectStorage usage + - File read with DirectStorage API + - Verify data correctness + +- [ ] **4.2.1.2** Test with Proton (3 days) + - Configure Wine environment + - Test shim loading + - Debug integration issues + +- [ ] **4.2.1.3** Test Vulkan device sharing (2 days) + - Verify VkDevice passed correctly + - Test GPU transfers + - Synchronization validation + +- [ ] **4.2.1.4** Performance baseline (2 days) + - Measure overhead + - Compare Windows vs Linux + - Identify bottlenecks + +**Deliverable**: Integration test suite + +**Sub-Phase 4.2.2: Real Game Testing (Weeks 35-36)** + +**Microtasks**: +- [ ] **4.2.2.1** Test Forspoken (if available) (3 days) + - Launch game via Proton + - Verify asset loading + - Performance testing + - Bug fixing + +- [ ] **4.2.2.2** Test other DirectStorage titles (3 days) + - Ratchet & Clank + - UE5 games + - Identify compatibility issues + +- [ ] **4.2.2.3** Performance optimization (3 days) + - Profile hot paths + - Optimize type conversions + - Reduce overhead + +- [ ] **4.2.2.4** Bug fixing and polish (3 days) + - Address discovered issues + - Stability improvements + - Memory leak fixes + +**Deliverable**: Stable Wine/Proton integration + +**Sub-Phase 4.2.3: Documentation (Week 36)** + +**Microtasks**: +- [ ] **4.2.3.1** Write integration guide (2 days) + - Build instructions + - Configuration + - Troubleshooting + +- [ ] **4.2.3.2** Document known issues (1 day) + - Compatibility list + - Workarounds + - Performance notes + +- [ ] **4.2.3.3** Create developer guide (1 day) + - Debugging tips + - Contributing guidelines + - Testing procedures + +- [ ] **4.2.3.4** Update project documentation (1 day) + - README.md + - ROADMAP.md + - Release notes + +**Deliverable**: Complete documentation + +**Phase 4 Milestone**: Wine/Proton integration complete + +--- + +## 3. Fast Track Option (12 Weeks MVP) + +**Goal**: Minimal viable product for testing + +**Included**: +- โœ… CPU backend (already working) +- โฉ GDeflate CPU (Weeks 1-5) +- โฉ Vulkan compute (Weeks 1-8, parallel) +- โฉ Basic Wine shim (Weeks 9-12) + +**Excluded**: +- โŒ GDeflate GPU (defer) +- โŒ io_uring multi-worker (defer) +- โŒ Request cancellation (defer) +- โŒ GPU optimizations (defer) +- โŒ Advanced Wine integration (defer) + +**Timeline**: 12 weeks to functional MVP + +--- + +## 4. Testing Strategy + +### 4.1 Continuous Testing + +**Per Phase**: +- Unit tests for all new components +- Integration tests after backend changes +- Performance benchmarks +- Memory leak testing (valgrind) + +### 4.2 Test Suites + +**New Test Files**: +1. `tests/gdeflate_format_test.cpp` +2. `tests/gdeflate_decompress_test.cpp` +3. `tests/gdeflate_gpu_test.cpp` +4. `tests/vulkan_compute_test.cpp` +5. `tests/io_uring_multi_worker_test.cpp` +6. `tests/cancellation_test.cpp` +7. `tests/wine_integration_test.cpp` + +**Existing Tests** (update as needed): +- `tests/basic_queue_test.cpp` +- `tests/cpu_backend_test.cpp` +- `tests/error_handling_test.cpp` +- `tests/compression_gdeflate_stub_test.cpp` (change to success test) + +### 4.3 Validation + +**Every Phase**: +- [ ] All tests pass +- [ ] No memory leaks (valgrind clean) +- [ ] Vulkan validation layers pass +- [ ] Performance meets targets +- [ ] Documentation updated + +--- + +## 5. Success Criteria + +### 5.1 Functional Requirements + +**Core**: +- โœ… GDeflate compression works (CPU and GPU) +- โœ… Vulkan GPU compute pipelines functional +- โœ… io_uring backend production-ready +- โœ… Request cancellation implemented +- โœ… GPU-resident workflows optimized +- โœ… Wine/Proton integration working + +**Quality**: +- โœ… No memory leaks +- โœ… Thread-safe +- โœ… Vulkan validation clean +- โœ… Comprehensive test coverage (โ‰ฅ80%) +- โœ… Documentation complete + +### 5.2 Performance Targets + +| Metric | Target | Method | +|--------|--------|--------| +| GDeflate CPU | โ‰ฅ 500 MB/s | Benchmark decompression | +| GDeflate GPU | โ‰ฅ 2 GB/s | Benchmark decompression | +| io_uring throughput | โ‰ฅ 2x CPU backend | File I/O benchmark | +| Vulkan compute overhead | < 100 ยตs/dispatch | Profiling | +| Wine/Proton overhead | < 10% vs native | Game benchmarks | + +### 5.3 Platform Requirements + +**Verified On**: +- CachyOS (Linux kernel 5.15+) +- Arch Linux (latest) +- AMD GPU (RADV driver) +- NVIDIA GPU (proprietary driver) +- Intel GPU (ANV driver) + +--- + +## 6. Risk Management + +### 6.1 Technical Risks + +| Risk | Probability | Impact | Mitigation | +|------|-------------|--------|------------| +| GDeflate format unavailable | Medium | High | Reverse engineer, community help | +| GPU shader too slow | Low | Medium | Optimize, fallback to CPU | +| Wine integration complex | High | High | Start simple, iterate, seek Wine dev help | +| Hardware compatibility | Medium | High | Test multiple GPUs, provide fallbacks | +| liburing unavailable | Low | Low | CPU/Vulkan backends work without it | + +### 6.2 Schedule Risks + +| Risk | Impact | Mitigation | +|------|--------|------------| +| GDeflate research delay | +4 weeks | Start GPU work in parallel | +| Vulkan debugging difficult | +2 weeks | Use validation layers, RenderDoc | +| Wine upstreaming slow | +8 weeks | Maintain out-of-tree, focus on functionality | +| Testing reveals bugs | +4 weeks | Buffer time, incremental fixes | + +### 6.3 Resource Risks + +**Assumptions**: +- Single developer (can parallelize to some extent) +- Access to CachyOS/Arch Linux system +- Access to Vulkan-capable GPU +- Access to DirectStorage test games (for Phase 4) + +**Mitigation**: +- Prioritize critical path +- Use community resources (Wine forums, GitHub) +- Parallelize independent work +- MVP approach if resource-constrained + +--- + +## 7. Deliverables + +### 7.1 Code + +**New Files**: +- `src/gdeflate_decoder.cpp` - GDeflate CPU implementation +- `shaders/gdeflate_decompress.comp` - GDeflate GPU shader +- `shaders/buffer_copy.comp` - Example compute shader +- `shaders/uppercase.comp` - Transform shader +- 7+ new test files + +**Modified Files**: +- `src/ds_runtime.cpp` - Remove GDeflate stub, add cancellation +- `src/ds_runtime_vulkan.cpp` - Add compute pipelines +- `src/ds_runtime_uring.cpp` - Multi-worker support +- `include/ds_runtime.hpp` - API additions (cancellation, IDs) + +**External** (Wine tree): +- `dlls/dstorage/*` - Wine shim DLL + +### 7.2 Documentation + +**New Documents**: +- โœ… `docs/investigation_gdeflate.md` (complete) +- โœ… `docs/investigation_vulkan_compute.md` (complete) +- โœ… `docs/investigation_io_uring.md` (complete) +- โœ… `docs/investigation_remaining_features.md` (complete) +- โœ… `docs/master_roadmap.md` (this document) +- `docs/gdeflate_format.md` (Phase 1) +- `docs/gdeflate_usage.md` (Phase 2) +- `docs/wine_integration_guide.md` (Phase 4) + +**Updated Documents**: +- `README.md` - Update status, features +- `MISSING_FEATURES.md` - Mark completed +- `COMPARISON.md` - Update comparisons +- `docs/design.md` - Reflect implementation + +### 7.3 Tests + +**Comprehensive Test Suite**: +- 12+ test executables (4 existing + 8 new) +- 100% coverage of new features +- Performance benchmarks +- Integration tests +- Wine shim tests + +--- + +## 8. Progress Tracking + +### 8.1 Weekly Milestones + +| Week | Milestone | Phase | +|------|-----------|-------| +| 3 | GDeflate format understood | 1.1 | +| 2 | Shader loading works | 1.2.1 | +| 4 | Descriptor system works | 1.2.2 | +| 6 | Compute pipelines created | 1.2.3 | +| 8 | Compute dispatch working | 1.2.4 | +| 10 | GDeflate header parser done | 2.1.1 | +| 11 | DEFLATE integration complete | 2.1.2 | +| 13 | GDeflate CPU working | 2.1.3 | +| 14 | io_uring multi-worker done | 2.2 | +| 18 | Request cancellation done | 2.3 | +| 24 | GDeflate GPU working | 3.1 | +| 28 | GPU workflows optimized | 3.2 | +| 32 | Wine shim functional | 4.1 | +| 36 | Full integration complete | 4.2 | + +### 8.2 Reporting + +**Frequency**: Weekly +**Format**: GitHub PR updates via `report_progress` tool + +**Include**: +- Completed microtasks (โœ…) +- In-progress tasks (โฉ) +- Blockers/issues +- Test results +- Performance metrics + +--- + +## 9. Next Actions + +### Immediate (Week 1) + +**This Week**: +1. โœ… Complete investigation documents (done) +2. โœ… Report progress with master plan (in progress) +3. โฉ Begin GDeflate format research +4. โฉ Start Vulkan shader loading implementation +5. โฉ Set up development environment (liburing, Vulkan SDK) + +### Short Term (Weeks 2-4) + +**Focus**: +- Continue GDeflate research +- Complete Vulkan shader module system +- Begin descriptor management +- Create test infrastructure + +### Medium Term (Weeks 5-12) + +**Focus**: +- Complete GDeflate CPU implementation +- Finish Vulkan compute pipelines +- Begin io_uring multi-worker +- Test all components independently + +### Long Term (Weeks 13-36) + +**Focus**: +- GPU acceleration (GDeflate, workflows) +- Wine/Proton integration +- Real game testing +- Performance optimization +- Documentation and polish + +--- + +## 10. Conclusion + +This master roadmap provides a comprehensive plan for completing the ds-runtime project with extensive subphasing and microtasking. The phased approach allows for: + +- **Incremental progress**: Small, verifiable steps +- **Parallel work**: Independent features can progress simultaneously +- **Risk management**: Early identification of issues +- **Flexibility**: Can adjust scope (MVP vs full implementation) +- **Clear milestones**: Weekly checkpoints for progress tracking + +**Full Implementation**: 36 weeks (9 months) +**MVP**: 12 weeks (3 months) + +The project is well-positioned with a solid foundation (CPU backend working, tests passing). The investigation phase has identified all requirements and dependencies. Execution can begin immediately on multiple parallel tracks (GDeflate research + Vulkan compute). + +--- + +**Document Status**: Complete v1.0 +**Last Updated**: 2026-02-16 +**Next Update**: Weekly progress reports diff --git a/include/gdeflate_format.h b/include/gdeflate_format.h new file mode 100644 index 0000000..33d08a1 --- /dev/null +++ b/include/gdeflate_format.h @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: Apache-2.0 +// GDeflate format structures and definitions for ds-runtime. + +#pragma once + +#include +#include +#include + +namespace ds { +namespace gdeflate { + +// GDeflate is a block-based compression format designed for GPU decompression. +// Each file/stream consists of a header followed by compressed blocks. +// Blocks can be decompressed independently in parallel. + +// GDeflate file magic number (placeholder - needs actual spec) +constexpr uint32_t GDEFLATE_MAGIC = 0x4744464C; // "GDFL" in little-endian + +// GDeflate format version +constexpr uint16_t GDEFLATE_VERSION_MAJOR = 1; +constexpr uint16_t GDEFLATE_VERSION_MINOR = 0; + +// Maximum block size (16 MB is typical for DirectStorage) +constexpr uint32_t MAX_BLOCK_SIZE = 16 * 1024 * 1024; + +// GDeflate file header structure +// This appears at the start of every GDeflate compressed file +struct FileHeader { + uint32_t magic; // Magic number (GDEFLATE_MAGIC) + uint16_t version_major; // Format version (major) + uint16_t version_minor; // Format version (minor) + uint32_t flags; // Compression flags + uint32_t uncompressed_size; // Total uncompressed size (bytes) + uint32_t compressed_size; // Total compressed size (bytes) + uint32_t block_count; // Number of blocks + uint32_t reserved[2]; // Reserved for future use + + // Validate header + bool is_valid() const { + return magic == GDEFLATE_MAGIC && + version_major == GDEFLATE_VERSION_MAJOR && + uncompressed_size > 0 && + compressed_size > 0 && + block_count > 0; + } +}; + +// Metadata for a single compressed block +struct BlockInfo { + uint64_t offset; // Offset in compressed stream (bytes) + uint32_t compressed_size; // Compressed block size (bytes) + uint32_t uncompressed_size; // Uncompressed block size (bytes) + uint32_t checksum; // Block checksum (CRC32 or similar) + + // Validate block info + bool is_valid() const { + return compressed_size > 0 && + uncompressed_size > 0 && + uncompressed_size <= MAX_BLOCK_SIZE; + } +}; + +// Complete GDeflate stream information +struct StreamInfo { + FileHeader header; + std::vector blocks; + + // Validate entire stream + bool is_valid() const { + if (!header.is_valid()) { + return false; + } + if (blocks.size() != header.block_count) { + return false; + } + for (const auto& block : blocks) { + if (!block.is_valid()) { + return false; + } + } + return true; + } + + // Get total uncompressed size + uint64_t get_uncompressed_size() const { + uint64_t total = 0; + for (const auto& block : blocks) { + total += block.uncompressed_size; + } + return total; + } + + // Get total compressed size + uint64_t get_compressed_size() const { + uint64_t total = 0; + for (const auto& block : blocks) { + total += block.compressed_size; + } + return total; + } +}; + +// Parse GDeflate file header from buffer +// Returns true if header is valid and successfully parsed +inline bool parse_file_header(const void* data, size_t size, FileHeader& header) { + if (size < sizeof(FileHeader)) { + return false; + } + + std::memcpy(&header, data, sizeof(FileHeader)); + return header.is_valid(); +} + +// Parse block metadata from buffer +// Returns number of blocks parsed, or 0 on error +inline size_t parse_block_info(const void* data, size_t size, + size_t block_count, std::vector& blocks) { + if (size < block_count * sizeof(BlockInfo)) { + return 0; + } + + blocks.resize(block_count); + std::memcpy(blocks.data(), data, block_count * sizeof(BlockInfo)); + + // Validate all blocks + for (const auto& block : blocks) { + if (!block.is_valid()) { + blocks.clear(); + return 0; + } + } + + return block_count; +} + +// Parse complete GDeflate stream information +inline bool parse_stream_info(const void* data, size_t size, StreamInfo& info) { + if (size < sizeof(FileHeader)) { + return false; + } + + // Parse header + if (!parse_file_header(data, size, info.header)) { + return false; + } + + // Parse block metadata (comes after header) + const uint8_t* block_data = static_cast(data) + sizeof(FileHeader); + size_t remaining = size - sizeof(FileHeader); + + size_t parsed = parse_block_info(block_data, remaining, + info.header.block_count, info.blocks); + + return parsed == info.header.block_count && info.is_valid(); +} + +} // namespace gdeflate +} // namespace ds diff --git a/shaders/README.md b/shaders/README.md new file mode 100644 index 0000000..1ef5ebd --- /dev/null +++ b/shaders/README.md @@ -0,0 +1,36 @@ +# DS-Runtime Compute Shaders + +This directory contains GLSL compute shaders for GPU-accelerated operations in ds-runtime. + +## Shaders + +### copy.comp +Basic buffer copy shader. Copies data from source buffer to destination buffer using GPU compute. +- Local workgroup size: 16 +- Bindings: + - 0: Source buffer (read-only) + - 1: Destination buffer (write-only) + +## Building + +Shaders are automatically compiled to SPIR-V during the CMake build process using `glslangValidator`. + +```bash +# Manual compilation (for testing): +glslangValidator -V copy.comp -o copy.comp.spv +``` + +## Adding New Shaders + +1. Create your shader file (e.g., `my_shader.comp`) +2. CMake will automatically compile it to SPIR-V +3. The compiled `.spv` file will be available at build time +4. Load the shader in code using the shader module system + +## Shader Conventions + +- Use `#version 450` for Vulkan 1.0+ compatibility +- Declare local workgroup size with `layout(local_size_x = N) in;` +- Use storage buffers for data: `layout(binding = N) buffer BufferName { ... };` +- Optimize for coalesced memory access patterns +- Keep workgroup sizes as multiples of 32 (warp/wavefront size) diff --git a/shaders/buffer_copy.comp b/shaders/buffer_copy.comp new file mode 100644 index 0000000..5178b1b --- /dev/null +++ b/shaders/buffer_copy.comp @@ -0,0 +1,34 @@ +# Shader for simple buffer copy operation +# Demonstrates basic compute shader structure with descriptor bindings + +#version 450 + +# Define workgroup size (how many threads run in parallel) +# 256 is a good default (multiple of 32 for most GPUs) +layout(local_size_x = 256) in; + +# Binding 0: Input buffer (read-only) +# Storage buffers allow large data arrays +layout(binding = 0) readonly buffer InputBuffer { + uint data[]; +} input_buf; + +# Binding 1: Output buffer (write-only) +layout(binding = 1) writeonly buffer OutputBuffer { + uint data[]; +} output_buf; + +# Push constants: Small data passed directly (faster than buffers) +layout(push_constant) uniform PushConstants { + uint element_count; // Number of elements to copy +} push; + +void main() { + // Get this thread's global index + uint idx = gl_GlobalInvocationID.x; + + // Bounds check: don't write beyond array + if (idx < push.element_count) { + output_buf.data[idx] = input_buf.data[idx]; + } +} diff --git a/shaders/copy.comp b/shaders/copy.comp new file mode 100644 index 0000000..fb45c82 --- /dev/null +++ b/shaders/copy.comp @@ -0,0 +1,19 @@ +#version 450 +layout(local_size_x = 16) in; + +// We treat the buffer as an array of 32-bit words. +// 16 words * 4 bytes = 64 bytes total. +layout(binding = 0) buffer SrcBuf { + uint src[]; +}; + +layout(binding = 1) buffer DstBuf { + uint dst[]; +}; + +void main() { + uint idx = gl_GlobalInvocationID.x; + if (idx < 16) { + dst[idx] = src[idx]; + } +} diff --git a/shaders/uppercase.comp b/shaders/uppercase.comp new file mode 100644 index 0000000..b345f67 --- /dev/null +++ b/shaders/uppercase.comp @@ -0,0 +1,36 @@ +# Shader for uppercase ASCII transformation +# Demonstrates data processing on GPU + +#version 450 + +layout(local_size_x = 256) in; + +# Input buffer (raw bytes as uint8) +layout(binding = 0) readonly buffer InputBuffer { + uint8_t data[]; +} input_buf; + +# Output buffer (transformed bytes) +layout(binding = 1) writeonly buffer OutputBuffer { + uint8_t data[]; +} output_buf; + +# Push constants +layout(push_constant) uniform PushConstants { + uint byte_count; +} push; + +void main() { + uint idx = gl_GlobalInvocationID.x; + + if (idx < push.byte_count) { + uint8_t c = input_buf.data[idx]; + + // Convert lowercase ASCII to uppercase (a-z -> A-Z) + if (c >= uint8_t('a') && c <= uint8_t('z')) { + c = c - uint8_t(32); + } + + output_buf.data[idx] = c; + } +} diff --git a/src/ds_runtime_vulkan.cpp b/src/ds_runtime_vulkan.cpp index 0a62460..24d692e 100644 --- a/src/ds_runtime_vulkan.cpp +++ b/src/ds_runtime_vulkan.cpp @@ -7,10 +7,15 @@ #include #include #include +#include #include +#include #include #include +#include +#include #include +#include #include #include @@ -20,6 +25,803 @@ namespace ds { namespace { +// Load SPIR-V bytecode from a file. +// Returns the bytecode as a vector of uint32_t words. +// Throws std::runtime_error if the file cannot be read or is invalid. +std::vector load_spirv_from_file(const std::string& path) { + // Open file in binary mode, seek to end to get size + std::ifstream file(path, std::ios::binary | std::ios::ate); + if (!file) { + throw std::runtime_error("Failed to open SPIR-V file: " + path); + } + + // Get file size + std::streamsize size = file.tellg(); + if (size <= 0) { + throw std::runtime_error("SPIR-V file is empty: " + path); + } + + // SPIR-V must be a multiple of 4 bytes (32-bit words) + if (size % 4 != 0) { + throw std::runtime_error("SPIR-V file size is not a multiple of 4 bytes: " + path); + } + + // Seek back to beginning + file.seekg(0, std::ios::beg); + + // Read the file into a buffer + std::vector buffer(static_cast(size)); + if (!file.read(buffer.data(), size)) { + throw std::runtime_error("Failed to read SPIR-V file: " + path); + } + + // Validate SPIR-V magic number (0x07230203 in little-endian) + if (buffer.size() >= 4) { + uint32_t magic = *reinterpret_cast(buffer.data()); + if (magic != 0x07230203) { + throw std::runtime_error("Invalid SPIR-V magic number in file: " + path); + } + } + + // Convert to uint32_t vector + std::vector spirv(buffer.size() / 4); + std::memcpy(spirv.data(), buffer.data(), buffer.size()); + + return spirv; +} + +// Wrapper for VkShaderModule with RAII lifecycle management. +class ShaderModule { +public: + // Create a shader module from SPIR-V bytecode. + // Throws std::runtime_error if creation fails. + ShaderModule(VkDevice device, const std::vector& spirv_code) + : device_(device), module_(VK_NULL_HANDLE) + { + if (device == VK_NULL_HANDLE) { + throw std::runtime_error("Invalid VkDevice for shader module creation"); + } + + if (spirv_code.empty()) { + throw std::runtime_error("Empty SPIR-V code"); + } + + VkShaderModuleCreateInfo create_info{}; + create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + create_info.codeSize = spirv_code.size() * sizeof(uint32_t); + create_info.pCode = spirv_code.data(); + + VkResult result = vkCreateShaderModule(device_, &create_info, nullptr, &module_); + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create VkShaderModule (VkResult: " + + std::to_string(static_cast(result)) + ")"); + } + } + + // Destructor cleans up the Vulkan shader module. + ~ShaderModule() { + if (module_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) { + vkDestroyShaderModule(device_, module_, nullptr); + } + } + + // Delete copy operations (shader modules should not be copied) + ShaderModule(const ShaderModule&) = delete; + ShaderModule& operator=(const ShaderModule&) = delete; + + // Allow move operations + ShaderModule(ShaderModule&& other) noexcept + : device_(other.device_), module_(other.module_) + { + other.module_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + } + + ShaderModule& operator=(ShaderModule&& other) noexcept { + if (this != &other) { + // Clean up our current module + if (module_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) { + vkDestroyShaderModule(device_, module_, nullptr); + } + + // Take ownership of other's module + device_ = other.device_; + module_ = other.module_; + other.module_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + } + return *this; + } + + // Get the underlying VkShaderModule handle. + VkShaderModule get() const { return module_; } + + // Check if the module is valid. + bool is_valid() const { return module_ != VK_NULL_HANDLE; } + +private: + VkDevice device_; + VkShaderModule module_; +}; + +// Cache for shader modules to avoid reloading/recompiling the same shaders. +// Maps shader file paths to loaded shader modules. +class ShaderModuleCache { +public: + explicit ShaderModuleCache(VkDevice device) : device_(device) {} + + // Load a shader from file, returning a cached module if already loaded. + // Throws std::runtime_error if the shader cannot be loaded. + VkShaderModule load_shader(const std::string& path) { + // Check if already cached + auto it = cache_.find(path); + if (it != cache_.end()) { + return it->second.get(); + } + + // Load SPIR-V from file + std::vector spirv = load_spirv_from_file(path); + + // Create shader module + ShaderModule module(device_, spirv); + + // Cache it (move into cache) + VkShaderModule handle = module.get(); + cache_.emplace(path, std::move(module)); + + return handle; + } + + // Clear all cached shader modules. + void clear() { + cache_.clear(); + } + + // Get number of cached shaders. + std::size_t size() const { + return cache_.size(); + } + + // Check if a shader is cached. + bool has_shader(const std::string& path) const { + return cache_.find(path) != cache_.end(); + } + +private: + VkDevice device_; + std::unordered_map cache_; +}; + +// Descriptor set layout for compute shaders. +// Defines the bindings used by a compute pipeline. +struct DescriptorLayoutInfo { + std::vector bindings; + VkDescriptorSetLayout layout = VK_NULL_HANDLE; + + // Create the Vulkan descriptor set layout from bindings + void create(VkDevice device) { + if (layout != VK_NULL_HANDLE) { + return; // Already created + } + + VkDescriptorSetLayoutCreateInfo layout_info{}; + layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layout_info.bindingCount = static_cast(bindings.size()); + layout_info.pBindings = bindings.data(); + + VkResult result = vkCreateDescriptorSetLayout(device, &layout_info, nullptr, &layout); + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create descriptor set layout (VkResult: " + + std::to_string(static_cast(result)) + ")"); + } + } + + // Destroy the layout + void destroy(VkDevice device) { + if (layout != VK_NULL_HANDLE) { + vkDestroyDescriptorSetLayout(device, layout, nullptr); + layout = VK_NULL_HANDLE; + } + } +}; + +// Factory functions to create common descriptor layouts +namespace descriptor_layouts { + +// Layout for simple buffer copy: 2 storage buffers (input, output) +inline DescriptorLayoutInfo create_buffer_copy_layout() { + DescriptorLayoutInfo info; + info.bindings.resize(2); + + // Binding 0: Input buffer (read-only) + info.bindings[0].binding = 0; + info.bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + info.bindings[0].descriptorCount = 1; + info.bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + info.bindings[0].pImmutableSamplers = nullptr; + + // Binding 1: Output buffer (write-only) + info.bindings[1].binding = 1; + info.bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + info.bindings[1].descriptorCount = 1; + info.bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + info.bindings[1].pImmutableSamplers = nullptr; + + return info; +} + +// Layout for decompression: 3 storage buffers (compressed, metadata, decompressed) +inline DescriptorLayoutInfo create_decompression_layout() { + DescriptorLayoutInfo info; + info.bindings.resize(3); + + // Binding 0: Compressed input buffer + info.bindings[0].binding = 0; + info.bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + info.bindings[0].descriptorCount = 1; + info.bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + info.bindings[0].pImmutableSamplers = nullptr; + + // Binding 1: Metadata buffer (block info) + info.bindings[1].binding = 1; + info.bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + info.bindings[1].descriptorCount = 1; + info.bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + info.bindings[1].pImmutableSamplers = nullptr; + + // Binding 2: Decompressed output buffer + info.bindings[2].binding = 2; + info.bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + info.bindings[2].descriptorCount = 1; + info.bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + info.bindings[2].pImmutableSamplers = nullptr; + + return info; +} + +} // namespace descriptor_layouts + +// Descriptor pool for allocating descriptor sets. +// Pre-allocates a pool of descriptors that can be used by compute pipelines. +class DescriptorPool { +public: + explicit DescriptorPool(VkDevice device, uint32_t max_sets = 32) + : device_(device), pool_(VK_NULL_HANDLE) + { + // Size the pool for storage buffers (most common for compute) + // Each set needs up to 3 storage buffers (for decompression layout) + VkDescriptorPoolSize pool_size{}; + pool_size.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + pool_size.descriptorCount = max_sets * 3; // 3 buffers per set max + + VkDescriptorPoolCreateInfo pool_info{}; + pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + pool_info.poolSizeCount = 1; + pool_info.pPoolSizes = &pool_size; + pool_info.maxSets = max_sets; + pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + + VkResult result = vkCreateDescriptorPool(device_, &pool_info, nullptr, &pool_); + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create descriptor pool (VkResult: " + + std::to_string(static_cast(result)) + ")"); + } + } + + ~DescriptorPool() { + if (pool_ != VK_NULL_HANDLE) { + vkDestroyDescriptorPool(device_, pool_, nullptr); + } + } + + // Delete copy operations + DescriptorPool(const DescriptorPool&) = delete; + DescriptorPool& operator=(const DescriptorPool&) = delete; + + // Allow move operations + DescriptorPool(DescriptorPool&& other) noexcept + : device_(other.device_), pool_(other.pool_) + { + other.pool_ = VK_NULL_HANDLE; + } + + DescriptorPool& operator=(DescriptorPool&& other) noexcept { + if (this != &other) { + if (pool_ != VK_NULL_HANDLE) { + vkDestroyDescriptorPool(device_, pool_, nullptr); + } + device_ = other.device_; + pool_ = other.pool_; + other.pool_ = VK_NULL_HANDLE; + } + return *this; + } + + // Allocate a descriptor set from this pool + VkDescriptorSet allocate(VkDescriptorSetLayout layout) { + VkDescriptorSetAllocateInfo alloc_info{}; + alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + alloc_info.descriptorPool = pool_; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &layout; + + VkDescriptorSet descriptor_set; + VkResult result = vkAllocateDescriptorSets(device_, &alloc_info, &descriptor_set); + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to allocate descriptor set (VkResult: " + + std::to_string(static_cast(result)) + ")"); + } + + return descriptor_set; + } + + // Free a descriptor set back to the pool + void free(VkDescriptorSet descriptor_set) { + vkFreeDescriptorSets(device_, pool_, 1, &descriptor_set); + } + + // Reset the entire pool (frees all allocated sets) + void reset() { + vkResetDescriptorPool(device_, pool_, 0); + } + + VkDescriptorPool get() const { return pool_; } + +private: + VkDevice device_; + VkDescriptorPool pool_; +}; + +// Helper functions for updating descriptor sets with buffer bindings. +namespace descriptor_updates { + +// Update a descriptor set with buffer bindings. +// Used to bind actual VkBuffer handles to descriptor set bindings. +struct BufferBinding { + uint32_t binding; // Binding index (matches shader layout) + VkBuffer buffer; // Buffer to bind + VkDeviceSize offset; // Offset into buffer + VkDeviceSize range; // Size of buffer region (or VK_WHOLE_SIZE) +}; + +// Update descriptor set with storage buffer bindings. +// Example: bind input and output buffers for compute shader. +inline void update_storage_buffers(VkDevice device, + VkDescriptorSet descriptor_set, + const std::vector& bindings) +{ + std::vector buffer_infos; + std::vector writes; + + buffer_infos.reserve(bindings.size()); + writes.reserve(bindings.size()); + + for (const auto& binding : bindings) { + // Create buffer info + VkDescriptorBufferInfo buffer_info{}; + buffer_info.buffer = binding.buffer; + buffer_info.offset = binding.offset; + buffer_info.range = binding.range; + buffer_infos.push_back(buffer_info); + + // Create write descriptor + VkWriteDescriptorSet write{}; + write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write.dstSet = descriptor_set; + write.dstBinding = binding.binding; + write.dstArrayElement = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write.pBufferInfo = &buffer_infos[writes.size()]; + writes.push_back(write); + } + + // Update all bindings at once + vkUpdateDescriptorSets(device, + static_cast(writes.size()), + writes.data(), + 0, nullptr); +} + +// Convenience function for 2-buffer copy layout (input, output) +inline void update_copy_buffers(VkDevice device, + VkDescriptorSet descriptor_set, + VkBuffer input_buffer, + VkDeviceSize input_size, + VkBuffer output_buffer, + VkDeviceSize output_size) +{ + std::vector bindings = { + {0, input_buffer, 0, input_size}, // Binding 0: input + {1, output_buffer, 0, output_size} // Binding 1: output + }; + update_storage_buffers(device, descriptor_set, bindings); +} + +// Convenience function for 3-buffer decompression layout +inline void update_decompression_buffers(VkDevice device, + VkDescriptorSet descriptor_set, + VkBuffer compressed_buffer, + VkDeviceSize compressed_size, + VkBuffer metadata_buffer, + VkDeviceSize metadata_size, + VkBuffer decompressed_buffer, + VkDeviceSize decompressed_size) +{ + std::vector bindings = { + {0, compressed_buffer, 0, compressed_size}, // Binding 0: compressed + {1, metadata_buffer, 0, metadata_size}, // Binding 1: metadata + {2, decompressed_buffer, 0, decompressed_size} // Binding 2: decompressed + }; + update_storage_buffers(device, descriptor_set, bindings); +} + +} // namespace descriptor_updates + +// Pipeline layout wrapper with RAII lifecycle management. +// Combines descriptor set layouts and push constants. +class PipelineLayout { +public: + // Create pipeline layout from descriptor layouts and optional push constants + PipelineLayout(VkDevice device, + const std::vector& descriptor_layouts, + const std::vector& push_constant_ranges = {}) + : device_(device), layout_(VK_NULL_HANDLE) + { + VkPipelineLayoutCreateInfo layout_info{}; + layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + layout_info.setLayoutCount = static_cast(descriptor_layouts.size()); + layout_info.pSetLayouts = descriptor_layouts.data(); + layout_info.pushConstantRangeCount = static_cast(push_constant_ranges.size()); + layout_info.pPushConstantRanges = push_constant_ranges.empty() ? nullptr : push_constant_ranges.data(); + + VkResult result = vkCreatePipelineLayout(device_, &layout_info, nullptr, &layout_); + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create pipeline layout (VkResult: " + + std::to_string(static_cast(result)) + ")"); + } + } + + ~PipelineLayout() { + if (layout_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) { + vkDestroyPipelineLayout(device_, layout_, nullptr); + } + } + + // Delete copy operations + PipelineLayout(const PipelineLayout&) = delete; + PipelineLayout& operator=(const PipelineLayout&) = delete; + + // Allow move operations + PipelineLayout(PipelineLayout&& other) noexcept + : device_(other.device_), layout_(other.layout_) + { + other.layout_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + } + + PipelineLayout& operator=(PipelineLayout&& other) noexcept { + if (this != &other) { + if (layout_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) { + vkDestroyPipelineLayout(device_, layout_, nullptr); + } + device_ = other.device_; + layout_ = other.layout_; + other.layout_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + } + return *this; + } + + VkPipelineLayout get() const { return layout_; } + bool is_valid() const { return layout_ != VK_NULL_HANDLE; } + +private: + VkDevice device_; + VkPipelineLayout layout_; +}; + +// Helper to create common push constant ranges +namespace push_constants { + +// Push constant range for compute shaders (typically for dispatch parameters) +inline VkPushConstantRange create_compute_range(uint32_t size, uint32_t offset = 0) { + VkPushConstantRange range{}; + range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + range.offset = offset; + range.size = size; + return range; +} + +// Common push constant for element count (single uint32) +inline VkPushConstantRange create_element_count_range() { + return create_compute_range(sizeof(uint32_t), 0); +} + +} // namespace push_constants + +// Compute pipeline wrapper with RAII lifecycle management. +class ComputePipeline { +public: + // Create compute pipeline from shader module and pipeline layout + ComputePipeline(VkDevice device, + VkShaderModule shader_module, + VkPipelineLayout pipeline_layout, + const char* entry_point = "main", + VkPipelineCache cache = VK_NULL_HANDLE) + : device_(device), pipeline_(VK_NULL_HANDLE) + { + // Shader stage info + VkPipelineShaderStageCreateInfo stage_info{}; + stage_info.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + stage_info.stage = VK_SHADER_STAGE_COMPUTE_BIT; + stage_info.module = shader_module; + stage_info.pName = entry_point; + + // Compute pipeline info + VkComputePipelineCreateInfo pipeline_info{}; + pipeline_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + pipeline_info.stage = stage_info; + pipeline_info.layout = pipeline_layout; + + VkResult result = vkCreateComputePipelines(device_, cache, 1, &pipeline_info, nullptr, &pipeline_); + if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to create compute pipeline (VkResult: " + + std::to_string(static_cast(result)) + ")"); + } + } + + ~ComputePipeline() { + if (pipeline_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) { + vkDestroyPipeline(device_, pipeline_, nullptr); + } + } + + // Delete copy operations + ComputePipeline(const ComputePipeline&) = delete; + ComputePipeline& operator=(const ComputePipeline&) = delete; + + // Allow move operations + ComputePipeline(ComputePipeline&& other) noexcept + : device_(other.device_), pipeline_(other.pipeline_) + { + other.pipeline_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + } + + ComputePipeline& operator=(ComputePipeline&& other) noexcept { + if (this != &other) { + if (pipeline_ != VK_NULL_HANDLE && device_ != VK_NULL_HANDLE) { + vkDestroyPipeline(device_, pipeline_, nullptr); + } + device_ = other.device_; + pipeline_ = other.pipeline_; + other.pipeline_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + } + return *this; + } + + VkPipeline get() const { return pipeline_; } + bool is_valid() const { return pipeline_ != VK_NULL_HANDLE; } + +private: + VkDevice device_; + VkPipeline pipeline_; +}; + +// Complete pipeline bundle (layout + pipeline + descriptor layout) +struct ComputePipelineBundle { + DescriptorLayoutInfo descriptor_layout; + std::unique_ptr pipeline_layout; + std::unique_ptr pipeline; + + // Check if bundle is complete and valid + bool is_valid() const { + return descriptor_layout.layout != VK_NULL_HANDLE && + pipeline_layout && pipeline_layout->is_valid() && + pipeline && pipeline->is_valid(); + } +}; + +// Factory functions for creating complete pipeline bundles +namespace pipeline_factory { + +// Create a buffer copy pipeline (2 storage buffers, 1 push constant for element count) +inline ComputePipelineBundle create_buffer_copy_pipeline( + VkDevice device, + ShaderModuleCache& shader_cache, + const std::string& shader_path) +{ + ComputePipelineBundle bundle; + + // 1. Create descriptor layout + bundle.descriptor_layout = descriptor_layouts::create_buffer_copy_layout(); + bundle.descriptor_layout.create(device); + + // 2. Create pipeline layout with push constants + std::vector desc_layouts = {bundle.descriptor_layout.layout}; + std::vector push_ranges = {push_constants::create_element_count_range()}; + bundle.pipeline_layout = std::make_unique(device, desc_layouts, push_ranges); + + // 3. Load shader and create pipeline + VkShaderModule shader = shader_cache.load_shader(shader_path); + bundle.pipeline = std::make_unique(device, shader, bundle.pipeline_layout->get()); + + return bundle; +} + +// Create a decompression pipeline (3 storage buffers, push constants for parameters) +inline ComputePipelineBundle create_decompression_pipeline( + VkDevice device, + ShaderModuleCache& shader_cache, + const std::string& shader_path) +{ + ComputePipelineBundle bundle; + + // 1. Create descriptor layout + bundle.descriptor_layout = descriptor_layouts::create_decompression_layout(); + bundle.descriptor_layout.create(device); + + // 2. Create pipeline layout with push constants (4 uint32s for decompression params) + std::vector desc_layouts = {bundle.descriptor_layout.layout}; + std::vector push_ranges = { + push_constants::create_compute_range(sizeof(uint32_t) * 4, 0) + }; + bundle.pipeline_layout = std::make_unique(device, desc_layouts, push_ranges); + + // 3. Load shader and create pipeline + VkShaderModule shader = shader_cache.load_shader(shader_path); + bundle.pipeline = std::make_unique(device, shader, bundle.pipeline_layout->get()); + + return bundle; +} + +} // namespace pipeline_factory + +// Command buffer recording helpers for compute dispatches +namespace compute_dispatch { + +// Helper to record compute dispatch commands into a command buffer +struct DispatchInfo { + VkPipeline pipeline; + VkPipelineLayout pipeline_layout; + VkDescriptorSet descriptor_set; + uint32_t workgroup_count_x; + uint32_t workgroup_count_y = 1; + uint32_t workgroup_count_z = 1; + const void* push_constants_data = nullptr; + uint32_t push_constants_size = 0; + uint32_t push_constants_offset = 0; +}; + +// Record compute dispatch commands into a command buffer +inline void record_compute_dispatch(VkCommandBuffer cmd, const DispatchInfo& info) { + // Bind compute pipeline + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, info.pipeline); + + // Bind descriptor set + vkCmdBindDescriptorSets( + cmd, + VK_PIPELINE_BIND_POINT_COMPUTE, + info.pipeline_layout, + 0, // first set + 1, // descriptor set count + &info.descriptor_set, + 0, // dynamic offset count + nullptr + ); + + // Push constants (if provided) + if (info.push_constants_data && info.push_constants_size > 0) { + vkCmdPushConstants( + cmd, + info.pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, + info.push_constants_offset, + info.push_constants_size, + info.push_constants_data + ); + } + + // Dispatch compute work + vkCmdDispatch(cmd, info.workgroup_count_x, info.workgroup_count_y, info.workgroup_count_z); +} + +// Calculate workgroup count for 1D data (e.g., buffer processing) +inline uint32_t calculate_workgroup_count_1d(uint32_t element_count, uint32_t workgroup_size) { + return (element_count + workgroup_size - 1) / workgroup_size; +} + +// Memory barrier helpers for synchronization +struct MemoryBarrierInfo { + VkAccessFlags src_access_mask; + VkAccessFlags dst_access_mask; + VkBuffer buffer; + VkDeviceSize offset; + VkDeviceSize size; +}; + +// Insert buffer memory barrier +inline void insert_buffer_barrier(VkCommandBuffer cmd, + VkPipelineStageFlags src_stage, + VkPipelineStageFlags dst_stage, + const MemoryBarrierInfo& info) +{ + VkBufferMemoryBarrier barrier{}; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.srcAccessMask = info.src_access_mask; + barrier.dstAccessMask = info.dst_access_mask; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = info.buffer; + barrier.offset = info.offset; + barrier.size = info.size; + + vkCmdPipelineBarrier( + cmd, + src_stage, + dst_stage, + 0, // dependency flags + 0, nullptr, // memory barriers + 1, &barrier, // buffer memory barriers + 0, nullptr // image memory barriers + ); +} + +// Barrier: Transfer Write โ†’ Compute Read (after staging buffer copy, before compute) +inline void barrier_transfer_to_compute(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize size) { + MemoryBarrierInfo info; + info.src_access_mask = VK_ACCESS_TRANSFER_WRITE_BIT; + info.dst_access_mask = VK_ACCESS_SHADER_READ_BIT; + info.buffer = buffer; + info.offset = 0; + info.size = size; + + insert_buffer_barrier( + cmd, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + info + ); +} + +// Barrier: Compute Write โ†’ Transfer Read (after compute, before readback) +inline void barrier_compute_to_transfer(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize size) { + MemoryBarrierInfo info; + info.src_access_mask = VK_ACCESS_SHADER_WRITE_BIT; + info.dst_access_mask = VK_ACCESS_TRANSFER_READ_BIT; + info.buffer = buffer; + info.offset = 0; + info.size = size; + + insert_buffer_barrier( + cmd, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + info + ); +} + +// Barrier: Compute Write โ†’ Compute Read (between dependent compute passes) +inline void barrier_compute_to_compute(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize size) { + MemoryBarrierInfo info; + info.src_access_mask = VK_ACCESS_SHADER_WRITE_BIT; + info.dst_access_mask = VK_ACCESS_SHADER_READ_BIT; + info.buffer = buffer; + info.offset = 0; + info.size = size; + + insert_buffer_barrier( + cmd, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + info + ); +} + +} // namespace compute_dispatch + // Simple fixed-size thread pool to keep backend execution async. // This mirrors the CPU backend model but is local to the Vulkan backend // to keep responsibilities self-contained. diff --git a/tests/gdeflate_format_test.cpp b/tests/gdeflate_format_test.cpp new file mode 100644 index 0000000..0f6c2f5 --- /dev/null +++ b/tests/gdeflate_format_test.cpp @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: Apache-2.0 +// Test for GDeflate format parsing + +#include "gdeflate_format.h" +#include +#include + +using namespace ds::gdeflate; + +// Test valid header parsing +bool test_valid_header() { + FileHeader header; + header.magic = GDEFLATE_MAGIC; + header.version_major = GDEFLATE_VERSION_MAJOR; + header.version_minor = GDEFLATE_VERSION_MINOR; + header.flags = 0; + header.uncompressed_size = 1024; + header.compressed_size = 512; + header.block_count = 1; + header.reserved[0] = 0; + header.reserved[1] = 0; + + if (!header.is_valid()) { + std::cerr << "Valid header failed validation\n"; + return false; + } + + // Test parsing + FileHeader parsed; + if (!parse_file_header(&header, sizeof(header), parsed)) { + std::cerr << "Failed to parse valid header\n"; + return false; + } + + if (parsed.magic != GDEFLATE_MAGIC || + parsed.uncompressed_size != 1024 || + parsed.block_count != 1) { + std::cerr << "Parsed header data mismatch\n"; + return false; + } + + return true; +} + +// Test invalid header (bad magic) +bool test_invalid_magic() { + FileHeader header; + header.magic = 0xDEADBEEF; // Wrong magic + header.version_major = GDEFLATE_VERSION_MAJOR; + header.version_minor = GDEFLATE_VERSION_MINOR; + header.uncompressed_size = 1024; + header.compressed_size = 512; + header.block_count = 1; + + if (header.is_valid()) { + std::cerr << "Invalid magic passed validation\n"; + return false; + } + + return true; +} + +// Test block info parsing +bool test_block_info() { + BlockInfo block; + block.offset = 0; + block.compressed_size = 256; + block.uncompressed_size = 512; + block.checksum = 0x12345678; + + if (!block.is_valid()) { + std::cerr << "Valid block failed validation\n"; + return false; + } + + // Test parsing multiple blocks + std::vector blocks; + BlockInfo block_array[3]; + for (int i = 0; i < 3; i++) { + block_array[i] = block; + block_array[i].offset = static_cast(i * 256); + } + + size_t parsed = parse_block_info(block_array, sizeof(block_array), 3, blocks); + if (parsed != 3) { + std::cerr << "Failed to parse blocks: " << parsed << "\n"; + return false; + } + + return true; +} + +// Test complete stream info +bool test_stream_info() { + // Create test data + const size_t total_size = sizeof(FileHeader) + sizeof(BlockInfo) * 2; + uint8_t buffer[total_size]; + + // Write header + FileHeader* header = reinterpret_cast(buffer); + header->magic = GDEFLATE_MAGIC; + header->version_major = GDEFLATE_VERSION_MAJOR; + header->version_minor = GDEFLATE_VERSION_MINOR; + header->flags = 0; + header->uncompressed_size = 2048; + header->compressed_size = 1024; + header->block_count = 2; + header->reserved[0] = 0; + header->reserved[1] = 0; + + // Write blocks + BlockInfo* blocks = reinterpret_cast(buffer + sizeof(FileHeader)); + blocks[0].offset = 0; + blocks[0].compressed_size = 512; + blocks[0].uncompressed_size = 1024; + blocks[0].checksum = 0x11111111; + + blocks[1].offset = 512; + blocks[1].compressed_size = 512; + blocks[1].uncompressed_size = 1024; + blocks[1].checksum = 0x22222222; + + // Parse stream + StreamInfo info; + if (!parse_stream_info(buffer, total_size, info)) { + std::cerr << "Failed to parse stream info\n"; + return false; + } + + if (info.blocks.size() != 2) { + std::cerr << "Wrong block count: " << info.blocks.size() << "\n"; + return false; + } + + if (info.get_uncompressed_size() != 2048) { + std::cerr << "Wrong uncompressed size: " << info.get_uncompressed_size() << "\n"; + return false; + } + + return true; +} + +int main() { + std::cout << "[gdeflate_format_test] Running tests...\n"; + + if (!test_valid_header()) { + std::cerr << "[gdeflate_format_test] test_valid_header FAILED\n"; + return 1; + } + std::cout << "[gdeflate_format_test] test_valid_header PASSED\n"; + + if (!test_invalid_magic()) { + std::cerr << "[gdeflate_format_test] test_invalid_magic FAILED\n"; + return 1; + } + std::cout << "[gdeflate_format_test] test_invalid_magic PASSED\n"; + + if (!test_block_info()) { + std::cerr << "[gdeflate_format_test] test_block_info FAILED\n"; + return 1; + } + std::cout << "[gdeflate_format_test] test_block_info PASSED\n"; + + if (!test_stream_info()) { + std::cerr << "[gdeflate_format_test] test_stream_info FAILED\n"; + return 1; + } + std::cout << "[gdeflate_format_test] test_stream_info PASSED\n"; + + std::cout << "[gdeflate_format_test] ALL TESTS PASSED\n"; + return 0; +}