diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a0f1d9d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,81 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + go: + name: Go + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.23' + - run: cd go && go build ./shard/... + - run: cd go && go test ./shard/... + + python: + name: Python + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - run: cd py && pip install -e ".[test]" + - run: cd py && pytest tests/ + + typescript: + name: TypeScript + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '20' + - run: cd ts && npm ci + - run: cd ts && npm test + + rust: + name: Rust + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - run: cd rs && cargo build + - run: cd rs && cargo test + + c: + name: C + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: sudo apt-get update && sudo apt-get install -y libzstd-dev liblz4-dev + - run: cd c && make clean && make test + + publish-gate: + name: Publish Gate + needs: [go, python, typescript, rust, c] + runs-on: ubuntu-latest + if: always() + steps: + - name: Check all language tests passed + run: | + if [[ "${{ needs.go.result }}" != "success" || \ + "${{ needs.python.result }}" != "success" || \ + "${{ needs.typescript.result }}" != "success" || \ + "${{ needs.rust.result }}" != "success" || \ + "${{ needs.c.result }}" != "success" ]]; then + echo "One or more language test suites failed:" + echo " Go: ${{ needs.go.result }}" + echo " Python: ${{ needs.python.result }}" + echo " TypeScript: ${{ needs.typescript.result }}" + echo " Rust: ${{ needs.rust.result }}" + echo " C: ${{ needs.c.result }}" + exit 1 + fi + echo "All 5 language test suites passed. Safe to publish." diff --git a/c/shard_v2.c b/c/shard_v2.c index ca83853..afa1c95 100644 --- a/c/shard_v2.c +++ b/c/shard_v2.c @@ -95,6 +95,21 @@ static inline void write_le64(uint8_t* p, uint64_t v) { p[7] = (uint8_t)(v >> 56); } +static bool checked_u64_add(uint64_t a, uint64_t b, uint64_t* out) { + if (!out) return false; + if (UINT64_MAX - a < b) return false; + *out = a + b; + return true; +} + +static bool range_within_size(size_t total, uint64_t offset, uint64_t len, uint64_t* end_out) { + uint64_t end = 0; + if (!checked_u64_add(offset, len, &end)) return false; + if (end > (uint64_t)total) return false; + if (end_out) *end_out = end; + return true; +} + /* ============================================================ * CRC32C (Castagnoli, polynomial 0x82F63B78) * ============================================================ */ @@ -358,6 +373,9 @@ static shard_v2_reader_t* reader_parse(shard_v2_reader_t* r) { if (r->header.total_file_size != (uint64_t)r->data_len) { goto fail; } + if (r->header.schema_offset > (uint64_t)r->data_len) { + goto fail; + } n = r->header.entry_count; r->entry_count = n; @@ -389,8 +407,11 @@ static shard_v2_reader_t* reader_parse(shard_v2_reader_t* r) { for (uint32_t i = 0; i < n; i++) { uint32_t name_off = r->entries[i].name_offset; uint16_t name_len = r->entries[i].name_len; - uint64_t abs_off = st_off + name_off; - if (abs_off + name_len > r->data_len) goto fail; + uint64_t abs_off = 0; + if (!checked_u64_add(st_off, name_off, &abs_off) || + !range_within_size(r->data_len, abs_off, name_len, NULL)) { + goto fail; + } r->names[i] = (char*)malloc(name_len + 1); if (!r->names[i]) goto fail; memcpy(r->names[i], r->data + abs_off, name_len); @@ -404,10 +425,11 @@ static shard_v2_reader_t* reader_parse(shard_v2_reader_t* r) { for (uint32_t i = 0; i < n; i++) { uint64_t off = r->entries[i].data_offset; uint64_t size = r->entries[i].disk_size; + uint64_t end = 0; if (off < ds_off) goto fail; - if (off + size > (uint64_t)r->data_len) goto fail; + if (!range_within_size(r->data_len, off, size, &end)) goto fail; if (i > 0 && off < prev_end) goto fail; - prev_end = off + size; + prev_end = end; } } @@ -606,7 +628,7 @@ const uint8_t* shard_v2_read_entry(const shard_v2_reader_t* r, uint32_t i, size_ if (!r || i >= r->entry_count) return NULL; const shard_v2_index_entry_t* e = &r->entries[i]; - if (e->data_offset + e->disk_size > r->data_len) return NULL; + if (!range_within_size(r->data_len, e->data_offset, e->disk_size, NULL)) return NULL; if (shard_v2_is_compressed(e)) { /* Compressed path: decompress into cache on first access. */ @@ -691,7 +713,7 @@ const uint8_t* shard_v2_read_entry_prefix(const shard_v2_reader_t* r, uint32_t i if (shard_v2_is_compressed(e)) return NULL; size_t sz = (size_t)e->disk_size; if (max_bytes < sz) sz = max_bytes; - if (e->data_offset + sz > r->data_len) return NULL; + if (!range_within_size(r->data_len, e->data_offset, sz, NULL)) return NULL; if (out_size) *out_size = sz; return r->data + e->data_offset; } diff --git a/c/test_metadata.c b/c/test_metadata.c index d9cd6de..62a1c80 100644 --- a/c/test_metadata.c +++ b/c/test_metadata.c @@ -263,7 +263,9 @@ TEST(join_path_multi) { * list_children tests * ============================================================ */ -static shard_v2_reader_t* make_layered_reader(void) { +/* make_layered_reader returns reader + buffer. Caller must free buf AFTER closing reader, + * because shard_v2_from_buffer borrows the pointer (owns_buf=false). */ +static shard_v2_reader_t* make_layered_reader(uint8_t** out_buf) { test_entry_t entries[] = { {"layer.0/weight", (const uint8_t*)"w0", 2}, {"layer.0/bias", (const uint8_t*)"b0", 2}, @@ -271,10 +273,8 @@ static shard_v2_reader_t* make_layered_reader(void) { {"embed", (const uint8_t*)"tok", 3}, }; size_t len; - uint8_t* buf = build_shard(entries, 4, 64, &len); - shard_v2_reader_t* r = shard_v2_from_buffer(buf, len); - free(buf); - return r; + *out_buf = build_shard(entries, 4, 64, &len); + return shard_v2_from_buffer(*out_buf, len); } /* Helper: check that array `arr` of `n` strings contains `needle`. */ @@ -286,7 +286,8 @@ static bool arr_contains(char** arr, uint32_t n, const char* needle) { } TEST(list_children_exact_prefix_with_slash) { - shard_v2_reader_t* r = make_layered_reader(); + uint8_t* buf; + shard_v2_reader_t* r = make_layered_reader(&buf); assert(r != NULL); uint32_t count = 0; char** children = shard_v2_list_children(r, "layer.0/", &count); @@ -296,10 +297,12 @@ TEST(list_children_exact_prefix_with_slash) { assert(arr_contains(children, count, "layer.0/bias")); shard_v2_list_children_free(children, count); shard_v2_close(r); + free(buf); } TEST(list_children_empty_prefix_returns_top_level) { - shard_v2_reader_t* r = make_layered_reader(); + uint8_t* buf; + shard_v2_reader_t* r = make_layered_reader(&buf); assert(r != NULL); uint32_t count = 0; char** children = shard_v2_list_children(r, "", &count); @@ -311,10 +314,12 @@ TEST(list_children_empty_prefix_returns_top_level) { assert(count == 3); shard_v2_list_children_free(children, count); shard_v2_close(r); + free(buf); } TEST(list_children_partial_prefix) { - shard_v2_reader_t* r = make_layered_reader(); + uint8_t* buf; + shard_v2_reader_t* r = make_layered_reader(&buf); assert(r != NULL); uint32_t count = 0; char** children = shard_v2_list_children(r, "layer.", &count); @@ -324,10 +329,12 @@ TEST(list_children_partial_prefix) { assert(arr_contains(children, count, "layer.1/")); shard_v2_list_children_free(children, count); shard_v2_close(r); + free(buf); } TEST(list_children_nonexistent_prefix) { - shard_v2_reader_t* r = make_layered_reader(); + uint8_t* buf; + shard_v2_reader_t* r = make_layered_reader(&buf); assert(r != NULL); uint32_t count = 0; char** children = shard_v2_list_children(r, "nonexistent/", &count); @@ -335,6 +342,7 @@ TEST(list_children_nonexistent_prefix) { assert(count == 0); shard_v2_list_children_free(children, count); shard_v2_close(r); + free(buf); } TEST(list_children_deduplicated_directories) { @@ -346,7 +354,6 @@ TEST(list_children_deduplicated_directories) { size_t len; uint8_t* buf = build_shard(entries, 3, 64, &len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, len); - free(buf); assert(r != NULL); uint32_t count = 0; @@ -360,6 +367,7 @@ TEST(list_children_deduplicated_directories) { assert(a_count == 1); shard_v2_list_children_free(children, count); shard_v2_close(r); + free(buf); } TEST(list_children_hierarchical_three_levels) { @@ -372,7 +380,6 @@ TEST(list_children_hierarchical_three_levels) { size_t len; uint8_t* buf = build_shard(entries, 4, 64, &len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, len); - free(buf); assert(r != NULL); /* Top-level */ @@ -399,6 +406,7 @@ TEST(list_children_hierarchical_three_levels) { shard_v2_list_children_free(under_ab, count); shard_v2_close(r); + free(buf); } /* ============================================================ @@ -415,7 +423,6 @@ TEST(read_entry_prefix_first_n_bytes) { size_t buf_len; uint8_t* buf = build_shard(entries, 1, 64, &buf_len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, buf_len); - free(buf); assert(r != NULL); size_t out_size = 0; @@ -424,6 +431,7 @@ TEST(read_entry_prefix_first_n_bytes) { assert(out_size == 5); assert(memcmp(got, "Hello", 5) == 0); shard_v2_close(r); + free(buf); } TEST(read_entry_prefix_full_entry) { @@ -433,7 +441,6 @@ TEST(read_entry_prefix_full_entry) { size_t buf_len; uint8_t* buf = build_shard(entries, 1, 64, &buf_len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, buf_len); - free(buf); assert(r != NULL); size_t out_size = 0; @@ -442,6 +449,7 @@ TEST(read_entry_prefix_full_entry) { assert(out_size == PAYLOAD_LEN); assert(memcmp(got, PAYLOAD, PAYLOAD_LEN) == 0); shard_v2_close(r); + free(buf); } TEST(read_entry_prefix_exceeds_entry_length) { @@ -451,7 +459,6 @@ TEST(read_entry_prefix_exceeds_entry_length) { size_t buf_len; uint8_t* buf = build_shard(entries, 1, 64, &buf_len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, buf_len); - free(buf); assert(r != NULL); size_t out_size = 0; @@ -460,6 +467,7 @@ TEST(read_entry_prefix_exceeds_entry_length) { assert(out_size == PAYLOAD_LEN); assert(memcmp(got, PAYLOAD, PAYLOAD_LEN) == 0); shard_v2_close(r); + free(buf); } TEST(read_entry_prefix_zero_bytes) { @@ -469,7 +477,6 @@ TEST(read_entry_prefix_zero_bytes) { size_t buf_len; uint8_t* buf = build_shard(entries, 1, 64, &buf_len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, buf_len); - free(buf); assert(r != NULL); size_t out_size = 99; @@ -477,6 +484,7 @@ TEST(read_entry_prefix_zero_bytes) { assert(got != NULL); assert(out_size == 0); shard_v2_close(r); + free(buf); } TEST(read_entry_prefix_one_byte) { @@ -486,7 +494,6 @@ TEST(read_entry_prefix_one_byte) { size_t buf_len; uint8_t* buf = build_shard(entries, 1, 64, &buf_len); shard_v2_reader_t* r = shard_v2_from_buffer(buf, buf_len); - free(buf); assert(r != NULL); size_t out_size = 0; @@ -495,6 +502,7 @@ TEST(read_entry_prefix_one_byte) { assert(out_size == 1); assert(got[0] == 'H'); shard_v2_close(r); + free(buf); } /* ============================================================ diff --git a/c/test_safety.c b/c/test_safety.c index 337c4eb..0fefad9 100644 --- a/c/test_safety.c +++ b/c/test_safety.c @@ -49,6 +49,29 @@ static void make_path(char* out, size_t outsz, const char* dir, const char* file snprintf(out, outsz, "%s/%s", dir, file); } +static void write_u16_le(uint8_t* p, uint16_t v) { + p[0] = (uint8_t)(v & 0xFFu); + p[1] = (uint8_t)((v >> 8) & 0xFFu); +} + +static void write_u32_le(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 0xFFu); + p[1] = (uint8_t)((v >> 8) & 0xFFu); + p[2] = (uint8_t)((v >> 16) & 0xFFu); + p[3] = (uint8_t)((v >> 24) & 0xFFu); +} + +static void write_u64_le(uint8_t* p, uint64_t v) { + p[0] = (uint8_t)(v & 0xFFu); + p[1] = (uint8_t)((v >> 8) & 0xFFu); + p[2] = (uint8_t)((v >> 16) & 0xFFu); + p[3] = (uint8_t)((v >> 24) & 0xFFu); + p[4] = (uint8_t)((v >> 32) & 0xFFu); + p[5] = (uint8_t)((v >> 40) & 0xFFu); + p[6] = (uint8_t)((v >> 48) & 0xFFu); + p[7] = (uint8_t)((v >> 56) & 0xFFu); +} + /* ============================================================ * Signal-based crash guard for corrupt shard tests * @@ -361,6 +384,46 @@ static void test_corrupt_all_zeros(void) { printf("\n"); } +static void test_corrupt_overflowing_entry_range(void) { + printf("[ corrupt_overflowing_entry_range (synthetic) ]\n"); + + uint8_t buf[SHARD_HEADER_SIZE + SHARD_INDEX_ENTRY_SIZE + 2]; + memset(buf, 0, sizeof(buf)); + + memcpy(buf, SHARD_MAGIC, 4); + buf[4] = SHARD_VERSION2; + buf[5] = ROLE_UNKNOWN; + write_u16_le(buf + 6, FLAG_LITTLE_ENDIAN); + buf[8] = ALIGN_NONE; + buf[9] = COMPRESS_NONE; + write_u16_le(buf + 10, SHARD_INDEX_ENTRY_SIZE); + write_u32_le(buf + 12, 1); + write_u64_le(buf + 16, SHARD_HEADER_SIZE + SHARD_INDEX_ENTRY_SIZE); + write_u64_le(buf + 24, SHARD_HEADER_SIZE + SHARD_INDEX_ENTRY_SIZE + 2); + write_u64_le(buf + 32, 0); + write_u64_le(buf + 40, sizeof(buf)); + + uint8_t* entry = buf + SHARD_HEADER_SIZE; + write_u64_le(entry + 0, 0); + write_u32_le(entry + 8, 0); + write_u16_le(entry + 12, 1); + write_u16_le(entry + 14, 0); + write_u64_le(entry + 16, UINT64_MAX - 7); + write_u64_le(entry + 24, 16); + write_u64_le(entry + 32, 16); + write_u32_le(entry + 40, 0); + write_u32_le(entry + 44, 0); + + buf[SHARD_HEADER_SIZE + SHARD_INDEX_ENTRY_SIZE] = 'x'; + buf[SHARD_HEADER_SIZE + SHARD_INDEX_ENTRY_SIZE + 1] = '\0'; + + shard_v2_reader_t* r = shard_v2_from_buffer(buf, sizeof(buf)); + check(r == NULL, "corrupt_overflowing_entry_range: open returns NULL (overflowing data range rejected)"); + if (r) shard_v2_close(r); + + printf("\n"); +} + /* ============================================================ * Concurrent reads test (pthreads) * @@ -590,6 +653,7 @@ int main(int argc, char* argv[]) { test_corrupt_wrong_magic(); test_corrupt_random_bytes(); test_corrupt_all_zeros(); + test_corrupt_overflowing_entry_range(); /* ============================================================ * Concurrent reads test diff --git a/go/cmd/ucodec/run.go b/go/cmd/ucodec/run.go index 00959e4..df22551 100644 --- a/go/cmd/ucodec/run.go +++ b/go/cmd/ucodec/run.go @@ -19,6 +19,17 @@ import ( "github.com/phenomenon0/Agent-GO/pkg/quant" ) +const ( + maxGenerateBodyBytes = 1 << 20 + maxGenerateTokens = 4096 +) + +type generateRequest struct { + Prompt string `json:"prompt"` + MaxTokens int `json:"max_tokens"` + Temperature float64 `json:"temperature"` +} + func runRun(args []string) error { fs := flag.NewFlagSet("run", flag.ExitOnError) umshPath := fs.String("umsh", "", "Path to UMSH/MoSH model file (.umsh or .mosh)") @@ -368,30 +379,18 @@ func runInferenceServer(model *inference.LLaMAModel, tokenizer llm.Tokenizer, po fmt.Printf(" POST /api/generate - Generate text\n") fmt.Printf(" GET /api/health - Health check\n\n") - http.HandleFunc("/api/generate", func(w http.ResponseWriter, r *http.Request) { + mux := http.NewServeMux() + mux.HandleFunc("/api/generate", func(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } - var req struct { - Prompt string `json:"prompt"` - MaxTokens int `json:"max_tokens"` - Temperature float64 `json:"temperature"` - } - - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - http.Error(w, err.Error(), http.StatusBadRequest) + req, ok := decodeGenerateRequest(w, r) + if !ok { return } - if req.MaxTokens <= 0 { - req.MaxTokens = 100 - } - if req.Temperature <= 0 { - req.Temperature = 0.7 - } - // Encode tokens32, err := tokenizer.Encode(req.Prompt) if err != nil { @@ -451,12 +450,12 @@ func runInferenceServer(model *inference.LLaMAModel, tokenizer llm.Tokenizer, po json.NewEncoder(w).Encode(resp) }) - http.HandleFunc("/api/health", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/api/health", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) }) - return http.ListenAndServe(fmt.Sprintf(":%d", port), nil) + return newHTTPServer(fmt.Sprintf(":%d", port), mux).ListenAndServe() } // logMemoryUsage logs current memory statistics @@ -562,32 +561,20 @@ func runLegacyInteractive(engine *quant.InferenceEngine, maxTokens int, temperat func runLegacyServer(engine *quant.InferenceEngine, port int) error { fmt.Printf("Starting legacy server on port %d\n", port) - http.HandleFunc("/api/generate", func(w http.ResponseWriter, r *http.Request) { + mux := http.NewServeMux() + mux.HandleFunc("/api/generate", func(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } - var req struct { - Prompt string `json:"prompt"` - MaxTokens int `json:"max_tokens"` - Temperature float32 `json:"temperature"` - } - - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - http.Error(w, err.Error(), http.StatusBadRequest) + req, ok := decodeGenerateRequest(w, r) + if !ok { return } - if req.MaxTokens <= 0 { - req.MaxTokens = 100 - } - if req.Temperature <= 0 { - req.Temperature = 0.7 - } - tokens := simpleTokenize(req.Prompt) - output, err := engine.Generate(tokens, req.MaxTokens, req.Temperature) + output, err := engine.Generate(tokens, req.MaxTokens, float32(req.Temperature)) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -607,12 +594,49 @@ func runLegacyServer(engine *quant.InferenceEngine, port int) error { json.NewEncoder(w).Encode(resp) }) - http.HandleFunc("/api/info", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/api/info", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") fmt.Fprint(w, engine.ModelInfo()) }) - return http.ListenAndServe(fmt.Sprintf(":%d", port), nil) + return newHTTPServer(fmt.Sprintf(":%d", port), mux).ListenAndServe() +} + +func decodeGenerateRequest(w http.ResponseWriter, r *http.Request) (generateRequest, bool) { + r.Body = http.MaxBytesReader(w, r.Body, maxGenerateBodyBytes) + defer r.Body.Close() + + dec := json.NewDecoder(r.Body) + dec.DisallowUnknownFields() + + var req generateRequest + if err := dec.Decode(&req); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return generateRequest{}, false + } + if req.MaxTokens <= 0 { + req.MaxTokens = 100 + } + if req.MaxTokens > maxGenerateTokens { + http.Error(w, fmt.Sprintf("max_tokens too large: %d > %d", req.MaxTokens, maxGenerateTokens), http.StatusBadRequest) + return generateRequest{}, false + } + if req.Temperature <= 0 { + req.Temperature = 0.7 + } + + return req, true +} + +func newHTTPServer(addr string, handler http.Handler) *http.Server { + return &http.Server{ + Addr: addr, + Handler: handler, + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 15 * time.Second, + WriteTimeout: 2 * time.Minute, + IdleTimeout: 60 * time.Second, + } } // Placeholder tokenizer for legacy engine diff --git a/rs/src/lib.rs b/rs/src/lib.rs index 37562b0..8211ef3 100644 --- a/rs/src/lib.rs +++ b/rs/src/lib.rs @@ -33,6 +33,7 @@ use std::collections::HashMap; use std::fs; use std::fs::File; +use std::io::Read; use std::path::Path; use memmap2::Mmap; @@ -176,7 +177,7 @@ impl std::fmt::Display for ShardError { ShardError::EntryNotFound(name) => write!(f, "entry not found: {}", name), ShardError::CompressionNotSupported => write!(f, "compression not supported"), ShardError::DecompressTooLarge(size) => { - write!(f, "decompressed size {} exceeds 1 GB limit", size) + write!(f, "decompressed size {} exceeds allowed limit", size) } ShardError::DecompressFailed(msg) => write!(f, "decompression failed: {}", msg), ShardError::SecurityLimitExceeded(msg) => write!(f, "security limit exceeded: {}", msg), @@ -216,6 +217,77 @@ pub fn compute_xxhash64(name: &str) -> u64 { xxhash_rust::xxh64::xxh64(name.as_bytes(), 0) } +fn entry_data_range(data_len: usize, offset: u64, size: u64) -> Result, ShardError> { + let offset = usize::try_from(offset).map_err(|_| { + ShardError::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "entry data offset does not fit in memory address space", + )) + })?; + let size = usize::try_from(size).map_err(|_| { + ShardError::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "entry data size does not fit in memory address space", + )) + })?; + let end = offset.checked_add(size).ok_or_else(|| { + ShardError::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "entry data range overflows address space", + )) + })?; + if end > data_len { + return Err(ShardError::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "entry data extends past end of file", + ))); + } + Ok(offset..end) +} + +fn decompress_zstd_bounded(raw: &[u8], expected_size: u64) -> Result, ShardError> { + if expected_size > MAX_DECOMPRESS_SIZE { + return Err(ShardError::DecompressTooLarge(expected_size)); + } + + let mut decoder = zstd::stream::read::Decoder::new(raw) + .map_err(|e| ShardError::DecompressFailed(e.to_string()))?; + let mut limited = decoder.by_ref().take(expected_size.saturating_add(1)); + let capacity = usize::try_from(expected_size).unwrap_or(usize::MAX).min(64 * 1024); + let mut out = Vec::with_capacity(capacity); + limited + .read_to_end(&mut out) + .map_err(|e| ShardError::DecompressFailed(e.to_string()))?; + + if out.len() as u64 > expected_size { + return Err(ShardError::DecompressTooLarge(out.len() as u64)); + } + if out.len() as u64 != expected_size { + return Err(ShardError::DecompressFailed(format!( + "zstd size mismatch: got {}, expected {}", + out.len(), + expected_size + ))); + } + + Ok(out) +} + +fn decompress_entry_data(raw: &[u8], entry: &IndexEntryV2) -> Result, ShardError> { + if entry.orig_size > MAX_DECOMPRESS_SIZE { + return Err(ShardError::DecompressTooLarge(entry.orig_size)); + } + + if entry.flags & ENTRY_FLAG_ZSTD != 0 { + decompress_zstd_bounded(raw, entry.orig_size) + } else if entry.flags & ENTRY_FLAG_LZ4 != 0 { + lz4_flex::decompress(raw, entry.orig_size as usize) + .map_err(|e| ShardError::DecompressFailed(e.to_string())) + } else { + Err(ShardError::CompressionNotSupported) + } +} + /// Align `offset` upward to the given `alignment`. Returns `offset` when `alignment` is 0. fn align_up(offset: usize, alignment: u8) -> usize { if alignment == 0 { @@ -225,6 +297,66 @@ fn align_up(offset: usize, alignment: u8) -> usize { (offset + a - 1) & !(a - 1) } +fn max_entry_data_end(entries: &[IndexEntryV2], data_section_offset: usize) -> Result { + let mut max_end = data_section_offset; + for (index, entry) in entries.iter().enumerate() { + let offset = usize::try_from(entry.data_offset) + .map_err(|_| ShardError::EntryOffsetOutOfRange { index })?; + let size = usize::try_from(entry.disk_size) + .map_err(|_| ShardError::EntryOffsetOutOfRange { index })?; + let end = offset + .checked_add(size) + .ok_or(ShardError::EntryOffsetOutOfRange { index })?; + max_end = max_end.max(end); + } + Ok(max_end) +} + +fn validate_schema_offset( + header: &ShardV2Header, + file_size: usize, + min_schema_offset: usize, +) -> Result, ShardError> { + let schema_offset = usize::try_from(header.schema_offset).map_err(|_| { + ShardError::SecurityLimitExceeded(format!( + "schema_offset {} does not fit in usize", + header.schema_offset + )) + })?; + + if schema_offset == 0 { + return Ok(None); + } + if schema_offset > file_size { + return Err(ShardError::SecurityLimitExceeded(format!( + "schema_offset {} beyond file size {}", + schema_offset, file_size + ))); + } + if schema_offset < min_schema_offset { + return Err(ShardError::SecurityLimitExceeded(format!( + "schema_offset {} overlaps data ending at {}", + schema_offset, min_schema_offset + ))); + } + + Ok(Some(schema_offset)) +} + +fn metadata_slice<'a>( + buf: &'a [u8], + header: &ShardV2Header, + entries: &[IndexEntryV2], +) -> Result, ShardError> { + let min_schema_offset = max_entry_data_end(entries, header.data_section_offset as usize)?; + let Some(schema_offset) = validate_schema_offset(header, buf.len(), min_schema_offset)? else { + return Ok(None); + }; + + let total_file_size = header.total_file_size as usize; + Ok(Some(&buf[schema_offset..total_file_size])) +} + // ============================================================ // Metadata Structures // ============================================================ @@ -492,37 +624,6 @@ impl ShardV2Reader { let header = ShardV2Header::from_bytes(&data[..HEADER_SIZE])?; - // Security: validate entry_count - let entry_count = header.entry_count as usize; - if entry_count > MAX_ENTRY_COUNT { - return Err(ShardError::SecurityLimitExceeded(format!( - "entry_count {} exceeds MAX_ENTRY_COUNT {}", - entry_count, MAX_ENTRY_COUNT - ))); - } - - // Security: validate index size - let index_size = entry_count * INDEX_ENTRY_SIZE; - if index_size > MAX_INDEX_SIZE { - return Err(ShardError::SecurityLimitExceeded(format!( - "index size {} exceeds MAX_INDEX_SIZE {}", - index_size, MAX_INDEX_SIZE - ))); - } - - // Security: validate string table size - let string_table_offset = header.string_table_offset as usize; - let data_section_offset = header.data_section_offset as usize; - if data_section_offset >= string_table_offset { - let st_size = data_section_offset - string_table_offset; - if st_size > MAX_STRING_TABLE_SIZE { - return Err(ShardError::SecurityLimitExceeded(format!( - "string table size {} exceeds MAX_STRING_TABLE_SIZE {}", - st_size, MAX_STRING_TABLE_SIZE - ))); - } - } - // Security: validate total_file_size if header.total_file_size != data.len() as u64 { return Err(ShardError::FileSizeMismatch { @@ -530,76 +631,7 @@ impl ShardV2Reader { actual: data.len() as u64, }); } - - // Parse index entries (immediately after header). - let index_start = HEADER_SIZE; - let index_end = index_start + entry_count * INDEX_ENTRY_SIZE; - - if data.len() < index_end { - return Err(ShardError::Io(std::io::Error::new( - std::io::ErrorKind::UnexpectedEof, - "file too small for index", - ))); - } - - if string_table_offset > data.len() || data_section_offset > data.len() { - return Err(ShardError::SecurityLimitExceeded(format!( - "string table or data section beyond file: string_table_offset={}, data_section_offset={}, file_size={}", - string_table_offset, data_section_offset, data.len() - ))); - } - - if data_section_offset < string_table_offset { - return Err(ShardError::SecurityLimitExceeded(format!( - "data_section_offset {} < string_table_offset {}", - data_section_offset, string_table_offset - ))); - } - - let string_table = &data[string_table_offset..data_section_offset]; - - // Parse each index entry and resolve name from string table. - let mut entries = Vec::with_capacity(entry_count); - for i in 0..entry_count { - let off = index_start + i * INDEX_ENTRY_SIZE; - let mut entry = IndexEntryV2::from_bytes(&data[off..off + INDEX_ENTRY_SIZE]); - - // Resolve name from string table. - let name_off = entry.name_offset as usize; - let name_len = entry.name_len as usize; - if name_off + name_len <= string_table.len() { - entry.name = - String::from_utf8_lossy(&string_table[name_off..name_off + name_len]) - .into_owned(); - } - - entries.push(entry); - } - - // Security: validate entry data offsets - let file_size = data.len(); - let mut prev_end: usize = 0; - for (i, entry) in entries.iter().enumerate() { - let offset = entry.data_offset as usize; - let size = entry.disk_size as usize; - - if offset < data_section_offset { - return Err(ShardError::EntryOffsetOutOfRange { index: i }); - } - if offset.saturating_add(size) > file_size { - return Err(ShardError::EntryOffsetOutOfRange { index: i }); - } - if i > 0 && offset < prev_end { - return Err(ShardError::EntryOffsetOutOfRange { index: i }); - } - prev_end = offset + size; - } - - // Build name → index map. - let mut name_to_index = HashMap::with_capacity(entry_count); - for (i, e) in entries.iter().enumerate() { - name_to_index.insert(e.name.clone(), i); - } + let (entries, name_to_index) = parse_index_from_bytes(&data, &header)?; Ok(ShardV2Reader { data, @@ -651,31 +683,11 @@ impl ShardV2Reader { /// against the decompressed content. pub fn read_entry(&self, i: usize) -> Result, ShardError> { let entry = &self.entries[i]; - let offset = entry.data_offset as usize; - let size = entry.disk_size as usize; - - if offset + size > self.data.len() { - return Err(ShardError::Io(std::io::Error::new( - std::io::ErrorKind::UnexpectedEof, - "entry data extends past end of file", - ))); - } - - let raw = &self.data[offset..offset + size]; + let range = entry_data_range(self.data.len(), entry.data_offset, entry.disk_size)?; + let raw = &self.data[range]; let data = if entry.compressed() { - if entry.orig_size > MAX_DECOMPRESS_SIZE { - return Err(ShardError::DecompressTooLarge(entry.orig_size)); - } - if entry.flags & ENTRY_FLAG_ZSTD != 0 { - zstd::decode_all(raw) - .map_err(|e| ShardError::DecompressFailed(e.to_string()))? - } else if entry.flags & ENTRY_FLAG_LZ4 != 0 { - lz4_flex::decompress(raw, entry.orig_size as usize) - .map_err(|e| ShardError::DecompressFailed(e.to_string()))? - } else { - return Err(ShardError::CompressionNotSupported); - } + decompress_entry_data(raw, entry)? } else { raw.to_vec() }; @@ -773,12 +785,9 @@ impl ShardV2Reader { /// /// Returns `None` if no schema (`schema_offset == 0`). pub fn read_metadata(&self) -> Result, ShardError> { - let schema_offset = self.header.schema_offset as usize; - if schema_offset == 0 { + let Some(meta_bytes) = metadata_slice(&self.data, &self.header, &self.entries)? else { return Ok(None); - } - let total_file_size = self.header.total_file_size as usize; - let meta_bytes = &self.data[schema_offset..total_file_size]; + }; let meta: ShardMetadata = serde_json::from_slice(meta_bytes) .map_err(|e| ShardError::JsonError(e.to_string()))?; Ok(Some(meta)) @@ -878,19 +887,25 @@ fn parse_index_from_bytes( for (i, entry) in entries.iter().enumerate() { let offset = entry.data_offset as usize; let size = entry.disk_size as usize; + let end = offset + .checked_add(size) + .ok_or(ShardError::EntryOffsetOutOfRange { index: i })?; if offset < data_section_offset { return Err(ShardError::EntryOffsetOutOfRange { index: i }); } - if offset.saturating_add(size) > file_size { + if end > file_size { return Err(ShardError::EntryOffsetOutOfRange { index: i }); } if i > 0 && offset < prev_end { return Err(ShardError::EntryOffsetOutOfRange { index: i }); } - prev_end = offset + size; + prev_end = end; } + let min_schema_offset = prev_end.max(data_section_offset); + let _ = validate_schema_offset(header, file_size, min_schema_offset)?; + // Build name → index map. let mut name_to_index = HashMap::with_capacity(entry_count); for (i, e) in entries.iter().enumerate() { @@ -1049,31 +1064,11 @@ impl MmapShardV2Reader { /// CRC32C is verified against the decompressed content. pub fn read_entry_decompressed(&self, i: usize) -> Result, ShardError> { let entry = &self.entries[i]; - let offset = entry.data_offset as usize; - let size = entry.disk_size as usize; - - if offset + size > self.mmap.len() { - return Err(ShardError::Io(std::io::Error::new( - std::io::ErrorKind::UnexpectedEof, - "entry data extends past end of file", - ))); - } - - let raw = &self.mmap[offset..offset + size]; + let range = entry_data_range(self.mmap.len(), entry.data_offset, entry.disk_size)?; + let raw = &self.mmap[range]; let data = if entry.compressed() { - if entry.orig_size > MAX_DECOMPRESS_SIZE { - return Err(ShardError::DecompressTooLarge(entry.orig_size)); - } - if entry.flags & ENTRY_FLAG_ZSTD != 0 { - zstd::decode_all(raw) - .map_err(|e| ShardError::DecompressFailed(e.to_string()))? - } else if entry.flags & ENTRY_FLAG_LZ4 != 0 { - lz4_flex::decompress(raw, entry.orig_size as usize) - .map_err(|e| ShardError::DecompressFailed(e.to_string()))? - } else { - return Err(ShardError::CompressionNotSupported); - } + decompress_entry_data(raw, entry)? } else { raw.to_vec() }; @@ -1134,12 +1129,9 @@ impl MmapShardV2Reader { /// Read JSON metadata from schema section, if present. pub fn read_metadata(&self) -> Result, ShardError> { - let schema_offset = self.header.schema_offset as usize; - if schema_offset == 0 { + let Some(meta_bytes) = metadata_slice(&self.mmap, &self.header, &self.entries)? else { return Ok(None); - } - let total_file_size = self.header.total_file_size as usize; - let meta_bytes = &self.mmap[schema_offset..total_file_size]; + }; let meta: ShardMetadata = serde_json::from_slice(meta_bytes) .map_err(|e| ShardError::JsonError(e.to_string()))?; Ok(Some(meta)) diff --git a/rs/tests/compression_test.rs b/rs/tests/compression_test.rs index 69e8970..bd86b23 100644 --- a/rs/tests/compression_test.rs +++ b/rs/tests/compression_test.rs @@ -1,8 +1,8 @@ //! Compression roundtrip and edge-case tests for Shard v2. use shard_format::{ - compute_crc32c, ShardV2Reader, ShardV2Writer, - COMPRESS_LZ4, COMPRESS_NONE, COMPRESS_ZSTD, ROLE_MOSH, + compute_crc32c, ShardError, ShardV2Reader, ShardV2Writer, + COMPRESS_LZ4, COMPRESS_NONE, COMPRESS_ZSTD, HEADER_SIZE, ROLE_MOSH, }; #[test] @@ -156,3 +156,27 @@ fn test_incompressible_data_stored_uncompressed() { // Whether compressed or not, the decompressed data must round-trip correctly. assert_eq!(r.read_entry(0).unwrap(), data); } + +#[test] +fn test_zstd_declared_orig_size_is_enforced() { + let payload = vec![0xABu8; 4096]; + let mut w = ShardV2Writer::new(ROLE_MOSH); + w.set_compression(COMPRESS_ZSTD); + w.write_entry_compressed("bomb", &payload); + let mut bytes = w.to_bytes(); + + let info = ShardV2Reader::from_bytes(bytes.clone()).unwrap().get_entry_info(0).clone(); + assert!(info.compressed(), "payload should be stored compressed for this test"); + + let declared_size = 64u64; + let orig_size_off = HEADER_SIZE + 32; + bytes[orig_size_off..orig_size_off + 8].copy_from_slice(&declared_size.to_le_bytes()); + + let r = ShardV2Reader::from_bytes(bytes).unwrap(); + let err = r.read_entry(0).unwrap_err(); + assert!( + matches!(err, ShardError::DecompressTooLarge(size) if size > declared_size), + "expected bounded zstd decode failure, got: {:?}", + err + ); +} diff --git a/rs/tests/fuzz_seeds_test.rs b/rs/tests/fuzz_seeds_test.rs index 7fae723..01899c9 100644 --- a/rs/tests/fuzz_seeds_test.rs +++ b/rs/tests/fuzz_seeds_test.rs @@ -422,6 +422,46 @@ fn test_fuzz_decompress_huge_orig_size_triggers_limit() { } } +#[test] +fn test_fuzz_decompress_zstd_declared_orig_size_mismatch_is_bounded() { + let payload = vec![b'A'; 4096]; + let compressed = zstd::stream::encode_all(&payload[..], 1).expect("zstd encode"); + + let st_offset: usize = HEADER_SIZE + INDEX_ENTRY_SIZE; + let data_offset: usize = st_offset + 4; + let total: usize = data_offset + compressed.len(); + let mut buf = vec![0u8; total]; + + buf[0..4].copy_from_slice(SHARD_MAGIC); + buf[4] = SHARD_VERSION2; + buf[5] = ROLE_MOSH; + buf[6..8].copy_from_slice(&FLAG_HAS_CHECKSUMS.to_le_bytes()); + buf[10..12].copy_from_slice(&(INDEX_ENTRY_SIZE as u16).to_le_bytes()); + buf[12..16].copy_from_slice(&1u32.to_le_bytes()); + buf[16..24].copy_from_slice(&(st_offset as u64).to_le_bytes()); + buf[24..32].copy_from_slice(&(data_offset as u64).to_le_bytes()); + buf[40..48].copy_from_slice(&(total as u64).to_le_bytes()); + + let flags: u16 = ENTRY_FLAG_COMPRESSED | ENTRY_FLAG_ZSTD; + buf[HEADER_SIZE + 14..HEADER_SIZE + 16].copy_from_slice(&flags.to_le_bytes()); + buf[HEADER_SIZE + 16..HEADER_SIZE + 24].copy_from_slice(&(data_offset as u64).to_le_bytes()); + buf[HEADER_SIZE + 24..HEADER_SIZE + 32] + .copy_from_slice(&(compressed.len() as u64).to_le_bytes()); + buf[HEADER_SIZE + 32..HEADER_SIZE + 40].copy_from_slice(&32u64.to_le_bytes()); + + buf[st_offset] = b'e'; + buf[st_offset + 1] = 0; + buf[HEADER_SIZE + 12..HEADER_SIZE + 14].copy_from_slice(&1u16.to_le_bytes()); + buf[data_offset..data_offset + compressed.len()].copy_from_slice(&compressed); + + let reader = ShardV2Reader::from_bytes(buf).expect("reader"); + let err = reader.read_entry(0).unwrap_err(); + match err { + shard_format::ShardError::DecompressTooLarge(size) => assert_eq!(size, 33), + other => panic!("expected DecompressTooLarge, got {:?}", other), + } +} + // ============================================================ // FuzzReadEntryByName seeds // ============================================================ diff --git a/rs/tests/metadata_test.rs b/rs/tests/metadata_test.rs index 29ede12..96b1c24 100644 --- a/rs/tests/metadata_test.rs +++ b/rs/tests/metadata_test.rs @@ -329,6 +329,33 @@ fn metadata_default_schema_version() { assert_eq!(got.schema_version, "shard-v2.1"); } +#[test] +fn security_schema_offset_beyond_file_is_rejected() { + let mut buf = make_shard_with_meta(&[("data", b"hello")], ShardMetadata::default()); + let invalid = (buf.len() as u64) + 1; + buf[32..40].copy_from_slice(&invalid.to_le_bytes()); + + let err = ShardV2Reader::from_bytes(buf).unwrap_err(); + let msg = format!("{}", err); + assert!(msg.contains("schema_offset"), "expected schema_offset error, got: {}", msg); +} + +#[test] +fn security_schema_offset_overlapping_data_is_rejected() { + let mut buf = make_shard_with_meta(&[("data", b"hello world")], ShardMetadata::default()); + let entry_data_offset_off = HEADER_SIZE + 16; + let data_offset = u64::from_le_bytes( + buf[entry_data_offset_off..entry_data_offset_off + 8] + .try_into() + .unwrap(), + ); + buf[32..40].copy_from_slice(&data_offset.to_le_bytes()); + + let err = ShardV2Reader::from_bytes(buf).unwrap_err(); + let msg = format!("{}", err); + assert!(msg.contains("schema_offset"), "expected schema_offset error, got: {}", msg); +} + // ============================================================ // Security limits // ============================================================ diff --git a/rs/tests/mmap_test.rs b/rs/tests/mmap_test.rs index 876ce1a..1e63f78 100644 --- a/rs/tests/mmap_test.rs +++ b/rs/tests/mmap_test.rs @@ -3,8 +3,8 @@ use serde::Deserialize; use sha2::{Digest, Sha256}; use shard_format::{ - MmapShardV2Reader, ShardV2Writer, - ROLE_MOSH, FLAG_HAS_CHECKSUMS, + MmapShardV2Reader, ShardError, ShardV2Writer, + ROLE_MOSH, FLAG_HAS_CHECKSUMS, HEADER_SIZE, }; use std::path::{Path, PathBuf}; use tempfile::TempDir; @@ -363,6 +363,35 @@ fn test_mmap_compressed_entry_lz4() { } } +#[test] +fn test_mmap_zstd_declared_orig_size_is_enforced() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("bounded_zstd.shard"); + + let payload = vec![0xAAu8; 4096]; + let mut w = ShardV2Writer::new(ROLE_MOSH); + w.set_compression(shard_format::COMPRESS_ZSTD); + w.write_entry_compressed("bomb", &payload); + w.write_to_file(&path).unwrap(); + + let info = MmapShardV2Reader::open(&path).unwrap().get_entry_info(0).clone(); + assert!(info.compressed(), "payload should be stored compressed for this test"); + + let mut bytes = std::fs::read(&path).unwrap(); + let declared_size = 64u64; + let orig_size_off = HEADER_SIZE + 32; + bytes[orig_size_off..orig_size_off + 8].copy_from_slice(&declared_size.to_le_bytes()); + std::fs::write(&path, bytes).unwrap(); + + let r = MmapShardV2Reader::open(&path).unwrap(); + let err = r.read_entry_decompressed(0).unwrap_err(); + assert!( + matches!(err, ShardError::DecompressTooLarge(size) if size > declared_size), + "expected bounded zstd decode failure, got: {:?}", + err + ); +} + // ============================================================ // Golden file parity tests // ============================================================ diff --git a/ts/src/__tests__/compression.test.ts b/ts/src/__tests__/compression.test.ts index 9badd28..a4423f9 100644 --- a/ts/src/__tests__/compression.test.ts +++ b/ts/src/__tests__/compression.test.ts @@ -131,6 +131,23 @@ describe('Zstd Compression', () => { const r = new ShardV2Reader(buf); expect(Buffer.compare(r.readEntry(0), data)).toBe(0); }); + + it('rejects zstd size mismatches against the shard index', () => { + const data = makeRepetitiveData(4096); + + const w = new ShardV2Writer(); + w.setCompression(COMPRESS_ZSTD); + w.writeEntryCompressed('big', data); + + const buf = Buffer.from(w.toBuffer()); + const indexOffset = 64; + buf.writeUInt32LE(1, indexOffset + 32); + buf.writeUInt32LE(0, indexOffset + 36); + + const r = new ShardV2Reader(buf); + expect(r.getEntryInfo(0).compressed).toBe(true); + expect(() => r.readEntry(0)).toThrow(/does not match expected size 1/); + }); }); // ============================================================ diff --git a/ts/src/index.ts b/ts/src/index.ts index 8b994bc..0726cb9 100644 --- a/ts/src/index.ts +++ b/ts/src/index.ts @@ -53,6 +53,7 @@ async function getXxhashModule() { // ============================================================ let _zstdReady = false; +const ZSTD_FRAME_MAGIC = 0xFD2FB528; /** * Initialize compression libraries (zstd WASM). @@ -74,11 +75,122 @@ function zstdCompress(data: Uint8Array): Uint8Array { } /** Decompress zstd data. */ -function zstdDecompress(data: Uint8Array): Uint8Array { +function zstdDecompress( + data: Uint8Array, + expectedSize?: number, + maxOutputSize: number = MAX_DECOMPRESS_SIZE, +): Uint8Array { if (!_zstdReady) { throw new Error('zstd not initialized — call await initCompression() first'); } - return zstdWasmDecompress(data); + + if (expectedSize !== undefined && expectedSize > maxOutputSize) { + throw new Error(`decompressed size ${expectedSize} exceeds limit ${maxOutputSize}`); + } + + const declaredSize = zstdFrameContentSize(data); + if (declaredSize !== null) { + if (declaredSize > maxOutputSize) { + throw new Error(`zstd frame content size ${declaredSize} exceeds limit ${maxOutputSize}`); + } + if (expectedSize !== undefined && declaredSize !== expectedSize) { + throw new Error( + `zstd frame content size ${declaredSize} does not match expected size ${expectedSize}`, + ); + } + } + + const output = zstdWasmDecompress(data); + if (output.length > maxOutputSize) { + throw new Error(`zstd output size ${output.length} exceeds limit ${maxOutputSize}`); + } + if (expectedSize !== undefined && output.length !== expectedSize) { + throw new Error(`zstd output size ${output.length} does not match expected size ${expectedSize}`); + } + return output; +} + +function zstdFrameContentSize(data: Uint8Array): number | null { + const buf = Buffer.from(data.buffer, data.byteOffset, data.byteLength); + if (buf.length < 5) { + throw new Error('truncated zstd frame'); + } + if (buf.readUInt32LE(0) !== ZSTD_FRAME_MAGIC) { + throw new Error('invalid zstd frame magic'); + } + + const descriptor = buf[4]; + const frameContentSizeFlag = descriptor >> 6; + const singleSegment = (descriptor & 0x20) !== 0; + const dictIdFlag = descriptor & 0x03; + + let offset = 5; + if (!singleSegment) { + if (buf.length < offset + 1) { + throw new Error('truncated zstd window descriptor'); + } + offset += 1; + } + + switch (dictIdFlag) { + case 0: + break; + case 1: + offset += 1; + break; + case 2: + offset += 2; + break; + case 3: + offset += 4; + break; + } + + let fcsSize = 0; + switch (frameContentSizeFlag) { + case 0: + fcsSize = singleSegment ? 1 : 0; + break; + case 1: + fcsSize = 2; + break; + case 2: + fcsSize = 4; + break; + case 3: + fcsSize = 8; + break; + } + + if (fcsSize === 0) { + return null; + } + if (buf.length < offset + fcsSize) { + throw new Error('truncated zstd frame content size'); + } + + let size: bigint; + switch (fcsSize) { + case 1: + size = BigInt(buf[offset]); + break; + case 2: + size = BigInt(buf.readUInt16LE(offset) + 256); + break; + case 4: + size = BigInt(buf.readUInt32LE(offset)); + break; + case 8: + size = readUint64LE(buf, offset); + break; + default: + return null; + } + + if (size > BigInt(Number.MAX_SAFE_INTEGER)) { + throw new Error(`zstd frame content size ${size.toString()} exceeds Number.MAX_SAFE_INTEGER`); + } + return Number(size); } /** @@ -684,7 +796,7 @@ export class ShardV2Reader { throw new Error(`decompressed size ${origSize} exceeds limit ${MAX_DECOMPRESS_SIZE}`); } if (entry.flags & ENTRY_FLAG_ZSTD) { - data = Buffer.from(zstdDecompress(data)); + data = Buffer.from(zstdDecompress(data, origSize, MAX_DECOMPRESS_SIZE)); } else if (entry.flags & ENTRY_FLAG_LZ4) { data = lz4BlockDecompress(data, origSize); } else { diff --git a/ucodec/testdata/generate_golden.go b/ucodec/testdata/generate_golden.go new file mode 100644 index 0000000..60dfc34 --- /dev/null +++ b/ucodec/testdata/generate_golden.go @@ -0,0 +1,271 @@ +// +build ignore + +// generate_golden.go produces deterministic golden shard files for cross-language testing. +// +// Run: go run ./testdata/generate_golden.go +// +// Produces: +// testdata/golden_basic.shard - 3 uncompressed entries, align 64, MoSH role +// testdata/golden_types.shard - entries with different content types +// testdata/golden_align16.shard - alignment 16 +// testdata/golden_noalign.shard - no alignment +// testdata/golden_manifest.json - JSON manifest of all expected values +package main + +import ( + "crypto/sha256" + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/phenomenon0/Agent-GO/cowrie/ucodec" +) + +// GoldenManifest describes all golden files and their expected contents. +type GoldenManifest struct { + Files []GoldenFile `json:"files"` +} + +type GoldenFile struct { + Filename string `json:"filename"` + SHA256 string `json:"sha256"` + Header GoldenHeader `json:"header"` + Entries []GoldenEntry `json:"entries"` +} + +type GoldenHeader struct { + Version int `json:"version"` + Role int `json:"role"` + Flags int `json:"flags"` + Alignment int `json:"alignment"` + CompressionDefault int `json:"compression_default"` + EntryCount int `json:"entry_count"` + IndexEntrySize int `json:"index_entry_size"` + StringTableOffset uint64 `json:"string_table_offset"` + DataSectionOffset uint64 `json:"data_section_offset"` + SchemaOffset uint64 `json:"schema_offset"` + TotalFileSize uint64 `json:"total_file_size"` +} + +type GoldenEntry struct { + Name string `json:"name"` + NameHash uint64 `json:"name_hash"` + ContentType int `json:"content_type"` + OrigSize uint64 `json:"orig_size"` + DiskSize uint64 `json:"disk_size"` + Checksum uint32 `json:"checksum"` + Compressed bool `json:"compressed"` + DataSHA256 string `json:"data_sha256"` +} + +func main() { + dir := filepath.Dir(os.Args[0]) + if dir == "" || dir == "." { + dir = "." + } + // Always write to the testdata directory relative to where the source lives + outDir := filepath.Join("cowrie", "ucodec", "testdata") + if _, err := os.Stat(outDir); err != nil { + // Try current directory + outDir = "testdata" + if _, err := os.Stat(outDir); err != nil { + outDir = "." + } + } + + var manifest GoldenManifest + + // === golden_basic.shard === + { + path := filepath.Join(outDir, "golden_basic.shard") + w, err := ucodec.NewShardV2Writer(path, ucodec.ShardRoleMoSH) + must(err) + + // Entry 1: simple text + data1 := []byte("hello world") + must(w.WriteEntryTyped("greeting", data1, ucodec.ContentTypeText)) + + // Entry 2: binary blob (deterministic pattern) + data2 := make([]byte, 1024) + for i := range data2 { + data2[i] = byte(i % 256) + } + must(w.WriteEntryTyped("pattern/1k", data2, ucodec.ContentTypeBlob)) + + // Entry 3: JSON + data3 := []byte(`{"model":"qwen2.5","params":7000000000,"quantization":"Q4_K_M"}`) + must(w.WriteEntryTyped("metadata/model", data3, ucodec.ContentTypeJSON)) + + must(w.Close()) + + gf := readGoldenFile(path) + manifest.Files = append(manifest.Files, gf) + } + + // === golden_types.shard === + { + path := filepath.Join(outDir, "golden_types.shard") + w, err := ucodec.NewShardV2Writer(path, ucodec.ShardRoleSample) + must(err) + + // Various content types + must(w.WriteEntryTyped("tensor/weights", makePattern(4096, 0xAA), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("config.json", []byte(`{"layers":32,"hidden":4096}`), ucodec.ContentTypeJSON)) + must(w.WriteEntryTyped("config.glyph", []byte("layers=32\nhidden=4096\n"), ucodec.ContentTypeGLYPH)) + must(w.WriteEntryTyped("readme.txt", []byte("This is a sample shard for testing."), ucodec.ContentTypeText)) + must(w.WriteEntryTyped("image/thumbnail", makePattern(256, 0x42), ucodec.ContentTypeImage)) + + must(w.Close()) + + gf := readGoldenFile(path) + manifest.Files = append(manifest.Files, gf) + } + + // === golden_align16.shard === + { + path := filepath.Join(outDir, "golden_align16.shard") + w, err := ucodec.NewShardV2Writer(path, ucodec.ShardRoleMoSH) + must(err) + must(w.SetAlignment(ucodec.Align16)) + + must(w.WriteEntryTyped("a", []byte("short"), ucodec.ContentTypeText)) + must(w.WriteEntryTyped("b", makePattern(100, 0x55), ucodec.ContentTypeBlob)) + + must(w.Close()) + + gf := readGoldenFile(path) + manifest.Files = append(manifest.Files, gf) + } + + // === golden_noalign.shard === + { + path := filepath.Join(outDir, "golden_noalign.shard") + w, err := ucodec.NewShardV2Writer(path, ucodec.ShardRoleMoSH) + must(err) + must(w.SetAlignment(ucodec.AlignNone)) + + must(w.WriteEntryTyped("x", []byte{0xDE, 0xAD, 0xBE, 0xEF}, ucodec.ContentTypeBlob)) + must(w.WriteEntryTyped("y", []byte{0xCA, 0xFE, 0xBA, 0xBE, 0x00, 0x01, 0x02, 0x03}, ucodec.ContentTypeBlob)) + + must(w.Close()) + + gf := readGoldenFile(path) + manifest.Files = append(manifest.Files, gf) + } + + // === golden_wshard.shard (WShard role) === + { + path := filepath.Join(outDir, "golden_wshard.shard") + w, err := ucodec.NewShardV2Writer(path, ucodec.ShardRoleWShard) + must(err) + + must(w.WriteEntryTyped("signal/imu", makePattern(600, 0x11), ucodec.ContentTypeBlob)) + must(w.WriteEntryTyped("omen/imu/mlp_v1", makePattern(600, 0x22), ucodec.ContentTypeBlob)) + must(w.WriteEntryTyped("residual/imu/sign2nddiff", makePattern(75, 0x33), ucodec.ContentTypeBlob)) + + must(w.Close()) + + gf := readGoldenFile(path) + manifest.Files = append(manifest.Files, gf) + } + + // === golden_hierarchical.shard (deep paths) === + { + path := filepath.Join(outDir, "golden_hierarchical.shard") + w, err := ucodec.NewShardV2Writer(path, ucodec.ShardRoleMoSH) + must(err) + + must(w.WriteEntryTyped("layer.0/attention/q_proj/weight", makePattern(512, 0x01), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/attention/k_proj/weight", makePattern(512, 0x02), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/attention/v_proj/weight", makePattern(512, 0x03), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/attention/o_proj/weight", makePattern(512, 0x04), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/ffn/gate/weight", makePattern(2048, 0x05), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/ffn/up/weight", makePattern(2048, 0x06), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/ffn/down/weight", makePattern(2048, 0x07), ucodec.ContentTypeTensor)) + must(w.WriteEntryTyped("layer.0/norm", makePattern(128, 0x08), ucodec.ContentTypeTensor)) + + must(w.Close()) + + gf := readGoldenFile(path) + manifest.Files = append(manifest.Files, gf) + } + + // Write manifest + manifestPath := filepath.Join(outDir, "golden_manifest.json") + manifestData, err := json.MarshalIndent(manifest, "", " ") + must(err) + must(os.WriteFile(manifestPath, manifestData, 0644)) + + fmt.Printf("Generated %d golden files + manifest in %s\n", len(manifest.Files), outDir) + for _, f := range manifest.Files { + fmt.Printf(" %s: %d entries, %d bytes\n", f.Filename, len(f.Entries), f.Header.TotalFileSize) + } +} + +func makePattern(size int, seed byte) []byte { + data := make([]byte, size) + for i := range data { + data[i] = byte((int(seed) + i) % 256) + } + return data +} + +func readGoldenFile(path string) GoldenFile { + // Read the file bytes for SHA256 + fileData, err := os.ReadFile(path) + must(err) + fileHash := sha256.Sum256(fileData) + + // Open with reader + r, err := ucodec.OpenShardV2(path) + must(err) + defer r.Close() + + h := r.Header() + gf := GoldenFile{ + Filename: filepath.Base(path), + SHA256: fmt.Sprintf("%x", fileHash), + Header: GoldenHeader{ + Version: int(h.Version), + Role: int(h.Role), + Flags: int(h.Flags), + Alignment: int(h.Alignment), + CompressionDefault: int(h.CompressionDefault), + EntryCount: int(h.EntryCount), + IndexEntrySize: int(h.IndexEntrySize), + StringTableOffset: h.StringTableOffset, + DataSectionOffset: h.DataSectionOffset, + SchemaOffset: h.SchemaOffset, + TotalFileSize: h.TotalFileSize, + }, + } + + for i := 0; i < r.EntryCount(); i++ { + info := r.GetEntryInfo(i) + data, err := r.ReadEntry(i) + must(err) + + dataHash := sha256.Sum256(data) + ge := GoldenEntry{ + Name: r.EntryName(i), + NameHash: info.NameHash, + ContentType: int(info.Reserved & 0xFFFF), + OrigSize: info.OrigSize, + DiskSize: info.DiskSize, + Checksum: info.Checksum, + Compressed: info.Flags&ucodec.EntryFlagCompressed != 0, + DataSHA256: fmt.Sprintf("%x", dataHash), + } + gf.Entries = append(gf.Entries, ge) + } + + return gf +} + +func must(err error) { + if err != nil { + fmt.Fprintf(os.Stderr, "FATAL: %v\n", err) + os.Exit(1) + } +} diff --git a/ucodec/testdata/golden_align16.shard b/ucodec/testdata/golden_align16.shard new file mode 100644 index 0000000..7aef771 Binary files /dev/null and b/ucodec/testdata/golden_align16.shard differ diff --git a/ucodec/testdata/golden_basic.shard b/ucodec/testdata/golden_basic.shard new file mode 100644 index 0000000..77fd47d Binary files /dev/null and b/ucodec/testdata/golden_basic.shard differ diff --git a/ucodec/testdata/golden_hierarchical.shard b/ucodec/testdata/golden_hierarchical.shard new file mode 100644 index 0000000..0d626ad Binary files /dev/null and b/ucodec/testdata/golden_hierarchical.shard differ diff --git a/ucodec/testdata/golden_manifest.json b/ucodec/testdata/golden_manifest.json new file mode 100644 index 0000000..3ab0bb6 --- /dev/null +++ b/ucodec/testdata/golden_manifest.json @@ -0,0 +1,348 @@ +{ + "files": [ + { + "filename": "golden_basic.shard", + "sha256": "487228df14dc2eff0ed316ec55c759ea7459ab33be61a29a1708411cd13fd67f", + "header": { + "version": 2, + "role": 1, + "flags": 162, + "alignment": 64, + "compression_default": 0, + "entry_count": 3, + "index_entry_size": 48, + "string_table_offset": 208, + "data_section_offset": 256, + "schema_offset": 0, + "total_file_size": 1407 + }, + "entries": [ + { + "name": "greeting", + "name_hash": 16700977181434310015, + "content_type": 5, + "orig_size": 11, + "disk_size": 11, + "checksum": 3381945770, + "compressed": false, + "data_sha256": "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9" + }, + { + "name": "pattern/1k", + "name_hash": 11449517142474649281, + "content_type": 10, + "orig_size": 1024, + "disk_size": 1024, + "checksum": 752840335, + "compressed": false, + "data_sha256": "785b0751fc2c53dc14a4ce3d800e69ef9ce1009eb327ccf458afe09c242c26c9" + }, + { + "name": "metadata/model", + "name_hash": 7724208215623586332, + "content_type": 2, + "orig_size": 63, + "disk_size": 63, + "checksum": 4262061395, + "compressed": false, + "data_sha256": "1b4c9b4a3b92d7b9e6f8cf692cb886a9cbd9fb12f7c6750a420371b968b30e17" + } + ] + }, + { + "filename": "golden_types.shard", + "sha256": "cf9d193f9ca96c7d94230b80aa4a7e3ca4774266c576c23daeb44eeb994a9523", + "header": { + "version": 2, + "role": 2, + "flags": 162, + "alignment": 64, + "compression_default": 0, + "entry_count": 5, + "index_entry_size": 48, + "string_table_offset": 304, + "data_section_offset": 384, + "schema_offset": 0, + "total_file_size": 4928 + }, + "entries": [ + { + "name": "tensor/weights", + "name_hash": 14268158869293769122, + "content_type": 1, + "orig_size": 4096, + "disk_size": 4096, + "checksum": 1817819105, + "compressed": false, + "data_sha256": "fb612d9b9b653549700a30f29e6cef04bcd35f7a122f91a8fa61c79bf605347a" + }, + { + "name": "config.json", + "name_hash": 4010899755935844745, + "content_type": 2, + "orig_size": 27, + "disk_size": 27, + "checksum": 2923727015, + "compressed": false, + "data_sha256": "ef478e2fad8508eafab8776872f59fbd658353d9f97f860bfcd58daedcbd580b" + }, + { + "name": "config.glyph", + "name_hash": 8537223467359895312, + "content_type": 4, + "orig_size": 22, + "disk_size": 22, + "checksum": 2175825978, + "compressed": false, + "data_sha256": "849eb0bf68979141a52432cbc998a767cd37eb304f5e8fc0a0bd5d9001611263" + }, + { + "name": "readme.txt", + "name_hash": 7447530330435853781, + "content_type": 5, + "orig_size": 35, + "disk_size": 35, + "checksum": 436240408, + "compressed": false, + "data_sha256": "9b9484324c91a5a001dc746f0cf31d8176cef85db8405aee0781b8511adf1dc7" + }, + { + "name": "image/thumbnail", + "name_hash": 12685321276006778552, + "content_type": 6, + "orig_size": 256, + "disk_size": 256, + "checksum": 4184433400, + "compressed": false, + "data_sha256": "8868762847d186a2fd707ac60cf0069b8caae69c5d90310848ab94e033c61db4" + } + ] + }, + { + "filename": "golden_align16.shard", + "sha256": "e0056ea463fc94ab58967fef9fd464fc3ec96bc649e643e3377223fc85c585ce", + "header": { + "version": 2, + "role": 1, + "flags": 162, + "alignment": 16, + "compression_default": 0, + "entry_count": 2, + "index_entry_size": 48, + "string_table_offset": 160, + "data_section_offset": 176, + "schema_offset": 0, + "total_file_size": 292 + }, + "entries": [ + { + "name": "a", + "name_hash": 15154266338359012955, + "content_type": 5, + "orig_size": 5, + "disk_size": 5, + "checksum": 1623414395, + "compressed": false, + "data_sha256": "f9b0078b5df596d2ea19010c001bbd009e651de2c57e8fb7e355f31eb9d3f739" + }, + { + "name": "b", + "name_hash": 8666379929374662555, + "content_type": 10, + "orig_size": 100, + "disk_size": 100, + "checksum": 3235686185, + "compressed": false, + "data_sha256": "a5cf82ae547e6976caebaff6d459598385fa73084c685782c7edb12da34d1b3d" + } + ] + }, + { + "filename": "golden_noalign.shard", + "sha256": "7e3b4acbdea04d981ed1c635a5e571ccfce1db8d82aed108c9937ba60cab9ead", + "header": { + "version": 2, + "role": 1, + "flags": 162, + "alignment": 0, + "compression_default": 0, + "entry_count": 2, + "index_entry_size": 48, + "string_table_offset": 160, + "data_section_offset": 164, + "schema_offset": 0, + "total_file_size": 176 + }, + "entries": [ + { + "name": "x", + "name_hash": 6665539201184043299, + "content_type": 10, + "orig_size": 4, + "disk_size": 4, + "checksum": 4057757582, + "compressed": false, + "data_sha256": "5f78c33274e43fa9de5659265c1d917e25c03722dcb0b8d27db8d5feaa813953" + }, + { + "name": "y", + "name_hash": 13923454618160480178, + "content_type": 10, + "orig_size": 8, + "disk_size": 8, + "checksum": 3145601971, + "compressed": false, + "data_sha256": "117638e8b13b2ae24ea0bc41a271dbeede26f6a6b5e5c2daef495c0d1d7b5568" + } + ] + }, + { + "filename": "golden_wshard.shard", + "sha256": "43b3474161442989a2938a235010e5cf2a0e153a4b12b68a59e2931e6bf21328", + "header": { + "version": 2, + "role": 5, + "flags": 162, + "alignment": 64, + "compression_default": 0, + "entry_count": 3, + "index_entry_size": 48, + "string_table_offset": 208, + "data_section_offset": 320, + "schema_offset": 0, + "total_file_size": 1675 + }, + "entries": [ + { + "name": "signal/imu", + "name_hash": 4930816044968525988, + "content_type": 10, + "orig_size": 600, + "disk_size": 600, + "checksum": 3613665374, + "compressed": false, + "data_sha256": "bd2b96fab3834f67e55e3fec0c7df78b1274159085e4cd6b18602558d14740ef" + }, + { + "name": "omen/imu/mlp_v1", + "name_hash": 16966689604114254082, + "content_type": 10, + "orig_size": 600, + "disk_size": 600, + "checksum": 4013429539, + "compressed": false, + "data_sha256": "e20146da666455879b899668b08160152cd2eca558549e98676402f71e287825" + }, + { + "name": "residual/imu/sign2nddiff", + "name_hash": 4414458669179889149, + "content_type": 10, + "orig_size": 75, + "disk_size": 75, + "checksum": 4037150595, + "compressed": false, + "data_sha256": "af6f764b02da66ff0151cc883b97b61afc314c85cc3ff5d2e57fd579e53cf053" + } + ] + }, + { + "filename": "golden_hierarchical.shard", + "sha256": "aea0e2a1239a59ec9d2260c540ff755b96351b9517d60688eae95b30041583f1", + "header": { + "version": 2, + "role": 1, + "flags": 162, + "alignment": 64, + "compression_default": 0, + "entry_count": 8, + "index_entry_size": 48, + "string_table_offset": 448, + "data_section_offset": 704, + "schema_offset": 0, + "total_file_size": 9024 + }, + "entries": [ + { + "name": "layer.0/attention/q_proj/weight", + "name_hash": 5849259052093732705, + "content_type": 1, + "orig_size": 512, + "disk_size": 512, + "checksum": 3374896620, + "compressed": false, + "data_sha256": "28398ff046bc535a237de195155297befb0482729ae810c6238564f440be76a1" + }, + { + "name": "layer.0/attention/k_proj/weight", + "name_hash": 14695571574228440880, + "content_type": 1, + "orig_size": 512, + "disk_size": 512, + "checksum": 3600458702, + "compressed": false, + "data_sha256": "2272ae691b6ae9a5e6a2e73399ac0f4940a92d89bef1656332176cf9fe37d8a3" + }, + { + "name": "layer.0/attention/v_proj/weight", + "name_hash": 8985939025931988576, + "content_type": 1, + "orig_size": 512, + "disk_size": 512, + "checksum": 2404250664, + "compressed": false, + "data_sha256": "72c4a922e03dc0b0b47e1b9b8f84568e4d1fbcb7b4126c399a1474dd3232fadc" + }, + { + "name": "layer.0/attention/o_proj/weight", + "name_hash": 4427070529554003615, + "content_type": 1, + "orig_size": 512, + "disk_size": 512, + "checksum": 3252991925, + "compressed": false, + "data_sha256": "88b9475b5af74e27d7a222713694e964899f9e57e240807d1131e1efcd92e340" + }, + { + "name": "layer.0/ffn/gate/weight", + "name_hash": 7777247805686235758, + "content_type": 1, + "orig_size": 2048, + "disk_size": 2048, + "checksum": 3801602145, + "compressed": false, + "data_sha256": "a04ab91887f01a03f81447c52555daf8bdd12b05820fdeee5bf51034fd5f582f" + }, + { + "name": "layer.0/ffn/up/weight", + "name_hash": 5945135977106567835, + "content_type": 1, + "orig_size": 2048, + "disk_size": 2048, + "checksum": 1273217590, + "compressed": false, + "data_sha256": "2761e1042571671571a4ed60fd051780facb7ff339f5f62f95a9ade9b5ec9a0b" + }, + { + "name": "layer.0/ffn/down/weight", + "name_hash": 9799028502101065133, + "content_type": 1, + "orig_size": 2048, + "disk_size": 2048, + "checksum": 324665330, + "compressed": false, + "data_sha256": "45ecbadc7ec56f219b77bfddc2ea8af29d941fba75c80542362c00b18ec29f79" + }, + { + "name": "layer.0/norm", + "name_hash": 5070234111195826601, + "content_type": 1, + "orig_size": 128, + "disk_size": 128, + "checksum": 412440727, + "compressed": false, + "data_sha256": "d1b27fdf2fa02f6d6a716b42c0036008b698de461afc05cc274aeeb69bb5a3bf" + } + ] + } + ] +} \ No newline at end of file diff --git a/ucodec/testdata/golden_noalign.shard b/ucodec/testdata/golden_noalign.shard new file mode 100644 index 0000000..d2c94ad Binary files /dev/null and b/ucodec/testdata/golden_noalign.shard differ diff --git a/ucodec/testdata/golden_sampleshard.smpl b/ucodec/testdata/golden_sampleshard.smpl new file mode 100644 index 0000000..4f1ceee Binary files /dev/null and b/ucodec/testdata/golden_sampleshard.smpl differ diff --git a/ucodec/testdata/golden_types.shard b/ucodec/testdata/golden_types.shard new file mode 100644 index 0000000..0919271 Binary files /dev/null and b/ucodec/testdata/golden_types.shard differ diff --git a/ucodec/testdata/golden_wshard.shard b/ucodec/testdata/golden_wshard.shard new file mode 100644 index 0000000..eeaac4d Binary files /dev/null and b/ucodec/testdata/golden_wshard.shard differ diff --git a/ucodec/testdata/safety/corrupt_bad_magic.shard b/ucodec/testdata/safety/corrupt_bad_magic.shard new file mode 100644 index 0000000..6e34127 Binary files /dev/null and b/ucodec/testdata/safety/corrupt_bad_magic.shard differ diff --git a/ucodec/testdata/safety/corrupt_crc_mismatch.shard b/ucodec/testdata/safety/corrupt_crc_mismatch.shard new file mode 100644 index 0000000..cb3dd6d Binary files /dev/null and b/ucodec/testdata/safety/corrupt_crc_mismatch.shard differ diff --git a/ucodec/testdata/safety/corrupt_entry_count_overflow.shard b/ucodec/testdata/safety/corrupt_entry_count_overflow.shard new file mode 100644 index 0000000..ab896f2 Binary files /dev/null and b/ucodec/testdata/safety/corrupt_entry_count_overflow.shard differ diff --git a/ucodec/testdata/safety/corrupt_huge_string_table.shard b/ucodec/testdata/safety/corrupt_huge_string_table.shard new file mode 100644 index 0000000..a6bd061 Binary files /dev/null and b/ucodec/testdata/safety/corrupt_huge_string_table.shard differ diff --git a/ucodec/testdata/safety/corrupt_truncated_data.shard b/ucodec/testdata/safety/corrupt_truncated_data.shard new file mode 100644 index 0000000..c9d3574 Binary files /dev/null and b/ucodec/testdata/safety/corrupt_truncated_data.shard differ diff --git a/ucodec/testdata/safety/corrupt_truncated_header.shard b/ucodec/testdata/safety/corrupt_truncated_header.shard new file mode 100644 index 0000000..91b92b1 Binary files /dev/null and b/ucodec/testdata/safety/corrupt_truncated_header.shard differ diff --git a/ucodec/testdata/safety/safety_manifest.json b/ucodec/testdata/safety/safety_manifest.json new file mode 100644 index 0000000..b9fb56e --- /dev/null +++ b/ucodec/testdata/safety/safety_manifest.json @@ -0,0 +1,143 @@ +{ + "valid": [ + { + "filename": "valid_basic.shard", + "sha256": "90af10d5379d70e975232c8928456f3adf8fad8143facd1588732cb71b530e50", + "entry_count": 3, + "entries": [ + { + "name": "tensor_a", + "data_sha256": "477e0dc549fdaf0f7a6e00a1c693454096d0aecbc6610ec820b377e70e712b17", + "size": 1024 + }, + { + "name": "tensor_b", + "data_sha256": "347e635d9369f762d476eee696313d7073a0536d89f2dfe1c0c5712b1a719ed1", + "size": 2048 + }, + { + "name": "tensor_c", + "data_sha256": "2034f7bb7a3a79f0b9651811aafd266b605c6d4c817c3fc4ce94e367dff216d8", + "size": 512 + } + ] + }, + { + "filename": "valid_compressed.shard", + "sha256": "ba5a0fff66d6738c85e12263ab6b6da28fd0311678c32bb6c5a22010902ac74b", + "entry_count": 3, + "entries": [ + { + "name": "comp_a", + "data_sha256": "c622005493c4cb75f3e08eda4cc0bfe172e2c5eeca661ec4908c5490fc3d6994", + "size": 4096 + }, + { + "name": "comp_b", + "data_sha256": "1082f036b4e470e2bb00ff9499e9aa8069fbbb3c7f1f3c3561de41ccd07aa7f6", + "size": 8192 + }, + { + "name": "comp_c", + "data_sha256": "5dec520a7765b98483edad2fa2efc130ee94987ad8e2e0e394339a24ec5bb197", + "size": 2048 + } + ] + }, + { + "filename": "valid_empty_entry.shard", + "sha256": "1a4788a4072fb12e16e9b2e31380710cf28df7ca407b0e1be6541f7935cb2efb", + "entry_count": 1, + "entries": [ + { + "name": "empty", + "data_sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "size": 0 + } + ] + }, + { + "filename": "valid_long_name.shard", + "sha256": "ea5ed054abc6a7f6f85f7e664309bdb0c6a060b72980806e8db0bcb38d73d251", + "entry_count": 1, + "entries": [ + { + "name": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "data_sha256": "a37ee4c3cfe84cb853359a1cff12324456dee4bebee08c55a4e8952725004177", + "size": 21 + } + ] + }, + { + "filename": "valid_unicode_name.shard", + "sha256": "5709721282fcf8bd02e3369f13bd4ad74676f698df50c51f1dc2282897e92753", + "entry_count": 4, + "entries": [ + { + "name": "日本語", + "data_sha256": "3712f83343c539d2fc101eb0a6f053f7e2a32188a6972d3ec8877995457222a6", + "size": 12 + }, + { + "name": "émoji_😀", + "data_sha256": "620ae50ca1986dc18f016d2324c41756536a28345dbf36b812c181505126bf97", + "size": 11 + }, + { + "name": "中文/层/权重", + "data_sha256": "dba0c6c1ffcbdc39cbeee07dbb6ebd9021a905218ba93c627f07ee0fbaff3699", + "size": 12 + }, + { + "name": "АБВГ", + "data_sha256": "53ba3680cddbbd70cee9ba8d0822d77ec607fc4680b3ab9dda594c042338f2e2", + "size": 13 + } + ] + }, + { + "filename": "valid_binary_data.shard", + "sha256": "c9ab1e93e8c9948afcca5db06c22e4481c1f4959bba155e6bf18dcd337dc1518", + "entry_count": 1, + "entries": [ + { + "name": "all_256_bytes", + "data_sha256": "40aff2e9d2d8922e47afd4648e6967497158785fbd1da870e7110266bf944880", + "size": 256 + } + ] + } + ], + "corrupt": [ + { + "filename": "corrupt_bad_magic.shard", + "expected_error": "invalid magic", + "description": "First 4 bytes are XXXX instead of SHRD" + }, + { + "filename": "corrupt_truncated_header.shard", + "expected_error": "header too short", + "description": "Only 32 bytes, header requires 64" + }, + { + "filename": "corrupt_crc_mismatch.shard", + "expected_error": "checksum mismatch", + "description": "Valid shard with first entry CRC32 bits flipped" + }, + { + "filename": "corrupt_truncated_data.shard", + "expected_error": "truncated data", + "description": "Valid header and index but data section cut short by 100 bytes" + }, + { + "filename": "corrupt_entry_count_overflow.shard", + "expected_error": "entry count overflow", + "description": "Header claims 0xFFFFFFFF entries (4 billion+), exceeds MaxV2EntryCount" + }, + { + "filename": "corrupt_huge_string_table.shard", + "expected_error": "string table beyond file", + "description": "String table offset is 10x the actual file size" + } + ] +} \ No newline at end of file diff --git a/ucodec/testdata/safety/valid_basic.shard b/ucodec/testdata/safety/valid_basic.shard new file mode 100644 index 0000000..23c29bb Binary files /dev/null and b/ucodec/testdata/safety/valid_basic.shard differ diff --git a/ucodec/testdata/safety/valid_binary_data.shard b/ucodec/testdata/safety/valid_binary_data.shard new file mode 100644 index 0000000..d3083ea Binary files /dev/null and b/ucodec/testdata/safety/valid_binary_data.shard differ diff --git a/ucodec/testdata/safety/valid_compressed.shard b/ucodec/testdata/safety/valid_compressed.shard new file mode 100644 index 0000000..780fd0e Binary files /dev/null and b/ucodec/testdata/safety/valid_compressed.shard differ diff --git a/ucodec/testdata/safety/valid_empty_entry.shard b/ucodec/testdata/safety/valid_empty_entry.shard new file mode 100644 index 0000000..1c50b54 Binary files /dev/null and b/ucodec/testdata/safety/valid_empty_entry.shard differ diff --git a/ucodec/testdata/safety/valid_long_name.shard b/ucodec/testdata/safety/valid_long_name.shard new file mode 100644 index 0000000..940b211 Binary files /dev/null and b/ucodec/testdata/safety/valid_long_name.shard differ diff --git a/ucodec/testdata/safety/valid_unicode_name.shard b/ucodec/testdata/safety/valid_unicode_name.shard new file mode 100644 index 0000000..e273646 Binary files /dev/null and b/ucodec/testdata/safety/valid_unicode_name.shard differ