diff --git a/package.json b/package.json index f234cb67..55f31691 100644 --- a/package.json +++ b/package.json @@ -58,11 +58,11 @@ "build": "rm -rf dist && tsc && pnpm build:lib && pnpm build:lib:cjs && pnpm build:browser && pnpm build:cli && pnpm build:shell && pnpm build:worker && pnpm build:clean && cp dist/index.d.ts dist/index.d.cts && sed '1,/^-->/d' AGENTS.npm.md > dist/AGENTS.md", "build:clean": "find dist -name '*.test.js' -delete && find dist -name '*.test.d.ts' -delete", "build:worker": "esbuild src/commands/python3/worker.ts --bundle --platform=node --format=esm --outfile=src/commands/python3/worker.js --external:../../../vendor/cpython-emscripten/* && cp src/commands/python3/worker.js dist/commands/python3/worker.js && mkdir -p dist/bin/chunks && cp src/commands/python3/worker.js dist/bin/chunks/worker.js && mkdir -p dist/bundle/chunks && cp src/commands/python3/worker.js dist/bundle/chunks/worker.js && esbuild src/commands/js-exec/worker.ts --bundle --platform=node --format=esm --outfile=src/commands/js-exec/worker.js --external:quickjs-emscripten && cp src/commands/js-exec/worker.js dist/commands/js-exec/worker.js && cp src/commands/js-exec/worker.js dist/bin/chunks/js-exec-worker.js && cp src/commands/js-exec/worker.js dist/bundle/chunks/js-exec-worker.js", - "build:lib": "esbuild dist/index.js --bundle --splitting --platform=node --format=esm --minify --outdir=dist/bundle --chunk-names=chunks/[name]-[hash] --external:diff --external:minimatch --external:sprintf-js --external:turndown --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:compressjs", - "build:lib:cjs": "esbuild dist/index.js --bundle --platform=node --format=cjs --minify --outfile=dist/bundle/index.cjs --external:diff --external:minimatch --external:sprintf-js --external:turndown --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:compressjs", - "build:browser": "esbuild dist/browser.js --bundle --platform=browser --format=esm --minify --outfile=dist/bundle/browser.js --external:diff --external:minimatch --external:sprintf-js --external:turndown --external:node:zlib --external:@mongodb-js/zstd --external:node-liblzma --external:compressjs --define:__BROWSER__=true --alias:node:dns=./src/shims/browser-unsupported.js", - "build:cli": "esbuild dist/cli/just-bash.js --bundle --splitting --platform=node --format=esm --minify --outdir=dist/bin --entry-names=[name] --chunk-names=chunks/[name]-[hash] --banner:js='#!/usr/bin/env node' --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:compressjs", - "build:shell": "esbuild dist/cli/shell.js --bundle --splitting --platform=node --format=esm --minify --outdir=dist/bin/shell --entry-names=[name] --chunk-names=chunks/[name]-[hash] --banner:js='#!/usr/bin/env node' --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:compressjs", + "build:lib": "esbuild dist/index.js --bundle --splitting --platform=node --format=esm --minify --outdir=dist/bundle --chunk-names=chunks/[name]-[hash] --external:diff --external:minimatch --external:sprintf-js --external:turndown --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:seek-bzip", + "build:lib:cjs": "esbuild dist/index.js --bundle --platform=node --format=cjs --minify --outfile=dist/bundle/index.cjs --external:diff --external:minimatch --external:sprintf-js --external:turndown --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:seek-bzip", + "build:browser": "esbuild dist/browser.js --bundle --platform=browser --format=esm --minify --outfile=dist/bundle/browser.js --external:diff --external:minimatch --external:sprintf-js --external:turndown --external:node:zlib --external:@mongodb-js/zstd --external:node-liblzma --external:seek-bzip --define:__BROWSER__=true --alias:node:dns=./src/shims/browser-unsupported.js", + "build:cli": "esbuild dist/cli/just-bash.js --bundle --splitting --platform=node --format=esm --minify --outdir=dist/bin --entry-names=[name] --chunk-names=chunks/[name]-[hash] --banner:js='#!/usr/bin/env node' --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:seek-bzip", + "build:shell": "esbuild dist/cli/shell.js --bundle --splitting --platform=node --format=esm --minify --outdir=dist/bin/shell --entry-names=[name] --chunk-names=chunks/[name]-[hash] --banner:js='#!/usr/bin/env node' --external:sql.js --external:quickjs-emscripten --external:@mongodb-js/zstd --external:node-liblzma --external:seek-bzip", "prepublishOnly": "pnpm validate", "validate": "pnpm lint && pnpm knip && pnpm typecheck && pnpm build && pnpm check:worker-sync && pnpm test:run && pnpm test:wasm && pnpm test:dist && pnpm test:examples", "test:examples": "cd examples/cjs-consumer && pnpm install --no-frozen-lockfile && npx tsc --noEmit", @@ -105,7 +105,7 @@ "vitest": "^4.0.16" }, "dependencies": { - "compressjs": "^1.0.3", + "seek-bzip": "^2.0.0", "diff": "^8.0.2", "fast-xml-parser": "^5.3.3", "file-type": "^21.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 29be728d..102b615e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -5,9 +5,6 @@ settings: excludeLinksFromLockfile: false dependencies: - compressjs: - specifier: ^1.0.3 - version: 1.0.3 diff: specifier: ^8.0.2 version: 8.0.2 @@ -35,6 +32,9 @@ dependencies: re2js: specifier: ^1.2.1 version: 1.2.1 + seek-bzip: + specifier: ^2.0.0 + version: 2.0.0 smol-toml: specifier: ^1.6.0 version: 1.6.0 @@ -1093,11 +1093,6 @@ packages: tinyrainbow: 3.0.3 dev: true - /amdefine@1.0.1: - resolution: {integrity: sha512-S2Hw0TtNkMJhIabBwIojKL9YHO5T0n5eNqWJ7Lrlel/zDbftQpxpapi8tZs3X1HWa+u+QeydGmzzNU0m09+Rcg==} - engines: {node: '>=0.4.2'} - dev: false - /argparse@2.0.1: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} dev: true @@ -1158,19 +1153,9 @@ packages: dev: false optional: true - /commander@2.8.1: - resolution: {integrity: sha512-+pJLBFVk+9ZZdlAOB5WuIElVPPth47hILFkmGym57aq8kwxsowvByvB0DHs1vQAhyMZzdcpTtF0VDKGkSDR4ZQ==} - engines: {node: '>= 0.6.x'} - dependencies: - graceful-readlink: 1.0.1 - dev: false - - /compressjs@1.0.3: - resolution: {integrity: sha512-jpKJjBTretQACTGLNuvnozP1JdP2ZLrjdGdBgk/tz1VfXlUcBhhSZW6vEsuThmeot/yjvSrPQKEgfF3X2Lpi8Q==} - hasBin: true - dependencies: - amdefine: 1.0.1 - commander: 2.8.1 + /commander@6.2.1: + resolution: {integrity: sha512-U7VdrJFnJgo4xjrHpTzu0yrHPGImdsmD95ZlgYSEajAn2JKzDhDTPG9kBTefmObL2w/ngeZnilk+OV9CG3d7UA==} + engines: {node: '>= 6'} dev: false /debug@4.4.3: @@ -1380,10 +1365,6 @@ packages: is-glob: 4.0.3 dev: true - /graceful-readlink@1.0.1: - resolution: {integrity: sha512-8tLu60LgxF6XpdbK8OW3FA+IfTNBn1ZHGHKF4KQbEeSkajYw5PlYJcKluntgegDPTg8UkHjpet1T82vk6TQ68w==} - dev: false - /has-flag@4.0.0: resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==} engines: {node: '>=8'} @@ -1803,6 +1784,13 @@ packages: dev: false optional: true + /seek-bzip@2.0.0: + resolution: {integrity: sha512-SMguiTnYrhpLdk3PwfzHeotrcwi8bNV4iemL9tx9poR/yeaMYwB9VzR1w7b57DuWpuqR8n6oZboi0hj3AxZxQg==} + hasBin: true + dependencies: + commander: 6.2.1 + dev: false + /semver@7.7.3: resolution: {integrity: sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==} engines: {node: '>=10'} diff --git a/src/commands/browser-excluded.ts b/src/commands/browser-excluded.ts index 4cb7fd33..5ef272ad 100644 --- a/src/commands/browser-excluded.ts +++ b/src/commands/browser-excluded.ts @@ -10,7 +10,7 @@ * is not available. */ export const BROWSER_EXCLUDED_COMMANDS: readonly string[] = [ - "tar", // Uses native compression modules (@mongodb-js/zstd, node-liblzma, compressjs) + "tar", // Uses native compression modules (@mongodb-js/zstd, node-liblzma, seek-bzip) "yq", // Requires fast-xml-parser and other Node.js-specific parsing "xan", // Complex CSV/data processing with Node.js dependencies "sqlite3", // Uses sql.js (WASM) which requires Node.js worker threads diff --git a/src/commands/tar/archive.ts b/src/commands/tar/archive.ts index 261a4d7b..ab73efc0 100644 --- a/src/commands/tar/archive.ts +++ b/src/commands/tar/archive.ts @@ -5,8 +5,6 @@ * with optional gzip, bzip2, and xz compression. */ -// @ts-expect-error - compressjs doesn't have types -import compressjs from "compressjs"; import { createGzipDecoder, createGzipEncoder, @@ -16,7 +14,10 @@ import { type TarHeader, unpackTar, } from "modern-tar"; +// @ts-expect-error - seek-bzip doesn't have types +import seekBzip from "seek-bzip"; import { DefenseInDepthBox } from "../../security/defense-in-depth-box.js"; +import { bzip2Compress } from "./bzip2-compress.js"; // Lazy load node-liblzma since it requires native compilation // that may fail on some systems (e.g., missing liblzma-dev) @@ -353,25 +354,18 @@ export function isXzCompressed(data: Uint8Array): boolean { } /** - * bzip2 decompression using compressjs + * bzip2 decompression using seek-bzip (MIT licensed) */ async function decompressBzip2(data: Uint8Array): Promise { - const Bzip2 = compressjs.Bzip2; - // decompressFile accepts an array/buffer directly - const decompressed = Bzip2.decompressFile(Array.from(data)); + const decompressed: Buffer = seekBzip.decode(Buffer.from(data)); return new Uint8Array(decompressed); } /** - * bzip2 compression using compressjs + * bzip2 compression using our pure-JS implementation */ async function compressBzip2(data: Uint8Array): Promise { - const Bzip2 = compressjs.Bzip2; - // compressFile accepts input array and output array directly - // coerceInputStream/coerceOutputStream handle the conversion - const output: number[] = []; - Bzip2.compressFile(Array.from(data), output, 9); // block size level 9 - return new Uint8Array(output); + return bzip2Compress(data, 9); } /** diff --git a/src/commands/tar/bzip2-compress.test.ts b/src/commands/tar/bzip2-compress.test.ts new file mode 100644 index 00000000..532943fb --- /dev/null +++ b/src/commands/tar/bzip2-compress.test.ts @@ -0,0 +1,397 @@ +// @ts-expect-error - seek-bzip doesn't have types +import seekBzip from "seek-bzip"; +import { describe, expect, it } from "vitest"; +import { bzip2Compress } from "./bzip2-compress.js"; + +/** + * Helper: compress with our implementation, decompress with seek-bzip (MIT), + * and verify the roundtrip matches the original input. + */ +function roundtrip(input: Uint8Array): Uint8Array { + const compressed = bzip2Compress(input); + const decompressed = seekBzip.decode(Buffer.from(compressed)); + return new Uint8Array(decompressed); +} + +function expectRoundtrip(input: Uint8Array): void { + const output = roundtrip(input); + expect(Buffer.from(output).equals(Buffer.from(input))).toBe(true); +} + +describe("bzip2-compress", () => { + describe("basic roundtrip", () => { + it("should compress and decompress a single byte", () => { + expectRoundtrip(new Uint8Array([65])); + }); + + it("should compress and decompress a short ASCII string", () => { + expectRoundtrip(Buffer.from("hello")); + }); + + it("should compress and decompress a longer ASCII string", () => { + expectRoundtrip( + Buffer.from("The quick brown fox jumps over the lazy dog."), + ); + }); + + it("should compress and decompress repeated characters", () => { + expectRoundtrip(Buffer.from("AAAA")); + }); + + it("should compress and decompress the full printable ASCII range", () => { + const chars: number[] = []; + for (let i = 32; i < 127; i++) chars.push(i); + expectRoundtrip(new Uint8Array(chars)); + }); + }); + + describe("binary data", () => { + it("should handle all 256 byte values", () => { + const data = new Uint8Array(256); + for (let i = 0; i < 256; i++) data[i] = i; + expectRoundtrip(data); + }); + + it("should handle all 256 byte values in reverse order", () => { + const data = new Uint8Array(256); + for (let i = 0; i < 256; i++) data[i] = 255 - i; + expectRoundtrip(data); + }); + + it("should handle null bytes", () => { + expectRoundtrip(new Uint8Array([0, 0, 0, 0, 0])); + }); + + it("should handle 0xFF bytes", () => { + expectRoundtrip(new Uint8Array([255, 255, 255, 255, 255])); + }); + + it("should handle alternating 0x00 and 0xFF", () => { + const data = new Uint8Array(100); + for (let i = 0; i < 100; i++) data[i] = i % 2 === 0 ? 0x00 : 0xff; + expectRoundtrip(data); + }); + + it("should handle random-looking binary data", () => { + // Deterministic pseudo-random via LCG + const data = new Uint8Array(1000); + let seed = 12345; + for (let i = 0; i < data.length; i++) { + seed = (seed * 1103515245 + 12345) & 0x7fffffff; + data[i] = seed & 0xff; + } + expectRoundtrip(data); + }); + + it("should handle binary data with only two distinct byte values", () => { + const data = new Uint8Array(500); + for (let i = 0; i < 500; i++) data[i] = i % 3 === 0 ? 0xab : 0xcd; + expectRoundtrip(data); + }); + }); + + describe("run-length edge cases (RLE1)", () => { + it("should handle exactly 3 repeated bytes (below RLE1 threshold)", () => { + expectRoundtrip(Buffer.from("aaabbb")); + }); + + it("should handle exactly 4 repeated bytes (RLE1 boundary)", () => { + expectRoundtrip(new Uint8Array(4).fill(42)); + }); + + it("should handle exactly 5 repeated bytes", () => { + expectRoundtrip(new Uint8Array(5).fill(42)); + }); + + it("should handle 255 repeated bytes (max RLE1 run)", () => { + expectRoundtrip(new Uint8Array(255).fill(99)); + }); + + it("should handle 256 repeated bytes (exceeds single RLE1 run)", () => { + expectRoundtrip(new Uint8Array(256).fill(99)); + }); + + it("should handle alternating runs of different bytes", () => { + const data: number[] = []; + for (let i = 0; i < 10; i++) { + for (let j = 0; j < 10; j++) data.push(i); + } + expectRoundtrip(new Uint8Array(data)); + }); + + it("should handle many short runs interspersed", () => { + const data: number[] = []; + for (let i = 0; i < 50; i++) { + data.push(i & 0xff, i & 0xff, i & 0xff); // runs of 3 + } + expectRoundtrip(new Uint8Array(data)); + }); + }); + + describe("BWT edge cases", () => { + it("should handle single distinct byte repeated", () => { + expectRoundtrip(new Uint8Array(100).fill(0)); + }); + + it("should handle already-sorted data", () => { + const data = new Uint8Array(100); + for (let i = 0; i < 100; i++) data[i] = i; + expectRoundtrip(data); + }); + + it("should handle reverse-sorted data", () => { + const data = new Uint8Array(100); + for (let i = 0; i < 100; i++) data[i] = 99 - i; + expectRoundtrip(data); + }); + + it("should handle periodic data (short period)", () => { + const pattern = [1, 2, 3]; + const data = new Uint8Array(300); + for (let i = 0; i < 300; i++) data[i] = pattern[i % pattern.length]; + expectRoundtrip(data); + }); + + it("should handle data with long identical prefix then different suffix", () => { + const data = new Uint8Array(200); + data.fill(65); + data[199] = 66; + expectRoundtrip(data); + }); + }); + + describe("MTF / Huffman edge cases", () => { + it("should handle data producing many MTF zeros (high compressibility)", () => { + // Sorted data → BWT produces many runs of same byte → MTF zeros + const data = Buffer.from("aaaaabbbbbcccccdddddeeeee"); + expectRoundtrip(data); + }); + + it("should handle data producing few MTF zeros (low compressibility)", () => { + // Random-looking data → fewer MTF zeros + const data = Buffer.from("qwertyuiopasdfghjklzxcvbnm1234567890"); + expectRoundtrip(data); + }); + + it("should handle data with exactly one unique symbol", () => { + expectRoundtrip(new Uint8Array(50).fill(0x42)); + }); + + it("should handle data with exactly two unique symbols", () => { + const data = new Uint8Array(100); + for (let i = 0; i < 100; i++) data[i] = i < 50 ? 0x41 : 0x42; + expectRoundtrip(data); + }); + + it("should handle data where all 256 byte values appear", () => { + const data = new Uint8Array(512); + for (let i = 0; i < 512; i++) data[i] = i & 0xff; + expectRoundtrip(data); + }); + }); + + describe("larger data", () => { + it("should handle 10 KB of text", () => { + const text = "The quick brown fox jumps over the lazy dog. "; + const repeated = text.repeat(Math.ceil(10240 / text.length)); + expectRoundtrip(Buffer.from(repeated.slice(0, 10240))); + }); + + it("should handle 50 KB of mixed content", () => { + const data = new Uint8Array(50 * 1024); + let seed = 42; + for (let i = 0; i < data.length; i++) { + seed = (seed * 1103515245 + 12345) & 0x7fffffff; + data[i] = seed & 0xff; + } + expectRoundtrip(data); + }); + + it("should handle 100 KB of highly compressible data", () => { + const data = new Uint8Array(100 * 1024); + for (let i = 0; i < data.length; i++) { + data[i] = i % 4; // only 4 distinct values + } + expectRoundtrip(data); + }); + + it("should handle data spanning multiple bzip2 blocks (block size 1)", () => { + // Block size 1 = 100KB blocks, so 150KB should span 2 blocks + const data = new Uint8Array(150 * 1024); + let seed = 7; + for (let i = 0; i < data.length; i++) { + seed = (seed * 1103515245 + 12345) & 0x7fffffff; + data[i] = seed & 0xff; + } + const compressed = bzip2Compress(data, 1); + const decompressed = seekBzip.decode(Buffer.from(compressed)); + expect(Buffer.from(decompressed).equals(Buffer.from(data))).toBe(true); + }); + }); + + describe("size limits", () => { + it("should reject input exceeding 10MB", () => { + const data = new Uint8Array(10 * 1024 * 1024 + 1); + expect(() => bzip2Compress(data)).toThrow("Input too large"); + }); + + it("should accept input at exactly 10MB", () => { + // Just verify it doesn't throw — don't actually compress 10MB in tests + // (would be too slow). Instead test with a smaller size to verify the + // limit check logic works. + const data = new Uint8Array(1); + expect(() => bzip2Compress(data)).not.toThrow(); + }); + }); + + describe("block size levels", () => { + const testData = Buffer.from("Block size test data! ".repeat(100)); + + it("should work with block size level 1", () => { + const compressed = bzip2Compress(testData, 1); + const decompressed = seekBzip.decode(Buffer.from(compressed)); + expect(Buffer.from(decompressed).equals(testData)).toBe(true); + }); + + it("should work with block size level 5", () => { + const compressed = bzip2Compress(testData, 5); + const decompressed = seekBzip.decode(Buffer.from(compressed)); + expect(Buffer.from(decompressed).equals(testData)).toBe(true); + }); + + it("should work with block size level 9", () => { + const compressed = bzip2Compress(testData, 9); + const decompressed = seekBzip.decode(Buffer.from(compressed)); + expect(Buffer.from(decompressed).equals(testData)).toBe(true); + }); + + it("should reject block size level 0", () => { + expect(() => bzip2Compress(testData, 0)).toThrow( + "Block size level must be 1-9", + ); + }); + + it("should reject block size level 10", () => { + expect(() => bzip2Compress(testData, 10)).toThrow( + "Block size level must be 1-9", + ); + }); + }); + + describe("bzip2 format compliance", () => { + it("should produce valid bzip2 header", () => { + const compressed = bzip2Compress(Buffer.from("test"), 9); + // BZh9 header + expect(compressed[0]).toBe(0x42); // 'B' + expect(compressed[1]).toBe(0x5a); // 'Z' + expect(compressed[2]).toBe(0x68); // 'h' + expect(compressed[3]).toBe(0x39); // '9' (block size level) + }); + + it("should encode block size level in header", () => { + for (let level = 1; level <= 9; level++) { + const compressed = bzip2Compress(Buffer.from("x"), level); + expect(compressed[3]).toBe(0x30 + level); + } + }); + + it("should produce valid block magic bytes", () => { + const compressed = bzip2Compress(Buffer.from("test"), 9); + // Block magic: 0x314159265359 + expect(compressed[4]).toBe(0x31); + expect(compressed[5]).toBe(0x41); + expect(compressed[6]).toBe(0x59); + expect(compressed[7]).toBe(0x26); + expect(compressed[8]).toBe(0x53); + expect(compressed[9]).toBe(0x59); + }); + + it("should produce output compatible with system bzip2 decompressor", () => { + // seek-bzip is a well-tested bzip2 decompressor — if it can decode + // our output, we're producing valid bzip2. + const inputs = [ + Buffer.from(""), + Buffer.from("a"), + Buffer.from("Hello, World!"), + new Uint8Array(1000).fill(0), + ]; + for (const input of inputs) { + if (input.length === 0) continue; // bzip2 doesn't compress empty + const compressed = bzip2Compress(input); + const decompressed = seekBzip.decode(Buffer.from(compressed)); + expect(Buffer.from(decompressed).equals(Buffer.from(input))).toBe(true); + } + }); + }); + + describe("special content patterns", () => { + it("should handle newlines and carriage returns", () => { + expectRoundtrip(Buffer.from("line1\nline2\r\nline3\rline4\n")); + }); + + it("should handle null-terminated strings", () => { + expectRoundtrip(Buffer.from("hello\x00world\x00")); + }); + + it("should handle UTF-8 multibyte sequences", () => { + expectRoundtrip(Buffer.from("こんにちは世界 🌍 café résumé")); + }); + + it("should handle data that looks like bzip2 headers (no confusion)", () => { + // Data containing BZh magic and block magic bytes + const data = Buffer.from("BZh9\x31\x41\x59\x26\x53\x59fake"); + expectRoundtrip(data); + }); + + it("should handle tar-like data (512-byte aligned blocks)", () => { + // Simulate a small tar header block + const data = new Uint8Array(1024); + // Fill with typical tar header pattern: name + nulls + mode bytes + Buffer.from("test-file.txt").copy(Buffer.from(data.buffer), 0); + Buffer.from("0000644\x00").copy(Buffer.from(data.buffer), 100); + Buffer.from("0001750\x00").copy(Buffer.from(data.buffer), 108); + expectRoundtrip(data); + }); + + it("should handle highly repetitive JSON", () => { + const json = JSON.stringify( + Array.from({ length: 100 }, (_, i) => ({ + id: i, + name: "test", + value: 42, + })), + ); + expectRoundtrip(Buffer.from(json)); + }); + + it("should handle data with long runs then random data then long runs", () => { + const data = new Uint8Array(1000); + // First 300 bytes: all 'A' + data.fill(65, 0, 300); + // Middle 400 bytes: pseudo-random + let seed = 999; + for (let i = 300; i < 700; i++) { + seed = (seed * 1103515245 + 12345) & 0x7fffffff; + data[i] = seed & 0xff; + } + // Last 300 bytes: all 'Z' + data.fill(90, 700, 1000); + expectRoundtrip(data); + }); + }); + + describe("decompression with seek-bzip", () => { + it("should decompress output from system bzip2", () => { + // Pre-computed bzip2 of "AAAA" using macOS system bzip2 + // (verified via: printf 'AAAA' | /usr/bin/bzip2 -c | xxd -i) + const systemCompressed = new Uint8Array([ + 0x42, 0x5a, 0x68, 0x39, 0x31, 0x41, 0x59, 0x26, 0x53, 0x59, 0xe1, 0x6e, + 0x65, 0x71, 0x00, 0x00, 0x02, 0x44, 0x00, 0x40, 0x00, 0x20, 0x00, 0x20, + 0x00, 0x21, 0x00, 0x82, 0x0b, 0x17, 0x72, 0x45, 0x38, 0x50, 0x90, 0xe1, + 0x6e, 0x65, 0x71, + ]); + const decoded = seekBzip.decode(Buffer.from(systemCompressed)); + expect(decoded.toString()).toBe("AAAA"); + }); + }); +}); diff --git a/src/commands/tar/bzip2-compress.ts b/src/commands/tar/bzip2-compress.ts new file mode 100644 index 00000000..1d81f491 --- /dev/null +++ b/src/commands/tar/bzip2-compress.ts @@ -0,0 +1,760 @@ +/** + * Pure JavaScript bzip2 compressor. + * + * Implements the bzip2 compression algorithm (public domain, Julian Seward 1996). + * Pipeline: RLE1 → BWT → MTF → RLE2 (RUNA/RUNB) → Huffman → bitstream output. + * + * This exists because no permissively-licensed JS bzip2 compressor is available + * on npm. Decompression uses the MIT-licensed `seek-bzip` package instead. + */ + +// ---------- CRC32 for bzip2 (same polynomial as standard CRC32) ---------- + +const CRC32_TABLE: Uint32Array = (() => { + const table = new Uint32Array(256); + for (let i = 0; i < 256; i++) { + let c = i << 24; + for (let j = 0; j < 8; j++) { + c = c & 0x80000000 ? (c << 1) ^ 0x04c11db7 : c << 1; + } + table[i] = c >>> 0; + } + return table; +})(); + +function crc32Update(crc: number, byte: number): number { + return ((crc << 8) ^ CRC32_TABLE[((crc >>> 24) ^ byte) & 0xff]) >>> 0; +} + +// ---------- Bit writer ---------- + +class BitWriter { + private buffer: number[] = []; + private current = 0; + private bitCount = 0; + + writeBits(n: number, value: number): void { + for (let i = n - 1; i >= 0; i--) { + this.current = (this.current << 1) | ((value >>> i) & 1); + this.bitCount++; + if (this.bitCount === 8) { + this.buffer.push(this.current); + this.current = 0; + this.bitCount = 0; + } + } + } + + writeBit(value: number): void { + this.current = (this.current << 1) | (value & 1); + this.bitCount++; + if (this.bitCount === 8) { + this.buffer.push(this.current); + this.current = 0; + this.bitCount = 0; + } + } + + finish(): Uint8Array { + if (this.bitCount > 0) { + this.buffer.push(this.current << (8 - this.bitCount)); + } + return new Uint8Array(this.buffer); + } +} + +// ---------- RLE1: Initial run-length encoding ---------- +// Runs of 4+ identical bytes become: byte byte byte byte (count-4) +// where count-4 is 0..251 + +function rle1Encode(data: Uint8Array): Uint8Array { + const out: number[] = []; + let i = 0; + while (i < data.length) { + const ch = data[i]; + let runLen = 1; + while ( + i + runLen < data.length && + data[i + runLen] === ch && + runLen < 255 + ) { + runLen++; + } + if (runLen >= 4) { + out.push(ch, ch, ch, ch); + out.push(runLen - 4); + i += runLen; + } else { + out.push(ch); + i++; + } + } + return new Uint8Array(out); +} + +// ---------- Burrows-Wheeler Transform ---------- +// Uses prefix-doubling suffix array construction: O(n log² n) time, O(n) space. + +function bwt(data: Uint8Array): { transformed: Uint8Array; pointer: number } { + const n = data.length; + if (n === 0) { + return { transformed: new Uint8Array(0), pointer: 0 }; + } + + // Build suffix array of the doubled data (rotation = suffix of doubled string) + // using prefix-doubling algorithm. + const sa = buildSuffixArrayForRotations(data); + + const transformed = new Uint8Array(n); + let pointer = 0; + for (let i = 0; i < n; i++) { + if (sa[i] === 0) { + pointer = i; + transformed[i] = data[n - 1]; + } else { + transformed[i] = data[sa[i] - 1]; + } + } + + return { transformed, pointer }; +} + +/** + * Build a suffix array for circular rotations using prefix doubling. + * O(n log² n) time with O(n) space — fast enough for bzip2 blocks up to 900KB. + */ +function buildSuffixArrayForRotations(data: Uint8Array): Int32Array { + const n = data.length; + const sa = new Int32Array(n); + const rank = new Int32Array(n); + + // Initialize ranks from byte values + for (let i = 0; i < n; i++) { + sa[i] = i; + rank[i] = data[i]; + } + + // Prefix doubling: sort by first 2^k characters of each rotation + for (let gap = 1; gap < n; gap *= 2) { + // Sort by (rank[i], rank[(i+gap)%n]) + // Use a comparison-based sort with the current rank array + const r = rank.slice(); // snapshot ranks before sorting + sa.sort((a, b) => { + if (r[a] !== r[b]) return r[a] - r[b]; + return r[(a + gap) % n] - r[(b + gap) % n]; + }); + + // Recompute ranks based on sorted order + rank[sa[0]] = 0; + for (let i = 1; i < n; i++) { + // Same rank if both primary and secondary keys match + if ( + r[sa[i]] === r[sa[i - 1]] && + r[(sa[i] + gap) % n] === r[(sa[i - 1] + gap) % n] + ) { + rank[sa[i]] = rank[sa[i - 1]]; + } else { + rank[sa[i]] = rank[sa[i - 1]] + 1; + } + } + + // If all ranks are unique, we're done + if (rank[sa[n - 1]] === n - 1) break; + } + + return sa; +} + +// ---------- Move-to-Front transform ---------- + +function mtfEncode( + data: Uint8Array, + symbolsInUse: boolean[], +): { encoded: Uint16Array; length: number } { + // Build initial MTF list from symbols actually in use + const mtfList: number[] = []; + for (let i = 0; i < 256; i++) { + if (symbolsInUse[i]) { + mtfList.push(i); + } + } + + // Map byte values to their index in the used-symbols list + const byteToIndex: number[] = new Array(256).fill(-1); + for (let i = 0; i < mtfList.length; i++) { + byteToIndex[mtfList[i]] = i; + } + + const encoded = new Uint16Array(data.length); + const list = mtfList.slice(); // working copy + + for (let i = 0; i < data.length; i++) { + const byte = data[i]; + // Find position of this byte in the MTF list + let pos = 0; + while (list[pos] !== byte) pos++; + + encoded[i] = pos; + + // Move to front + if (pos > 0) { + const val = list[pos]; + for (let j = pos; j > 0; j--) { + list[j] = list[j - 1]; + } + list[0] = val; + } + } + + return { encoded, length: data.length }; +} + +// ---------- RLE2: Zero run-length encoding (RUNA/RUNB) ---------- +// Zeros are encoded as base-2 sequences of RUNA(0) and RUNB(1) +// Non-zero values are incremented by 1 + +function rle2Encode( + mtfData: Uint16Array, + mtfLen: number, + numSymbolsInUse: number, +): { symbols: Uint16Array; length: number; eob: number } { + const RUNA = 0; + const RUNB = 1; + const eob = numSymbolsInUse + 1; // End-of-block symbol + + const symbols: number[] = []; + let i = 0; + + while (i < mtfLen) { + if (mtfData[i] === 0) { + // Count run of zeros + let runLen = 0; + while (i < mtfLen && mtfData[i] === 0) { + runLen++; + i++; + } + // Encode run length as RUNA/RUNB sequence (bijective base-2) + // runLen = sum of (digit+1) * 2^position + // 1 → RUNA, 2 → RUNB, 3 → RUNA RUNA, 4 → RUNB RUNA, etc. + let n = runLen; + while (n > 0) { + n--; + if (n & 1) { + symbols.push(RUNB); + } else { + symbols.push(RUNA); + } + n >>>= 1; + } + } else { + // Non-zero MTF values are incremented by 1 (to make room for RUNA/RUNB) + symbols.push(mtfData[i] + 1); + i++; + } + } + + symbols.push(eob); + + const result = new Uint16Array(symbols.length); + for (let j = 0; j < symbols.length; j++) { + result[j] = symbols[j]; + } + + return { symbols: result, length: symbols.length, eob }; +} + +// ---------- Huffman coding ---------- + +interface HuffmanTable { + codeLengths: number[]; + maxLen: number; + minLen: number; +} + +function buildHuffmanTable( + freqs: number[], + numSymbols: number, + maxCodeLen: number, +): HuffmanTable { + // Build Huffman tree using package-merge algorithm for length-limited codes + // First, use a simple approach: build standard Huffman, then limit lengths + + if (numSymbols <= 1) { + const lengths = new Array(freqs.length).fill(0); + for (let i = 0; i < freqs.length; i++) { + if (freqs[i] > 0) lengths[i] = 1; + } + return { codeLengths: lengths, maxLen: 1, minLen: 1 }; + } + + // Standard Huffman tree construction + interface HNode { + freq: number; + symbol: number; + left: HNode | null; + right: HNode | null; + } + + const nodes: HNode[] = []; + for (let i = 0; i < freqs.length; i++) { + if (freqs[i] > 0) { + nodes.push({ freq: freqs[i], symbol: i, left: null, right: null }); + } + } + + if (nodes.length === 0) { + return { + codeLengths: new Array(freqs.length).fill(0), + maxLen: 0, + minLen: 0, + }; + } + + if (nodes.length === 1) { + const lengths = new Array(freqs.length).fill(0); + lengths[nodes[0].symbol] = 1; + return { codeLengths: lengths, maxLen: 1, minLen: 1 }; + } + + // Build tree + while (nodes.length > 1) { + nodes.sort((a, b) => a.freq - b.freq); + // nodes.length > 1 guarantees both shifts return a value + const left = nodes.shift() as HNode; + const right = nodes.shift() as HNode; + nodes.push({ + freq: left.freq + right.freq, + symbol: -1, + left, + right, + }); + } + + // Extract code lengths + const codeLengths = new Array(freqs.length).fill(0); + function traverse(node: HNode, depth: number): void { + if (!node.left && !node.right) { + codeLengths[node.symbol] = depth; + return; + } + if (node.left) traverse(node.left, depth + 1); + if (node.right) traverse(node.right, depth + 1); + } + traverse(nodes[0], 0); + + // Limit code lengths to maxCodeLen using the heuristic approach + let changed = true; + while (changed) { + changed = false; + for (let i = 0; i < codeLengths.length; i++) { + if (codeLengths[i] > maxCodeLen) { + codeLengths[i] = maxCodeLen; + changed = true; + } + } + if (changed) { + // Verify Kraft inequality and adjust if needed + let kraft = 0; + for (let i = 0; i < codeLengths.length; i++) { + if (codeLengths[i] > 0) { + kraft += 1.0 / (1 << codeLengths[i]); + } + } + if (kraft > 1.0) { + // Need to increase some shorter codes + // Find the shortest code and increase it + for (let len = 1; len < maxCodeLen && kraft > 1.0; len++) { + for (let i = 0; i < codeLengths.length && kraft > 1.0; i++) { + if (codeLengths[i] === len) { + codeLengths[i]++; + kraft -= 1.0 / (1 << len) - 1.0 / (1 << (len + 1)); + } + } + } + } + changed = false; // We've adjusted, check again + for (let i = 0; i < codeLengths.length; i++) { + if (codeLengths[i] > maxCodeLen) { + changed = true; + break; + } + } + } + } + + let minLen = maxCodeLen; + let actualMaxLen = 0; + for (let i = 0; i < codeLengths.length; i++) { + if (codeLengths[i] > 0) { + if (codeLengths[i] < minLen) minLen = codeLengths[i]; + if (codeLengths[i] > actualMaxLen) actualMaxLen = codeLengths[i]; + } + } + + return { codeLengths, maxLen: actualMaxLen, minLen }; +} + +function generateCanonicalCodes( + codeLengths: number[], + numSymbols: number, +): { codes: number[]; lengths: number[] } { + const codes = new Array(numSymbols).fill(0); + const lengths = codeLengths.slice(0, numSymbols); + + // Count codes of each length + const maxLen = Math.max(...lengths, 0); + const blCount = new Array(maxLen + 1).fill(0); + for (let i = 0; i < numSymbols; i++) { + if (lengths[i] > 0) blCount[lengths[i]]++; + } + + // Find the numerical value of the smallest code for each code length + const nextCode = new Array(maxLen + 1).fill(0); + let code = 0; + for (let bits = 1; bits <= maxLen; bits++) { + code = (code + blCount[bits - 1]) << 1; + nextCode[bits] = code; + } + + // Assign codes + for (let i = 0; i < numSymbols; i++) { + if (lengths[i] > 0) { + codes[i] = nextCode[lengths[i]]++; + } + } + + return { codes, lengths }; +} + +// ---------- Block compression ---------- + +function compressBlock( + writer: BitWriter, + blockData: Uint8Array, + blockCRC: number, +): void { + // Block header magic: 0x314159265359 (pi) + writer.writeBits(24, 0x314159); + writer.writeBits(24, 0x265359); + + // Block CRC + writer.writeBits(32, blockCRC); + + // Randomized flag (always 0 in modern bzip2) + writer.writeBit(0); + + // Step 1: RLE1 encoding + const rle1Data = rle1Encode(blockData); + + // Step 2: BWT + const { transformed, pointer } = bwt(rle1Data); + + // BWT pointer (24 bits) + writer.writeBits(24, pointer); + + // Step 3: Determine symbols in use + const symbolsInUse: boolean[] = new Array(256).fill(false); + for (let i = 0; i < transformed.length; i++) { + symbolsInUse[transformed[i]] = true; + } + + // Write symbol map (16 groups of 16) + const inUse16: boolean[] = new Array(16).fill(false); + for (let i = 0; i < 16; i++) { + for (let j = 0; j < 16; j++) { + if (symbolsInUse[i * 16 + j]) { + inUse16[i] = true; + break; + } + } + } + + for (let i = 0; i < 16; i++) { + writer.writeBit(inUse16[i] ? 1 : 0); + } + for (let i = 0; i < 16; i++) { + if (inUse16[i]) { + for (let j = 0; j < 16; j++) { + writer.writeBit(symbolsInUse[i * 16 + j] ? 1 : 0); + } + } + } + + // Count symbols in use + let numSymbolsInUse = 0; + for (let i = 0; i < 256; i++) { + if (symbolsInUse[i]) numSymbolsInUse++; + } + + // Step 4: MTF + const { encoded: mtfData, length: mtfLen } = mtfEncode( + transformed, + symbolsInUse, + ); + + // Step 5: RLE2 + const { symbols, length: symLen } = rle2Encode( + mtfData, + mtfLen, + numSymbolsInUse, + ); + + // Total alphabet size: numSymbolsInUse + 2 (RUNA, RUNB, symbols 1..n, EOB) + const alphaSize = numSymbolsInUse + 2; + + // Step 6: Build Huffman table(s) + // For simplicity, use a single Huffman table (nGroups=1 is not valid in bzip2, + // minimum is 2, but we'll use the minimum number appropriate for data size) + const GROUP_SIZE = 50; + const nSelectors = Math.ceil(symLen / GROUP_SIZE); + + // Determine number of tables (bzip2 uses 2-6 based on data size) + let nGroups: number; + if (symLen < 200) nGroups = 2; + else if (symLen < 600) nGroups = 3; + else if (symLen < 1200) nGroups = 4; + else if (symLen < 2400) nGroups = 5; + else nGroups = 6; + + // For simplicity with small data, cap nGroups + if (nGroups > nSelectors) nGroups = Math.max(2, nSelectors); + + // Build frequency tables for each group + // Simple approach: assign symbols to groups evenly, build one table per group + // Then do a few iterations of optimization + const groupFreqs: number[][] = []; + for (let t = 0; t < nGroups; t++) { + groupFreqs.push(new Array(alphaSize).fill(0)); + } + + // Initial assignment: distribute groups evenly across selectors + const selectors = new Int32Array(nSelectors); + for (let s = 0; s < nSelectors; s++) { + selectors[s] = s % nGroups; + } + + // Build initial frequency counts + for (let s = 0; s < nSelectors; s++) { + const start = s * GROUP_SIZE; + const end = Math.min(start + GROUP_SIZE, symLen); + const g = selectors[s]; + for (let i = start; i < end; i++) { + groupFreqs[g][symbols[i]]++; + } + } + + // Ensure all symbols have at least frequency 1 to produce valid code lengths. + // bzip2 requires every symbol in the alphabet to have a valid code (1-20). + function ensureMinFreq(freqs: number[]): void { + for (let i = 0; i < alphaSize; i++) { + if (freqs[i] < 1) freqs[i] = 1; + } + } + + // Build Huffman tables + const tables: HuffmanTable[] = []; + for (let t = 0; t < nGroups; t++) { + ensureMinFreq(groupFreqs[t]); + tables.push(buildHuffmanTable(groupFreqs[t], alphaSize, 20)); + } + + // Iterate: reassign selectors to best table, rebuild tables + for (let iter = 0; iter < 4; iter++) { + // Reset frequencies + for (let t = 0; t < nGroups; t++) { + groupFreqs[t].fill(0); + } + + // Reassign each group of symbols to the table with shortest encoding + for (let s = 0; s < nSelectors; s++) { + const start = s * GROUP_SIZE; + const end = Math.min(start + GROUP_SIZE, symLen); + + let bestGroup = 0; + let bestCost = Infinity; + + for (let t = 0; t < nGroups; t++) { + let cost = 0; + for (let i = start; i < end; i++) { + cost += tables[t].codeLengths[symbols[i]] || 20; + } + if (cost < bestCost) { + bestCost = cost; + bestGroup = t; + } + } + + selectors[s] = bestGroup; + for (let i = start; i < end; i++) { + groupFreqs[bestGroup][symbols[i]]++; + } + } + + // Rebuild tables + for (let t = 0; t < nGroups; t++) { + ensureMinFreq(groupFreqs[t]); + tables[t] = buildHuffmanTable(groupFreqs[t], alphaSize, 20); + } + } + + // Write number of Huffman trees + writer.writeBits(3, nGroups); + + // Write number of selectors + writer.writeBits(15, nSelectors); + + // MTF encode selectors + const selectorMtf: number[] = []; + const selectorList: number[] = []; + for (let i = 0; i < nGroups; i++) selectorList.push(i); + + for (let s = 0; s < nSelectors; s++) { + const val = selectors[s]; + let pos = 0; + while (selectorList[pos] !== val) pos++; + + selectorMtf.push(pos); + + // Move to front + if (pos > 0) { + const v = selectorList[pos]; + for (let j = pos; j > 0; j--) { + selectorList[j] = selectorList[j - 1]; + } + selectorList[0] = v; + } + } + + // Write selectors as unary + for (let s = 0; s < nSelectors; s++) { + for (let j = 0; j < selectorMtf[s]; j++) { + writer.writeBit(1); + } + writer.writeBit(0); + } + + // Write Huffman tables (delta-encoded code lengths) + for (let t = 0; t < nGroups; t++) { + const lengths = tables[t].codeLengths; + let currentLen = lengths[0]; + + writer.writeBits(5, currentLen); + + for (let i = 0; i < alphaSize; i++) { + const targetLen = lengths[i]; + while (currentLen < targetLen) { + writer.writeBit(1); + writer.writeBit(0); // increment + currentLen++; + } + while (currentLen > targetLen) { + writer.writeBit(1); + writer.writeBit(1); // decrement + currentLen--; + } + writer.writeBit(0); // done for this symbol + } + } + + // Write compressed data using Huffman codes + for (let t = 0; t < nGroups; t++) { + const { codes, lengths } = generateCanonicalCodes( + tables[t].codeLengths, + alphaSize, + ); + tables[t] = Object.assign(tables[t], { _codes: codes, _lengths: lengths }); + } + + let selectorIdx = 0; + let groupPos = 0; + + for (let i = 0; i < symLen; i++) { + if (groupPos === 0 || groupPos >= GROUP_SIZE) { + if (i > 0) selectorIdx++; + groupPos = 0; + } + + const tableIdx = selectors[selectorIdx]; + const table = tables[tableIdx] as HuffmanTable & { + _codes: number[]; + _lengths: number[]; + }; + const sym = symbols[i]; + const len = table._lengths[sym]; + const code = table._codes[sym]; + + if (len > 0) { + writer.writeBits(len, code); + } + groupPos++; + } +} + +// ---------- Main compression function ---------- + +// Default maximum input size to prevent runaway compute (10MB). +// BWT is O(n log² n) which is acceptable up to this limit. +const DEFAULT_MAX_COMPRESS_SIZE = 10 * 1024 * 1024; + +/** + * Compress data using bzip2 algorithm. + * @param data - Input data to compress + * @param blockSizeLevel - Block size level 1-9 (x 100KB), default 9 + * @param maxSize - Maximum input size in bytes (default 10MB) + * @returns Compressed bzip2 data + */ +export function bzip2Compress( + data: Uint8Array, + blockSizeLevel: number = 9, + maxSize: number = DEFAULT_MAX_COMPRESS_SIZE, +): Uint8Array { + if (blockSizeLevel < 1 || blockSizeLevel > 9) { + throw new Error("Block size level must be 1-9"); + } + if (data.length > maxSize) { + throw new Error( + `Input too large for bzip2 compression (${data.length} bytes, max ${maxSize})`, + ); + } + + const blockSize = blockSizeLevel * 100000; + const writer = new BitWriter(); + + // Stream header + writer.writeBits(8, 0x42); // 'B' + writer.writeBits(8, 0x5a); // 'Z' + writer.writeBits(8, 0x68); // 'h' + writer.writeBits(8, 0x30 + blockSizeLevel); // '0' + level + + let combinedCRC = 0; + + // Process blocks + let offset = 0; + while (offset < data.length) { + const end = Math.min(offset + blockSize, data.length); + const blockData = data.subarray(offset, end); + + // Compute block CRC + let blockCRC = 0xffffffff; + for (let i = 0; i < blockData.length; i++) { + blockCRC = crc32Update(blockCRC, blockData[i]); + } + blockCRC = ~blockCRC >>> 0; + + // Update combined CRC + combinedCRC = ((combinedCRC << 1) | (combinedCRC >>> 31)) >>> 0; + combinedCRC = (combinedCRC ^ blockCRC) >>> 0; + + compressBlock(writer, blockData, blockCRC); + offset = end; + } + + // Stream footer magic: 0x177245385090 (sqrt(pi)) + writer.writeBits(24, 0x177245); + writer.writeBits(24, 0x385090); + + // Combined CRC + writer.writeBits(32, combinedCRC); + + return writer.finish(); +}