diff --git a/src/commands/base64/base64.binary.test.ts b/src/commands/base64/base64.binary.test.ts index 0c673d99..be0b8c9d 100644 --- a/src/commands/base64/base64.binary.test.ts +++ b/src/commands/base64/base64.binary.test.ts @@ -130,5 +130,72 @@ describe("base64 with binary data", () => { expect(result.stdout).toBe("test content"); }); + + it("should handle large binary files (1MB+)", async () => { + // Create a 1MB binary file with all byte values repeated + const size = 1024 * 1024; // 1MB + const data = new Uint8Array(size); + for (let i = 0; i < size; i++) { + data[i] = i % 256; + } + + const env = new Bash({ + files: { + "/large.bin": data, + }, + }); + + // Encode the large file + await env.exec("base64 /large.bin > /encoded.txt"); + + // Decode it back + await env.exec("base64 -d /encoded.txt > /decoded.bin"); + + // Verify the decoded file matches the original + const decoded = await env.fs.readFileBuffer( + env.fs.resolvePath("/", "/decoded.bin"), + ); + + expect(decoded.length).toBe(size); + // Check first, middle, and last bytes + expect(decoded[0]).toBe(0); + expect(decoded[255]).toBe(255); + expect(decoded[size / 2]).toBe((size / 2) % 256); + expect(decoded[size - 1]).toBe((size - 1) % 256); + + // Verify a sample of bytes throughout the file + for (let i = 0; i < size; i += 10000) { + expect(decoded[i]).toBe(i % 256); + } + }); + + it("should handle large files via pipe", async () => { + // Create a 512KB binary file + const size = 512 * 1024; + const data = new Uint8Array(size); + for (let i = 0; i < size; i++) { + data[i] = (i * 7) % 256; // Different pattern + } + + const env = new Bash({ + files: { + "/medium.bin": data, + }, + }); + + // Round-trip through pipe + await env.exec("cat /medium.bin | base64 | base64 -d > /output.bin"); + + // Verify the output matches the original + const output = await env.fs.readFileBuffer( + env.fs.resolvePath("/", "/output.bin"), + ); + + expect(output.length).toBe(size); + // Check a sample of bytes + for (let i = 0; i < size; i += 5000) { + expect(output[i]).toBe((i * 7) % 256); + } + }); }); }); diff --git a/src/commands/base64/base64.ts b/src/commands/base64/base64.ts index e6661365..f68cf2e7 100644 --- a/src/commands/base64/base64.ts +++ b/src/commands/base64/base64.ts @@ -93,7 +93,19 @@ export const base64Command: Command = { // For decoding, read as text and strip whitespace const readResult = await readBinary(ctx, files, "base64"); if (!readResult.ok) return readResult.error; - // Use binary string (latin1) to preserve bytes for input + + // Use Buffer if available (Node.js) for better large file handling + if (typeof Buffer !== "undefined") { + const buffer = Buffer.from(readResult.data); + const cleaned = buffer.toString("utf8").replace(/\s/g, ""); + const decoded = Buffer.from(cleaned, "base64"); + // Convert to binary string (each char code = byte value) + // Use Buffer's latin1 encoding which treats each byte as a character + const result = decoded.toString("latin1"); + return { stdout: result, stderr: "", exitCode: 0 }; + } + + // Browser fallback - use binary string (latin1) to preserve bytes for input const input = String.fromCharCode(...readResult.data); const cleaned = input.replace(/\s/g, ""); // Decode base64 to binary string (each char code = byte value) @@ -105,8 +117,15 @@ export const base64Command: Command = { const readResult = await readBinary(ctx, files, "base64"); if (!readResult.ok) return readResult.error; - // Convert binary to base64 - let encoded = btoa(String.fromCharCode(...readResult.data)); + // Use Buffer if available (Node.js) for better large file handling + let encoded: string; + if (typeof Buffer !== "undefined") { + const buffer = Buffer.from(readResult.data); + encoded = buffer.toString("base64"); + } else { + // Browser fallback - convert binary to base64 + encoded = btoa(String.fromCharCode(...readResult.data)); + } if (wrapCols > 0) { const lines: string[] = []; diff --git a/src/fs/encoding.ts b/src/fs/encoding.ts index b9980457..f1261594 100644 --- a/src/fs/encoding.ts +++ b/src/fs/encoding.ts @@ -36,7 +36,16 @@ export function toBuffer( return bytes; } if (encoding === "binary" || encoding === "latin1") { - return Uint8Array.from(content, (c) => c.charCodeAt(0)); + // Use chunked approach for large strings to avoid performance issues + const chunkSize = 65536; // 64KB chunks + if (content.length <= chunkSize) { + return Uint8Array.from(content, (c) => c.charCodeAt(0)); + } + const result = new Uint8Array(content.length); + for (let i = 0; i < content.length; i++) { + result[i] = content.charCodeAt(i); + } + return result; } // Default to UTF-8 for text content return textEncoder.encode(content); @@ -58,7 +67,22 @@ export function fromBuffer( .join(""); } if (encoding === "binary" || encoding === "latin1") { - return String.fromCharCode(...buffer); + // Use Buffer if available (Node.js) - much more efficient and avoids spread operator limits + if (typeof Buffer !== "undefined") { + return Buffer.from(buffer).toString(encoding); + } + + // Browser fallback - String.fromCharCode(...buffer) fails with buffers > ~100KB + const chunkSize = 65536; // 64KB chunks + if (buffer.length <= chunkSize) { + return String.fromCharCode(...buffer); + } + let result = ""; + for (let i = 0; i < buffer.length; i += chunkSize) { + const chunk = buffer.subarray(i, i + chunkSize); + result += String.fromCharCode(...chunk); + } + return result; } // Default to UTF-8 for text content return textDecoder.decode(buffer); diff --git a/src/fs/read-write-fs/read-write-fs.piping.test.ts b/src/fs/read-write-fs/read-write-fs.piping.test.ts new file mode 100644 index 00000000..e10c5fc1 --- /dev/null +++ b/src/fs/read-write-fs/read-write-fs.piping.test.ts @@ -0,0 +1,164 @@ +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterAll, beforeAll, describe, expect, it } from "vitest"; +import { Bash } from "../../Bash.js"; +import { ReadWriteFs } from "./read-write-fs.js"; + +/** + * Test piping with ReadWriteFs (real filesystem) + * This test suite validates that just-bash can handle large data through pipes + * when using ReadWriteFs backed by the real filesystem. + */ +describe("ReadWriteFs - Piping with large data", () => { + let tempDir: string; + let fs: ReadWriteFs; + let bash: Bash; + + beforeAll(async () => { + // Create a real temp directory + tempDir = await mkdtemp(join(tmpdir(), "bash-test-")); + console.log("Created temp dir:", tempDir); + + // Use ReadWriteFs with real filesystem + fs = new ReadWriteFs({ root: tempDir }); + bash = new Bash({ fs }); + }); + + afterAll(async () => { + // Cleanup + if (tempDir) { + await rm(tempDir, { recursive: true, force: true }); + console.log("Cleaned up temp dir:", tempDir); + } + }); + + it("should handle large data with wc -l using ReadWriteFs", async () => { + // Create large text data with trailing newline (standard for text files) + const lines = Array.from({ length: 50000 }, (_, i) => `Line ${i + 1}`); + const largeText = `${lines.join("\n")}\n`; + + console.log( + `Generated text size: ${(largeText.length / 1024 / 1024).toFixed(2)}MB`, + ); + console.log(`Line count: ${lines.length}`); + + // Write to file + await fs.writeFile("/data.txt", largeText); + + // Test piping through cat + const result = await bash.exec("cat /data.txt | wc -l"); + + console.log("Result stdout:", result.stdout.trim()); + console.log("Result stderr:", result.stderr); + console.log("Result exitCode:", result.exitCode); + + expect(result.exitCode).toBe(0); + expect(result.stdout.trim()).toBe("50000"); + }, 30000); + + it("should handle large data with wc -l FILENAME using ReadWriteFs", async () => { + // Create large text data with trailing newline + const lines = Array.from({ length: 50000 }, (_, i) => `Line ${i + 1}`); + const largeText = `${lines.join("\n")}\n`; + + // Write to file + await fs.writeFile("/data2.txt", largeText); + + // Test direct file access + const result = await bash.exec("wc -l /data2.txt"); + + console.log("Result stdout:", result.stdout.trim()); + console.log("Result exitCode:", result.exitCode); + + expect(result.exitCode).toBe(0); + expect(result.stdout.trim()).toContain("50000"); + }, 30000); + + it("should handle small data with wc -l using ReadWriteFs", async () => { + // Create small text data with trailing newline + const lines = Array.from({ length: 100 }, (_, i) => `Line ${i + 1}`); + const smallText = `${lines.join("\n")}\n`; + + // Write to file + await fs.writeFile("/small.txt", smallText); + + // Test piping through cat + const result = await bash.exec("cat /small.txt | wc -l"); + + console.log("Result stdout:", result.stdout.trim()); + console.log("Result exitCode:", result.exitCode); + + expect(result.exitCode).toBe(0); + expect(result.stdout.trim()).toBe("100"); + }, 30000); + + it("should handle medium data with multiple pipes", async () => { + // Create medium text data with some repeated lines + const lines = Array.from({ length: 10000 }, (_, i) => { + // Create some duplicates + const lineNum = Math.floor(i / 2); + return `Line ${lineNum}`; + }); + const mediumText = lines.join("\n"); + + // Write to file + await fs.writeFile("/medium.txt", mediumText); + + // Test piping through multiple commands + const result = await bash.exec("cat /medium.txt | sort | uniq | wc -l"); + + console.log("Result stdout:", result.stdout.trim()); + console.log("Result exitCode:", result.exitCode); + + expect(result.exitCode).toBe(0); + // Should have 5000 unique lines (0-4999) + expect(result.stdout.trim()).toBe("5000"); + }, 30000); + + it("should handle grep with large files", async () => { + // Create large text data with specific patterns + const lines = Array.from({ length: 20000 }, (_, i) => { + if (i % 3 === 0) { + return `MATCH Line ${i}`; + } + return `Other Line ${i}`; + }); + const largeText = lines.join("\n"); + + // Write to file + await fs.writeFile("/grep-test.txt", largeText); + + // Test grep with wc + const result = await bash.exec("grep MATCH /grep-test.txt | wc -l"); + + console.log("Result stdout:", result.stdout.trim()); + console.log("Result exitCode:", result.exitCode); + + expect(result.exitCode).toBe(0); + // Should match every 3rd line: 20000/3 = 6667 (rounded up) + expect(result.stdout.trim()).toBe("6667"); + }, 30000); + + it("should handle binary data correctly", async () => { + // Create binary data + const binaryData = new Uint8Array(10000); + for (let i = 0; i < binaryData.length; i++) { + binaryData[i] = i % 256; + } + + // Write binary file + await fs.writeFile("/binary.bin", binaryData); + + // Test wc -c (byte count) + const result = await bash.exec("wc -c /binary.bin"); + + console.log("Result stdout:", result.stdout.trim()); + console.log("Result exitCode:", result.exitCode); + + expect(result.exitCode).toBe(0); + expect(result.stdout.trim()).toContain("10000"); + }, 30000); +}); + +// Made with Bob diff --git a/src/interpreter/redirections.ts b/src/interpreter/redirections.ts index 8595ce3c..3869fb3c 100644 --- a/src/interpreter/redirections.ts +++ b/src/interpreter/redirections.ts @@ -53,9 +53,16 @@ async function checkOutputRedirectTarget( * Determine the encoding to use for file I/O. * If all character codes are <= 255, use binary encoding (byte data). * Otherwise, use UTF-8 encoding (text with Unicode characters). + * For performance, only check the first 8KB of large strings. */ function getFileEncoding(content: string): "binary" | "utf8" { - for (let i = 0; i < content.length; i++) { + const SAMPLE_SIZE = 8192; // 8KB + + // For large strings, only check the first 8KB + // This is sufficient since UTF-8 files typically have Unicode chars early + const checkLength = Math.min(content.length, SAMPLE_SIZE); + + for (let i = 0; i < checkLength; i++) { if (content.charCodeAt(i) > 255) { return "utf8"; }