From cd22f0a6b26c01c60df5225237c4ad0d07ac1f3b Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Sat, 14 Mar 2026 05:22:56 +0000 Subject: [PATCH 1/2] feat(builtins): add join and split commands - join: join lines of two sorted files on a common field with -1/-2/-t/-a/-e options (Closes #547) - split: split files into pieces by lines (-l), bytes (-b), or chunks (-n) with numeric suffixes (-d) (Closes #543) Includes 13 unit tests covering positive and negative paths. --- crates/bashkit/src/builtins/join.rs | 246 +++++++++++++++++++++++ crates/bashkit/src/builtins/mod.rs | 4 + crates/bashkit/src/builtins/split.rs | 269 ++++++++++++++++++++++++++ crates/bashkit/src/interpreter/mod.rs | 2 + 4 files changed, 521 insertions(+) create mode 100644 crates/bashkit/src/builtins/join.rs create mode 100644 crates/bashkit/src/builtins/split.rs diff --git a/crates/bashkit/src/builtins/join.rs b/crates/bashkit/src/builtins/join.rs new file mode 100644 index 00000000..1f694a85 --- /dev/null +++ b/crates/bashkit/src/builtins/join.rs @@ -0,0 +1,246 @@ +//! join builtin command - join lines of two sorted files on a common field + +use async_trait::async_trait; + +use super::{Builtin, Context, resolve_path}; +use crate::error::Result; +use crate::interpreter::ExecResult; + +/// The join builtin command. +/// +/// Usage: join [-1 FIELD] [-2 FIELD] [-t CHAR] [-a FILENUM] [-e STRING] FILE1 FILE2 +/// +/// Join lines of two sorted files on a common field (default: first field). +pub struct Join; + +struct JoinOptions { + field1: usize, // 1-based field number for file1 + field2: usize, // 1-based field number for file2 + separator: char, // field separator + unpaired: Vec, // which file's unpairable lines to show (1 or 2) + empty: String, // replacement for missing fields +} + +#[async_trait] +impl Builtin for Join { + async fn execute(&self, ctx: Context<'_>) -> Result { + let mut opts = JoinOptions { + field1: 1, + field2: 1, + separator: ' ', + unpaired: Vec::new(), + empty: String::new(), + }; + + let mut files: Vec<&str> = Vec::new(); + let mut i = 0; + + while i < ctx.args.len() { + match ctx.args[i].as_str() { + "-1" => { + i += 1; + opts.field1 = ctx.args.get(i).and_then(|s| s.parse().ok()).unwrap_or(1); + } + "-2" => { + i += 1; + opts.field2 = ctx.args.get(i).and_then(|s| s.parse().ok()).unwrap_or(1); + } + "-t" => { + i += 1; + if let Some(s) = ctx.args.get(i) { + opts.separator = s.chars().next().unwrap_or(' '); + } + } + "-a" => { + i += 1; + if let Some(n) = ctx.args.get(i).and_then(|s| s.parse::().ok()) { + opts.unpaired.push(n); + } + } + "-e" => { + i += 1; + if let Some(s) = ctx.args.get(i) { + opts.empty = s.clone(); + } + } + _ => files.push(&ctx.args[i]), + } + i += 1; + } + + if files.len() < 2 { + return Ok(ExecResult::err("join: missing operand\n".to_string(), 1)); + } + + let content1 = read_input(ctx.fs.as_ref(), ctx.cwd, files[0], ctx.stdin).await?; + let content2 = read_input(ctx.fs.as_ref(), ctx.cwd, files[1], None).await?; + + let lines1: Vec<&str> = content1.lines().collect(); + let lines2: Vec<&str> = content2.lines().collect(); + + let sep = opts.separator; + let mut output = String::new(); + let mut j = 0; + + for line1 in &lines1 { + let fields1: Vec<&str> = line1.split(sep).collect(); + let key1 = fields1.get(opts.field1 - 1).copied().unwrap_or(""); + + let mut matched = false; + while j < lines2.len() { + let fields2: Vec<&str> = lines2[j].split(sep).collect(); + let key2 = fields2.get(opts.field2 - 1).copied().unwrap_or(""); + + match key1.cmp(key2) { + std::cmp::Ordering::Equal => { + matched = true; + // Output: key, remaining fields from file1, remaining fields from file2 + output.push_str(key1); + for (k, f) in fields1.iter().enumerate() { + if k != opts.field1 - 1 { + output.push(sep); + output.push_str(f); + } + } + for (k, f) in fields2.iter().enumerate() { + if k != opts.field2 - 1 { + output.push(sep); + output.push_str(f); + } + } + output.push('\n'); + j += 1; + break; + } + std::cmp::Ordering::Greater => { + if opts.unpaired.contains(&2) { + output.push_str(lines2[j]); + output.push('\n'); + } + j += 1; + } + std::cmp::Ordering::Less => { + break; + } + } + } + + if !matched && opts.unpaired.contains(&1) { + output.push_str(line1); + output.push('\n'); + } + } + + // Remaining unmatched lines from file2 + if opts.unpaired.contains(&2) { + while j < lines2.len() { + output.push_str(lines2[j]); + output.push('\n'); + j += 1; + } + } + + Ok(ExecResult::ok(output)) + } +} + +async fn read_input( + fs: &dyn crate::fs::FileSystem, + cwd: &std::path::Path, + file: &str, + stdin: Option<&str>, +) -> Result { + if file == "-" { + Ok(stdin.unwrap_or("").to_string()) + } else { + let path = resolve_path(cwd, file); + let bytes = fs.read_file(&path).await?; + Ok(String::from_utf8_lossy(&bytes).to_string()) + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use crate::fs::{FileSystem, InMemoryFs}; + use std::collections::HashMap; + use std::path::{Path, PathBuf}; + use std::sync::Arc; + + async fn run_join(args: &[&str], fs: Arc) -> ExecResult { + let args: Vec = args.iter().map(|s| s.to_string()).collect(); + let env = HashMap::new(); + let mut variables = HashMap::new(); + let mut cwd = PathBuf::from("/"); + let ctx = Context { + args: &args, + env: &env, + variables: &mut variables, + cwd: &mut cwd, + fs, + stdin: None, + #[cfg(feature = "http_client")] + http_client: None, + #[cfg(feature = "git")] + git_client: None, + }; + Join.execute(ctx).await.expect("join failed") + } + + #[tokio::test] + async fn test_join_basic() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + fs.write_file(Path::new("/f1"), b"a 1\nb 2\nc 3") + .await + .unwrap(); + fs.write_file(Path::new("/f2"), b"a x\nb y\nc z") + .await + .unwrap(); + let result = run_join(&["/f1", "/f2"], fs).await; + assert_eq!(result.exit_code, 0); + assert!(result.stdout.contains("a 1 x")); + assert!(result.stdout.contains("b 2 y")); + assert!(result.stdout.contains("c 3 z")); + } + + #[tokio::test] + async fn test_join_custom_field() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + fs.write_file(Path::new("/f1"), b"x a\ny b").await.unwrap(); + fs.write_file(Path::new("/f2"), b"a 1\nb 2").await.unwrap(); + let result = run_join(&["-1", "2", "/f1", "/f2"], fs).await; + assert_eq!(result.exit_code, 0); + assert!(result.stdout.contains("a x 1")); + } + + #[tokio::test] + async fn test_join_custom_separator() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + fs.write_file(Path::new("/f1"), b"a:1\nb:2").await.unwrap(); + fs.write_file(Path::new("/f2"), b"a:x\nb:y").await.unwrap(); + let result = run_join(&["-t", ":", "/f1", "/f2"], fs).await; + assert_eq!(result.exit_code, 0); + assert!(result.stdout.contains("a:1:x")); + } + + #[tokio::test] + async fn test_join_missing_operand() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let result = run_join(&["/f1"], fs).await; + assert_eq!(result.exit_code, 1); + } + + #[tokio::test] + async fn test_join_unpairable() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + fs.write_file(Path::new("/f1"), b"a 1\nb 2\nc 3") + .await + .unwrap(); + fs.write_file(Path::new("/f2"), b"a x\nc z").await.unwrap(); + let result = run_join(&["-a", "1", "/f1", "/f2"], fs).await; + assert_eq!(result.exit_code, 0); + // "b 2" should appear as unpairable from file1 + assert!(result.stdout.contains("b 2")); + } +} diff --git a/crates/bashkit/src/builtins/mod.rs b/crates/bashkit/src/builtins/mod.rs index ffee4e94..edc8fa71 100644 --- a/crates/bashkit/src/builtins/mod.rs +++ b/crates/bashkit/src/builtins/mod.rs @@ -50,6 +50,7 @@ mod grep; mod headtail; mod hextools; mod inspect; +mod join; mod jq; mod ls; mod navigation; @@ -64,6 +65,7 @@ mod seq; mod sleep; mod sortuniq; mod source; +mod split; mod strings; mod system; mod test; @@ -109,6 +111,7 @@ pub use grep::Grep; pub use headtail::{Head, Tail}; pub use hextools::{Hexdump, Od, Xxd}; pub use inspect::{File, Less, Stat}; +pub use join::Join; pub use jq::Jq; pub(crate) use ls::glob_match; pub use ls::{Find, Ls, Rmdir}; @@ -124,6 +127,7 @@ pub use seq::Seq; pub use sleep::Sleep; pub use sortuniq::{Sort, Uniq}; pub use source::Source; +pub use split::Split; pub use strings::Strings; pub use system::{DEFAULT_HOSTNAME, DEFAULT_USERNAME, Hostname, Id, Uname, Whoami}; pub use test::{Bracket, Test}; diff --git a/crates/bashkit/src/builtins/split.rs b/crates/bashkit/src/builtins/split.rs new file mode 100644 index 00000000..a4480f21 --- /dev/null +++ b/crates/bashkit/src/builtins/split.rs @@ -0,0 +1,269 @@ +//! split builtin command - split a file into pieces + +use async_trait::async_trait; + +use super::{Builtin, Context, resolve_path}; +use crate::error::Result; +use crate::interpreter::ExecResult; + +/// The split builtin command. +/// +/// Usage: split [-l lines] [-b bytes] [-n chunks] [-d] [FILE [PREFIX]] +/// +/// Options: +/// -l N Split into pieces of N lines each (default 1000) +/// -b N Split by byte size +/// -n N Split into N equal pieces +/// -d Use numeric suffixes (00, 01, ...) instead of alphabetic (aa, ab, ...) +pub struct Split; + +#[async_trait] +impl Builtin for Split { + async fn execute(&self, ctx: Context<'_>) -> Result { + let mut lines_per_file: Option = None; + let mut bytes_per_file: Option = None; + let mut num_chunks: Option = None; + let mut numeric_suffix = false; + let mut positional: Vec<&str> = Vec::new(); + + let mut i = 0; + while i < ctx.args.len() { + match ctx.args[i].as_str() { + "-l" => { + i += 1; + lines_per_file = + Some(ctx.args.get(i).and_then(|s| s.parse().ok()).unwrap_or(1000)); + } + "-b" => { + i += 1; + bytes_per_file = ctx.args.get(i).and_then(|s| parse_size(s)); + } + "-n" => { + i += 1; + num_chunks = ctx.args.get(i).and_then(|s| s.parse().ok()); + } + "-d" | "--numeric-suffixes" => numeric_suffix = true, + _ => positional.push(&ctx.args[i]), + } + i += 1; + } + + // Default to splitting by lines + if lines_per_file.is_none() && bytes_per_file.is_none() && num_chunks.is_none() { + lines_per_file = Some(1000); + } + + let file = positional.first().copied().unwrap_or("-"); + let prefix = positional.get(1).copied().unwrap_or("x"); + + let input = if file == "-" { + ctx.stdin.unwrap_or("").to_string() + } else { + let path = resolve_path(ctx.cwd, file); + match ctx.fs.read_file(&path).await { + Ok(bytes) => String::from_utf8_lossy(&bytes).to_string(), + Err(_) => { + return Ok(ExecResult::err( + format!( + "split: cannot open '{}' for reading: No such file or directory\n", + file + ), + 1, + )); + } + } + }; + + let mut file_index = 0; + + if let Some(n) = num_chunks { + // Split into N equal pieces + if n == 0 { + return Ok(ExecResult::err( + "split: invalid number of chunks: 0\n".to_string(), + 1, + )); + } + let chunk_size = input.len().div_ceil(n); + let bytes = input.as_bytes(); + let mut pos = 0; + while pos < bytes.len() { + let end = (pos + chunk_size).min(bytes.len()); + let suffix = make_suffix(file_index, numeric_suffix); + let out_path = resolve_path(ctx.cwd, &format!("{}{}", prefix, suffix)); + ctx.fs + .write_file(&out_path, &bytes[pos..end]) + .await + ?; + file_index += 1; + pos = end; + } + } else if let Some(size) = bytes_per_file { + // Split by byte size + let bytes = input.as_bytes(); + let mut pos = 0; + while pos < bytes.len() { + let end = (pos + size).min(bytes.len()); + let suffix = make_suffix(file_index, numeric_suffix); + let out_path = resolve_path(ctx.cwd, &format!("{}{}", prefix, suffix)); + ctx.fs + .write_file(&out_path, &bytes[pos..end]) + .await + ?; + file_index += 1; + pos = end; + } + } else { + // Split by lines + let n = lines_per_file.unwrap_or(1000); + let lines: Vec<&str> = input.lines().collect(); + let mut pos = 0; + while pos < lines.len() { + let end = (pos + n).min(lines.len()); + let suffix = make_suffix(file_index, numeric_suffix); + let out_path = resolve_path(ctx.cwd, &format!("{}{}", prefix, suffix)); + let chunk = lines[pos..end].join("\n"); + let chunk_with_newline = if end < lines.len() || input.ends_with('\n') { + format!("{}\n", chunk) + } else { + chunk + }; + ctx.fs + .write_file(&out_path, chunk_with_newline.as_bytes()) + .await + ?; + file_index += 1; + pos = end; + } + } + + Ok(ExecResult::ok(String::new())) + } +} + +fn make_suffix(index: usize, numeric: bool) -> String { + if numeric { + format!("{:02}", index) + } else { + // aa, ab, ac, ..., az, ba, bb, ... + let first = (b'a' + (index / 26) as u8) as char; + let second = (b'a' + (index % 26) as u8) as char; + format!("{}{}", first, second) + } +} + +fn parse_size(s: &str) -> Option { + let s = s.trim(); + if let Some(stripped) = s.strip_suffix('k').or_else(|| s.strip_suffix('K')) { + stripped.parse::().ok().map(|n| n * 1024) + } else if let Some(stripped) = s.strip_suffix('m').or_else(|| s.strip_suffix('M')) { + stripped.parse::().ok().map(|n| n * 1024 * 1024) + } else { + s.parse::().ok() + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use crate::fs::{FileSystem, InMemoryFs}; + use std::collections::HashMap; + use std::path::{Path, PathBuf}; + use std::sync::Arc; + + async fn run_split(args: &[&str], stdin: Option<&str>, fs: Arc) -> ExecResult { + let args: Vec = args.iter().map(|s| s.to_string()).collect(); + let env = HashMap::new(); + let mut variables = HashMap::new(); + let mut cwd = PathBuf::from("/"); + let ctx = Context { + args: &args, + env: &env, + variables: &mut variables, + cwd: &mut cwd, + fs, + stdin, + #[cfg(feature = "http_client")] + http_client: None, + #[cfg(feature = "git")] + git_client: None, + }; + Split.execute(ctx).await.expect("split failed") + } + + #[tokio::test] + async fn test_split_by_lines() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let input = "line1\nline2\nline3\nline4\nline5\n"; + fs.write_file(Path::new("/input"), input.as_bytes()) + .await + .unwrap(); + let result = run_split(&["-l", "2", "/input"], None, fs.clone()).await; + assert_eq!(result.exit_code, 0); + // Should create xaa (2 lines), xab (2 lines), xac (1 line) + assert!(fs.exists(Path::new("/xaa")).await.unwrap()); + assert!(fs.exists(Path::new("/xab")).await.unwrap()); + assert!(fs.exists(Path::new("/xac")).await.unwrap()); + } + + #[tokio::test] + async fn test_split_numeric_suffix() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let input = "a\nb\nc\n"; + let result = run_split(&["-l", "1", "-d"], Some(input), fs.clone()).await; + assert_eq!(result.exit_code, 0); + assert!(fs.exists(Path::new("/x00")).await.unwrap()); + assert!(fs.exists(Path::new("/x01")).await.unwrap()); + assert!(fs.exists(Path::new("/x02")).await.unwrap()); + } + + #[tokio::test] + async fn test_split_by_chunks() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let input = "abcdef"; + let result = run_split(&["-n", "3"], Some(input), fs.clone()).await; + assert_eq!(result.exit_code, 0); + assert!(fs.exists(Path::new("/xaa")).await.unwrap()); + assert!(fs.exists(Path::new("/xab")).await.unwrap()); + assert!(fs.exists(Path::new("/xac")).await.unwrap()); + } + + #[tokio::test] + async fn test_split_custom_prefix() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let input = "data\n"; + let result = run_split(&["-l", "1", "-", "out_"], Some(input), fs.clone()).await; + assert_eq!(result.exit_code, 0); + assert!(fs.exists(Path::new("/out_aa")).await.unwrap()); + } + + #[tokio::test] + async fn test_split_missing_file() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let result = run_split(&["/nonexistent"], None, fs).await; + assert_eq!(result.exit_code, 1); + assert!(result.stderr.contains("cannot open")); + } + + #[tokio::test] + async fn test_split_zero_chunks() { + let fs = Arc::new(InMemoryFs::new()) as Arc; + let result = run_split(&["-n", "0"], Some("data"), fs).await; + assert_eq!(result.exit_code, 1); + } + + #[tokio::test] + async fn test_make_suffix_alpha() { + assert_eq!(make_suffix(0, false), "aa"); + assert_eq!(make_suffix(1, false), "ab"); + assert_eq!(make_suffix(26, false), "ba"); + } + + #[tokio::test] + async fn test_make_suffix_numeric() { + assert_eq!(make_suffix(0, true), "00"); + assert_eq!(make_suffix(5, true), "05"); + assert_eq!(make_suffix(42, true), "42"); + } +} diff --git a/crates/bashkit/src/interpreter/mod.rs b/crates/bashkit/src/interpreter/mod.rs index 021550c8..e28da299 100644 --- a/crates/bashkit/src/interpreter/mod.rs +++ b/crates/bashkit/src/interpreter/mod.rs @@ -480,6 +480,8 @@ impl Interpreter { builtins.insert("expand".to_string(), Box::new(builtins::Expand)); builtins.insert("unexpand".to_string(), Box::new(builtins::Unexpand)); builtins.insert("envsubst".to_string(), Box::new(builtins::Envsubst)); + builtins.insert("join".to_string(), Box::new(builtins::Join)); + builtins.insert("split".to_string(), Box::new(builtins::Split)); // Merge custom builtins (override defaults if same name) for (name, builtin) in custom_builtins { From 85e7b93025e66b8042c3e5b4f7c0b86c949b5025 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Sat, 14 Mar 2026 05:31:26 +0000 Subject: [PATCH 2/2] style: fix split.rs formatting --- crates/bashkit/src/builtins/split.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/crates/bashkit/src/builtins/split.rs b/crates/bashkit/src/builtins/split.rs index a4480f21..f4f28f07 100644 --- a/crates/bashkit/src/builtins/split.rs +++ b/crates/bashkit/src/builtins/split.rs @@ -91,10 +91,7 @@ impl Builtin for Split { let end = (pos + chunk_size).min(bytes.len()); let suffix = make_suffix(file_index, numeric_suffix); let out_path = resolve_path(ctx.cwd, &format!("{}{}", prefix, suffix)); - ctx.fs - .write_file(&out_path, &bytes[pos..end]) - .await - ?; + ctx.fs.write_file(&out_path, &bytes[pos..end]).await?; file_index += 1; pos = end; } @@ -106,10 +103,7 @@ impl Builtin for Split { let end = (pos + size).min(bytes.len()); let suffix = make_suffix(file_index, numeric_suffix); let out_path = resolve_path(ctx.cwd, &format!("{}{}", prefix, suffix)); - ctx.fs - .write_file(&out_path, &bytes[pos..end]) - .await - ?; + ctx.fs.write_file(&out_path, &bytes[pos..end]).await?; file_index += 1; pos = end; } @@ -130,8 +124,7 @@ impl Builtin for Split { }; ctx.fs .write_file(&out_path, chunk_with_newline.as_bytes()) - .await - ?; + .await?; file_index += 1; pos = end; }