Skip to content

Commit ce2ab0e

Browse files
authored
feat(builtins): add join and split commands (#584)
## Summary - `join`: Join lines of two sorted files on common field (`-1`, `-2`, `-t`, `-a`, `-e`) - `split`: Split files into pieces by lines (`-l`), bytes (`-b`), or chunks (`-n`), with `-d` for numeric suffixes ## Test plan - [x] 5 join tests: basic, custom field, custom separator, missing operand, unpairable lines - [x] 8 split tests: by lines, numeric suffix, by chunks, custom prefix, missing file, zero chunks, suffix generation - [x] clippy clean, 1653 tests pass Closes #547, Closes #543
1 parent 0b0e887 commit ce2ab0e

File tree

4 files changed

+514
-0
lines changed

4 files changed

+514
-0
lines changed
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
//! join builtin command - join lines of two sorted files on a common field
2+
3+
use async_trait::async_trait;
4+
5+
use super::{Builtin, Context, resolve_path};
6+
use crate::error::Result;
7+
use crate::interpreter::ExecResult;
8+
9+
/// The join builtin command.
10+
///
11+
/// Usage: join [-1 FIELD] [-2 FIELD] [-t CHAR] [-a FILENUM] [-e STRING] FILE1 FILE2
12+
///
13+
/// Join lines of two sorted files on a common field (default: first field).
14+
pub struct Join;
15+
16+
struct JoinOptions {
17+
field1: usize, // 1-based field number for file1
18+
field2: usize, // 1-based field number for file2
19+
separator: char, // field separator
20+
unpaired: Vec<usize>, // which file's unpairable lines to show (1 or 2)
21+
empty: String, // replacement for missing fields
22+
}
23+
24+
#[async_trait]
25+
impl Builtin for Join {
26+
async fn execute(&self, ctx: Context<'_>) -> Result<ExecResult> {
27+
let mut opts = JoinOptions {
28+
field1: 1,
29+
field2: 1,
30+
separator: ' ',
31+
unpaired: Vec::new(),
32+
empty: String::new(),
33+
};
34+
35+
let mut files: Vec<&str> = Vec::new();
36+
let mut i = 0;
37+
38+
while i < ctx.args.len() {
39+
match ctx.args[i].as_str() {
40+
"-1" => {
41+
i += 1;
42+
opts.field1 = ctx.args.get(i).and_then(|s| s.parse().ok()).unwrap_or(1);
43+
}
44+
"-2" => {
45+
i += 1;
46+
opts.field2 = ctx.args.get(i).and_then(|s| s.parse().ok()).unwrap_or(1);
47+
}
48+
"-t" => {
49+
i += 1;
50+
if let Some(s) = ctx.args.get(i) {
51+
opts.separator = s.chars().next().unwrap_or(' ');
52+
}
53+
}
54+
"-a" => {
55+
i += 1;
56+
if let Some(n) = ctx.args.get(i).and_then(|s| s.parse::<usize>().ok()) {
57+
opts.unpaired.push(n);
58+
}
59+
}
60+
"-e" => {
61+
i += 1;
62+
if let Some(s) = ctx.args.get(i) {
63+
opts.empty = s.clone();
64+
}
65+
}
66+
_ => files.push(&ctx.args[i]),
67+
}
68+
i += 1;
69+
}
70+
71+
if files.len() < 2 {
72+
return Ok(ExecResult::err("join: missing operand\n".to_string(), 1));
73+
}
74+
75+
let content1 = read_input(ctx.fs.as_ref(), ctx.cwd, files[0], ctx.stdin).await?;
76+
let content2 = read_input(ctx.fs.as_ref(), ctx.cwd, files[1], None).await?;
77+
78+
let lines1: Vec<&str> = content1.lines().collect();
79+
let lines2: Vec<&str> = content2.lines().collect();
80+
81+
let sep = opts.separator;
82+
let mut output = String::new();
83+
let mut j = 0;
84+
85+
for line1 in &lines1 {
86+
let fields1: Vec<&str> = line1.split(sep).collect();
87+
let key1 = fields1.get(opts.field1 - 1).copied().unwrap_or("");
88+
89+
let mut matched = false;
90+
while j < lines2.len() {
91+
let fields2: Vec<&str> = lines2[j].split(sep).collect();
92+
let key2 = fields2.get(opts.field2 - 1).copied().unwrap_or("");
93+
94+
match key1.cmp(key2) {
95+
std::cmp::Ordering::Equal => {
96+
matched = true;
97+
// Output: key, remaining fields from file1, remaining fields from file2
98+
output.push_str(key1);
99+
for (k, f) in fields1.iter().enumerate() {
100+
if k != opts.field1 - 1 {
101+
output.push(sep);
102+
output.push_str(f);
103+
}
104+
}
105+
for (k, f) in fields2.iter().enumerate() {
106+
if k != opts.field2 - 1 {
107+
output.push(sep);
108+
output.push_str(f);
109+
}
110+
}
111+
output.push('\n');
112+
j += 1;
113+
break;
114+
}
115+
std::cmp::Ordering::Greater => {
116+
if opts.unpaired.contains(&2) {
117+
output.push_str(lines2[j]);
118+
output.push('\n');
119+
}
120+
j += 1;
121+
}
122+
std::cmp::Ordering::Less => {
123+
break;
124+
}
125+
}
126+
}
127+
128+
if !matched && opts.unpaired.contains(&1) {
129+
output.push_str(line1);
130+
output.push('\n');
131+
}
132+
}
133+
134+
// Remaining unmatched lines from file2
135+
if opts.unpaired.contains(&2) {
136+
while j < lines2.len() {
137+
output.push_str(lines2[j]);
138+
output.push('\n');
139+
j += 1;
140+
}
141+
}
142+
143+
Ok(ExecResult::ok(output))
144+
}
145+
}
146+
147+
async fn read_input(
148+
fs: &dyn crate::fs::FileSystem,
149+
cwd: &std::path::Path,
150+
file: &str,
151+
stdin: Option<&str>,
152+
) -> Result<String> {
153+
if file == "-" {
154+
Ok(stdin.unwrap_or("").to_string())
155+
} else {
156+
let path = resolve_path(cwd, file);
157+
let bytes = fs.read_file(&path).await?;
158+
Ok(String::from_utf8_lossy(&bytes).to_string())
159+
}
160+
}
161+
162+
#[cfg(test)]
163+
#[allow(clippy::unwrap_used)]
164+
mod tests {
165+
use super::*;
166+
use crate::fs::{FileSystem, InMemoryFs};
167+
use std::collections::HashMap;
168+
use std::path::{Path, PathBuf};
169+
use std::sync::Arc;
170+
171+
async fn run_join(args: &[&str], fs: Arc<dyn FileSystem>) -> ExecResult {
172+
let args: Vec<String> = args.iter().map(|s| s.to_string()).collect();
173+
let env = HashMap::new();
174+
let mut variables = HashMap::new();
175+
let mut cwd = PathBuf::from("/");
176+
let ctx = Context {
177+
args: &args,
178+
env: &env,
179+
variables: &mut variables,
180+
cwd: &mut cwd,
181+
fs,
182+
stdin: None,
183+
#[cfg(feature = "http_client")]
184+
http_client: None,
185+
#[cfg(feature = "git")]
186+
git_client: None,
187+
};
188+
Join.execute(ctx).await.expect("join failed")
189+
}
190+
191+
#[tokio::test]
192+
async fn test_join_basic() {
193+
let fs = Arc::new(InMemoryFs::new()) as Arc<dyn FileSystem>;
194+
fs.write_file(Path::new("/f1"), b"a 1\nb 2\nc 3")
195+
.await
196+
.unwrap();
197+
fs.write_file(Path::new("/f2"), b"a x\nb y\nc z")
198+
.await
199+
.unwrap();
200+
let result = run_join(&["/f1", "/f2"], fs).await;
201+
assert_eq!(result.exit_code, 0);
202+
assert!(result.stdout.contains("a 1 x"));
203+
assert!(result.stdout.contains("b 2 y"));
204+
assert!(result.stdout.contains("c 3 z"));
205+
}
206+
207+
#[tokio::test]
208+
async fn test_join_custom_field() {
209+
let fs = Arc::new(InMemoryFs::new()) as Arc<dyn FileSystem>;
210+
fs.write_file(Path::new("/f1"), b"x a\ny b").await.unwrap();
211+
fs.write_file(Path::new("/f2"), b"a 1\nb 2").await.unwrap();
212+
let result = run_join(&["-1", "2", "/f1", "/f2"], fs).await;
213+
assert_eq!(result.exit_code, 0);
214+
assert!(result.stdout.contains("a x 1"));
215+
}
216+
217+
#[tokio::test]
218+
async fn test_join_custom_separator() {
219+
let fs = Arc::new(InMemoryFs::new()) as Arc<dyn FileSystem>;
220+
fs.write_file(Path::new("/f1"), b"a:1\nb:2").await.unwrap();
221+
fs.write_file(Path::new("/f2"), b"a:x\nb:y").await.unwrap();
222+
let result = run_join(&["-t", ":", "/f1", "/f2"], fs).await;
223+
assert_eq!(result.exit_code, 0);
224+
assert!(result.stdout.contains("a:1:x"));
225+
}
226+
227+
#[tokio::test]
228+
async fn test_join_missing_operand() {
229+
let fs = Arc::new(InMemoryFs::new()) as Arc<dyn FileSystem>;
230+
let result = run_join(&["/f1"], fs).await;
231+
assert_eq!(result.exit_code, 1);
232+
}
233+
234+
#[tokio::test]
235+
async fn test_join_unpairable() {
236+
let fs = Arc::new(InMemoryFs::new()) as Arc<dyn FileSystem>;
237+
fs.write_file(Path::new("/f1"), b"a 1\nb 2\nc 3")
238+
.await
239+
.unwrap();
240+
fs.write_file(Path::new("/f2"), b"a x\nc z").await.unwrap();
241+
let result = run_join(&["-a", "1", "/f1", "/f2"], fs).await;
242+
assert_eq!(result.exit_code, 0);
243+
// "b 2" should appear as unpairable from file1
244+
assert!(result.stdout.contains("b 2"));
245+
}
246+
}

crates/bashkit/src/builtins/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ mod grep;
5050
mod headtail;
5151
mod hextools;
5252
mod inspect;
53+
mod join;
5354
mod jq;
5455
mod ls;
5556
mod navigation;
@@ -64,6 +65,7 @@ mod seq;
6465
mod sleep;
6566
mod sortuniq;
6667
mod source;
68+
mod split;
6769
mod strings;
6870
mod system;
6971
mod test;
@@ -109,6 +111,7 @@ pub use grep::Grep;
109111
pub use headtail::{Head, Tail};
110112
pub use hextools::{Hexdump, Od, Xxd};
111113
pub use inspect::{File, Less, Stat};
114+
pub use join::Join;
112115
pub use jq::Jq;
113116
pub(crate) use ls::glob_match;
114117
pub use ls::{Find, Ls, Rmdir};
@@ -124,6 +127,7 @@ pub use seq::Seq;
124127
pub use sleep::Sleep;
125128
pub use sortuniq::{Sort, Uniq};
126129
pub use source::Source;
130+
pub use split::Split;
127131
pub use strings::Strings;
128132
pub use system::{DEFAULT_HOSTNAME, DEFAULT_USERNAME, Hostname, Id, Uname, Whoami};
129133
pub use test::{Bracket, Test};

0 commit comments

Comments
 (0)