Skip to content

Commit 669f91d

Browse files
committed
feat(builtins): full sort -k KEYDEF parsing with multi-key support
Parse the complete GNU sort key spec: start/end fields (-k2,3), character positions (-k2.3,3.4), per-key flags (-k2n,2 -k3r,3), and multiple keys for cascading sort priority. Closes #906
1 parent 865ee37 commit 669f91d

File tree

2 files changed

+235
-51
lines changed

2 files changed

+235
-51
lines changed

crates/bashkit/src/builtins/sortuniq.rs

Lines changed: 189 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,152 @@ use crate::interpreter::ExecResult;
2525
/// -o Write output to FILE
2626
pub struct Sort;
2727

28-
/// Extract the sort key from a line based on field delimiter and key spec
29-
fn extract_key(line: &str, delimiter: Option<char>, key_field: usize) -> String {
28+
/// A parsed sort key definition from `-k KEYDEF`.
29+
/// Format: `START[.CHAR][FLAGS][,END[.CHAR][FLAGS]]`
30+
#[derive(Clone, Debug)]
31+
struct KeySpec {
32+
start_field: usize,
33+
start_char: usize, // 0 = whole field
34+
end_field: usize, // 0 = end of line
35+
end_char: usize, // 0 = end of field
36+
numeric: bool,
37+
reverse: bool,
38+
fold_case: bool,
39+
human_numeric: bool,
40+
month_sort: bool,
41+
#[allow(dead_code)] // Used when combined with sort -V feature
42+
version_sort: bool,
43+
}
44+
45+
impl KeySpec {
46+
/// Parse a KEYDEF string like "2", "2,3", "2.3,3.4", "2n,2", "2nr"
47+
fn parse(spec: &str) -> Self {
48+
let (start_part, end_part) = if let Some(comma) = spec.find(',') {
49+
(&spec[..comma], Some(&spec[comma + 1..]))
50+
} else {
51+
(spec, None)
52+
};
53+
54+
let (start_field, start_char, start_flags) = Self::parse_field_spec(start_part);
55+
let (end_field, end_char, end_flags) = if let Some(ep) = end_part {
56+
let (f, c, fl) = Self::parse_field_spec(ep);
57+
(f, c, fl)
58+
} else {
59+
(0, 0, String::new())
60+
};
61+
62+
// Merge flags from both start and end parts
63+
let all_flags: String = format!("{}{}", start_flags, end_flags);
64+
65+
KeySpec {
66+
start_field,
67+
start_char,
68+
end_field,
69+
end_char,
70+
numeric: all_flags.contains('n'),
71+
reverse: all_flags.contains('r'),
72+
fold_case: all_flags.contains('f'),
73+
human_numeric: all_flags.contains('h'),
74+
month_sort: all_flags.contains('M'),
75+
version_sort: all_flags.contains('V'),
76+
}
77+
}
78+
79+
/// Parse "FIELD[.CHAR][FLAGS]" → (field, char_pos, flags_string)
80+
fn parse_field_spec(s: &str) -> (usize, usize, String) {
81+
let mut i = 0;
82+
let chars: Vec<char> = s.chars().collect();
83+
// Parse field number
84+
while i < chars.len() && chars[i].is_ascii_digit() {
85+
i += 1;
86+
}
87+
let field: usize = s[..i].parse().unwrap_or(0);
88+
89+
// Parse optional .CHAR
90+
let mut char_pos = 0;
91+
if i < chars.len() && chars[i] == '.' {
92+
i += 1;
93+
let start = i;
94+
while i < chars.len() && chars[i].is_ascii_digit() {
95+
i += 1;
96+
}
97+
char_pos = s[start..i].parse().unwrap_or(0);
98+
}
99+
100+
// Remaining chars are flags
101+
let flags = s[i..].to_string();
102+
(field, char_pos, flags)
103+
}
104+
}
105+
106+
/// Split a line into fields using the given delimiter
107+
fn split_fields(line: &str, delimiter: Option<char>) -> Vec<&str> {
30108
if let Some(delim) = delimiter {
31-
line.split(delim)
32-
.nth(key_field.saturating_sub(1))
33-
.unwrap_or("")
34-
.to_string()
109+
line.split(delim).collect()
35110
} else {
36-
// Default: whitespace-separated fields
37-
line.split_whitespace()
38-
.nth(key_field.saturating_sub(1))
39-
.unwrap_or("")
40-
.to_string()
111+
line.split_whitespace().collect()
41112
}
42113
}
43114

115+
/// Extract the sort key from a line based on field delimiter and key spec
116+
fn extract_key_spec(line: &str, delimiter: Option<char>, key: &KeySpec) -> String {
117+
let fields = split_fields(line, delimiter);
118+
if fields.is_empty() || key.start_field == 0 {
119+
return line.to_string();
120+
}
121+
122+
let start_idx = key.start_field.saturating_sub(1);
123+
if start_idx >= fields.len() {
124+
return String::new();
125+
}
126+
127+
let end_idx = if key.end_field == 0 {
128+
fields.len() - 1
129+
} else {
130+
(key.end_field.saturating_sub(1)).min(fields.len() - 1)
131+
};
132+
133+
if start_idx > end_idx {
134+
return String::new();
135+
}
136+
137+
if start_idx == end_idx {
138+
let field = fields[start_idx];
139+
let start_c = if key.start_char > 0 {
140+
(key.start_char - 1).min(field.len())
141+
} else {
142+
0
143+
};
144+
let end_c = if key.end_char > 0 {
145+
key.end_char.min(field.len())
146+
} else {
147+
field.len()
148+
};
149+
if start_c >= end_c {
150+
return String::new();
151+
}
152+
return field[start_c..end_c].to_string();
153+
}
154+
155+
// Multi-field key
156+
let mut result = String::new();
157+
for (i, field) in fields.iter().enumerate().take(end_idx + 1).skip(start_idx) {
158+
if i > start_idx {
159+
result.push(delimiter.unwrap_or(' '));
160+
}
161+
if i == start_idx && key.start_char > 0 {
162+
let sc = (key.start_char - 1).min(field.len());
163+
result.push_str(&field[sc..]);
164+
} else if i == end_idx && key.end_char > 0 {
165+
let ec = key.end_char.min(field.len());
166+
result.push_str(&field[..ec]);
167+
} else {
168+
result.push_str(field);
169+
}
170+
}
171+
result
172+
}
173+
44174
/// Extract leading numeric prefix from a string for `sort -n`.
45175
/// Real coreutils `sort -n` parses the leading numeric portion (optional sign,
46176
/// digits, optional decimal point and digits) and treats the rest as non-numeric.
@@ -124,7 +254,7 @@ impl Builtin for Sort {
124254
let mut month_sort = false;
125255
let mut merge = false;
126256
let mut delimiter: Option<char> = None;
127-
let mut key_field: Option<usize> = None;
257+
let mut key_specs: Vec<KeySpec> = Vec::new();
128258
let mut output_file: Option<String> = None;
129259
let mut zero_terminated = false;
130260
let mut files = Vec::new();
@@ -134,15 +264,7 @@ impl Builtin for Sort {
134264
if let Some(val) = p.flag_value_opt("-t") {
135265
delimiter = val.chars().next();
136266
} else if let Some(val) = p.flag_value_opt("-k") {
137-
// Parse key: "2" or "2,2" or "2n"
138-
let field_str: String = val.chars().take_while(|c| c.is_ascii_digit()).collect();
139-
key_field = field_str.parse().ok();
140-
if val.contains('n') {
141-
numeric = true;
142-
}
143-
if val.contains('r') {
144-
reverse = true;
145-
}
267+
key_specs.push(KeySpec::parse(val));
146268
} else if let Some(val) = p.flag_value_opt("-o") {
147269
output_file = Some(val.to_string());
148270
} else {
@@ -280,43 +402,59 @@ impl Builtin for Sort {
280402
}
281403

282404
// Get the key extractor
283-
let get_key = |line: &str| -> String {
284-
if let Some(kf) = key_field {
285-
extract_key(line, delimiter, kf)
405+
/// Compare two keys using the specified sort mode flags
406+
fn compare_keys(
407+
ka: &str,
408+
kb: &str,
409+
is_numeric: bool,
410+
is_human: bool,
411+
is_month: bool,
412+
is_fold_case: bool,
413+
) -> std::cmp::Ordering {
414+
if is_human {
415+
let na = parse_human_numeric(ka);
416+
let nb = parse_human_numeric(kb);
417+
na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal)
418+
} else if is_month {
419+
month_ordinal(ka).cmp(&month_ordinal(kb))
420+
} else if is_numeric {
421+
let na = extract_numeric_prefix(ka);
422+
let nb = extract_numeric_prefix(kb);
423+
na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal)
424+
} else if is_fold_case {
425+
ka.to_lowercase().cmp(&kb.to_lowercase())
286426
} else {
287-
line.to_string()
427+
ka.cmp(kb)
288428
}
289-
};
429+
}
290430

291431
// Sort the lines
292432
let sort_fn = |a: &String, b: &String| -> std::cmp::Ordering {
293-
let ka = get_key(a);
294-
let kb = get_key(b);
295-
if human_numeric {
296-
let na = parse_human_numeric(&ka);
297-
let nb = parse_human_numeric(&kb);
298-
na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal)
299-
} else if month_sort {
300-
let ma = month_ordinal(&ka);
301-
let mb = month_ordinal(&kb);
302-
ma.cmp(&mb)
303-
} else if numeric {
304-
let na = extract_numeric_prefix(&ka);
305-
let nb = extract_numeric_prefix(&kb);
306-
match na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal) {
307-
std::cmp::Ordering::Equal => a.cmp(b),
308-
ord => ord,
309-
}
310-
} else if fold_case {
311-
let ord = ka.to_lowercase().cmp(&kb.to_lowercase());
312-
if ord == std::cmp::Ordering::Equal && key_field.is_some() {
313-
a.cmp(b)
314-
} else {
315-
ord
433+
if !key_specs.is_empty() {
434+
// Multi-key sort: compare by each key spec in order
435+
for key in &key_specs {
436+
let ka = extract_key_spec(a, delimiter, key);
437+
let kb = extract_key_spec(b, delimiter, key);
438+
// Per-key flags override global flags
439+
let ord = compare_keys(
440+
&ka,
441+
&kb,
442+
key.numeric || numeric,
443+
key.human_numeric || human_numeric,
444+
key.month_sort || month_sort,
445+
key.fold_case || fold_case,
446+
);
447+
let ord = if key.reverse { ord.reverse() } else { ord };
448+
if ord != std::cmp::Ordering::Equal {
449+
return ord;
450+
}
316451
}
452+
// All keys equal — fall back to full-line comparison
453+
a.cmp(b)
317454
} else {
318-
let ord = ka.cmp(&kb);
319-
if ord == std::cmp::Ordering::Equal && key_field.is_some() {
455+
// No key specs — use global flags on whole line
456+
let ord = compare_keys(a, b, numeric, human_numeric, month_sort, fold_case);
457+
if ord == std::cmp::Ordering::Equal {
320458
a.cmp(b)
321459
} else {
322460
ord

crates/bashkit/tests/spec_cases/bash/sortuniq.test.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,3 +332,49 @@ printf '003\n010\n001\n' | sort -n
332332
003
333333
010
334334
### end
335+
336+
### sort_key_end_field
337+
# sort -k with start,end field spec
338+
printf 'c 3\na 1\nb 2\n' | sort -k2,2
339+
### expect
340+
a 1
341+
b 2
342+
c 3
343+
### end
344+
345+
### sort_key_numeric_per_key
346+
# sort -k with per-key numeric flag
347+
printf 'a 10\nb 2\nc 1\n' | sort -k2n,2
348+
### expect
349+
c 1
350+
b 2
351+
a 10
352+
### end
353+
354+
### sort_key_multiple
355+
# sort with multiple -k keys (primary lexical, secondary numeric)
356+
printf 'b 2\na 10\na 2\nb 1\n' | sort -k1,1 -k2n,2
357+
### expect
358+
a 2
359+
a 10
360+
b 1
361+
b 2
362+
### end
363+
364+
### sort_key_reverse_per_key
365+
# sort -k with per-key reverse flag
366+
printf 'a 1\nb 2\nc 3\n' | sort -k2r,2
367+
### expect
368+
c 3
369+
b 2
370+
a 1
371+
### end
372+
373+
### sort_key_char_position
374+
# sort -k with character positions
375+
printf 'abc\naab\nabc\n' | sort -k1.2,1.2
376+
### expect
377+
aab
378+
abc
379+
abc
380+
### end

0 commit comments

Comments
 (0)