Skip to content

Commit 1296499

Browse files
authored
feat(builtins): full sort -k KEYDEF parsing with multi-key support (#920)
Closes #906
1 parent 9339490 commit 1296499

File tree

2 files changed

+283
-60
lines changed

2 files changed

+283
-60
lines changed

crates/bashkit/src/builtins/sortuniq.rs

Lines changed: 237 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,151 @@ use crate::interpreter::ExecResult;
2525
/// -o Write output to FILE
2626
pub struct Sort;
2727

28-
/// Extract the sort key from a line based on field delimiter and key spec
29-
fn extract_key(line: &str, delimiter: Option<char>, key_field: usize) -> String {
28+
/// A parsed sort key definition from `-k KEYDEF`.
29+
/// Format: `START[.CHAR][FLAGS][,END[.CHAR][FLAGS]]`
30+
#[derive(Clone, Debug)]
31+
struct KeySpec {
32+
start_field: usize,
33+
start_char: usize, // 0 = whole field
34+
end_field: usize, // 0 = end of line
35+
end_char: usize, // 0 = end of field
36+
numeric: bool,
37+
reverse: bool,
38+
fold_case: bool,
39+
human_numeric: bool,
40+
month_sort: bool,
41+
#[allow(dead_code)] // Used when combined with sort -V feature
42+
version_sort: bool,
43+
}
44+
45+
impl KeySpec {
46+
/// Parse a KEYDEF string like "2", "2,3", "2.3,3.4", "2n,2", "2nr"
47+
fn parse(spec: &str) -> Self {
48+
let (start_part, end_part) = if let Some(comma) = spec.find(',') {
49+
(&spec[..comma], Some(&spec[comma + 1..]))
50+
} else {
51+
(spec, None)
52+
};
53+
54+
let (start_field, start_char, start_flags) = Self::parse_field_spec(start_part);
55+
let (end_field, end_char, end_flags) = if let Some(ep) = end_part {
56+
let (f, c, fl) = Self::parse_field_spec(ep);
57+
(f, c, fl)
58+
} else {
59+
// No comma: -k2 means "field 2 only" (same as -k2,2)
60+
(start_field, 0, String::new())
61+
};
62+
63+
// Merge flags from both start and end parts
64+
let all_flags: String = format!("{}{}", start_flags, end_flags);
65+
66+
KeySpec {
67+
start_field,
68+
start_char,
69+
end_field,
70+
end_char,
71+
numeric: all_flags.contains('n'),
72+
reverse: all_flags.contains('r'),
73+
fold_case: all_flags.contains('f'),
74+
human_numeric: all_flags.contains('h'),
75+
month_sort: all_flags.contains('M'),
76+
version_sort: all_flags.contains('V'),
77+
}
78+
}
79+
80+
/// Parse "FIELD[.CHAR][FLAGS]" → (field, char_pos, flags_string)
81+
fn parse_field_spec(s: &str) -> (usize, usize, String) {
82+
let mut i = 0;
83+
let chars: Vec<char> = s.chars().collect();
84+
// Parse field number
85+
while i < chars.len() && chars[i].is_ascii_digit() {
86+
i += 1;
87+
}
88+
let field: usize = s[..i].parse().unwrap_or(0);
89+
90+
// Parse optional .CHAR
91+
let mut char_pos = 0;
92+
if i < chars.len() && chars[i] == '.' {
93+
i += 1;
94+
let start = i;
95+
while i < chars.len() && chars[i].is_ascii_digit() {
96+
i += 1;
97+
}
98+
char_pos = s[start..i].parse().unwrap_or(0);
99+
}
100+
101+
// Remaining chars are flags
102+
let flags = s[i..].to_string();
103+
(field, char_pos, flags)
104+
}
105+
}
106+
107+
/// Split a line into fields using the given delimiter
108+
fn split_fields(line: &str, delimiter: Option<char>) -> Vec<&str> {
30109
if let Some(delim) = delimiter {
31-
line.split(delim)
32-
.nth(key_field.saturating_sub(1))
33-
.unwrap_or("")
34-
.to_string()
110+
line.split(delim).collect()
35111
} else {
36-
// Default: whitespace-separated fields
37-
line.split_whitespace()
38-
.nth(key_field.saturating_sub(1))
39-
.unwrap_or("")
40-
.to_string()
112+
line.split_whitespace().collect()
113+
}
114+
}
115+
116+
/// Extract the sort key from a line based on field delimiter and key spec
117+
fn extract_key_spec(line: &str, delimiter: Option<char>, key: &KeySpec) -> String {
118+
let fields = split_fields(line, delimiter);
119+
if fields.is_empty() || key.start_field == 0 {
120+
return line.to_string();
121+
}
122+
123+
let start_idx = key.start_field.saturating_sub(1);
124+
if start_idx >= fields.len() {
125+
return String::new();
126+
}
127+
128+
let end_idx = if key.end_field == 0 {
129+
fields.len() - 1
130+
} else {
131+
(key.end_field.saturating_sub(1)).min(fields.len() - 1)
132+
};
133+
134+
if start_idx > end_idx {
135+
return String::new();
136+
}
137+
138+
if start_idx == end_idx {
139+
let field = fields[start_idx];
140+
let start_c = if key.start_char > 0 {
141+
(key.start_char - 1).min(field.len())
142+
} else {
143+
0
144+
};
145+
let end_c = if key.end_char > 0 {
146+
key.end_char.min(field.len())
147+
} else {
148+
field.len()
149+
};
150+
if start_c >= end_c {
151+
return String::new();
152+
}
153+
return field[start_c..end_c].to_string();
41154
}
155+
156+
// Multi-field key
157+
let mut result = String::new();
158+
for (i, field) in fields.iter().enumerate().take(end_idx + 1).skip(start_idx) {
159+
if i > start_idx {
160+
result.push(delimiter.unwrap_or(' '));
161+
}
162+
if i == start_idx && key.start_char > 0 {
163+
let sc = (key.start_char - 1).min(field.len());
164+
result.push_str(&field[sc..]);
165+
} else if i == end_idx && key.end_char > 0 {
166+
let ec = key.end_char.min(field.len());
167+
result.push_str(&field[..ec]);
168+
} else {
169+
result.push_str(field);
170+
}
171+
}
172+
result
42173
}
43174

44175
/// Extract leading numeric prefix from a string for `sort -n`.
@@ -194,7 +325,7 @@ impl Builtin for Sort {
194325
let mut version_sort = false;
195326
let mut merge = false;
196327
let mut delimiter: Option<char> = None;
197-
let mut key_field: Option<usize> = None;
328+
let mut key_specs: Vec<KeySpec> = Vec::new();
198329
let mut output_file: Option<String> = None;
199330
let mut zero_terminated = false;
200331
let mut files = Vec::new();
@@ -204,15 +335,7 @@ impl Builtin for Sort {
204335
if let Some(val) = p.flag_value_opt("-t") {
205336
delimiter = val.chars().next();
206337
} else if let Some(val) = p.flag_value_opt("-k") {
207-
// Parse key: "2" or "2,2" or "2n"
208-
let field_str: String = val.chars().take_while(|c| c.is_ascii_digit()).collect();
209-
key_field = field_str.parse().ok();
210-
if val.contains('n') {
211-
numeric = true;
212-
}
213-
if val.contains('r') {
214-
reverse = true;
215-
}
338+
key_specs.push(KeySpec::parse(val));
216339
} else if let Some(val) = p.flag_value_opt("-o") {
217340
output_file = Some(val.to_string());
218341
} else {
@@ -351,48 +474,75 @@ impl Builtin for Sort {
351474
}
352475

353476
// Get the key extractor
354-
let get_key = |line: &str| -> String {
355-
if let Some(kf) = key_field {
356-
extract_key(line, delimiter, kf)
477+
/// Compare two keys using the specified sort mode flags
478+
fn compare_keys(
479+
ka: &str,
480+
kb: &str,
481+
is_numeric: bool,
482+
is_human: bool,
483+
is_month: bool,
484+
is_fold_case: bool,
485+
) -> std::cmp::Ordering {
486+
if is_human {
487+
let na = parse_human_numeric(ka);
488+
let nb = parse_human_numeric(kb);
489+
na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal)
490+
} else if is_month {
491+
month_ordinal(ka).cmp(&month_ordinal(kb))
492+
} else if is_numeric {
493+
let na = extract_numeric_prefix(ka);
494+
let nb = extract_numeric_prefix(kb);
495+
na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal)
496+
} else if is_fold_case {
497+
ka.to_lowercase().cmp(&kb.to_lowercase())
357498
} else {
358-
line.to_string()
499+
ka.cmp(kb)
359500
}
360-
};
501+
}
361502

362503
// Sort the lines
363504
let sort_fn = |a: &String, b: &String| -> std::cmp::Ordering {
364-
let ka = get_key(a);
365-
let kb = get_key(b);
366-
if version_sort {
367-
version_cmp(&ka, &kb)
368-
} else if human_numeric {
369-
let na = parse_human_numeric(&ka);
370-
let nb = parse_human_numeric(&kb);
371-
na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal)
372-
} else if month_sort {
373-
let ma = month_ordinal(&ka);
374-
let mb = month_ordinal(&kb);
375-
ma.cmp(&mb)
376-
} else if numeric {
377-
let na = extract_numeric_prefix(&ka);
378-
let nb = extract_numeric_prefix(&kb);
379-
match na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal) {
380-
std::cmp::Ordering::Equal => a.cmp(b),
381-
ord => ord,
382-
}
383-
} else if fold_case {
384-
let ord = ka.to_lowercase().cmp(&kb.to_lowercase());
385-
if ord == std::cmp::Ordering::Equal && key_field.is_some() {
386-
a.cmp(b)
387-
} else {
388-
ord
505+
if !key_specs.is_empty() {
506+
// Multi-key sort: compare by each key spec in order
507+
for key in &key_specs {
508+
let ka = extract_key_spec(a, delimiter, key);
509+
let kb = extract_key_spec(b, delimiter, key);
510+
// Per-key flags override global flags
511+
let ord = if key.version_sort || version_sort {
512+
version_cmp(&ka, &kb)
513+
} else {
514+
compare_keys(
515+
&ka,
516+
&kb,
517+
key.numeric || numeric,
518+
key.human_numeric || human_numeric,
519+
key.month_sort || month_sort,
520+
key.fold_case || fold_case,
521+
)
522+
};
523+
let ord = if key.reverse { ord.reverse() } else { ord };
524+
if ord != std::cmp::Ordering::Equal {
525+
return ord;
526+
}
389527
}
528+
// All keys equal — fall back to full-line comparison
529+
a.cmp(b)
390530
} else {
391-
let ord = ka.cmp(&kb);
392-
if ord == std::cmp::Ordering::Equal && key_field.is_some() {
393-
a.cmp(b)
531+
// No key specs — use global flags on whole line
532+
if version_sort {
533+
let ord = version_cmp(a, b);
534+
if ord == std::cmp::Ordering::Equal {
535+
a.cmp(b)
536+
} else {
537+
ord
538+
}
394539
} else {
395-
ord
540+
let ord = compare_keys(a, b, numeric, human_numeric, month_sort, fold_case);
541+
if ord == std::cmp::Ordering::Equal {
542+
a.cmp(b)
543+
} else {
544+
ord
545+
}
396546
}
397547
}
398548
};
@@ -761,11 +911,38 @@ mod tests {
761911
}
762912

763913
#[tokio::test]
764-
async fn test_extract_key() {
765-
assert_eq!(extract_key("a:b:c", Some(':'), 2), "b");
766-
assert_eq!(extract_key("hello world", None, 1), "hello");
767-
assert_eq!(extract_key("hello world", None, 2), "world");
768-
assert_eq!(extract_key("x", None, 5), "");
914+
async fn test_extract_key_spec() {
915+
let key = KeySpec::parse("2");
916+
assert_eq!(extract_key_spec("a:b:c", Some(':'), &key), "b");
917+
let key1 = KeySpec::parse("1");
918+
assert_eq!(extract_key_spec("hello world", None, &key1), "hello");
919+
let key2 = KeySpec::parse("2");
920+
assert_eq!(extract_key_spec("hello world", None, &key2), "world");
921+
let key5 = KeySpec::parse("5");
922+
assert_eq!(extract_key_spec("x", None, &key5), "");
923+
}
924+
925+
#[tokio::test]
926+
async fn test_keyspec_parse() {
927+
let k = KeySpec::parse("2n,3r");
928+
assert_eq!(k.start_field, 2);
929+
assert_eq!(k.end_field, 3);
930+
assert!(k.numeric);
931+
assert!(k.reverse);
932+
933+
let k2 = KeySpec::parse("1.2,1.5f");
934+
assert_eq!(k2.start_field, 1);
935+
assert_eq!(k2.start_char, 2);
936+
assert_eq!(k2.end_field, 1);
937+
assert_eq!(k2.end_char, 5);
938+
assert!(k2.fold_case);
939+
}
940+
941+
#[tokio::test]
942+
async fn test_version_cmp() {
943+
assert_eq!(version_cmp("1.2", "1.10"), std::cmp::Ordering::Less);
944+
assert_eq!(version_cmp("2.0", "1.9"), std::cmp::Ordering::Greater);
945+
assert_eq!(version_cmp("1.0", "1.0"), std::cmp::Ordering::Equal);
769946
}
770947

771948
#[tokio::test]

crates/bashkit/tests/spec_cases/bash/sortuniq.test.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,3 +379,49 @@ a2
379379
a10
380380
a20
381381
### end
382+
383+
### sort_key_end_field
384+
# sort -k with start,end field spec
385+
printf 'c 3\na 1\nb 2\n' | sort -k2,2
386+
### expect
387+
a 1
388+
b 2
389+
c 3
390+
### end
391+
392+
### sort_key_numeric_per_key
393+
# sort -k with per-key numeric flag
394+
printf 'a 10\nb 2\nc 1\n' | sort -k2n,2
395+
### expect
396+
c 1
397+
b 2
398+
a 10
399+
### end
400+
401+
### sort_key_multiple
402+
# sort with multiple -k keys (primary lexical, secondary numeric)
403+
printf 'b 2\na 10\na 2\nb 1\n' | sort -k1,1 -k2n,2
404+
### expect
405+
a 2
406+
a 10
407+
b 1
408+
b 2
409+
### end
410+
411+
### sort_key_reverse_per_key
412+
# sort -k with per-key reverse flag
413+
printf 'a 1\nb 2\nc 3\n' | sort -k2r,2
414+
### expect
415+
c 3
416+
b 2
417+
a 1
418+
### end
419+
420+
### sort_key_char_position
421+
# sort -k with character positions
422+
printf 'abc\naab\nabc\n' | sort -k1.2,1.2
423+
### expect
424+
aab
425+
abc
426+
abc
427+
### end

0 commit comments

Comments
 (0)