chore: merge main, resolve eval-tasks.jsonl conflict

claude · claude · commit 7b43a409ebbc · 2026-02-27T04:51:44.000Z
Keep both our new tasks (code_search, environment, etc.) and main's new tasks (database_operations, config_management, build_simulation). Dataset now has 58 tasks across 15 categories. https://claude.ai/code/session_0158eammVU6hRXeg9VJZuCJz
diff --git a/crates/bashkit-eval/data/eval-tasks.jsonl b/crates/bashkit-eval/data/eval-tasks.jsonl
@@ -50,3 +50,9 @@
 {"id":"complex_test_output","category":"complex_tasks","description":"Parse test results to extract failures and generate summary report","prompt":"Read /data/test-results.txt which contains test output in a standard format. Parse it to: 1) Count total tests, passed, and failed. 2) Extract the names of all failing tests. 3) Generate a summary report at /reports/test-summary.md with a header '# Test Summary', a line 'Total: N | Passed: N | Failed: N', and a '## Failures' section listing each failed test. Print the summary.","files":{"/data/test-results.txt":"PASS test_login_valid\nPASS test_login_invalid_password\nFAIL test_login_expired_token\nPASS test_signup_new_user\nFAIL test_signup_duplicate_email\nPASS test_logout\nPASS test_password_reset\nFAIL test_session_timeout\nPASS test_profile_update\nPASS test_profile_delete\nPASS test_api_rate_limit\nPASS test_api_auth_header\n"},"expectations":[{"check":"file_exists:/reports/test-summary.md"},{"check":"file_contains:/reports/test-summary.md:# Test Summary"},{"check":"file_contains:/reports/test-summary.md:Total: 12"},{"check":"file_contains:/reports/test-summary.md:Passed: 9"},{"check":"file_contains:/reports/test-summary.md:Failed: 3"},{"check":"file_contains:/reports/test-summary.md:test_login_expired_token"},{"check":"file_contains:/reports/test-summary.md:test_signup_duplicate_email"},{"check":"file_contains:/reports/test-summary.md:test_session_timeout"},{"check":"stdout_contains:Failed: 3"},{"check":"exit_code:0"}]}
 {"id":"complex_debug_script","category":"complex_tasks","description":"Debug and fix a broken script using bash debugging features","prompt":"The script /scripts/broken.sh has bugs. Run it first to see the errors. Then examine the script, identify the bugs, fix them, and run the fixed version. The script should compute the factorial of 5 and print 'factorial(5) = 120'. The script should exit 0. Write the fixed version back to /scripts/broken.sh.","files":{"/scripts/broken.sh":"#!/bin/bash\n\nfactorial() {\n  local n=$1\n  if [ $n -le 1 ]; then\n    echo 1\n  else\n    local sub=$(factorial $((n-1)))\n    echo $((n * sub))\n  fi\n}\n\nresult=$(factorial 5\necho \"factorial(5) = $result\"\nexit 0\n"},"expectations":[{"check":"stdout_contains:factorial(5) = 120"},{"check":"file_exists:/scripts/broken.sh"},{"check":"exit_code:0"},{"check":"tool_calls_min:2"}]}
 {"id":"data_regex_extract","category":"data_transformation","description":"Extract structured data from log entries using regex and BASH_REMATCH","prompt":"Read /data/access.log where each line has format '[TIMESTAMP] STATUS_CODE METHOD URL DURATION_MS'. Use bash regex matching ([[ =~ ]]) with BASH_REMATCH to extract each field. Find all requests that took longer than 500ms. Print them as 'SLOW: METHOD URL took DURATIONms (STATUS)'. At the end, print 'Slow requests: N of M total'.","files":{"/data/access.log":"[2024-01-15T10:00:01] 200 GET /api/users 150\n[2024-01-15T10:00:02] 200 POST /api/orders 850\n[2024-01-15T10:00:03] 404 GET /api/missing 50\n[2024-01-15T10:00:04] 200 GET /api/reports 1200\n[2024-01-15T10:00:05] 500 POST /api/payments 2000\n[2024-01-15T10:00:06] 200 GET /api/health 30\n[2024-01-15T10:00:07] 200 GET /api/products 450\n[2024-01-15T10:00:08] 200 PUT /api/users 620\n"},"expectations":[{"check":"stdout_contains:/api/orders"},{"check":"stdout_contains:/api/reports"},{"check":"stdout_contains:/api/payments"},{"check":"stdout_contains:620"},{"check":"stdout_regex:4.*8|4 of 8|4 slow"},{"check":"exit_code:0"}]}
+{"id":"db_csv_group_by","category":"database_operations","description":"GROUP BY with aggregation on CSV data","prompt":"Read /data/sales.csv with columns: region, product, amount. Compute total amount per region (like SQL GROUP BY region, SUM(amount)). Print results as 'region: total' sorted by total descending.","files":{"/data/sales.csv":"region,product,amount\nnorth,widgets,500\nsouth,gadgets,300\nnorth,bolts,200\neast,widgets,400\nsouth,widgets,350\nnorth,gadgets,150\neast,bolts,250\nsouth,bolts,100\n"},"expectations":[{"check":"stdout_contains:north"},{"check":"stdout_contains:850"},{"check":"stdout_contains:south"},{"check":"stdout_contains:750"},{"check":"stdout_contains:east"},{"check":"stdout_contains:650"},{"check":"exit_code:0"}]}
+{"id":"db_csv_join_aggregate","category":"database_operations","description":"Join two CSVs and compute per-group statistics","prompt":"Join /data/orders.csv and /data/products.csv on product_id. For each category, compute the total revenue (quantity * price). Print 'category: total_revenue' sorted by revenue descending.","files":{"/data/orders.csv":"order_id,product_id,quantity\n1,101,3\n2,102,5\n3,101,2\n4,103,1\n5,102,4\n6,103,3\n","/data/products.csv":"product_id,name,category,price\n101,Widget,hardware,25\n102,Gadget,electronics,50\n103,Bolt,hardware,10\n"},"expectations":[{"check":"stdout_contains:electronics"},{"check":"stdout_contains:450"},{"check":"stdout_contains:hardware"},{"check":"stdout_contains:165"},{"check":"exit_code:0"}]}
+{"id":"config_env_template","category":"config_management","description":"Generate .env file from template with defaults","prompt":"Read /config/template.env which has lines like 'KEY=${VALUE:-default}'. For each line, check if the key exists in /config/overrides.txt (format: KEY=value). If it does, use the override value; otherwise use the default from the template. Write the final KEY=value pairs to /app/.env and print the result.","files":{"/config/template.env":"DB_HOST=${DB_HOST:-localhost}\nDB_PORT=${DB_PORT:-5432}\nDB_NAME=${DB_NAME:-myapp}\nREDIS_URL=${REDIS_URL:-redis://localhost:6379}\nLOG_LEVEL=${LOG_LEVEL:-info}\n","/config/overrides.txt":"DB_HOST=db.prod.internal\nLOG_LEVEL=warn\n"},"expectations":[{"check":"file_exists:/app/.env"},{"check":"file_contains:/app/.env:DB_HOST=db.prod.internal"},{"check":"file_contains:/app/.env:DB_PORT=5432"},{"check":"file_contains:/app/.env:DB_NAME=myapp"},{"check":"file_contains:/app/.env:LOG_LEVEL=warn"},{"check":"stdout_contains:db.prod.internal"},{"check":"exit_code:0"}]}
+{"id":"config_ini_merge","category":"config_management","description":"Merge INI config files with section-aware override","prompt":"Merge /config/defaults.ini and /config/custom.ini. Custom values should override defaults within the same section. Keys only in defaults should be preserved. Write the merged result to /config/merged.ini and print it. Sections are denoted by [section_name] headers.","files":{"/config/defaults.ini":"[server]\nhost=0.0.0.0\nport=8080\nworkers=4\n\n[database]\nhost=localhost\nport=5432\npool_size=5\n\n[logging]\nlevel=info\nformat=text\n","/config/custom.ini":"[server]\nport=9090\nworkers=8\n\n[logging]\nlevel=debug\nformat=json\n"},"expectations":[{"check":"file_exists:/config/merged.ini"},{"check":"file_contains:/config/merged.ini:port=9090"},{"check":"file_contains:/config/merged.ini:workers=8"},{"check":"file_contains:/config/merged.ini:host=0.0.0.0"},{"check":"file_contains:/config/merged.ini:pool_size=5"},{"check":"file_contains:/config/merged.ini:level=debug"},{"check":"exit_code:0"}]}
+{"id":"build_multi_stage","category":"build_simulation","description":"Multi-stage build pipeline with dependency checking","prompt":"Simulate a build pipeline: 1) Check that /src/main.c and /src/utils.c exist, 2) 'Compile' each .c file by copying it to /build/ with a .o extension and prepending '// compiled', 3) 'Link' by concatenating all .o files into /build/program with a '// linked' header, 4) 'Package' by creating a tar.gz of /build/ at /dist/release.tar.gz. Print status for each stage. If any stage fails, stop and report the error.","files":{"/src/main.c":"int main() { return helper(); }\n","/src/utils.c":"int helper() { return 0; }\n"},"expectations":[{"check":"file_exists:/build/main.o"},{"check":"file_exists:/build/utils.o"},{"check":"file_exists:/build/program"},{"check":"file_contains:/build/program:compiled"},{"check":"file_exists:/dist/release.tar.gz"},{"check":"exit_code:0"}]}
+{"id":"build_script_generator","category":"build_simulation","description":"Generate a Makefile-like build script from dependency spec","prompt":"Read /project/deps.txt which lists build targets and their dependencies (format: 'target: dep1 dep2'). Generate /project/build.sh that builds targets in correct dependency order (dependencies before dependents). Then run the build script, which should create each target as a file in /project/out/ containing 'built: <target>'. Print the build order.","files":{"/project/deps.txt":"app: lib utils\nlib: core\nutils: core\ncore:\n","/project/src/core":"core source\n","/project/src/lib":"lib source\n","/project/src/utils":"utils source\n","/project/src/app":"app source\n"},"expectations":[{"check":"file_exists:/project/build.sh"},{"check":"file_exists:/project/out/core"},{"check":"file_exists:/project/out/lib"},{"check":"file_exists:/project/out/app"},{"check":"exit_code:0"}]}
diff --git a/crates/bashkit/src/builtins/wc.rs b/crates/bashkit/src/builtins/wc.rs
@@ -78,6 +78,20 @@ impl WcFlags {
             max_line_length,
         }
     }
+
+    /// Number of active count fields
+    fn active_count(&self) -> usize {
+        [
+            self.lines,
+            self.words,
+            self.bytes,
+            self.chars,
+            self.max_line_length,
+        ]
+        .iter()
+        .filter(|&&b| b)
+        .count()
+    }
 }
 
 #[async_trait]
@@ -102,7 +116,9 @@ impl Builtin for Wc {
             // Read from stdin
             if let Some(stdin) = ctx.stdin {
                 let counts = count_text(stdin);
-                output.push_str(&format_counts(&counts, &flags, None));
+                // Real bash: no padding for single-value stdin, padded for multiple values
+                let padded = flags.active_count() > 1;
+                output.push_str(&format_counts(&counts, &flags, None, padded));
                 output.push('\n');
             }
         } else {
@@ -127,7 +143,7 @@ impl Builtin for Wc {
                             total_max_line = counts.max_line_length;
                         }
 
-                        output.push_str(&format_counts(&counts, &flags, Some(file)));
+                        output.push_str(&format_counts(&counts, &flags, Some(file), true));
                         output.push('\n');
                     }
                     Err(e) => {
@@ -145,7 +161,12 @@ impl Builtin for Wc {
                     chars: total_chars,
                     max_line_length: total_max_line,
                 };
-                output.push_str(&format_counts(&totals, &flags, Some(&"total".to_string())));
+                output.push_str(&format_counts(
+                    &totals,
+                    &flags,
+                    Some(&"total".to_string()),
+                    true,
+                ));
                 output.push('\n');
             }
         }
@@ -178,32 +199,47 @@ fn count_text(text: &str) -> TextCounts {
     }
 }
 
-/// Format counts for output
-fn format_counts(counts: &TextCounts, flags: &WcFlags, filename: Option<&String>) -> String {
-    let mut parts = Vec::new();
+/// Format counts for output.
+/// When `padded` is true, right-align numbers in 8-char fields (used for file output).
+/// When `padded` is false, use minimal formatting like real bash stdin output.
+fn format_counts(
+    counts: &TextCounts,
+    flags: &WcFlags,
+    filename: Option<&String>,
+    padded: bool,
+) -> String {
+    let mut values: Vec<usize> = Vec::new();
 
     if flags.lines {
-        parts.push(format!("{:>8}", counts.lines));
+        values.push(counts.lines);
     }
     if flags.words {
-        parts.push(format!("{:>8}", counts.words));
+        values.push(counts.words);
     }
     if flags.bytes {
-        parts.push(format!("{:>8}", counts.bytes));
+        values.push(counts.bytes);
     }
     if flags.chars {
-        parts.push(format!("{:>8}", counts.chars));
+        values.push(counts.chars);
     }
     if flags.max_line_length {
-        parts.push(format!("{:>8}", counts.max_line_length));
+        values.push(counts.max_line_length);
     }
 
-    let mut result = parts.join("");
+    let result = if padded {
+        // Real bash uses 7-char wide fields separated by a space
+        let parts: Vec<String> = values.iter().map(|v| format!("{:>7}", v)).collect();
+        parts.join(" ")
+    } else {
+        let parts: Vec<String> = values.iter().map(|v| v.to_string()).collect();
+        parts.join(" ")
+    };
+
     if let Some(name) = filename {
-        result.push(' ');
-        result.push_str(name);
+        format!("{} {}", result, name)
+    } else {
+        result
     }
-    result
 }
 
 #[cfg(test)]
diff --git a/crates/bashkit/src/error.rs b/crates/bashkit/src/error.rs
@@ -49,6 +49,10 @@ pub enum Error {
     #[error("network error: {0}")]
     Network(String),
 
+    /// Regex compilation or matching error.
+    #[error("regex error: {0}")]
+    Regex(#[from] regex::Error),
+
     /// Internal error for unexpected failures.
     ///
     /// THREAT[TM-INT-002]: Unexpected internal failures should not crash the interpreter.
diff --git a/crates/bashkit/src/tool.rs b/crates/bashkit/src/tool.rs
@@ -42,8 +42,21 @@ use std::sync::{Arc, Mutex};
 /// Library version from Cargo.toml
 pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 
-/// List of built-in commands
-const BUILTINS: &str = "echo cat grep sed awk jq curl head tail sort uniq cut tr wc date sleep mkdir rm cp mv touch chmod printf test [ true false exit cd pwd ls find xargs basename dirname env export read";
+/// List of built-in commands (organized by category)
+const BUILTINS: &str = "\
+echo printf cat read \
+grep sed awk jq head tail sort uniq cut tr wc nl paste column comm diff strings tac rev \
+cd pwd ls find mkdir mktemp rm rmdir cp mv touch chmod chown ln \
+file stat less tar gzip gunzip du df \
+test [ true false exit return break continue \
+export set unset local shift source eval declare typeset readonly shopt getopts \
+sleep date seq expr yes wait timeout xargs tee watch \
+basename dirname realpath \
+pushd popd dirs \
+whoami hostname uname id env printenv history \
+curl wget \
+od xxd hexdump base64 \
+kill";
 
 /// Base help documentation template (generic help format)
 const BASE_HELP: &str = r#"BASH(1)                          User Commands                         BASH(1)
@@ -62,10 +75,22 @@ DESCRIPTION
        loops, conditionals, functions, and arrays.
 
 BUILTINS
-       echo, cat, grep, sed, awk, jq, curl, head, tail, sort, uniq, cut, tr,
-       wc, date, sleep, mkdir, rm, cp, mv, touch, chmod, printf, test, [,
-       true, false, exit, cd, pwd, ls, find, xargs, basename, dirname, env,
-       export, read
+   Core I/O:        echo, printf, cat, read
+   Text Processing: grep, sed, awk, jq, head, tail, sort, uniq, cut, tr, wc,
+                     nl, paste, column, comm, diff, strings, tac, rev
+   File Operations: cd, pwd, ls, find, mkdir, mktemp, rm, rmdir, cp, mv,
+                     touch, chmod, chown, ln
+   File Inspection: file, stat, less, tar, gzip, gunzip, du, df
+   Flow Control:    test, [, true, false, exit, return, break, continue
+   Shell/Variables:  export, set, unset, local, shift, source, eval, declare,
+                     typeset, readonly, shopt, getopts
+   Utilities:       sleep, date, seq, expr, yes, wait, timeout, xargs, tee,
+                     watch, basename, dirname, realpath
+   Dir Stack:       pushd, popd, dirs
+   System Info:     whoami, hostname, uname, id, env, printenv, history
+   Network:         curl, wget
+   Binary/Hex:      od, xxd, hexdump, base64
+   Signals:         kill
 
 INPUT
        commands    Bash commands to execute (like bash -c "commands")
@@ -671,6 +696,7 @@ fn error_kind(e: &Error) -> String {
         Error::CommandNotFound(_) => "command_not_found".to_string(),
         Error::ResourceLimit(_) => "resource_limit".to_string(),
         Error::Network(_) => "network_error".to_string(),
+        Error::Regex(_) => "regex_error".to_string(),
         Error::Internal(_) => "internal_error".to_string(),
     }
 }
diff --git a/crates/bashkit/tests/spec_cases/bash/wc.test.sh b/crates/bashkit/tests/spec_cases/bash/wc.test.sh
@@ -1,158 +1,139 @@
 ### wc_lines_only
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Count lines with -l
 printf 'a\nb\nc\n' | wc -l
 ### expect
-       3
+3
 ### end
 
 ### wc_words_only
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Count words with -w
 printf 'one two three four five' | wc -w
 ### expect
-       5
+5
 ### end
 
 ### wc_bytes_only
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Count bytes with -c
 printf 'hello' | wc -c
 ### expect
-       5
+5
 ### end
 
 ### wc_empty
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Empty input
 printf '' | wc -l
 ### expect
-       0
+0
 ### end
 
 ### wc_all_flags
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # All counts (default)
 printf 'hello world\n' | wc
 ### expect
-       1       2      12
+      1       2      12
 ### end
 
 ### wc_multiple_lines
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Multiple lines
 printf 'one\ntwo\nthree\n' | wc -l
 ### expect
-       3
+3
 ### end
 
 ### wc_chars_m_flag
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Count characters with -m
 printf 'hello' | wc -m
 ### expect
-       5
+5
 ### end
 
 ### wc_lines_words
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Lines and words combined
 printf 'one two\nthree four\n' | wc -lw
 ### expect
-       2       4
+      2       4
 ### end
 
 ### wc_no_newline_at_end
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Input without trailing newline
 printf 'hello world' | wc -w
 ### expect
-       2
+2
 ### end
 
 ### wc_multiple_spaces
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Multiple spaces between words
 printf 'hello   world' | wc -w
 ### expect
-       2
+2
 ### end
 
 ### wc_tabs_count
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Tabs in input
 printf 'a\tb\tc' | wc -w
 ### expect
-       3
+3
 ### end
 
 ### wc_single_word
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Single word
 printf 'word' | wc -w
 ### expect
-       1
+1
 ### end
 
 ### wc_only_whitespace
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Only whitespace
 printf '   \t   ' | wc -w
 ### expect
-       0
+0
 ### end
 
 ### wc_max_line_length
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 printf 'short\nlongerline\n' | wc -L
 ### expect
-      10
+10
 ### end
 
 ### wc_long_flags
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Long flag --lines
 printf 'a\nb\n' | wc --lines
 ### expect
-       2
+2
 ### end
 
 ### wc_long_words
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Long flag --words
 printf 'one two three' | wc --words
 ### expect
-       3
+3
 ### end
 
 ### wc_long_bytes
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Long flag --bytes
 printf 'hello' | wc --bytes
 ### expect
-       5
+5
 ### end
 
 ### wc_bytes_vs_chars
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Bytes vs chars for ASCII
 printf 'hello' | wc -c && printf 'hello' | wc -m
 ### expect
-       5
-       5
+5
+5
 ### end
 
 ### wc_unicode_chars
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
+### bash_diff: locale-dependent; real bash wc -m may count bytes in C locale
 printf 'héllo' | wc -m
 ### expect
-       5
+5
 ### end
 
 ### wc_unicode_bytes
-### bash_diff: Bashkit wc uses fixed-width padding for stdin, real bash uses no padding
 # Unicode byte count
 printf 'héllo' | wc -c
 ### expect
-       6
+6
 ### end
diff --git a/crates/bashkit/tests/spec_tests.rs b/crates/bashkit/tests/spec_tests.rs