From 71e4c1c56558fb2c22c9f0c2bf32675023d9f965 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 02:14:18 -0500 Subject: [PATCH 01/25] feat(output): add JSON, table, and YARA formatters Signed-off-by: UncleSp1d3r --- src/lib.rs | 3 + src/output/json.rs | 9 ++ src/output/mod.rs | 305 +++++++++++++++++++++++++++++++++++++++++++- src/output/table.rs | 9 ++ src/output/yara.rs | 9 ++ 5 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 src/output/json.rs create mode 100644 src/output/table.rs create mode 100644 src/output/yara.rs diff --git a/src/lib.rs b/src/lib.rs index 8dfb54b..afdfc3b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -84,3 +84,6 @@ pub use extraction::{ AsciiExtractionConfig, BasicExtractor, CanonicalString, ExtractionConfig, StringExtractor, StringOccurrence, Utf16ExtractionConfig, deduplicate, }; + +// Re-export output infrastructure types +pub use output::{OutputFormat, OutputMetadata, format_output}; diff --git a/src/output/json.rs b/src/output/json.rs new file mode 100644 index 0000000..e183a25 --- /dev/null +++ b/src/output/json.rs @@ -0,0 +1,9 @@ +use crate::types::{FoundString, Result}; + +use super::OutputMetadata; + +/// Format strings as JSONL output, one object per line. +pub fn format_json(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { + // TODO: Implement JSON formatter in a subsequent phase. + Ok(String::new()) +} diff --git a/src/output/mod.rs b/src/output/mod.rs index 34403c4..bf97cb9 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -1 +1,304 @@ -// Output formatting +//! Output formatting infrastructure for Stringy. +//! +//! This module provides the core dispatch logic and shared metadata for output +//! formatters. Concrete formatters live in submodules and are selected via the +//! `OutputFormat` enum. +//! +//! Supported formats: +//! - Table (human-readable, TTY-friendly) +//! - JSON (JSONL, one object per line) +//! - YARA (rule template output) +//! +//! ## Example +//! +//! ```rust +//! use stringy::{format_output, FoundString, OutputFormat, OutputMetadata}; +//! use stringy::types::{Encoding, StringSource}; +//! +//! let strings = vec![FoundString::new( +//! "example".to_string(), +//! Encoding::Ascii, +//! 0, +//! 7, +//! StringSource::SectionData, +//! )]; +//! +//! let metadata = OutputMetadata::new( +//! "sample.bin".to_string(), +//! OutputFormat::Table, +//! strings.len(), +//! strings.len(), +//! ); +//! +//! let output = format_output(&strings, &metadata)?; +//! # Ok::<(), stringy::StringyError>(()) +//! ``` + +use crate::types::{FoundString, Result}; + +pub mod json; +pub mod table; +pub mod yara; + +pub use json::format_json; +pub use table::format_table; +pub use yara::format_yara; + +/// Output format selection for Stringy formatters. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum OutputFormat { + /// Human-readable table format with TTY detection. + Table, + /// JSONL output, one JSON object per line. + Json, + /// YARA rule template output. + Yara, +} + +/// Metadata describing the output context. +/// +/// This struct is marked `#[non_exhaustive]` to allow adding new fields without +/// breaking downstream code. Use `OutputMetadata::new()` to construct instances. +#[non_exhaustive] +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OutputMetadata { + /// Name of the analyzed binary file. + pub binary_name: String, + /// Output format to be used. + pub format: OutputFormat, + /// Total number of strings extracted. + pub total_strings: usize, + /// Number of strings after filtering. + pub filtered_strings: usize, +} + +impl OutputMetadata { + /// Create a new `OutputMetadata` instance. + #[must_use] + pub fn new( + binary_name: String, + format: OutputFormat, + total_strings: usize, + filtered_strings: usize, + ) -> Self { + Self { + binary_name, + format, + total_strings, + filtered_strings, + } + } +} + +/// Format output strings using the requested output format. +/// +/// # Arguments +/// +/// * `strings` - The extracted strings to format. +/// * `metadata` - Output context and format selection. +/// +/// # Returns +/// +/// A formatted output string on success. +pub fn format_output(strings: &[FoundString], metadata: &OutputMetadata) -> Result { + format_output_with(strings, metadata, format_table, format_json, format_yara) +} + +fn format_output_with< + FTable: Fn(&[FoundString], &OutputMetadata) -> Result, + FJson: Fn(&[FoundString], &OutputMetadata) -> Result, + FYara: Fn(&[FoundString], &OutputMetadata) -> Result, +>( + strings: &[FoundString], + metadata: &OutputMetadata, + table_formatter: FTable, + json_formatter: FJson, + yara_formatter: FYara, +) -> Result { + match metadata.format { + OutputFormat::Table => table_formatter(strings, metadata), + OutputFormat::Json => json_formatter(strings, metadata), + OutputFormat::Yara => yara_formatter(strings, metadata), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Encoding, StringSource, StringyError}; + + fn build_found_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0, + text.len() as u32, + StringSource::SectionData, + ) + } + + #[test] + fn test_output_format_enum_properties() { + let table = OutputFormat::Table; + let json = OutputFormat::Json; + let yara = OutputFormat::Yara; + + let copied = table; + let cloned = json; + + assert_eq!(copied, OutputFormat::Table); + assert_eq!(cloned, OutputFormat::Json); + assert_ne!(table, json); + assert_ne!(json, yara); + assert_ne!(table, yara); + + let debug = format!("{:?}", OutputFormat::Yara); + assert!(!debug.is_empty(), "Debug output should not be empty"); + } + + #[test] + fn test_output_metadata_construction() { + let metadata = OutputMetadata::new("sample.bin".to_string(), OutputFormat::Table, 12, 9); + + assert_eq!(metadata.binary_name, "sample.bin"); + assert_eq!(metadata.format, OutputFormat::Table); + assert_eq!(metadata.total_strings, 12); + assert_eq!(metadata.filtered_strings, 9); + + let other = OutputMetadata::new("other.exe".to_string(), OutputFormat::Json, 1, 1); + + assert_eq!(other.binary_name, "other.exe"); + assert_eq!(other.format, OutputFormat::Json); + assert_eq!(other.total_strings, 1); + assert_eq!(other.filtered_strings, 1); + } + + #[test] + fn test_dispatch_logic_for_each_format() { + let strings = vec![build_found_string("alpha")]; + let metadata = OutputMetadata::new( + "sample.bin".to_string(), + OutputFormat::Table, + strings.len(), + strings.len(), + ); + + let result = format_output_with( + &strings, + &metadata, + |_, _| Ok("table".to_string()), + |_, _| Ok("json".to_string()), + |_, _| Ok("yara".to_string()), + ) + .expect("Dispatch should succeed"); + + assert_eq!(result, "table"); + + let json_metadata = OutputMetadata::new( + "sample.bin".to_string(), + OutputFormat::Json, + strings.len(), + strings.len(), + ); + + let json_result = format_output_with( + &strings, + &json_metadata, + |_, _| Ok("table".to_string()), + |_, _| Ok("json".to_string()), + |_, _| Ok("yara".to_string()), + ) + .expect("Dispatch should succeed"); + + assert_eq!(json_result, "json"); + + let yara_metadata = OutputMetadata::new( + "sample.bin".to_string(), + OutputFormat::Yara, + strings.len(), + strings.len(), + ); + + let yara_result = format_output_with( + &strings, + &yara_metadata, + |_, _| Ok("table".to_string()), + |_, _| Ok("json".to_string()), + |_, _| Ok("yara".to_string()), + ) + .expect("Dispatch should succeed"); + + assert_eq!(yara_result, "yara"); + } + + #[test] + fn test_edge_cases() { + // Use injected stubs to validate dispatch on edge-case metadata without + // depending on placeholder formatter output. + let empty: Vec = Vec::new(); + let metadata = OutputMetadata::new("empty.bin".to_string(), OutputFormat::Table, 0, 0); + + let output = format_output_with( + &empty, + &metadata, + |_, _| Ok("table".to_string()), + |_, _| Ok("json".to_string()), + |_, _| Ok("yara".to_string()), + ) + .expect("Formatting should succeed"); + assert_eq!(output, "table"); + + let single = vec![build_found_string("x")]; + let single_metadata = + OutputMetadata::new("single.bin".to_string(), OutputFormat::Json, 1, 1); + + let single_output = format_output_with( + &single, + &single_metadata, + |_, _| Ok("table".to_string()), + |_, _| Ok("json".to_string()), + |_, _| Ok("yara".to_string()), + ) + .expect("Formatting should succeed"); + assert_eq!(single_output, "json"); + + let long_name = "a".repeat(512); + let long_metadata = OutputMetadata::new(long_name, OutputFormat::Yara, 1, 0); + let long_output = format_output_with( + &single, + &long_metadata, + |_, _| Ok("table".to_string()), + |_, _| Ok("json".to_string()), + |_, _| Ok("yara".to_string()), + ) + .expect("Formatting should succeed"); + assert_eq!(long_output, "yara"); + } + + #[test] + fn test_error_propagation() { + let strings = vec![build_found_string("err")]; + let metadata = OutputMetadata::new( + "sample.bin".to_string(), + OutputFormat::Json, + strings.len(), + strings.len(), + ); + + let error = format_output_with( + &strings, + &metadata, + |_, _| Ok("table".to_string()), + |_, _| Err(StringyError::ConfigError("formatter failed".to_string())), + |_, _| Ok("yara".to_string()), + ) + .expect_err("Formatter errors should propagate"); + + match error { + StringyError::ConfigError(message) => { + assert_eq!(message, "formatter failed"); + } + _ => panic!("Unexpected error type"), + } + } +} diff --git a/src/output/table.rs b/src/output/table.rs new file mode 100644 index 0000000..d34c71e --- /dev/null +++ b/src/output/table.rs @@ -0,0 +1,9 @@ +use crate::types::{FoundString, Result}; + +use super::OutputMetadata; + +/// Format strings in a human-readable table format. +pub fn format_table(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { + // TODO: Implement table formatter in a subsequent phase. + Ok(String::new()) +} diff --git a/src/output/yara.rs b/src/output/yara.rs new file mode 100644 index 0000000..8e367b8 --- /dev/null +++ b/src/output/yara.rs @@ -0,0 +1,9 @@ +use crate::types::{FoundString, Result}; + +use super::OutputMetadata; + +/// Format strings as YARA rule templates. +pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { + // TODO: Implement YARA formatter in a subsequent phase. + Ok(String::new()) +} From 4c4c955ad568921b671eb10b688b2a04ea413786 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 02:35:07 -0500 Subject: [PATCH 02/25] Implement table output formatter with TTY and plain modes Adds a human-readable table output formatter for extracted strings, with automatic TTY detection. In TTY mode, outputs an aligned table with headers and truncated columns; in non-TTY mode, outputs plain string text, one per line. Includes dynamic column width calculation, tag formatting, Unicode-safe truncation, and comprehensive integration and unit tests with snapshot verification. --- src/lib.rs | 2 +- src/output/mod.rs | 2 +- src/output/table.rs | 707 +++++++++++++++++- tests/output_table_integration.rs | 396 ++++++++++ ..._integration__edge_empty_section_name.snap | 7 + ...integration__edge_many_tags_truncated.snap | 7 + ...able_integration__edge_string_sources.snap | 9 + ...e_integration__edge_very_short_string.snap | 9 + ...ut_table_integration__edge_zero_score.snap | 7 + ...able_integration__plain_empty_strings.snap | 5 + ...ion__plain_long_strings_not_truncated.snap | 5 + ...e_integration__plain_multiple_strings.snap | 7 + ...n__plain_preserves_special_characters.snap | 8 + ...able_integration__plain_single_string.snap | 5 + ...le_integration__plain_unicode_strings.snap | 7 + ..._table_integration__tty_all_tag_types.snap | 16 + ..._table_integration__tty_empty_strings.snap | 5 + ...ut_table_integration__tty_high_scores.snap | 8 + ...e_integration__tty_long_section_names.snap | 9 + ...tegration__tty_long_strings_truncated.snap | 8 + ...egration__tty_missing_optional_fields.snap | 9 + ...ble_integration__tty_multiple_strings.snap | 10 + ..._table_integration__tty_single_string.snap | 7 + ...e_integration__tty_special_characters.snap | 9 + ...ation__tty_strings_with_multiple_tags.snap | 8 + ...le_integration__tty_various_encodings.snap | 9 + 26 files changed, 1275 insertions(+), 6 deletions(-) create mode 100644 tests/output_table_integration.rs create mode 100644 tests/snapshots/output_table_integration__edge_empty_section_name.snap create mode 100644 tests/snapshots/output_table_integration__edge_many_tags_truncated.snap create mode 100644 tests/snapshots/output_table_integration__edge_string_sources.snap create mode 100644 tests/snapshots/output_table_integration__edge_very_short_string.snap create mode 100644 tests/snapshots/output_table_integration__edge_zero_score.snap create mode 100644 tests/snapshots/output_table_integration__plain_empty_strings.snap create mode 100644 tests/snapshots/output_table_integration__plain_long_strings_not_truncated.snap create mode 100644 tests/snapshots/output_table_integration__plain_multiple_strings.snap create mode 100644 tests/snapshots/output_table_integration__plain_preserves_special_characters.snap create mode 100644 tests/snapshots/output_table_integration__plain_single_string.snap create mode 100644 tests/snapshots/output_table_integration__plain_unicode_strings.snap create mode 100644 tests/snapshots/output_table_integration__tty_all_tag_types.snap create mode 100644 tests/snapshots/output_table_integration__tty_empty_strings.snap create mode 100644 tests/snapshots/output_table_integration__tty_high_scores.snap create mode 100644 tests/snapshots/output_table_integration__tty_long_section_names.snap create mode 100644 tests/snapshots/output_table_integration__tty_long_strings_truncated.snap create mode 100644 tests/snapshots/output_table_integration__tty_missing_optional_fields.snap create mode 100644 tests/snapshots/output_table_integration__tty_multiple_strings.snap create mode 100644 tests/snapshots/output_table_integration__tty_single_string.snap create mode 100644 tests/snapshots/output_table_integration__tty_special_characters.snap create mode 100644 tests/snapshots/output_table_integration__tty_strings_with_multiple_tags.snap create mode 100644 tests/snapshots/output_table_integration__tty_various_encodings.snap diff --git a/src/lib.rs b/src/lib.rs index afdfc3b..58a931d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -86,4 +86,4 @@ pub use extraction::{ }; // Re-export output infrastructure types -pub use output::{OutputFormat, OutputMetadata, format_output}; +pub use output::{OutputFormat, OutputMetadata, format_output, format_table_with_mode}; diff --git a/src/output/mod.rs b/src/output/mod.rs index bf97cb9..419c129 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -41,7 +41,7 @@ pub mod table; pub mod yara; pub use json::format_json; -pub use table::format_table; +pub use table::{format_table, format_table_with_mode}; pub use yara::format_yara; /// Output format selection for Stringy formatters. diff --git a/src/output/table.rs b/src/output/table.rs index d34c71e..4844082 100644 --- a/src/output/table.rs +++ b/src/output/table.rs @@ -1,9 +1,708 @@ -use crate::types::{FoundString, Result}; +//! Table output formatter for Stringy. +//! +//! This module provides human-readable table output with automatic TTY detection. +//! When output is directed to a terminal (TTY), strings are displayed in an aligned +//! table with headers showing String, Tags, Score, and Section columns. When output +//! is piped or redirected (non-TTY), only the raw string text is emitted, one per line, +//! for seamless integration with other command-line tools. +//! +//! # TTY Mode Example +//! +//! ```text +//! String | Tags | Score | Section +//! -------------------------------------------------------------|--------------|-------|-------- +//! https://malware.example.com/beacon | url | 150 | .rdata +//! C:\Windows\System32\cmd.exe | filepath | 120 | .data +//! GetProcAddress | import | 80 | +//! ``` +//! +//! # Non-TTY Mode Example +//! +//! ```text +//! https://malware.example.com/beacon +//! C:\Windows\System32\cmd.exe +//! GetProcAddress +//! ``` +//! +//! # Column Layout +//! +//! - **String**: Up to 60 characters, truncated with `...` if longer +//! - **Tags**: First 2-3 tags, comma-separated, max 20 characters +//! - **Score**: Right-aligned integer score +//! - **Section**: Section name where the string was found + +use std::io::IsTerminal; + +use crate::classification::ranking::RankingConfig; +use crate::types::{FoundString, Result, Tag}; use super::OutputMetadata; +/// Maximum width for the string column before truncation. +const STRING_COLUMN_WIDTH: usize = 60; + +/// Maximum width for the tags column. +const TAGS_COLUMN_WIDTH: usize = 20; + +/// Maximum width for the score column. +const SCORE_COLUMN_WIDTH: usize = 6; + +/// Maximum width for the section column. +const SECTION_COLUMN_WIDTH: usize = 15; + /// Format strings in a human-readable table format. -pub fn format_table(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - // TODO: Implement table formatter in a subsequent phase. - Ok(String::new()) +/// +/// Automatically detects whether output is going to a TTY (terminal) and adjusts +/// the format accordingly. In TTY mode, outputs an aligned table with headers. +/// In non-TTY mode (piped/redirected), outputs plain strings one per line. +/// +/// # Arguments +/// +/// * `strings` - The extracted strings to format +/// * `metadata` - Output context (currently unused but reserved for future features) +/// +/// # Returns +/// +/// A formatted string ready for output. +pub fn format_table(strings: &[FoundString], metadata: &OutputMetadata) -> Result { + let is_tty = std::io::stdout().is_terminal(); + format_table_with_mode(strings, metadata, is_tty) +} + +/// Format table with explicit TTY mode specification. +/// +/// This function allows explicit control over the output mode, useful for testing +/// and programmatic control over output format. +/// +/// # Arguments +/// +/// * `strings` - The extracted strings to format +/// * `metadata` - Output context +/// * `is_tty` - Whether to use TTY mode (true) or plain mode (false) +pub fn format_table_with_mode( + strings: &[FoundString], + metadata: &OutputMetadata, + is_tty: bool, +) -> Result { + if is_tty { + format_table_tty(strings, metadata) + } else { + format_table_plain(strings) + } +} + +/// Format strings as an aligned table for TTY output. +/// +/// Creates a table with headers and aligned columns showing: +/// - String text (truncated if necessary) +/// - Tags (comma-separated, limited count) +/// - Score (right-aligned) +/// - Section name +fn format_table_tty(strings: &[FoundString], _metadata: &OutputMetadata) -> Result { + if strings.is_empty() { + return Ok(String::new()); + } + + let mut output = String::new(); + + // Calculate dynamic column widths based on content + let section_width = calculate_section_width(strings); + let tags_width = calculate_tags_width(strings); + + // Build header + let header = format!( + "{} | {} | {} | {}", + pad_string("String", STRING_COLUMN_WIDTH, Alignment::Left), + pad_string("Tags", tags_width, Alignment::Left), + pad_string("Score", SCORE_COLUMN_WIDTH, Alignment::Right), + pad_string("Section", section_width, Alignment::Left), + ); + output.push_str(&header); + output.push('\n'); + + // Build separator line + let separator = format!( + "{}-|-{}-|-{}-|-{}", + "-".repeat(STRING_COLUMN_WIDTH), + "-".repeat(tags_width), + "-".repeat(SCORE_COLUMN_WIDTH), + "-".repeat(section_width), + ); + output.push_str(&separator); + output.push('\n'); + + // Build rows + for found_string in strings { + let truncated_text = truncate_string(&found_string.text, STRING_COLUMN_WIDTH); + let tags_display = format_tags(&found_string.tags); + let section_display = found_string.section.as_deref().unwrap_or(""); + + let row = format!( + "{} | {} | {} | {}", + pad_string(&truncated_text, STRING_COLUMN_WIDTH, Alignment::Left), + pad_string(&tags_display, tags_width, Alignment::Left), + pad_string( + &found_string.score.to_string(), + SCORE_COLUMN_WIDTH, + Alignment::Right + ), + pad_string(section_display, section_width, Alignment::Left), + ); + output.push_str(&row); + output.push('\n'); + } + + // Remove trailing newline for consistency + if output.ends_with('\n') { + output.pop(); + } + + Ok(output) +} + +/// Format strings as plain text for non-TTY output. +/// +/// Outputs only the string text, one per line, suitable for piping to other tools. +fn format_table_plain(strings: &[FoundString]) -> Result { + let lines: Vec = strings + .iter() + .map(|s| sanitize_plain_text(&s.text)) + .collect(); + Ok(lines.join("\n")) +} + +/// Calculate the optimal width for the section column based on content. +fn calculate_section_width(strings: &[FoundString]) -> usize { + let max_section_len = strings + .iter() + .filter_map(|s| s.section.as_ref()) + .map(|s| s.len()) + .max() + .unwrap_or(0); + + // Minimum width is "Section" header length, maximum is SECTION_COLUMN_WIDTH + max_section_len.clamp("Section".len(), SECTION_COLUMN_WIDTH) +} + +/// Calculate the optimal width for the tags column based on content. +fn calculate_tags_width(strings: &[FoundString]) -> usize { + let max_tags_len = strings + .iter() + .map(|s| format_tags(&s.tags).len()) + .max() + .unwrap_or(0); + + // Minimum width is "Tags" header length, maximum is TAGS_COLUMN_WIDTH + max_tags_len.clamp("Tags".len(), TAGS_COLUMN_WIDTH) +} + +/// Format tags for display in the table. +/// +/// Converts tags to their display format using serde rename values where applicable. +/// Limits output to `MAX_TAGS_DISPLAY` tags to prevent overflow. +/// +/// # Arguments +/// +/// * `tags` - Slice of tags to format +/// +/// # Returns +/// +/// Comma-separated string of tag names, or empty string if no tags. +/// +/// # Examples +/// +/// ```ignore +/// let tags = vec![Tag::IPv4, Tag::FilePath]; +/// assert_eq!(format_tags(&tags), "ipv4, filepath"); +/// ``` +pub fn format_tags(tags: &[Tag]) -> String { + if tags.is_empty() { + return String::new(); + } + + let config = RankingConfig::default(); + let max_boost = tags + .iter() + .map(|tag| tag_boost_value(tag, &config)) + .max() + .unwrap_or(0); + + let tag_strings: Vec = tags + .iter() + .filter(|tag| tag_boost_value(tag, &config) == max_boost) + .map(tag_to_display_string) + .collect(); + + let result = tag_strings.join(", "); + + // Truncate if still too long + if result.len() > TAGS_COLUMN_WIDTH { + truncate_string(&result, TAGS_COLUMN_WIDTH) + } else { + result + } +} + +/// Sanitize plain text output so each string renders as a single line. +/// +/// Replaces CRLF, LF, and CR with escaped sequences to preserve content +/// while keeping output line-based. +fn sanitize_plain_text(text: &str) -> String { + text.replace("\r\n", "\\r\\n") + .replace('\n', "\\n") + .replace('\r', "\\r") +} + +/// Get the ranking boost value for a tag using the provided config. +fn tag_boost_value(tag: &Tag, config: &RankingConfig) -> i32 { + config.tag_boosts.get(tag).copied().unwrap_or(0) +} + +/// Convert a single tag to its display string. +/// +/// Uses the serde rename value where defined, otherwise uses lowercase Debug format. +fn tag_to_display_string(tag: &Tag) -> String { + match tag { + Tag::Url => "url".to_string(), + Tag::Domain => "domain".to_string(), + Tag::IPv4 => "ipv4".to_string(), + Tag::IPv6 => "ipv6".to_string(), + Tag::FilePath => "filepath".to_string(), + Tag::RegistryPath => "regpath".to_string(), + Tag::Guid => "guid".to_string(), + Tag::Email => "email".to_string(), + Tag::Base64 => "b64".to_string(), + Tag::FormatString => "fmt".to_string(), + Tag::UserAgent => "user-agent-ish".to_string(), + Tag::DemangledSymbol => "demangled".to_string(), + Tag::Import => "import".to_string(), + Tag::Export => "export".to_string(), + Tag::Version => "version".to_string(), + Tag::Manifest => "manifest".to_string(), + Tag::Resource => "resource".to_string(), + Tag::DylibPath => "dylib-path".to_string(), + Tag::Rpath => "rpath".to_string(), + Tag::RpathVariable => "rpath-var".to_string(), + Tag::FrameworkPath => "framework-path".to_string(), + } +} + +/// Truncate a string to the specified maximum length. +/// +/// If the string exceeds the maximum length, it is truncated and `...` is appended. +/// Handles Unicode correctly by truncating at character boundaries. +/// +/// # Arguments +/// +/// * `s` - The string to truncate +/// * `max_len` - Maximum length including the ellipsis +/// +/// # Returns +/// +/// The original string if it fits, or a truncated version with `...` appended. +/// +/// # Examples +/// +/// ```ignore +/// assert_eq!(truncate_string("hello", 10), "hello"); +/// assert_eq!(truncate_string("hello world", 8), "hello..."); +/// ``` +pub fn truncate_string(s: &str, max_len: usize) -> String { + if s.len() <= max_len { + return s.to_string(); + } + + if max_len <= 3 { + return ".".repeat(max_len); + } + + // Find a valid character boundary for truncation + let truncate_at = max_len - 3; + let mut end_index = truncate_at; + + // Ensure we don't split a multi-byte character + for (idx, _) in s.char_indices() { + if idx <= truncate_at { + end_index = idx; + } else { + break; + } + } + + // Handle case where we need to include at least one character + if end_index == 0 && !s.is_empty() { + if let Some((idx, _)) = s.char_indices().nth(1) { + end_index = idx; + } else { + end_index = s.len(); + } + } + + format!("{}...", &s[..end_index]) +} + +/// Text alignment for padding. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Alignment { + /// Left-align text (pad on right). + Left, + /// Right-align text (pad on left). + Right, +} + +/// Pad a string to a fixed width with the specified alignment. +/// +/// # Arguments +/// +/// * `s` - The string to pad +/// * `width` - Target width +/// * `alignment` - Left or right alignment +/// +/// # Returns +/// +/// The padded string. +pub fn pad_string(s: &str, width: usize, alignment: Alignment) -> String { + match alignment { + Alignment::Left => format!("{: format!("{:>width$}", s, width = width), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::output::OutputFormat; + use crate::types::{Encoding, StringSource}; + + fn make_test_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0x1000, + text.len() as u32, + StringSource::SectionData, + ) + } + + fn make_metadata() -> OutputMetadata { + OutputMetadata::new("test.bin".to_string(), OutputFormat::Table, 10, 10) + } + + // Tests for format_tags + mod format_tags_tests { + use super::*; + + #[test] + fn empty_tags() { + assert_eq!(format_tags(&[]), ""); + } + + #[test] + fn single_tag() { + assert_eq!(format_tags(&[Tag::Url]), "url"); + assert_eq!(format_tags(&[Tag::IPv4]), "ipv4"); + assert_eq!(format_tags(&[Tag::FilePath]), "filepath"); + } + + #[test] + fn two_tags() { + assert_eq!(format_tags(&[Tag::Url, Tag::Domain]), "url"); + assert_eq!(format_tags(&[Tag::IPv4, Tag::FilePath]), "ipv4"); + } + + #[test] + fn three_tags() { + assert_eq!(format_tags(&[Tag::Url, Tag::Domain, Tag::IPv4]), "url"); + } + + #[test] + fn more_than_max_tags_truncated() { + let tags = vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::FilePath, + Tag::RegistryPath, + ]; + assert_eq!(format_tags(&tags), "url"); + } + + #[test] + fn multiple_tags_same_priority() { + assert_eq!(format_tags(&[Tag::Import, Tag::Export]), "import, export"); + } + + #[test] + fn all_tag_variants_have_display() { + // Ensure all tag variants produce valid output + let all_tags = vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::IPv6, + Tag::FilePath, + Tag::RegistryPath, + Tag::Guid, + Tag::Email, + Tag::Base64, + Tag::FormatString, + Tag::UserAgent, + Tag::DemangledSymbol, + Tag::Import, + Tag::Export, + Tag::Version, + Tag::Manifest, + Tag::Resource, + Tag::DylibPath, + Tag::Rpath, + Tag::RpathVariable, + Tag::FrameworkPath, + ]; + + for tag in all_tags { + let display = tag_to_display_string(&tag); + assert!(!display.is_empty(), "Tag {:?} should have display", tag); + assert!(display.is_ascii(), "Tag display should be ASCII"); + } + } + } + + // Tests for truncate_string + mod truncate_string_tests { + use super::*; + + #[test] + fn short_string_unchanged() { + assert_eq!(truncate_string("hello", 10), "hello"); + assert_eq!(truncate_string("", 10), ""); + } + + #[test] + fn exact_length_unchanged() { + assert_eq!(truncate_string("hello", 5), "hello"); + } + + #[test] + fn long_string_truncated() { + assert_eq!(truncate_string("hello world", 8), "hello..."); + } + + #[test] + fn very_short_max_length() { + assert_eq!(truncate_string("hello", 3), "..."); + assert_eq!(truncate_string("hello", 2), ".."); + assert_eq!(truncate_string("hello", 1), "."); + } + + #[test] + fn unicode_string_safe_truncation() { + // Ensure we don't split multi-byte characters + let unicode = "hello\u{1F600}world"; // emoji in the middle + let truncated = truncate_string(unicode, 8); + // Should truncate before the emoji to avoid splitting it + assert!(truncated.ends_with("...")); + assert!(truncated.len() <= 8); + } + + #[test] + fn unicode_at_boundary() { + let text = "\u{4E2D}\u{6587}\u{6D4B}\u{8BD5}"; // Chinese characters + let truncated = truncate_string(text, 6); + assert!(truncated.is_char_boundary(truncated.len() - 3)); + } + } + + // Tests for pad_string + mod pad_string_tests { + use super::*; + + #[test] + fn left_alignment() { + assert_eq!(pad_string("hi", 5, Alignment::Left), "hi "); + assert_eq!(pad_string("hello", 5, Alignment::Left), "hello"); + } + + #[test] + fn right_alignment() { + assert_eq!(pad_string("hi", 5, Alignment::Right), " hi"); + assert_eq!(pad_string("hello", 5, Alignment::Right), "hello"); + } + + #[test] + fn exact_width() { + assert_eq!(pad_string("exact", 5, Alignment::Left), "exact"); + assert_eq!(pad_string("exact", 5, Alignment::Right), "exact"); + } + + #[test] + fn empty_string() { + assert_eq!(pad_string("", 5, Alignment::Left), " "); + assert_eq!(pad_string("", 5, Alignment::Right), " "); + } + } + + // Tests for format_table + mod format_table_tests { + use super::*; + + #[test] + fn empty_strings_returns_empty() { + let result = format_table_with_mode(&[], &make_metadata(), true).unwrap(); + assert_eq!(result, ""); + } + + #[test] + fn single_string_tty_mode() { + let strings = vec![make_test_string("test string")]; + let result = format_table_with_mode(&strings, &make_metadata(), true).unwrap(); + + // Should have header, separator, and one data row + let lines: Vec<&str> = result.lines().collect(); + assert_eq!(lines.len(), 3); + assert!(lines[0].contains("String")); + assert!(lines[0].contains("Tags")); + assert!(lines[0].contains("Score")); + assert!(lines[0].contains("Section")); + assert!(lines[1].contains("---")); + assert!(lines[2].contains("test string")); + } + + #[test] + fn single_string_plain_mode() { + let strings = vec![make_test_string("test string")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + assert_eq!(result, "test string"); + } + + #[test] + fn multiple_strings_plain_mode() { + let strings = vec![ + make_test_string("first"), + make_test_string("second"), + make_test_string("third"), + ]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + assert_eq!(result, "first\nsecond\nthird"); + } + + #[test] + fn string_with_tags_displayed() { + let mut found = make_test_string("http://example.com"); + found.tags = vec![Tag::Url, Tag::Domain]; + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + assert!(result.contains("url")); + } + + #[test] + fn string_with_section_displayed() { + let found = make_test_string("test").with_section(".rodata".to_string()); + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + assert!(result.contains(".rodata")); + } + + #[test] + fn string_with_score_displayed() { + let found = make_test_string("test").with_score(150); + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + assert!(result.contains("150")); + } + + #[test] + fn long_string_truncated_in_tty() { + let long_text = "a".repeat(100); + let strings = vec![make_test_string(&long_text)]; + let result = format_table_with_mode(&strings, &make_metadata(), true).unwrap(); + + // Should contain truncated version with ... + assert!(result.contains("...")); + // Should not contain the full 100 character string + assert!(!result.contains(&long_text)); + } + + #[test] + fn long_string_not_truncated_in_plain() { + let long_text = "a".repeat(100); + let strings = vec![make_test_string(&long_text)]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + // Plain mode should have full string + assert_eq!(result, long_text); + } + + #[test] + fn missing_optional_fields_handled() { + // String with no section, no tags, default score + let found = make_test_string("minimal"); + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + // Should not crash and should contain the string + assert!(result.contains("minimal")); + } + + #[test] + fn special_characters_in_string() { + let strings = vec![make_test_string("tab\there"), make_test_string("pipe|here")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + // Each string should be on its own line in output + let lines: Vec<&str> = result.lines().collect(); + assert_eq!(lines.len(), 2); + assert!(lines[0].contains("tab\there")); + assert!(lines[1].contains("pipe|here")); + } + + #[test] + fn string_with_embedded_newline() { + let strings = vec![make_test_string("line1\nline2")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + assert_eq!(result, "line1\\nline2"); + } + } + + // Tests for column width calculation + mod column_width_tests { + use super::*; + + #[test] + fn section_width_minimum() { + let strings = vec![make_test_string("test")]; + let width = calculate_section_width(&strings); + assert_eq!(width, "Section".len()); + } + + #[test] + fn section_width_from_content() { + let strings = vec![make_test_string("test").with_section(".rodata.str1.1".to_string())]; + let width = calculate_section_width(&strings); + assert_eq!(width, ".rodata.str1.1".len()); + } + + #[test] + fn section_width_capped_at_max() { + let long_section = "a".repeat(50); + let strings = vec![make_test_string("test").with_section(long_section)]; + let width = calculate_section_width(&strings); + assert_eq!(width, SECTION_COLUMN_WIDTH); + } + + #[test] + fn tags_width_minimum() { + let strings = vec![make_test_string("test")]; + let width = calculate_tags_width(&strings); + assert_eq!(width, "Tags".len()); + } + + #[test] + fn tags_width_from_content() { + let mut found = make_test_string("test"); + found.tags = vec![Tag::Url, Tag::Domain]; + let width = calculate_tags_width(&[found]); + assert_eq!(width, "Tags".len()); + } + } } diff --git a/tests/output_table_integration.rs b/tests/output_table_integration.rs new file mode 100644 index 0000000..3446464 --- /dev/null +++ b/tests/output_table_integration.rs @@ -0,0 +1,396 @@ +//! Integration tests for table output formatter. +//! +//! Uses insta snapshots to verify output format consistency. + +use insta::assert_snapshot; +use stringy::output::{OutputFormat, OutputMetadata, format_table_with_mode}; +use stringy::types::{Encoding, FoundString, StringSource, Tag}; + +/// Create a test FoundString with common defaults. +fn make_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0x1000, + text.len() as u32, + StringSource::SectionData, + ) +} + +/// Create OutputMetadata for tests. +fn make_metadata(count: usize) -> OutputMetadata { + OutputMetadata::new( + "test_binary.exe".to_string(), + OutputFormat::Table, + count, + count, + ) +} + +// TTY mode tests + +#[test] +fn test_tty_empty_strings() { + let result = format_table_with_mode(&[], &make_metadata(0), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_single_string() { + let strings = vec![make_string("GetProcAddress")]; + let result = format_table_with_mode(&strings, &make_metadata(1), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_multiple_strings() { + let strings = vec![ + make_string("https://malware.example.com/beacon") + .with_tags(vec![Tag::Url]) + .with_score(150) + .with_section(".rdata".to_string()), + make_string("C:\\Windows\\System32\\cmd.exe") + .with_tags(vec![Tag::FilePath]) + .with_score(120) + .with_section(".data".to_string()), + make_string("GetProcAddress") + .with_tags(vec![Tag::Import]) + .with_score(80), + make_string("192.168.1.100") + .with_tags(vec![Tag::IPv4]) + .with_score(100) + .with_section(".rodata".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(4), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_strings_with_multiple_tags() { + let strings = vec![ + make_string("http://evil.com/download.exe") + .with_tags(vec![Tag::Url, Tag::Domain, Tag::FilePath]) + .with_score(200) + .with_section(".rdata".to_string()), + make_string("user@example.com") + .with_tags(vec![Tag::Email, Tag::Domain]) + .with_score(90) + .with_section(".data".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(2), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_long_strings_truncated() { + let long_url = format!( + "https://very-long-subdomain.malware-domain.example.com/path/to/beacon?id={}", + "x".repeat(50) + ); + let long_path = format!( + "C:\\Users\\Administrator\\AppData\\Local\\Temp\\{}.exe", + "a".repeat(60) + ); + + let strings = vec![ + make_string(&long_url) + .with_tags(vec![Tag::Url]) + .with_score(150) + .with_section(".rdata".to_string()), + make_string(&long_path) + .with_tags(vec![Tag::FilePath]) + .with_score(120) + .with_section(".data".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(2), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_missing_optional_fields() { + let strings = vec![ + // No section + make_string("kernel32.dll") + .with_tags(vec![Tag::Import]) + .with_score(50), + // No tags + make_string("mysterious string") + .with_score(10) + .with_section(".text".to_string()), + // No tags, no section, default score + make_string("bare minimum"), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_special_characters() { + let strings = vec![ + make_string("string with\ttab") + .with_score(10) + .with_section(".data".to_string()), + make_string("pipe|character") + .with_score(10) + .with_section(".data".to_string()), + make_string("backslash\\here") + .with_tags(vec![Tag::FilePath]) + .with_score(20) + .with_section(".rdata".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_various_encodings() { + let strings = vec![ + FoundString::new( + "ASCII string".to_string(), + Encoding::Ascii, + 0x1000, + 12, + StringSource::SectionData, + ) + .with_score(50) + .with_section(".rodata".to_string()), + FoundString::new( + "UTF-8 string".to_string(), + Encoding::Utf8, + 0x2000, + 12, + StringSource::SectionData, + ) + .with_score(50) + .with_section(".rodata".to_string()), + FoundString::new( + "UTF-16LE string".to_string(), + Encoding::Utf16Le, + 0x3000, + 30, + StringSource::SectionData, + ) + .with_score(50) + .with_section(".data".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_high_scores() { + let strings = vec![ + make_string("critical IOC") + .with_tags(vec![Tag::Url, Tag::IPv4]) + .with_score(9999) + .with_section(".rdata".to_string()), + make_string("negative score") + .with_score(-50) + .with_section(".text".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(2), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_all_tag_types() { + // Test a variety of tag types to ensure they all display correctly + let strings = vec![ + make_string("https://example.com") + .with_tags(vec![Tag::Url]) + .with_score(100), + make_string("example.com") + .with_tags(vec![Tag::Domain]) + .with_score(80), + make_string("192.168.1.1") + .with_tags(vec![Tag::IPv4]) + .with_score(90), + make_string("::1").with_tags(vec![Tag::IPv6]).with_score(90), + make_string("/etc/passwd") + .with_tags(vec![Tag::FilePath]) + .with_score(85), + make_string("HKLM\\Software") + .with_tags(vec![Tag::RegistryPath]) + .with_score(85), + make_string("{12345678-1234-1234-1234-123456789012}") + .with_tags(vec![Tag::Guid]) + .with_score(70), + make_string("user@domain.com") + .with_tags(vec![Tag::Email]) + .with_score(75), + make_string("SGVsbG8gV29ybGQ=") + .with_tags(vec![Tag::Base64]) + .with_score(60), + make_string("%s %d %x") + .with_tags(vec![Tag::FormatString]) + .with_score(50), + ]; + let result = format_table_with_mode(&strings, &make_metadata(10), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_tty_long_section_names() { + let strings = vec![ + make_string("string one") + .with_score(10) + .with_section(".rodata.str1.1".to_string()), + make_string("string two") + .with_score(20) + .with_section(".data.rel.ro".to_string()), + make_string("string three") + .with_score(30) + .with_section(".text".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), true).unwrap(); + assert_snapshot!(result); +} + +// Non-TTY (plain) mode tests + +#[test] +fn test_plain_empty_strings() { + let result = format_table_with_mode(&[], &make_metadata(0), false).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_plain_single_string() { + let strings = vec![make_string("GetProcAddress")]; + let result = format_table_with_mode(&strings, &make_metadata(1), false).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_plain_multiple_strings() { + let strings = vec![ + make_string("https://malware.example.com/beacon") + .with_tags(vec![Tag::Url]) + .with_score(150), + make_string("C:\\Windows\\System32\\cmd.exe") + .with_tags(vec![Tag::FilePath]) + .with_score(120), + make_string("GetProcAddress") + .with_tags(vec![Tag::Import]) + .with_score(80), + ]; + let result = format_table_with_mode(&strings, &make_metadata(4), false).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_plain_long_strings_not_truncated() { + let long_string = "a".repeat(200); + let strings = vec![make_string(&long_string)]; + let result = format_table_with_mode(&strings, &make_metadata(1), false).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_plain_preserves_special_characters() { + let strings = vec![ + make_string("tab\there"), + make_string("pipe|here"), + make_string("quote\"here"), + make_string("line1\nline2"), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), false).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_plain_unicode_strings() { + let strings = vec![ + make_string("\u{4E2D}\u{6587}\u{5B57}\u{7B26}\u{4E32}"), // Chinese characters + make_string("\u{0420}\u{0443}\u{0441}\u{0441}\u{043A}\u{0438}\u{0439}"), // Russian + make_string("\u{1F600}\u{1F601}\u{1F602}"), // Emojis + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), false).unwrap(); + assert_snapshot!(result); +} + +// Edge case tests + +#[test] +fn test_edge_many_tags_truncated() { + let strings = vec![ + make_string("multi-tagged") + .with_tags(vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::FilePath, + Tag::RegistryPath, + ]) + .with_score(100) + .with_section(".data".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(1), true).unwrap(); + // Should only show first 3 tags + assert_snapshot!(result); +} + +#[test] +fn test_edge_zero_score() { + let strings = vec![ + make_string("zero score string") + .with_score(0) + .with_section(".data".to_string()), + ]; + let result = format_table_with_mode(&strings, &make_metadata(1), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_edge_empty_section_name() { + // Section explicitly set to empty string vs None + let strings = vec![make_string("with empty section").with_section(String::new())]; + let result = format_table_with_mode(&strings, &make_metadata(1), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_edge_very_short_string() { + let strings = vec![ + make_string("a").with_score(10), + make_string("ab").with_score(20), + make_string("abc").with_score(30), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), true).unwrap(); + assert_snapshot!(result); +} + +#[test] +fn test_edge_string_sources() { + let strings = vec![ + FoundString::new( + "import_func".to_string(), + Encoding::Ascii, + 0x1000, + 11, + StringSource::ImportName, + ) + .with_tags(vec![Tag::Import]) + .with_score(80), + FoundString::new( + "export_func".to_string(), + Encoding::Ascii, + 0x2000, + 11, + StringSource::ExportName, + ) + .with_tags(vec![Tag::Export]) + .with_score(80), + FoundString::new( + "resource string".to_string(), + Encoding::Utf16Le, + 0x3000, + 30, + StringSource::ResourceString, + ) + .with_tags(vec![Tag::Resource]) + .with_score(60), + ]; + let result = format_table_with_mode(&strings, &make_metadata(3), true).unwrap(); + assert_snapshot!(result); +} diff --git a/tests/snapshots/output_table_integration__edge_empty_section_name.snap b/tests/snapshots/output_table_integration__edge_empty_section_name.snap new file mode 100644 index 0000000..be098ad --- /dev/null +++ b/tests/snapshots/output_table_integration__edge_empty_section_name.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +with empty section | | 0 | diff --git a/tests/snapshots/output_table_integration__edge_many_tags_truncated.snap b/tests/snapshots/output_table_integration__edge_many_tags_truncated.snap new file mode 100644 index 0000000..eded6bc --- /dev/null +++ b/tests/snapshots/output_table_integration__edge_many_tags_truncated.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +multi-tagged | url | 100 | .data diff --git a/tests/snapshots/output_table_integration__edge_string_sources.snap b/tests/snapshots/output_table_integration__edge_string_sources.snap new file mode 100644 index 0000000..d86c3f3 --- /dev/null +++ b/tests/snapshots/output_table_integration__edge_string_sources.snap @@ -0,0 +1,9 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|----------|--------|-------- +import_func | import | 80 | +export_func | export | 80 | +resource string | resource | 60 | diff --git a/tests/snapshots/output_table_integration__edge_very_short_string.snap b/tests/snapshots/output_table_integration__edge_very_short_string.snap new file mode 100644 index 0000000..25d3c61 --- /dev/null +++ b/tests/snapshots/output_table_integration__edge_very_short_string.snap @@ -0,0 +1,9 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +a | | 10 | +ab | | 20 | +abc | | 30 | diff --git a/tests/snapshots/output_table_integration__edge_zero_score.snap b/tests/snapshots/output_table_integration__edge_zero_score.snap new file mode 100644 index 0000000..3803bf2 --- /dev/null +++ b/tests/snapshots/output_table_integration__edge_zero_score.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +zero score string | | 0 | .data diff --git a/tests/snapshots/output_table_integration__plain_empty_strings.snap b/tests/snapshots/output_table_integration__plain_empty_strings.snap new file mode 100644 index 0000000..c900bf2 --- /dev/null +++ b/tests/snapshots/output_table_integration__plain_empty_strings.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- + diff --git a/tests/snapshots/output_table_integration__plain_long_strings_not_truncated.snap b/tests/snapshots/output_table_integration__plain_long_strings_not_truncated.snap new file mode 100644 index 0000000..6372697 --- /dev/null +++ b/tests/snapshots/output_table_integration__plain_long_strings_not_truncated.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa diff --git a/tests/snapshots/output_table_integration__plain_multiple_strings.snap b/tests/snapshots/output_table_integration__plain_multiple_strings.snap new file mode 100644 index 0000000..f7c8c0f --- /dev/null +++ b/tests/snapshots/output_table_integration__plain_multiple_strings.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +https://malware.example.com/beacon +C:\Windows\System32\cmd.exe +GetProcAddress diff --git a/tests/snapshots/output_table_integration__plain_preserves_special_characters.snap b/tests/snapshots/output_table_integration__plain_preserves_special_characters.snap new file mode 100644 index 0000000..d41e9ba --- /dev/null +++ b/tests/snapshots/output_table_integration__plain_preserves_special_characters.snap @@ -0,0 +1,8 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +tab here +pipe|here +quote"here +line1\nline2 diff --git a/tests/snapshots/output_table_integration__plain_single_string.snap b/tests/snapshots/output_table_integration__plain_single_string.snap new file mode 100644 index 0000000..dbeff49 --- /dev/null +++ b/tests/snapshots/output_table_integration__plain_single_string.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +GetProcAddress diff --git a/tests/snapshots/output_table_integration__plain_unicode_strings.snap b/tests/snapshots/output_table_integration__plain_unicode_strings.snap new file mode 100644 index 0000000..a44510c --- /dev/null +++ b/tests/snapshots/output_table_integration__plain_unicode_strings.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +中文字符串 +Русский +😀😁😂 diff --git a/tests/snapshots/output_table_integration__tty_all_tag_types.snap b/tests/snapshots/output_table_integration__tty_all_tag_types.snap new file mode 100644 index 0000000..5f6612e --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_all_tag_types.snap @@ -0,0 +1,16 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|----------|--------|-------- +https://example.com | url | 100 | +example.com | domain | 80 | +192.168.1.1 | ipv4 | 90 | +::1 | ipv6 | 90 | +/etc/passwd | filepath | 85 | +HKLM\Software | regpath | 85 | +{12345678-1234-1234-1234-123456789012} | guid | 70 | +user@domain.com | email | 75 | +SGVsbG8gV29ybGQ= | b64 | 60 | +%s %d %x | fmt | 50 | diff --git a/tests/snapshots/output_table_integration__tty_empty_strings.snap b/tests/snapshots/output_table_integration__tty_empty_strings.snap new file mode 100644 index 0000000..c900bf2 --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_empty_strings.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- + diff --git a/tests/snapshots/output_table_integration__tty_high_scores.snap b/tests/snapshots/output_table_integration__tty_high_scores.snap new file mode 100644 index 0000000..fa3a32a --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_high_scores.snap @@ -0,0 +1,8 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +critical IOC | url | 9999 | .rdata +negative score | | -50 | .text diff --git a/tests/snapshots/output_table_integration__tty_long_section_names.snap b/tests/snapshots/output_table_integration__tty_long_section_names.snap new file mode 100644 index 0000000..8cbd810 --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_long_section_names.snap @@ -0,0 +1,9 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|--------------- +string one | | 10 | .rodata.str1.1 +string two | | 20 | .data.rel.ro +string three | | 30 | .text diff --git a/tests/snapshots/output_table_integration__tty_long_strings_truncated.snap b/tests/snapshots/output_table_integration__tty_long_strings_truncated.snap new file mode 100644 index 0000000..643d930 --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_long_strings_truncated.snap @@ -0,0 +1,8 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|----------|--------|-------- +https://very-long-subdomain.malware-domain.example.com/pa... | url | 150 | .rdata +C:\Users\Administrator\AppData\Local\Temp\aaaaaaaaaaaaaaa... | filepath | 120 | .data diff --git a/tests/snapshots/output_table_integration__tty_missing_optional_fields.snap b/tests/snapshots/output_table_integration__tty_missing_optional_fields.snap new file mode 100644 index 0000000..8e6e113 --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_missing_optional_fields.snap @@ -0,0 +1,9 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|--------|--------|-------- +kernel32.dll | import | 50 | +mysterious string | | 10 | .text +bare minimum | | 0 | diff --git a/tests/snapshots/output_table_integration__tty_multiple_strings.snap b/tests/snapshots/output_table_integration__tty_multiple_strings.snap new file mode 100644 index 0000000..f7a8eca --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_multiple_strings.snap @@ -0,0 +1,10 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|----------|--------|-------- +https://malware.example.com/beacon | url | 150 | .rdata +C:\Windows\System32\cmd.exe | filepath | 120 | .data +GetProcAddress | import | 80 | +192.168.1.100 | ipv4 | 100 | .rodata diff --git a/tests/snapshots/output_table_integration__tty_single_string.snap b/tests/snapshots/output_table_integration__tty_single_string.snap new file mode 100644 index 0000000..28cbea8 --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_single_string.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +GetProcAddress | | 0 | diff --git a/tests/snapshots/output_table_integration__tty_special_characters.snap b/tests/snapshots/output_table_integration__tty_special_characters.snap new file mode 100644 index 0000000..2ebce1e --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_special_characters.snap @@ -0,0 +1,9 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|----------|--------|-------- +string with tab | | 10 | .data +pipe|character | | 10 | .data +backslash\here | filepath | 20 | .rdata diff --git a/tests/snapshots/output_table_integration__tty_strings_with_multiple_tags.snap b/tests/snapshots/output_table_integration__tty_strings_with_multiple_tags.snap new file mode 100644 index 0000000..8be72b8 --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_strings_with_multiple_tags.snap @@ -0,0 +1,8 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|--------|--------|-------- +http://evil.com/download.exe | url | 200 | .rdata +user@example.com | domain | 90 | .data diff --git a/tests/snapshots/output_table_integration__tty_various_encodings.snap b/tests/snapshots/output_table_integration__tty_various_encodings.snap new file mode 100644 index 0000000..eade21f --- /dev/null +++ b/tests/snapshots/output_table_integration__tty_various_encodings.snap @@ -0,0 +1,9 @@ +--- +source: tests/output_table_integration.rs +expression: result +--- +String | Tags | Score | Section +-------------------------------------------------------------|------|--------|-------- +ASCII string | | 50 | .rodata +UTF-8 string | | 50 | .rodata +UTF-16LE string | | 50 | .data From b8bcc8e99950dae10956e762e3ee6510c3d2dc17 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 02:55:30 -0500 Subject: [PATCH 03/25] Implement JSON and YARA output formatters Added full implementations for `format_json` and `format_yara` functions, including serialization logic, escaping, and tag/group handling. Updated documentation and examples to reflect new UTF-16 extraction API and output formatting. Added comprehensive integration and snapshot tests for both output formats to ensure correctness and stability. --- src/classification/mod.rs | 24 +- src/classification/semantic.rs | 24 +- src/classification/symbols.rs | 49 +-- src/extraction/ascii.rs | 2 +- src/extraction/mod.rs | 11 +- src/extraction/pe_resources.rs | 7 + src/lib.rs | 10 +- src/output/json.rs | 279 +++++++++++++++- src/output/yara.rs | 298 +++++++++++++++++- tests/output_json_integration.rs | 266 ++++++++++++++++ tests/output_yara_integration.rs | 196 ++++++++++++ ..._json_integration__json_all_encodings.snap | 8 + ...ntegration__json_all_fields_populated.snap | 5 + ...ut_json_integration__json_all_sources.snap | 10 + ...utput_json_integration__json_all_tags.snap | 5 + ...t_json_integration__json_debug_fields.snap | 5 + ..._json_integration__json_empty_strings.snap | 5 + ...t_json_integration__json_long_strings.snap | 5 + ...on_integration__json_multiple_strings.snap | 7 + ...ntegration__json_optional_fields_none.snap | 5 + ..._json_integration__json_original_text.snap | 5 + ..._json_integration__json_single_string.snap | 5 + ..._integration__json_special_characters.snap | 5 + ...son_integration__json_unicode_content.snap | 5 + ..._yara_integration__yara_all_tag_types.snap | 80 +++++ ...ration__yara_binary_name_sanitization.snap | 20 ++ ...a_integration__yara_edge_case_names-2.snap | 20 ++ ...ara_integration__yara_edge_case_names.snap | 20 ++ ..._yara_integration__yara_empty_strings.snap | 16 + ..._integration__yara_encoding_modifiers.snap | 23 ++ ...ut_yara_integration__yara_high_scores.snap | 23 ++ ...ntegration__yara_long_strings_skipped.snap | 19 ++ ...ara_integration__yara_mixed_encodings.snap | 26 ++ ..._yara_multiple_strings_different_tags.snap | 26 ++ ...ation__yara_multiple_strings_same_tag.snap | 22 ++ ...output_yara_integration__yara_no_tags.snap | 22 ++ ..._yara_integration__yara_single_string.snap | 20 ++ ..._integration__yara_special_characters.snap | 20 ++ ..._integration__yara_unicode_in_strings.snap | 20 ++ 39 files changed, 1539 insertions(+), 79 deletions(-) create mode 100644 tests/output_json_integration.rs create mode 100644 tests/output_yara_integration.rs create mode 100644 tests/snapshots/output_json_integration__json_all_encodings.snap create mode 100644 tests/snapshots/output_json_integration__json_all_fields_populated.snap create mode 100644 tests/snapshots/output_json_integration__json_all_sources.snap create mode 100644 tests/snapshots/output_json_integration__json_all_tags.snap create mode 100644 tests/snapshots/output_json_integration__json_debug_fields.snap create mode 100644 tests/snapshots/output_json_integration__json_empty_strings.snap create mode 100644 tests/snapshots/output_json_integration__json_long_strings.snap create mode 100644 tests/snapshots/output_json_integration__json_multiple_strings.snap create mode 100644 tests/snapshots/output_json_integration__json_optional_fields_none.snap create mode 100644 tests/snapshots/output_json_integration__json_original_text.snap create mode 100644 tests/snapshots/output_json_integration__json_single_string.snap create mode 100644 tests/snapshots/output_json_integration__json_special_characters.snap create mode 100644 tests/snapshots/output_json_integration__json_unicode_content.snap create mode 100644 tests/snapshots/output_yara_integration__yara_all_tag_types.snap create mode 100644 tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap create mode 100644 tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap create mode 100644 tests/snapshots/output_yara_integration__yara_edge_case_names.snap create mode 100644 tests/snapshots/output_yara_integration__yara_empty_strings.snap create mode 100644 tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap create mode 100644 tests/snapshots/output_yara_integration__yara_high_scores.snap create mode 100644 tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap create mode 100644 tests/snapshots/output_yara_integration__yara_mixed_encodings.snap create mode 100644 tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap create mode 100644 tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap create mode 100644 tests/snapshots/output_yara_integration__yara_no_tags.snap create mode 100644 tests/snapshots/output_yara_integration__yara_single_string.snap create mode 100644 tests/snapshots/output_yara_integration__yara_special_characters.snap create mode 100644 tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap diff --git a/src/classification/mod.rs b/src/classification/mod.rs index 704ac76..f425aa9 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -26,22 +26,14 @@ //! use stringy::types::{FoundString, Encoding, StringSource, Tag}; //! //! let classifier = SemanticClassifier::new(); -//! let found_string = FoundString { -//! text: "C:\\Windows\\System32\\cmd.exe".to_string(), -//! original_text: None, -//! encoding: Encoding::Ascii, -//! offset: 0, -//! rva: None, -//! section: None, -//! length: 27, -//! tags: Vec::new(), -//! score: 0, -//! section_weight: None, -//! semantic_boost: None, -//! noise_penalty: None, -//! source: StringSource::SectionData, -//! confidence: 1.0, -//! }; +//! let text = "C:\\Windows\\System32\\cmd.exe"; +//! let found_string = FoundString::new( +//! text.to_string(), +//! Encoding::Ascii, +//! 0, +//! text.len() as u32, +//! StringSource::SectionData, +//! ); //! //! let tags = classifier.classify(&found_string); //! assert!(tags.contains(&Tag::FilePath)); diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index c6df7a7..0ad913f 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -23,22 +23,14 @@ //! use stringy::types::{FoundString, Encoding, StringSource}; //! //! let classifier = SemanticClassifier::new(); -//! let found_string = FoundString { -//! text: "https://example.com/api".to_string(), -//! original_text: None, -//! encoding: Encoding::Ascii, -//! offset: 0, -//! rva: None, -//! section: None, -//! length: 24, -//! tags: Vec::new(), -//! score: 0, -//! section_weight: None, -//! semantic_boost: None, -//! noise_penalty: None, -//! source: StringSource::SectionData, -//! confidence: 1.0, -//! }; +//! let text = "https://example.com/api"; +//! let found_string = FoundString::new( +//! text.to_string(), +//! Encoding::Ascii, +//! 0, +//! text.len() as u32, +//! StringSource::SectionData, +//! ); //! //! let tags = classifier.classify(&found_string); //! assert_eq!(tags.len(), 1); diff --git a/src/classification/symbols.rs b/src/classification/symbols.rs index 27b7cd2..b69ae31 100644 --- a/src/classification/symbols.rs +++ b/src/classification/symbols.rs @@ -18,24 +18,17 @@ //! use stringy::types::{FoundString, Encoding, StringSource, Tag}; //! //! let demangler = SymbolDemangler::new(); -//! let mut found_string = FoundString { -//! text: "_ZN4core3fmt5Write9write_str17h1234567890abcdefE".to_string(), -//! original_text: None, -//! encoding: Encoding::Ascii, -//! offset: 0, -//! rva: None, -//! section: None, -//! length: 47, -//! tags: Vec::new(), -//! score: 0, -//! section_weight: None, -//! semantic_boost: None, -//! noise_penalty: None, -//! source: StringSource::ImportName, -//! confidence: 1.0, -//! }; +//! let text = "_ZN4core3fmt5Write9write_str17h1234567890abcdefE"; +//! let mut found_string = FoundString::new( +//! text.to_string(), +//! Encoding::Ascii, +//! 0, +//! text.len() as u32, +//! StringSource::ImportName, +//! ); //! //! demangler.demangle(&mut found_string); +//! assert!(found_string.tags.contains(&Tag::DemangledSymbol)); //! // found_string.text now contains the demangled symbol //! // found_string.original_text contains the original mangled form //! // found_string.tags contains Tag::DemangledSymbol @@ -129,22 +122,14 @@ impl SymbolDemangler { /// use stringy::types::{FoundString, Encoding, StringSource, Tag}; /// /// let demangler = SymbolDemangler::new(); - /// let mut found_string = FoundString { - /// text: "_ZN4core3fmt5Write9write_str17h1234567890abcdefE".to_string(), - /// original_text: None, - /// encoding: Encoding::Ascii, - /// offset: 0, - /// rva: None, - /// section: None, - /// length: 47, - /// tags: Vec::new(), - /// score: 0, - /// section_weight: None, - /// semantic_boost: None, - /// noise_penalty: None, - /// source: StringSource::ImportName, - /// confidence: 1.0, - /// }; + /// let text = "_ZN4core3fmt5Write9write_str17h1234567890abcdefE"; + /// let mut found_string = FoundString::new( + /// text.to_string(), + /// Encoding::Ascii, + /// 0, + /// text.len() as u32, + /// StringSource::ImportName, + /// ); /// /// demangler.demangle(&mut found_string); /// assert!(found_string.tags.contains(&Tag::DemangledSymbol)); diff --git a/src/extraction/ascii.rs b/src/extraction/ascii.rs index 9f9d82f..2025348 100644 --- a/src/extraction/ascii.rs +++ b/src/extraction/ascii.rs @@ -26,7 +26,7 @@ //! is_writable: false, //! weight: 1.0, //! }; -//! let strings = extract_from_section(§ion, data, &config); +//! let strings = extract_from_section(§ion, data, &config, None, false, 0.5); //! ``` use crate::extraction::config::NoiseFilterConfig; diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 6b3a85f..af814f8 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -40,7 +40,7 @@ //! and noise filtering. It implements byte-level scanning for contiguous UTF-16LE character //! sequences, following the pattern established in the ASCII extractor. //! -//! - `extract_utf16le_strings()`: Basic byte-level UTF-16LE string scanning +//! - `extract_utf16_strings()`: Basic byte-level UTF-16 string scanning //! - `extract_from_section()`: Section-aware extraction with proper metadata population //! - `Utf16ExtractionConfig`: Configuration for minimum/maximum character count and confidence thresholds //! @@ -89,6 +89,7 @@ //! use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; //! use stringy::container::{detect_format, create_parser}; //! +//! # fn example() -> stringy::Result<()> { //! let data = std::fs::read("example.exe")?; //! let format = detect_format(&data); //! let parser = create_parser(format)?; @@ -100,7 +101,7 @@ //! //! // Format-specific extractors //! use stringy::extraction::{ -//! extract_ascii_strings, extract_utf16le_strings, extract_load_command_strings, extract_resources, +//! extract_ascii_strings, extract_utf16_strings, extract_load_command_strings, extract_resources, //! extract_resource_strings, AsciiExtractionConfig, Utf16ExtractionConfig, //! }; //! @@ -108,9 +109,9 @@ //! let ascii_config = AsciiExtractionConfig::default(); //! let ascii_strings = extract_ascii_strings(&data, &ascii_config); //! -//! // UTF-16LE extraction +//! // UTF-16 extraction //! let utf16_config = Utf16ExtractionConfig::default(); -//! let utf16le_strings = extract_utf16le_strings(&data, &utf16_config); +//! let utf16_strings = extract_utf16_strings(&data, &utf16_config); //! //! // Phase 1: Get resource metadata //! let metadata = extract_resources(&data); @@ -121,6 +122,8 @@ //! // Mach-O load command extraction //! let macho_data = std::fs::read("example.dylib")?; //! let load_command_strings = extract_load_command_strings(&macho_data); +//! # Ok(()) +//! # } //! ``` use crate::classification::{SemanticClassifier, SymbolDemangler}; diff --git a/src/extraction/pe_resources.rs b/src/extraction/pe_resources.rs index 7938667..211b085 100644 --- a/src/extraction/pe_resources.rs +++ b/src/extraction/pe_resources.rs @@ -48,7 +48,9 @@ //! //! ```rust //! use stringy::extraction::pe_resources::extract_resources; +//! use stringy::types::ResourceType; //! +//! # fn example() -> stringy::Result<()> { //! let pe_data = std::fs::read("example.exe")?; //! let resources = extract_resources(&pe_data); //! @@ -65,6 +67,8 @@ //! _ => {} //! } //! } +//! # Ok(()) +//! # } //! ``` //! //! ## Phase 2: Resource String Extraction @@ -73,6 +77,7 @@ //! use stringy::extraction::pe_resources::extract_resource_strings; //! use stringy::types::Tag; //! +//! # fn example() -> stringy::Result<()> { //! let pe_data = std::fs::read("example.exe")?; //! let strings = extract_resource_strings(&pe_data); //! @@ -85,6 +90,8 @@ //! let ui_strings: Vec<_> = strings.iter() //! .filter(|s| s.tags.contains(&Tag::Resource) && !s.tags.contains(&Tag::Version)) //! .collect(); +//! # Ok(()) +//! # } //! ``` use crate::types::{ diff --git a/src/lib.rs b/src/lib.rs index 58a931d..d340897 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,10 +42,10 @@ //! println!("Found {} ASCII strings", ascii_strings.len()); //! //! // UTF-16LE string extraction (Windows PE binaries) -//! use stringy::extraction::{extract_utf16le_strings, Utf16ExtractionConfig}; +//! use stringy::extraction::{extract_utf16_strings, Utf16ExtractionConfig}; //! let utf16_config = Utf16ExtractionConfig::default(); -//! let utf16le_strings = extract_utf16le_strings(&data, &utf16_config); -//! println!("Found {} UTF-16LE strings", utf16le_strings.len()); +//! let utf16_strings = extract_utf16_strings(&data, &utf16_config); +//! println!("Found {} UTF-16 strings", utf16_strings.len()); //! # Ok(()) //! # } //! ``` @@ -86,4 +86,6 @@ pub use extraction::{ }; // Re-export output infrastructure types -pub use output::{OutputFormat, OutputMetadata, format_output, format_table_with_mode}; +pub use output::{ + OutputFormat, OutputMetadata, format_json, format_output, format_table_with_mode, format_yara, +}; diff --git a/src/output/json.rs b/src/output/json.rs index e183a25..635d8aa 100644 --- a/src/output/json.rs +++ b/src/output/json.rs @@ -1,9 +1,282 @@ -use crate::types::{FoundString, Result}; +use crate::types::{FoundString, Result, StringyError}; use super::OutputMetadata; /// Format strings as JSONL output, one object per line. pub fn format_json(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - // TODO: Implement JSON formatter in a subsequent phase. - Ok(String::new()) + if _strings.is_empty() { + return Ok(String::new()); + } + + let mut lines = Vec::with_capacity(_strings.len()); + for item in _strings { + if !item.confidence.is_finite() { + return Err(StringyError::ConfigError( + "JSON serialization failed: non-finite confidence".to_string(), + )); + } + let line = serde_json::to_string(item).map_err(|err| { + StringyError::ConfigError(format!("JSON serialization failed: {}", err)) + })?; + lines.push(line); + } + + Ok(lines.join("\n")) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::output::{OutputFormat, OutputMetadata}; + use crate::types::{Encoding, FoundString, StringSource, Tag}; + use serde_json::Value; + + fn make_metadata(count: usize) -> OutputMetadata { + OutputMetadata::new("test.bin".to_string(), OutputFormat::Json, count, count) + } + + fn make_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0x1000, + text.len() as u32, + StringSource::SectionData, + ) + } + + fn parse_line(line: &str) -> Value { + serde_json::from_str(line).expect("JSON should parse") + } + + #[test] + fn test_empty_strings_returns_empty_output() { + let output = format_json(&[], &make_metadata(0)).expect("Formatting should succeed"); + assert!(output.is_empty()); + } + + #[test] + fn test_single_string_serialization() { + let strings = vec![make_string("alpha")]; + let output = format_json(&strings, &make_metadata(1)).expect("Formatting should succeed"); + let value = parse_line(&output); + assert_eq!(value["text"], "alpha"); + assert_eq!(value["encoding"], "Ascii"); + } + + #[test] + fn test_multiple_strings_jsonl_format() { + let strings = vec![make_string("one"), make_string("two")]; + let output = format_json(&strings, &make_metadata(2)).expect("Formatting should succeed"); + let lines: Vec<&str> = output.lines().collect(); + assert_eq!(lines.len(), 2); + assert_eq!(parse_line(lines[0])["text"], "one"); + assert_eq!(parse_line(lines[1])["text"], "two"); + } + + #[test] + fn test_optional_fields_excluded_when_none() { + let strings = vec![make_string("no-optional")]; + let output = format_json(&strings, &make_metadata(1)).expect("Formatting should succeed"); + assert!(!output.contains("original_text")); + assert!(!output.contains("section_weight")); + assert!(!output.contains("semantic_boost")); + assert!(!output.contains("noise_penalty")); + } + + #[test] + fn test_optional_fields_included_when_some() { + let strings = vec![ + make_string("with-optional") + .with_original_text("orig".to_string()) + .with_section_weight(10) + .with_semantic_boost(5) + .with_noise_penalty(-2), + ]; + let output = format_json(&strings, &make_metadata(1)).expect("Formatting should succeed"); + assert!(output.contains("original_text")); + assert!(output.contains("section_weight")); + assert!(output.contains("semantic_boost")); + assert!(output.contains("noise_penalty")); + } + + #[test] + fn test_special_characters_are_escaped() { + let strings = vec![make_string("quote\" backslash\\ line\n tab\t")]; + let output = format_json(&strings, &make_metadata(1)).expect("Formatting should succeed"); + assert!(output.contains("\\\"")); + assert!(output.contains("\\\\")); + assert!(output.contains("\\n")); + assert!(output.contains("\\t")); + } + + #[test] + fn test_all_encodings_serialize_correctly() { + let strings = vec![ + FoundString::new( + "a".to_string(), + Encoding::Ascii, + 0, + 1, + StringSource::SectionData, + ), + FoundString::new( + "b".to_string(), + Encoding::Utf8, + 1, + 1, + StringSource::SectionData, + ), + FoundString::new( + "c".to_string(), + Encoding::Utf16Le, + 2, + 2, + StringSource::SectionData, + ), + FoundString::new( + "d".to_string(), + Encoding::Utf16Be, + 3, + 2, + StringSource::SectionData, + ), + ]; + let output = format_json(&strings, &make_metadata(4)).expect("Formatting should succeed"); + let lines: Vec<&str> = output.lines().collect(); + assert_eq!(parse_line(lines[0])["encoding"], "Ascii"); + assert_eq!(parse_line(lines[1])["encoding"], "Utf8"); + assert_eq!(parse_line(lines[2])["encoding"], "Utf16Le"); + assert_eq!(parse_line(lines[3])["encoding"], "Utf16Be"); + } + + #[test] + fn test_all_tag_types_serialize_correct_names() { + let tags = vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::IPv6, + Tag::FilePath, + Tag::RegistryPath, + Tag::Guid, + Tag::Email, + Tag::Base64, + Tag::FormatString, + Tag::UserAgent, + Tag::DemangledSymbol, + Tag::Import, + Tag::Export, + Tag::Version, + Tag::Manifest, + Tag::Resource, + Tag::DylibPath, + Tag::Rpath, + Tag::RpathVariable, + Tag::FrameworkPath, + ]; + let strings = vec![make_string("tagged").with_tags(tags)]; + let output = format_json(&strings, &make_metadata(1)).expect("Formatting should succeed"); + let value = parse_line(&output); + let tag_values: Vec = value["tags"] + .as_array() + .expect("tags should be an array") + .iter() + .map(|item| item.as_str().expect("tag should be string").to_string()) + .collect(); + + let expected = vec![ + "Url", + "Domain", + "ipv4", + "ipv6", + "filepath", + "regpath", + "guid", + "Email", + "b64", + "fmt", + "user-agent-ish", + "demangled", + "Import", + "Export", + "Version", + "Manifest", + "Resource", + "dylib-path", + "rpath", + "rpath-var", + "framework-path", + ]; + + for name in expected { + assert!(tag_values.iter().any(|tag| tag == name)); + } + } + + #[test] + fn test_all_source_types_serialize_correctly() { + let strings = vec![ + FoundString::new( + "a".to_string(), + Encoding::Ascii, + 0, + 1, + StringSource::SectionData, + ), + FoundString::new( + "b".to_string(), + Encoding::Ascii, + 1, + 1, + StringSource::ImportName, + ), + FoundString::new( + "c".to_string(), + Encoding::Ascii, + 2, + 1, + StringSource::ExportName, + ), + FoundString::new( + "d".to_string(), + Encoding::Ascii, + 3, + 1, + StringSource::ResourceString, + ), + FoundString::new( + "e".to_string(), + Encoding::Ascii, + 4, + 1, + StringSource::LoadCommand, + ), + FoundString::new( + "f".to_string(), + Encoding::Ascii, + 5, + 1, + StringSource::DebugInfo, + ), + ]; + let output = format_json(&strings, &make_metadata(6)).expect("Formatting should succeed"); + let lines: Vec<&str> = output.lines().collect(); + assert_eq!(parse_line(lines[0])["source"], "SectionData"); + assert_eq!(parse_line(lines[1])["source"], "ImportName"); + assert_eq!(parse_line(lines[2])["source"], "ExportName"); + assert_eq!(parse_line(lines[3])["source"], "ResourceString"); + assert_eq!(parse_line(lines[4])["source"], "LoadCommand"); + assert_eq!(parse_line(lines[5])["source"], "DebugInfo"); + } + + #[test] + fn test_error_propagation_for_serialization_failures() { + let strings = vec![make_string("nan").with_confidence(f32::NAN)]; + let result = format_json(&strings, &make_metadata(1)); + match result { + Err(StringyError::ConfigError(_)) => {} + _ => panic!("Expected ConfigError on invalid JSON serialization"), + } + } } diff --git a/src/output/yara.rs b/src/output/yara.rs index 8e367b8..5e90d53 100644 --- a/src/output/yara.rs +++ b/src/output/yara.rs @@ -1,9 +1,301 @@ -use crate::types::{FoundString, Result}; +use crate::types::{Encoding, FoundString, Result}; use super::OutputMetadata; +use std::collections::{BTreeMap, HashMap}; +use std::time::{SystemTime, UNIX_EPOCH}; /// Format strings as YARA rule templates. pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - // TODO: Implement YARA formatter in a subsequent phase. - Ok(String::new()) + let timestamp = current_timestamp(); + let base_rule_name = sanitize_rule_name(&_metadata.binary_name); + let rule_name = format!("{}_strings", base_rule_name); + + let mut output = String::new(); + output.push_str("// YARA rule generated by Stringy\n"); + output.push_str(&format!("// Binary: {}\n", _metadata.binary_name)); + output.push_str(&format!("// Generated: {}\n\n", timestamp)); + + output.push_str(&format!("rule {} {{\n", rule_name)); + output.push_str(" meta:\n"); + output.push_str(&format!( + " description = \"Strings extracted from {}\"\n", + escape_yara_string(&_metadata.binary_name) + )); + output.push_str(" generated_by = \"stringy\"\n"); + output.push_str(&format!(" generated_at = \"{}\"\n", timestamp)); + + if _strings.is_empty() { + output.push_str(" condition:\n"); + output.push_str(" true\n"); + output.push_str("}\n"); + return Ok(output); + } + + let grouped = group_strings_by_tag(_strings); + let mut strings_block = String::new(); + let mut counters: HashMap = HashMap::new(); + let mut included = 0usize; + + strings_block.push_str(" strings:\n"); + for (tag, items) in grouped { + strings_block.push_str(&format!(" // tag: {}\n", tag)); + let var_tag = sanitize_identifier(&tag); + for item in items { + let char_count = item.text.chars().count(); + if char_count > 200 { + strings_block.push_str(&format!( + " // skipped (length > 200 chars): {}\n", + char_count + )); + continue; + } + + let counter = counters.entry(var_tag.clone()).or_insert(0); + *counter += 1; + let var_name = format!("${}_{}", var_tag, *counter); + let escaped = escape_yara_string(&item.text); + let modifier = get_yara_modifier(item.encoding); + + strings_block.push_str(&format!(" // score: {}\n", item.score)); + strings_block.push_str(&format!( + " {} = \"{}\" {}\n", + var_name, escaped, modifier + )); + included += 1; + } + } + + output.push_str(&strings_block); + output.push_str(" condition:\n"); + if included == 0 { + output.push_str(" true\n"); + } else { + output.push_str(" any of them\n"); + } + output.push_str("}\n"); + + Ok(output) +} + +fn current_timestamp() -> String { + match SystemTime::now().duration_since(UNIX_EPOCH) { + Ok(duration) => duration.as_secs().to_string(), + Err(_) => "0".to_string(), + } +} + +fn sanitize_rule_name(binary_name: &str) -> String { + let mut sanitized = String::new(); + for ch in binary_name.chars() { + if ch.is_ascii_alphanumeric() { + sanitized.push(ch); + } else { + sanitized.push('_'); + } + } + + if sanitized.is_empty() { + sanitized.push('_'); + } + + let first = sanitized.chars().next().unwrap_or('_'); + if !first.is_ascii_alphabetic() && first != '_' { + sanitized.insert(0, '_'); + } + + sanitized +} + +fn sanitize_identifier(name: &str) -> String { + let mut sanitized = String::new(); + for ch in name.chars() { + if ch.is_ascii_alphanumeric() || ch == '_' { + sanitized.push(ch); + } else { + sanitized.push('_'); + } + } + + if sanitized.is_empty() { + "tag".to_string() + } else { + sanitized + } +} + +fn escape_yara_string(text: &str) -> String { + let mut escaped = String::new(); + for byte in text.as_bytes() { + match *byte { + b'\\' => escaped.push_str("\\\\"), + b'"' => escaped.push_str("\\\""), + b'\n' => escaped.push_str("\\n"), + b'\r' => escaped.push_str("\\r"), + b'\t' => escaped.push_str("\\t"), + 0x08 => escaped.push_str("\\b"), + 0x0b => escaped.push_str("\\x0b"), + 0x0c => escaped.push_str("\\x0c"), + 0x00..=0x1f | 0x7f..=0xff => { + escaped.push_str(&format!("\\x{:02x}", byte)); + } + _ => escaped.push(*byte as char), + } + } + escaped +} + +fn get_yara_modifier(encoding: Encoding) -> &'static str { + match encoding { + Encoding::Ascii | Encoding::Utf8 => "ascii", + Encoding::Utf16Le | Encoding::Utf16Be => "wide", + } +} + +fn tag_name(tag: &crate::types::Tag) -> &'static str { + match tag { + crate::types::Tag::Url => "Url", + crate::types::Tag::Domain => "Domain", + crate::types::Tag::IPv4 => "ipv4", + crate::types::Tag::IPv6 => "ipv6", + crate::types::Tag::FilePath => "filepath", + crate::types::Tag::RegistryPath => "regpath", + crate::types::Tag::Guid => "guid", + crate::types::Tag::Email => "Email", + crate::types::Tag::Base64 => "b64", + crate::types::Tag::FormatString => "fmt", + crate::types::Tag::UserAgent => "user-agent-ish", + crate::types::Tag::DemangledSymbol => "demangled", + crate::types::Tag::Import => "Import", + crate::types::Tag::Export => "Export", + crate::types::Tag::Version => "Version", + crate::types::Tag::Manifest => "Manifest", + crate::types::Tag::Resource => "Resource", + crate::types::Tag::DylibPath => "dylib-path", + crate::types::Tag::Rpath => "rpath", + crate::types::Tag::RpathVariable => "rpath-var", + crate::types::Tag::FrameworkPath => "framework-path", + } +} + +fn group_strings_by_tag(strings: &[FoundString]) -> BTreeMap> { + let mut grouped: BTreeMap> = BTreeMap::new(); + + for item in strings { + let tag = item + .tags + .first() + .map(|tag| tag_name(tag).to_string()) + .unwrap_or_else(|| "untagged".to_string()); + grouped.entry(tag).or_default().push(item); + } + + grouped +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::output::{OutputFormat, OutputMetadata}; + use crate::types::{FoundString, StringSource, Tag}; + + fn make_metadata() -> OutputMetadata { + OutputMetadata::new("sample.bin".to_string(), OutputFormat::Yara, 0, 0) + } + + fn make_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0, + text.len() as u32, + StringSource::SectionData, + ) + } + + #[test] + fn test_sanitize_rule_name() { + assert_eq!(sanitize_rule_name("sample.bin"), "sample_bin"); + assert_eq!(sanitize_rule_name("123name"), "_123name"); + assert_eq!(sanitize_rule_name("$weird#name"), "_weird_name"); + assert_eq!(sanitize_rule_name(""), "_"); + } + + #[test] + fn test_escape_yara_string() { + let input = "quote\" backslash\\ line\n tab\t"; + let escaped = escape_yara_string(input); + assert!(escaped.contains("\\\"")); + assert!(escaped.contains("\\\\")); + assert!(escaped.contains("\\n")); + assert!(escaped.contains("\\t")); + } + + #[test] + fn test_get_yara_modifier() { + assert_eq!(get_yara_modifier(Encoding::Ascii), "ascii"); + assert_eq!(get_yara_modifier(Encoding::Utf8), "ascii"); + assert_eq!(get_yara_modifier(Encoding::Utf16Le), "wide"); + assert_eq!(get_yara_modifier(Encoding::Utf16Be), "wide"); + } + + #[test] + fn test_group_strings_by_tag() { + let strings = vec![ + make_string("one").with_tags(vec![Tag::Url]), + make_string("two").with_tags(vec![Tag::Domain]), + make_string("three"), + ]; + let grouped = group_strings_by_tag(&strings); + assert!(grouped.contains_key("Url")); + assert!(grouped.contains_key("Domain")); + assert!(grouped.contains_key("untagged")); + } + + #[test] + fn test_empty_strings_produces_minimal_rule() { + let output = format_yara(&[], &make_metadata()).expect("Formatting should succeed"); + assert!(output.contains("condition:")); + assert!(output.contains("true")); + } + + #[test] + fn test_single_string_produces_rule() { + let strings = vec![make_string("alpha").with_tags(vec![Tag::Url])]; + let output = format_yara(&strings, &make_metadata()).expect("Formatting should succeed"); + assert!(output.contains("strings:")); + assert!(output.contains("$Url_1")); + assert!(output.contains("\"alpha\"")); + } + + #[test] + fn test_long_strings_are_skipped() { + let long_text = "a".repeat(201); + let strings = vec![make_string(&long_text).with_tags(vec![Tag::Url])]; + let output = format_yara(&strings, &make_metadata()).expect("Formatting should succeed"); + assert!(output.contains("skipped (length > 200 chars)")); + } + + #[test] + fn test_binary_name_sanitization_in_rule_name() { + let metadata = OutputMetadata::new("weird name.exe".to_string(), OutputFormat::Yara, 1, 1); + let strings = vec![make_string("alpha")]; + let output = format_yara(&strings, &metadata).expect("Formatting should succeed"); + assert!(output.contains("rule weird_name_exe_strings")); + } + + #[test] + fn test_encodings_apply_modifiers() { + let mut string = make_string("wide"); + string.encoding = Encoding::Utf16Le; + let output = format_yara(&[string], &make_metadata()).expect("Formatting should succeed"); + assert!(output.contains("wide")); + } + + #[test] + fn test_unicode_content_is_escaped() { + let unicode = "\u{4E2D}\u{6587}"; + let strings = vec![make_string(unicode).with_tags(vec![Tag::Domain])]; + let output = format_yara(&strings, &make_metadata()).expect("Formatting should succeed"); + assert!(output.contains("\\x")); + } } diff --git a/tests/output_json_integration.rs b/tests/output_json_integration.rs new file mode 100644 index 0000000..4590956 --- /dev/null +++ b/tests/output_json_integration.rs @@ -0,0 +1,266 @@ +//! Integration tests for JSON output formatter. +//! +//! Uses insta snapshots to verify output format consistency. + +use insta::assert_snapshot; +use serde_json::Value; +use stringy::output::{OutputFormat, OutputMetadata, format_json}; +use stringy::types::{Encoding, FoundString, StringSource, Tag}; + +fn make_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0x1000, + text.len() as u32, + StringSource::SectionData, + ) +} + +fn make_metadata(count: usize) -> OutputMetadata { + OutputMetadata::new( + "test_binary.exe".to_string(), + OutputFormat::Json, + count, + count, + ) +} + +fn parse_line(line: &str) -> Value { + serde_json::from_str(line).expect("JSON should parse") +} + +#[test] +fn test_json_empty_strings() { + let output = format_json(&[], &make_metadata(0)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_single_string() { + let strings = vec![make_string("GetProcAddress")]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_multiple_strings() { + let strings = vec![make_string("one"), make_string("two"), make_string("three")]; + let output = format_json(&strings, &make_metadata(3)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_all_fields_populated() { + let strings = vec![ + make_string("fielded") + .with_original_text("original".to_string()) + .with_section(".rdata".to_string()) + .with_rva(0x2000) + .with_tags(vec![Tag::Url]) + .with_score(150) + .with_section_weight(20) + .with_semantic_boost(30) + .with_noise_penalty(-10) + .with_confidence(0.9), + ]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_optional_fields_none() { + let strings = vec![make_string("no-optional")]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_special_characters() { + let strings = vec![make_string("quote\" backslash\\ line\n tab\t")]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_all_encodings() { + let strings = vec![ + FoundString::new( + "ASCII".to_string(), + Encoding::Ascii, + 0, + 5, + StringSource::SectionData, + ), + FoundString::new( + "UTF8".to_string(), + Encoding::Utf8, + 1, + 4, + StringSource::SectionData, + ), + FoundString::new( + "UTF16LE".to_string(), + Encoding::Utf16Le, + 2, + 14, + StringSource::SectionData, + ), + FoundString::new( + "UTF16BE".to_string(), + Encoding::Utf16Be, + 3, + 14, + StringSource::SectionData, + ), + ]; + let output = format_json(&strings, &make_metadata(4)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_all_tags() { + let tags = vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::IPv6, + Tag::FilePath, + Tag::RegistryPath, + Tag::Guid, + Tag::Email, + Tag::Base64, + Tag::FormatString, + Tag::UserAgent, + Tag::DemangledSymbol, + Tag::Import, + Tag::Export, + Tag::Version, + Tag::Manifest, + Tag::Resource, + Tag::DylibPath, + Tag::Rpath, + Tag::RpathVariable, + Tag::FrameworkPath, + ]; + let strings = vec![make_string("tagged").with_tags(tags)]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_all_sources() { + let strings = vec![ + FoundString::new( + "sec".to_string(), + Encoding::Ascii, + 0, + 3, + StringSource::SectionData, + ), + FoundString::new( + "imp".to_string(), + Encoding::Ascii, + 1, + 3, + StringSource::ImportName, + ), + FoundString::new( + "exp".to_string(), + Encoding::Ascii, + 2, + 3, + StringSource::ExportName, + ), + FoundString::new( + "res".to_string(), + Encoding::Ascii, + 3, + 3, + StringSource::ResourceString, + ), + FoundString::new( + "lc".to_string(), + Encoding::Ascii, + 4, + 2, + StringSource::LoadCommand, + ), + FoundString::new( + "dbg".to_string(), + Encoding::Ascii, + 5, + 3, + StringSource::DebugInfo, + ), + ]; + let output = format_json(&strings, &make_metadata(6)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_debug_fields() { + let strings = vec![ + make_string("debug") + .with_section_weight(10) + .with_semantic_boost(5) + .with_noise_penalty(-3), + ]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_original_text() { + let strings = vec![make_string("demangled").with_original_text("_ZN".to_string())]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_long_strings() { + let long_text = "a".repeat(300); + let strings = vec![make_string(&long_text).with_score(5)]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_unicode_content() { + let unicode = "\u{4E2D}\u{6587}\u{5B57}\u{7B26}"; + let strings = vec![make_string(unicode)]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_json_parse_roundtrip() { + let strings = vec![ + make_string("roundtrip") + .with_tags(vec![Tag::Url]) + .with_score(10), + make_string("another") + .with_tags(vec![Tag::Domain]) + .with_score(20), + ]; + let output = format_json(&strings, &make_metadata(2)).unwrap(); + let lines: Vec<&str> = output.lines().collect(); + assert_eq!(lines.len(), 2); + + let first: FoundString = serde_json::from_str(lines[0]).expect("should deserialize"); + let second: FoundString = serde_json::from_str(lines[1]).expect("should deserialize"); + + assert_eq!(first.text, "roundtrip"); + assert_eq!(second.text, "another"); +} + +#[test] +fn test_json_optional_fields_excluded() { + let strings = vec![make_string("no-optional")]; + let output = format_json(&strings, &make_metadata(1)).unwrap(); + let value = parse_line(&output); + assert!(value.get("original_text").is_none()); + assert!(value.get("section_weight").is_none()); + assert!(value.get("semantic_boost").is_none()); + assert!(value.get("noise_penalty").is_none()); +} diff --git a/tests/output_yara_integration.rs b/tests/output_yara_integration.rs new file mode 100644 index 0000000..9468a75 --- /dev/null +++ b/tests/output_yara_integration.rs @@ -0,0 +1,196 @@ +//! Integration tests for YARA output formatter. +//! +//! Uses insta snapshots to verify output format consistency. + +use insta::assert_snapshot; +use stringy::output::{OutputFormat, OutputMetadata, format_yara}; +use stringy::types::{Encoding, FoundString, StringSource, Tag}; + +fn make_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0x1000, + text.len() as u32, + StringSource::SectionData, + ) +} + +fn make_metadata(binary_name: &str, count: usize) -> OutputMetadata { + OutputMetadata::new(binary_name.to_string(), OutputFormat::Yara, count, count) +} + +#[test] +fn test_yara_empty_strings() { + let output = format_yara(&[], &make_metadata("empty.bin", 0)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_single_string() { + let strings = vec![make_string("GetProcAddress").with_tags(vec![Tag::Import])]; + let output = format_yara(&strings, &make_metadata("single.exe", 1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_multiple_strings_same_tag() { + let strings = vec![ + make_string("alpha").with_tags(vec![Tag::Url]), + make_string("beta").with_tags(vec![Tag::Url]), + ]; + let output = format_yara(&strings, &make_metadata("same-tag.exe", 2)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_multiple_strings_different_tags() { + let strings = vec![ + make_string("https://example.com").with_tags(vec![Tag::Url]), + make_string("example.com").with_tags(vec![Tag::Domain]), + make_string("192.168.1.1").with_tags(vec![Tag::IPv4]), + ]; + let output = format_yara(&strings, &make_metadata("diff-tag.exe", 3)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_no_tags() { + let strings = vec![make_string("no-tag"), make_string("still-no-tag")]; + let output = format_yara(&strings, &make_metadata("untagged.exe", 2)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_long_strings_skipped() { + let long_text = "a".repeat(201); + let strings = vec![make_string(&long_text).with_tags(vec![Tag::Url])]; + let output = format_yara(&strings, &make_metadata("long.exe", 1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_special_characters() { + let strings = vec![ + make_string("quote\" backslash\\ line\n tab\t") + .with_tags(vec![Tag::FilePath]) + .with_score(10), + ]; + let output = format_yara(&strings, &make_metadata("special.exe", 1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_binary_name_sanitization() { + let strings = vec![make_string("alpha")]; + let output = format_yara(&strings, &make_metadata("weird name.exe", 1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_encoding_modifiers() { + let ascii = make_string("ascii"); + let utf16 = FoundString::new( + "wide".to_string(), + Encoding::Utf16Le, + 0x2000, + 8, + StringSource::SectionData, + ) + .with_tags(vec![Tag::Resource]); + + let output = format_yara(&[ascii, utf16], &make_metadata("enc.exe", 2)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_mixed_encodings() { + let strings = vec![ + FoundString::new( + "ascii".to_string(), + Encoding::Ascii, + 0x1000, + 5, + StringSource::SectionData, + ) + .with_tags(vec![Tag::Url]), + FoundString::new( + "utf8".to_string(), + Encoding::Utf8, + 0x2000, + 4, + StringSource::SectionData, + ) + .with_tags(vec![Tag::Domain]), + FoundString::new( + "utf16".to_string(), + Encoding::Utf16Be, + 0x3000, + 10, + StringSource::SectionData, + ) + .with_tags(vec![Tag::Resource]), + ]; + let output = format_yara(&strings, &make_metadata("mixed.exe", 3)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_high_scores() { + let strings = vec![ + make_string("critical") + .with_tags(vec![Tag::Url]) + .with_score(9999), + make_string("low") + .with_tags(vec![Tag::Domain]) + .with_score(-10), + ]; + let output = format_yara(&strings, &make_metadata("scores.exe", 2)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_all_tag_types() { + let strings = vec![ + make_string("url").with_tags(vec![Tag::Url]), + make_string("domain").with_tags(vec![Tag::Domain]), + make_string("ipv4").with_tags(vec![Tag::IPv4]), + make_string("ipv6").with_tags(vec![Tag::IPv6]), + make_string("path").with_tags(vec![Tag::FilePath]), + make_string("reg").with_tags(vec![Tag::RegistryPath]), + make_string("guid").with_tags(vec![Tag::Guid]), + make_string("email").with_tags(vec![Tag::Email]), + make_string("b64").with_tags(vec![Tag::Base64]), + make_string("fmt").with_tags(vec![Tag::FormatString]), + make_string("agent").with_tags(vec![Tag::UserAgent]), + make_string("demangled").with_tags(vec![Tag::DemangledSymbol]), + make_string("import").with_tags(vec![Tag::Import]), + make_string("export").with_tags(vec![Tag::Export]), + make_string("version").with_tags(vec![Tag::Version]), + make_string("manifest").with_tags(vec![Tag::Manifest]), + make_string("resource").with_tags(vec![Tag::Resource]), + make_string("dylib").with_tags(vec![Tag::DylibPath]), + make_string("rpath").with_tags(vec![Tag::Rpath]), + make_string("rpathvar").with_tags(vec![Tag::RpathVariable]), + make_string("framework").with_tags(vec![Tag::FrameworkPath]), + ]; + let output = format_yara(&strings, &make_metadata("tags.exe", strings.len())).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_unicode_in_strings() { + let unicode = "\u{4E2D}\u{6587}\u{5B57}\u{7B26}"; + let strings = vec![make_string(unicode).with_tags(vec![Tag::Domain])]; + let output = format_yara(&strings, &make_metadata("unicode.exe", 1)).unwrap(); + assert_snapshot!(output); +} + +#[test] +fn test_yara_edge_case_names() { + let strings = vec![make_string("alpha")]; + let output_numbers = format_yara(&strings, &make_metadata("12345", 1)).unwrap(); + let output_special = format_yara(&strings, &make_metadata("#$%", 1)).unwrap(); + assert_snapshot!(output_numbers); + assert_snapshot!(output_special); +} diff --git a/tests/snapshots/output_json_integration__json_all_encodings.snap b/tests/snapshots/output_json_integration__json_all_encodings.snap new file mode 100644 index 0000000..fac7e90 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_all_encodings.snap @@ -0,0 +1,8 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"ASCII","encoding":"Ascii","offset":0,"rva":null,"section":null,"length":5,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"UTF8","encoding":"Utf8","offset":1,"rva":null,"section":null,"length":4,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"UTF16LE","encoding":"Utf16Le","offset":2,"rva":null,"section":null,"length":14,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"UTF16BE","encoding":"Utf16Be","offset":3,"rva":null,"section":null,"length":14,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_all_fields_populated.snap b/tests/snapshots/output_json_integration__json_all_fields_populated.snap new file mode 100644 index 0000000..3593900 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_all_fields_populated.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"fielded","original_text":"original","encoding":"Ascii","offset":4096,"rva":8192,"section":".rdata","length":7,"tags":["Url"],"score":150,"section_weight":20,"semantic_boost":30,"noise_penalty":-10,"source":"SectionData","confidence":0.9} diff --git a/tests/snapshots/output_json_integration__json_all_sources.snap b/tests/snapshots/output_json_integration__json_all_sources.snap new file mode 100644 index 0000000..ab773f4 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_all_sources.snap @@ -0,0 +1,10 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"sec","encoding":"Ascii","offset":0,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"imp","encoding":"Ascii","offset":1,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"ImportName","confidence":1.0} +{"text":"exp","encoding":"Ascii","offset":2,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"ExportName","confidence":1.0} +{"text":"res","encoding":"Ascii","offset":3,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"ResourceString","confidence":1.0} +{"text":"lc","encoding":"Ascii","offset":4,"rva":null,"section":null,"length":2,"tags":[],"score":0,"source":"LoadCommand","confidence":1.0} +{"text":"dbg","encoding":"Ascii","offset":5,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"DebugInfo","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_all_tags.snap b/tests/snapshots/output_json_integration__json_all_tags.snap new file mode 100644 index 0000000..f3a0b35 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_all_tags.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"tagged","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":6,"tags":["Url","Domain","ipv4","ipv6","filepath","regpath","guid","Email","b64","fmt","user-agent-ish","demangled","Import","Export","Version","Manifest","Resource","dylib-path","rpath","rpath-var","framework-path"],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_debug_fields.snap b/tests/snapshots/output_json_integration__json_debug_fields.snap new file mode 100644 index 0000000..be79024 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_debug_fields.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"debug","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":5,"tags":[],"score":0,"section_weight":10,"semantic_boost":5,"noise_penalty":-3,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_empty_strings.snap b/tests/snapshots/output_json_integration__json_empty_strings.snap new file mode 100644 index 0000000..d7f4d70 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_empty_strings.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- + diff --git a/tests/snapshots/output_json_integration__json_long_strings.snap b/tests/snapshots/output_json_integration__json_long_strings.snap new file mode 100644 index 0000000..6ff94ee --- /dev/null +++ b/tests/snapshots/output_json_integration__json_long_strings.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":300,"tags":[],"score":5,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_multiple_strings.snap b/tests/snapshots/output_json_integration__json_multiple_strings.snap new file mode 100644 index 0000000..a71d29e --- /dev/null +++ b/tests/snapshots/output_json_integration__json_multiple_strings.snap @@ -0,0 +1,7 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"one","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"two","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":3,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"three","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":5,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_optional_fields_none.snap b/tests/snapshots/output_json_integration__json_optional_fields_none.snap new file mode 100644 index 0000000..c7cc4bb --- /dev/null +++ b/tests/snapshots/output_json_integration__json_optional_fields_none.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"no-optional","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":11,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_original_text.snap b/tests/snapshots/output_json_integration__json_original_text.snap new file mode 100644 index 0000000..6e6c2b0 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_original_text.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"demangled","original_text":"_ZN","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":9,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_single_string.snap b/tests/snapshots/output_json_integration__json_single_string.snap new file mode 100644 index 0000000..2a3d52a --- /dev/null +++ b/tests/snapshots/output_json_integration__json_single_string.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"GetProcAddress","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":14,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_special_characters.snap b/tests/snapshots/output_json_integration__json_special_characters.snap new file mode 100644 index 0000000..75d19f2 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_special_characters.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"quote\" backslash\\ line\n tab\t","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":28,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_json_integration__json_unicode_content.snap b/tests/snapshots/output_json_integration__json_unicode_content.snap new file mode 100644 index 0000000..77c2d01 --- /dev/null +++ b/tests/snapshots/output_json_integration__json_unicode_content.snap @@ -0,0 +1,5 @@ +--- +source: tests/output_json_integration.rs +expression: output +--- +{"text":"中文字符","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":12,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_yara_integration__yara_all_tag_types.snap b/tests/snapshots/output_yara_integration__yara_all_tag_types.snap new file mode 100644 index 0000000..29c2418 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_all_tag_types.snap @@ -0,0 +1,80 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: tags.exe +// Generated: 1768722692 + +rule tags_exe_strings { + meta: + description = "Strings extracted from tags.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Domain + // score: 0 + $Domain_1 = "domain" ascii + // tag: Email + // score: 0 + $Email_1 = "email" ascii + // tag: Export + // score: 0 + $Export_1 = "export" ascii + // tag: Import + // score: 0 + $Import_1 = "import" ascii + // tag: Manifest + // score: 0 + $Manifest_1 = "manifest" ascii + // tag: Resource + // score: 0 + $Resource_1 = "resource" ascii + // tag: Url + // score: 0 + $Url_1 = "url" ascii + // tag: Version + // score: 0 + $Version_1 = "version" ascii + // tag: b64 + // score: 0 + $b64_1 = "b64" ascii + // tag: demangled + // score: 0 + $demangled_1 = "demangled" ascii + // tag: dylib-path + // score: 0 + $dylib_path_1 = "dylib" ascii + // tag: filepath + // score: 0 + $filepath_1 = "path" ascii + // tag: fmt + // score: 0 + $fmt_1 = "fmt" ascii + // tag: framework-path + // score: 0 + $framework_path_1 = "framework" ascii + // tag: guid + // score: 0 + $guid_1 = "guid" ascii + // tag: ipv4 + // score: 0 + $ipv4_1 = "ipv4" ascii + // tag: ipv6 + // score: 0 + $ipv6_1 = "ipv6" ascii + // tag: regpath + // score: 0 + $regpath_1 = "reg" ascii + // tag: rpath + // score: 0 + $rpath_1 = "rpath" ascii + // tag: rpath-var + // score: 0 + $rpath_var_1 = "rpathvar" ascii + // tag: user-agent-ish + // score: 0 + $user_agent_ish_1 = "agent" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap b/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap new file mode 100644 index 0000000..11e82b2 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap @@ -0,0 +1,20 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: weird name.exe +// Generated: 1768722692 + +rule weird_name_exe_strings { + meta: + description = "Strings extracted from weird name.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: untagged + // score: 0 + $untagged_1 = "alpha" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap b/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap new file mode 100644 index 0000000..0427dfc --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap @@ -0,0 +1,20 @@ +--- +source: tests/output_yara_integration.rs +expression: output_special +--- +// YARA rule generated by Stringy +// Binary: #$% +// Generated: 1768722692 + +rule ____strings { + meta: + description = "Strings extracted from #$%" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: untagged + // score: 0 + $untagged_1 = "alpha" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_edge_case_names.snap b/tests/snapshots/output_yara_integration__yara_edge_case_names.snap new file mode 100644 index 0000000..93718e1 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_edge_case_names.snap @@ -0,0 +1,20 @@ +--- +source: tests/output_yara_integration.rs +expression: output_numbers +--- +// YARA rule generated by Stringy +// Binary: 12345 +// Generated: 1768722692 + +rule _12345_strings { + meta: + description = "Strings extracted from 12345" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: untagged + // score: 0 + $untagged_1 = "alpha" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_empty_strings.snap b/tests/snapshots/output_yara_integration__yara_empty_strings.snap new file mode 100644 index 0000000..895d38d --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_empty_strings.snap @@ -0,0 +1,16 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: empty.bin +// Generated: 1768722692 + +rule empty_bin_strings { + meta: + description = "Strings extracted from empty.bin" + generated_by = "stringy" + generated_at = "1768722692" + condition: + true +} diff --git a/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap b/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap new file mode 100644 index 0000000..b255b3f --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap @@ -0,0 +1,23 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: enc.exe +// Generated: 1768722692 + +rule enc_exe_strings { + meta: + description = "Strings extracted from enc.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Resource + // score: 0 + $Resource_1 = "wide" wide + // tag: untagged + // score: 0 + $untagged_1 = "ascii" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_high_scores.snap b/tests/snapshots/output_yara_integration__yara_high_scores.snap new file mode 100644 index 0000000..1f86286 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_high_scores.snap @@ -0,0 +1,23 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: scores.exe +// Generated: 1768722692 + +rule scores_exe_strings { + meta: + description = "Strings extracted from scores.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Domain + // score: -10 + $Domain_1 = "low" ascii + // tag: Url + // score: 9999 + $Url_1 = "critical" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap b/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap new file mode 100644 index 0000000..97f125a --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap @@ -0,0 +1,19 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: long.exe +// Generated: 1768722692 + +rule long_exe_strings { + meta: + description = "Strings extracted from long.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Url + // skipped (length > 200 chars): 201 + condition: + true +} diff --git a/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap b/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap new file mode 100644 index 0000000..2a54f81 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap @@ -0,0 +1,26 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: mixed.exe +// Generated: 1768722692 + +rule mixed_exe_strings { + meta: + description = "Strings extracted from mixed.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Domain + // score: 0 + $Domain_1 = "utf8" ascii + // tag: Resource + // score: 0 + $Resource_1 = "utf16" wide + // tag: Url + // score: 0 + $Url_1 = "ascii" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap b/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap new file mode 100644 index 0000000..b32a4a9 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap @@ -0,0 +1,26 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: diff-tag.exe +// Generated: 1768722692 + +rule diff_tag_exe_strings { + meta: + description = "Strings extracted from diff-tag.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Domain + // score: 0 + $Domain_1 = "example.com" ascii + // tag: Url + // score: 0 + $Url_1 = "https://example.com" ascii + // tag: ipv4 + // score: 0 + $ipv4_1 = "192.168.1.1" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap b/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap new file mode 100644 index 0000000..8bf2a3d --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap @@ -0,0 +1,22 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: same-tag.exe +// Generated: 1768722692 + +rule same_tag_exe_strings { + meta: + description = "Strings extracted from same-tag.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Url + // score: 0 + $Url_1 = "alpha" ascii + // score: 0 + $Url_2 = "beta" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_no_tags.snap b/tests/snapshots/output_yara_integration__yara_no_tags.snap new file mode 100644 index 0000000..1b53e20 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_no_tags.snap @@ -0,0 +1,22 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: untagged.exe +// Generated: 1768722692 + +rule untagged_exe_strings { + meta: + description = "Strings extracted from untagged.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: untagged + // score: 0 + $untagged_1 = "no-tag" ascii + // score: 0 + $untagged_2 = "still-no-tag" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_single_string.snap b/tests/snapshots/output_yara_integration__yara_single_string.snap new file mode 100644 index 0000000..bfe7b2f --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_single_string.snap @@ -0,0 +1,20 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: single.exe +// Generated: 1768722692 + +rule single_exe_strings { + meta: + description = "Strings extracted from single.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Import + // score: 0 + $Import_1 = "GetProcAddress" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_special_characters.snap b/tests/snapshots/output_yara_integration__yara_special_characters.snap new file mode 100644 index 0000000..5da9535 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_special_characters.snap @@ -0,0 +1,20 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: special.exe +// Generated: 1768722692 + +rule special_exe_strings { + meta: + description = "Strings extracted from special.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: filepath + // score: 10 + $filepath_1 = "quote\" backslash\\ line\n tab\t" ascii + condition: + any of them +} diff --git a/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap b/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap new file mode 100644 index 0000000..52b4910 --- /dev/null +++ b/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap @@ -0,0 +1,20 @@ +--- +source: tests/output_yara_integration.rs +expression: output +--- +// YARA rule generated by Stringy +// Binary: unicode.exe +// Generated: 1768722692 + +rule unicode_exe_strings { + meta: + description = "Strings extracted from unicode.exe" + generated_by = "stringy" + generated_at = "1768722692" + strings: + // tag: Domain + // score: 0 + $Domain_1 = "\xe4\xb8\xad\xe6\x96\x87\xe5\xad\x97\xe7\xac\xa6" ascii + condition: + any of them +} From d2710e87fff51e24163a991a8e5a422fe28b1c0d Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 11:43:17 -0500 Subject: [PATCH 04/25] feat(output): add generated_at timestamp to output metadata - Introduced an optional `generated_at` field in `OutputMetadata` for deterministic outputs. - Updated YARA formatter to utilize the `generated_at` timestamp instead of runtime timestamps. - Adjusted integration tests and snapshots to reflect changes in generated timestamps. Signed-off-by: UncleSp1d3r --- src/output/mod.rs | 12 +++ src/output/yara.rs | 75 +++++++++++++++++-- tests/output_yara_integration.rs | 1 + ..._yara_integration__yara_all_tag_types.snap | 4 +- ...ration__yara_binary_name_sanitization.snap | 4 +- ...a_integration__yara_edge_case_names-2.snap | 4 +- ...ara_integration__yara_edge_case_names.snap | 4 +- ..._yara_integration__yara_empty_strings.snap | 4 +- ..._integration__yara_encoding_modifiers.snap | 4 +- ...ut_yara_integration__yara_high_scores.snap | 4 +- ...ntegration__yara_long_strings_skipped.snap | 4 +- ...ara_integration__yara_mixed_encodings.snap | 6 +- ..._yara_multiple_strings_different_tags.snap | 4 +- ...ation__yara_multiple_strings_same_tag.snap | 4 +- ...output_yara_integration__yara_no_tags.snap | 4 +- ..._yara_integration__yara_single_string.snap | 4 +- ..._integration__yara_special_characters.snap | 4 +- ..._integration__yara_unicode_in_strings.snap | 4 +- 18 files changed, 111 insertions(+), 39 deletions(-) diff --git a/src/output/mod.rs b/src/output/mod.rs index 419c129..767d25f 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -70,6 +70,10 @@ pub struct OutputMetadata { pub total_strings: usize, /// Number of strings after filtering. pub filtered_strings: usize, + /// Optional generated-at timestamp for deterministic outputs. + /// + /// When set, formatters may use this value instead of runtime timestamps. + pub generated_at: Option, } impl OutputMetadata { @@ -86,8 +90,16 @@ impl OutputMetadata { format, total_strings, filtered_strings, + generated_at: None, } } + + /// Set an explicit generated-at timestamp for deterministic outputs. + #[must_use] + pub fn with_generated_at(mut self, generated_at: String) -> Self { + self.generated_at = Some(generated_at); + self + } } /// Format output strings using the requested output format. diff --git a/src/output/yara.rs b/src/output/yara.rs index 5e90d53..cd197c6 100644 --- a/src/output/yara.rs +++ b/src/output/yara.rs @@ -6,7 +6,10 @@ use std::time::{SystemTime, UNIX_EPOCH}; /// Format strings as YARA rule templates. pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - let timestamp = current_timestamp(); + let timestamp = _metadata + .generated_at + .clone() + .unwrap_or_else(current_timestamp); let base_rule_name = sanitize_rule_name(&_metadata.binary_name); let rule_name = format!("{}_strings", base_rule_name); @@ -53,14 +56,26 @@ pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Resu let counter = counters.entry(var_tag.clone()).or_insert(0); *counter += 1; let var_name = format!("${}_{}", var_tag, *counter); - let escaped = escape_yara_string(&item.text); - let modifier = get_yara_modifier(item.encoding); - strings_block.push_str(&format!(" // score: {}\n", item.score)); - strings_block.push_str(&format!( - " {} = \"{}\" {}\n", - var_name, escaped, modifier - )); + + match item.encoding { + Encoding::Utf16Be => { + let hex = utf16be_hex_string(&item.text); + strings_block.push_str(&format!(" {} = {}\n", var_name, hex)); + } + Encoding::Utf16Le => { + let escaped = escape_yara_unicode_literal(&item.text); + strings_block.push_str(&format!(" {} = \"{}\" wide\n", var_name, escaped)); + } + Encoding::Ascii | Encoding::Utf8 => { + let escaped = escape_yara_string(&item.text); + let modifier = get_yara_modifier(item.encoding); + strings_block.push_str(&format!( + " {} = \"{}\" {}\n", + var_name, escaped, modifier + )); + } + } included += 1; } } @@ -144,6 +159,50 @@ fn escape_yara_string(text: &str) -> String { escaped } +fn escape_yara_unicode_literal(text: &str) -> String { + let mut escaped = String::new(); + for ch in text.chars() { + match ch { + '\\' => escaped.push_str("\\\\"), + '"' => escaped.push_str("\\\""), + '\n' => escaped.push_str("\\n"), + '\r' => escaped.push_str("\\r"), + '\t' => escaped.push_str("\\t"), + _ if ch.is_control() => { + let mut buf = [0; 4]; + let encoded = ch.encode_utf8(&mut buf); + for byte in encoded.as_bytes() { + escaped.push_str(&format!("\\x{:02x}", byte)); + } + } + _ => escaped.push(ch), + } + } + escaped +} + +fn utf16be_hex_string(text: &str) -> String { + let mut bytes = Vec::new(); + for unit in text.encode_utf16() { + bytes.extend_from_slice(&unit.to_be_bytes()); + } + + if bytes.is_empty() { + return "{ }".to_string(); + } + + let mut hex = String::new(); + hex.push_str("{ "); + for (idx, byte) in bytes.iter().enumerate() { + if idx > 0 { + hex.push(' '); + } + hex.push_str(&format!("{:02x}", byte)); + } + hex.push_str(" }"); + hex +} + fn get_yara_modifier(encoding: Encoding) -> &'static str { match encoding { Encoding::Ascii | Encoding::Utf8 => "ascii", diff --git a/tests/output_yara_integration.rs b/tests/output_yara_integration.rs index 9468a75..e8cf416 100644 --- a/tests/output_yara_integration.rs +++ b/tests/output_yara_integration.rs @@ -18,6 +18,7 @@ fn make_string(text: &str) -> FoundString { fn make_metadata(binary_name: &str, count: usize) -> OutputMetadata { OutputMetadata::new(binary_name.to_string(), OutputFormat::Yara, count, count) + .with_generated_at("0".to_string()) } #[test] diff --git a/tests/snapshots/output_yara_integration__yara_all_tag_types.snap b/tests/snapshots/output_yara_integration__yara_all_tag_types.snap index 29c2418..6b5cb58 100644 --- a/tests/snapshots/output_yara_integration__yara_all_tag_types.snap +++ b/tests/snapshots/output_yara_integration__yara_all_tag_types.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: tags.exe -// Generated: 1768722692 +// Generated: 0 rule tags_exe_strings { meta: description = "Strings extracted from tags.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Domain // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap b/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap index 11e82b2..e25f563 100644 --- a/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap +++ b/tests/snapshots/output_yara_integration__yara_binary_name_sanitization.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: weird name.exe -// Generated: 1768722692 +// Generated: 0 rule weird_name_exe_strings { meta: description = "Strings extracted from weird name.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: untagged // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap b/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap index 0427dfc..960b4c7 100644 --- a/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap +++ b/tests/snapshots/output_yara_integration__yara_edge_case_names-2.snap @@ -4,13 +4,13 @@ expression: output_special --- // YARA rule generated by Stringy // Binary: #$% -// Generated: 1768722692 +// Generated: 0 rule ____strings { meta: description = "Strings extracted from #$%" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: untagged // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_edge_case_names.snap b/tests/snapshots/output_yara_integration__yara_edge_case_names.snap index 93718e1..f0553b0 100644 --- a/tests/snapshots/output_yara_integration__yara_edge_case_names.snap +++ b/tests/snapshots/output_yara_integration__yara_edge_case_names.snap @@ -4,13 +4,13 @@ expression: output_numbers --- // YARA rule generated by Stringy // Binary: 12345 -// Generated: 1768722692 +// Generated: 0 rule _12345_strings { meta: description = "Strings extracted from 12345" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: untagged // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_empty_strings.snap b/tests/snapshots/output_yara_integration__yara_empty_strings.snap index 895d38d..dc1e9cb 100644 --- a/tests/snapshots/output_yara_integration__yara_empty_strings.snap +++ b/tests/snapshots/output_yara_integration__yara_empty_strings.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: empty.bin -// Generated: 1768722692 +// Generated: 0 rule empty_bin_strings { meta: description = "Strings extracted from empty.bin" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" condition: true } diff --git a/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap b/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap index b255b3f..3ae9427 100644 --- a/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap +++ b/tests/snapshots/output_yara_integration__yara_encoding_modifiers.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: enc.exe -// Generated: 1768722692 +// Generated: 0 rule enc_exe_strings { meta: description = "Strings extracted from enc.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Resource // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_high_scores.snap b/tests/snapshots/output_yara_integration__yara_high_scores.snap index 1f86286..9ce8eb9 100644 --- a/tests/snapshots/output_yara_integration__yara_high_scores.snap +++ b/tests/snapshots/output_yara_integration__yara_high_scores.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: scores.exe -// Generated: 1768722692 +// Generated: 0 rule scores_exe_strings { meta: description = "Strings extracted from scores.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Domain // score: -10 diff --git a/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap b/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap index 97f125a..4841282 100644 --- a/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap +++ b/tests/snapshots/output_yara_integration__yara_long_strings_skipped.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: long.exe -// Generated: 1768722692 +// Generated: 0 rule long_exe_strings { meta: description = "Strings extracted from long.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Url // skipped (length > 200 chars): 201 diff --git a/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap b/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap index 2a54f81..1c880d3 100644 --- a/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap +++ b/tests/snapshots/output_yara_integration__yara_mixed_encodings.snap @@ -4,20 +4,20 @@ expression: output --- // YARA rule generated by Stringy // Binary: mixed.exe -// Generated: 1768722692 +// Generated: 0 rule mixed_exe_strings { meta: description = "Strings extracted from mixed.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Domain // score: 0 $Domain_1 = "utf8" ascii // tag: Resource // score: 0 - $Resource_1 = "utf16" wide + $Resource_1 = { 00 75 00 74 00 66 00 31 00 36 } // tag: Url // score: 0 $Url_1 = "ascii" ascii diff --git a/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap b/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap index b32a4a9..e5c2692 100644 --- a/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap +++ b/tests/snapshots/output_yara_integration__yara_multiple_strings_different_tags.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: diff-tag.exe -// Generated: 1768722692 +// Generated: 0 rule diff_tag_exe_strings { meta: description = "Strings extracted from diff-tag.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Domain // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap b/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap index 8bf2a3d..419dc58 100644 --- a/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap +++ b/tests/snapshots/output_yara_integration__yara_multiple_strings_same_tag.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: same-tag.exe -// Generated: 1768722692 +// Generated: 0 rule same_tag_exe_strings { meta: description = "Strings extracted from same-tag.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Url // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_no_tags.snap b/tests/snapshots/output_yara_integration__yara_no_tags.snap index 1b53e20..cf88b43 100644 --- a/tests/snapshots/output_yara_integration__yara_no_tags.snap +++ b/tests/snapshots/output_yara_integration__yara_no_tags.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: untagged.exe -// Generated: 1768722692 +// Generated: 0 rule untagged_exe_strings { meta: description = "Strings extracted from untagged.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: untagged // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_single_string.snap b/tests/snapshots/output_yara_integration__yara_single_string.snap index bfe7b2f..0501fb4 100644 --- a/tests/snapshots/output_yara_integration__yara_single_string.snap +++ b/tests/snapshots/output_yara_integration__yara_single_string.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: single.exe -// Generated: 1768722692 +// Generated: 0 rule single_exe_strings { meta: description = "Strings extracted from single.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Import // score: 0 diff --git a/tests/snapshots/output_yara_integration__yara_special_characters.snap b/tests/snapshots/output_yara_integration__yara_special_characters.snap index 5da9535..1e74269 100644 --- a/tests/snapshots/output_yara_integration__yara_special_characters.snap +++ b/tests/snapshots/output_yara_integration__yara_special_characters.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: special.exe -// Generated: 1768722692 +// Generated: 0 rule special_exe_strings { meta: description = "Strings extracted from special.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: filepath // score: 10 diff --git a/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap b/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap index 52b4910..2d33457 100644 --- a/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap +++ b/tests/snapshots/output_yara_integration__yara_unicode_in_strings.snap @@ -4,13 +4,13 @@ expression: output --- // YARA rule generated by Stringy // Binary: unicode.exe -// Generated: 1768722692 +// Generated: 0 rule unicode_exe_strings { meta: description = "Strings extracted from unicode.exe" generated_by = "stringy" - generated_at = "1768722692" + generated_at = "0" strings: // tag: Domain // score: 0 From 14c3d822f19bf952c995606a391bd7c0775507fc Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 11:51:18 -0500 Subject: [PATCH 05/25] Enable superpowers plugin in Claude settings Added 'superpowers@claude-plugins-official' to the enabledPlugins list in .claude/settings.json. --- .claude/settings.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.claude/settings.json b/.claude/settings.json index 37c2a2b..9faa2d4 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,5 +1,6 @@ { "enabledPlugins": { - "commit@cc-marketplace": true + "commit@cc-marketplace": true, + "superpowers@claude-plugins-official": true } } From de2e8d56b354caf59253caaeb40ef3c886b22888 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 12:03:11 -0500 Subject: [PATCH 06/25] refactor(output): improve YARA formatter code quality and test coverage Address issues identified during PR review: - Rename misleading underscore-prefixed parameters (_strings, _metadata) to strings, metadata since they are actively used - Return "CLOCK_ERROR" instead of "0" on timestamp failure to make errors clearly distinguishable from valid timestamps - Remove dead code: get_yara_modifier() function and its tests after inlining "ascii" constant for ASCII/UTF-8 encodings - Simplify utf16be_hex_string() using iterator chains with flat_map() - Add Tag import and simplify tag_name() function Add comprehensive unit tests for UTF-16 encoding functions: - escape_yara_unicode_literal: basic escapes, control chars, unicode passthrough, empty string - utf16be_hex_string: basic ASCII, empty string, non-ASCII BMP chars, surrogate pairs - escape_yara_string: additional control character coverage - with_generated_at builder method validation - Default timestamp fallback behavior Co-Authored-By: Claude Opus 4.5 --- src/output/mod.rs | 9 +++ src/output/yara.rs | 193 +++++++++++++++++++++++++++++---------------- 2 files changed, 136 insertions(+), 66 deletions(-) diff --git a/src/output/mod.rs b/src/output/mod.rs index 767d25f..9bbdb4c 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -185,6 +185,15 @@ mod tests { assert_eq!(other.filtered_strings, 1); } + #[test] + fn test_with_generated_at_builder() { + let metadata = OutputMetadata::new("test.bin".to_string(), OutputFormat::Yara, 0, 0); + assert!(metadata.generated_at.is_none()); + + let with_timestamp = metadata.with_generated_at("12345".to_string()); + assert_eq!(with_timestamp.generated_at, Some("12345".to_string())); + } + #[test] fn test_dispatch_logic_for_each_format() { let strings = vec![build_found_string("alpha")]; diff --git a/src/output/yara.rs b/src/output/yara.rs index cd197c6..099fcbe 100644 --- a/src/output/yara.rs +++ b/src/output/yara.rs @@ -1,40 +1,40 @@ -use crate::types::{Encoding, FoundString, Result}; +use crate::types::{Encoding, FoundString, Result, Tag}; use super::OutputMetadata; use std::collections::{BTreeMap, HashMap}; use std::time::{SystemTime, UNIX_EPOCH}; /// Format strings as YARA rule templates. -pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - let timestamp = _metadata +pub fn format_yara(strings: &[FoundString], metadata: &OutputMetadata) -> Result { + let timestamp = metadata .generated_at .clone() .unwrap_or_else(current_timestamp); - let base_rule_name = sanitize_rule_name(&_metadata.binary_name); + let base_rule_name = sanitize_rule_name(&metadata.binary_name); let rule_name = format!("{}_strings", base_rule_name); let mut output = String::new(); output.push_str("// YARA rule generated by Stringy\n"); - output.push_str(&format!("// Binary: {}\n", _metadata.binary_name)); + output.push_str(&format!("// Binary: {}\n", metadata.binary_name)); output.push_str(&format!("// Generated: {}\n\n", timestamp)); output.push_str(&format!("rule {} {{\n", rule_name)); output.push_str(" meta:\n"); output.push_str(&format!( " description = \"Strings extracted from {}\"\n", - escape_yara_string(&_metadata.binary_name) + escape_yara_string(&metadata.binary_name) )); output.push_str(" generated_by = \"stringy\"\n"); output.push_str(&format!(" generated_at = \"{}\"\n", timestamp)); - if _strings.is_empty() { + if strings.is_empty() { output.push_str(" condition:\n"); output.push_str(" true\n"); output.push_str("}\n"); return Ok(output); } - let grouped = group_strings_by_tag(_strings); + let grouped = group_strings_by_tag(strings); let mut strings_block = String::new(); let mut counters: HashMap = HashMap::new(); let mut included = 0usize; @@ -69,11 +69,7 @@ pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Resu } Encoding::Ascii | Encoding::Utf8 => { let escaped = escape_yara_string(&item.text); - let modifier = get_yara_modifier(item.encoding); - strings_block.push_str(&format!( - " {} = \"{}\" {}\n", - var_name, escaped, modifier - )); + strings_block.push_str(&format!(" {} = \"{}\" ascii\n", var_name, escaped)); } } included += 1; @@ -95,7 +91,9 @@ pub fn format_yara(_strings: &[FoundString], _metadata: &OutputMetadata) -> Resu fn current_timestamp() -> String { match SystemTime::now().duration_since(UNIX_EPOCH) { Ok(duration) => duration.as_secs().to_string(), - Err(_) => "0".to_string(), + // Return a clearly invalid timestamp if system clock is before Unix epoch. + // This avoids silently producing "0" which looks like a valid epoch timestamp. + Err(_) => "CLOCK_ERROR".to_string(), } } @@ -182,57 +180,42 @@ fn escape_yara_unicode_literal(text: &str) -> String { } fn utf16be_hex_string(text: &str) -> String { - let mut bytes = Vec::new(); - for unit in text.encode_utf16() { - bytes.extend_from_slice(&unit.to_be_bytes()); - } + let hex_bytes: Vec = text + .encode_utf16() + .flat_map(|unit| unit.to_be_bytes()) + .map(|b| format!("{:02x}", b)) + .collect(); - if bytes.is_empty() { + if hex_bytes.is_empty() { return "{ }".to_string(); } - let mut hex = String::new(); - hex.push_str("{ "); - for (idx, byte) in bytes.iter().enumerate() { - if idx > 0 { - hex.push(' '); - } - hex.push_str(&format!("{:02x}", byte)); - } - hex.push_str(" }"); - hex -} - -fn get_yara_modifier(encoding: Encoding) -> &'static str { - match encoding { - Encoding::Ascii | Encoding::Utf8 => "ascii", - Encoding::Utf16Le | Encoding::Utf16Be => "wide", - } + format!("{{ {} }}", hex_bytes.join(" ")) } -fn tag_name(tag: &crate::types::Tag) -> &'static str { +fn tag_name(tag: &Tag) -> &'static str { match tag { - crate::types::Tag::Url => "Url", - crate::types::Tag::Domain => "Domain", - crate::types::Tag::IPv4 => "ipv4", - crate::types::Tag::IPv6 => "ipv6", - crate::types::Tag::FilePath => "filepath", - crate::types::Tag::RegistryPath => "regpath", - crate::types::Tag::Guid => "guid", - crate::types::Tag::Email => "Email", - crate::types::Tag::Base64 => "b64", - crate::types::Tag::FormatString => "fmt", - crate::types::Tag::UserAgent => "user-agent-ish", - crate::types::Tag::DemangledSymbol => "demangled", - crate::types::Tag::Import => "Import", - crate::types::Tag::Export => "Export", - crate::types::Tag::Version => "Version", - crate::types::Tag::Manifest => "Manifest", - crate::types::Tag::Resource => "Resource", - crate::types::Tag::DylibPath => "dylib-path", - crate::types::Tag::Rpath => "rpath", - crate::types::Tag::RpathVariable => "rpath-var", - crate::types::Tag::FrameworkPath => "framework-path", + Tag::Url => "Url", + Tag::Domain => "Domain", + Tag::IPv4 => "ipv4", + Tag::IPv6 => "ipv6", + Tag::FilePath => "filepath", + Tag::RegistryPath => "regpath", + Tag::Guid => "guid", + Tag::Email => "Email", + Tag::Base64 => "b64", + Tag::FormatString => "fmt", + Tag::UserAgent => "user-agent-ish", + Tag::DemangledSymbol => "demangled", + Tag::Import => "Import", + Tag::Export => "Export", + Tag::Version => "Version", + Tag::Manifest => "Manifest", + Tag::Resource => "Resource", + Tag::DylibPath => "dylib-path", + Tag::Rpath => "rpath", + Tag::RpathVariable => "rpath-var", + Tag::FrameworkPath => "framework-path", } } @@ -289,14 +272,6 @@ mod tests { assert!(escaped.contains("\\t")); } - #[test] - fn test_get_yara_modifier() { - assert_eq!(get_yara_modifier(Encoding::Ascii), "ascii"); - assert_eq!(get_yara_modifier(Encoding::Utf8), "ascii"); - assert_eq!(get_yara_modifier(Encoding::Utf16Le), "wide"); - assert_eq!(get_yara_modifier(Encoding::Utf16Be), "wide"); - } - #[test] fn test_group_strings_by_tag() { let strings = vec![ @@ -357,4 +332,90 @@ mod tests { let output = format_yara(&strings, &make_metadata()).expect("Formatting should succeed"); assert!(output.contains("\\x")); } + + #[test] + fn test_escape_yara_unicode_literal_basic() { + // Basic escapes + assert_eq!(escape_yara_unicode_literal("quote\""), "quote\\\""); + assert_eq!(escape_yara_unicode_literal("back\\slash"), "back\\\\slash"); + assert_eq!(escape_yara_unicode_literal("line\nbreak"), "line\\nbreak"); + assert_eq!(escape_yara_unicode_literal("tab\there"), "tab\\there"); + assert_eq!(escape_yara_unicode_literal("return\rhere"), "return\\rhere"); + } + + #[test] + fn test_escape_yara_unicode_literal_control_chars() { + // Control characters should be hex-escaped + assert_eq!(escape_yara_unicode_literal("\x00"), "\\x00"); + assert_eq!(escape_yara_unicode_literal("\x1f"), "\\x1f"); + } + + #[test] + fn test_escape_yara_unicode_literal_unicode_passthrough() { + // Non-control Unicode should pass through unescaped + let result = escape_yara_unicode_literal("\u{4E2D}\u{6587}"); + assert!( + result.contains('\u{4E2D}'), + "Non-control Unicode should not be escaped" + ); + } + + #[test] + fn test_escape_yara_unicode_literal_empty() { + assert_eq!(escape_yara_unicode_literal(""), ""); + } + + #[test] + fn test_utf16be_hex_string_basic() { + // Basic ASCII - should be big-endian (00 followed by ASCII byte) + assert_eq!(utf16be_hex_string("A"), "{ 00 41 }"); + assert_eq!(utf16be_hex_string("AB"), "{ 00 41 00 42 }"); + } + + #[test] + fn test_utf16be_hex_string_empty() { + assert_eq!(utf16be_hex_string(""), "{ }"); + } + + #[test] + fn test_utf16be_hex_string_non_ascii() { + // Non-ASCII Unicode (BMP) - Chinese character U+4E2D + let chinese = utf16be_hex_string("\u{4E2D}"); + assert_eq!(chinese, "{ 4e 2d }"); + } + + #[test] + fn test_utf16be_hex_string_surrogate_pair() { + // Character requiring surrogate pair (outside BMP) - emoji U+1F600 + let emoji = utf16be_hex_string("\u{1F600}"); + // Should produce surrogate pair: D83D DE00 + assert_eq!(emoji, "{ d8 3d de 00 }"); + } + + #[test] + fn test_escape_yara_string_control_characters() { + assert_eq!(escape_yara_string("\r"), "\\r"); + assert_eq!(escape_yara_string("\x00"), "\\x00"); + assert_eq!(escape_yara_string("\x08"), "\\b"); + assert_eq!(escape_yara_string("\x0b"), "\\x0b"); + assert_eq!(escape_yara_string("\x0c"), "\\x0c"); + assert_eq!(escape_yara_string("\x7f"), "\\x7f"); + } + + #[test] + fn test_format_yara_uses_current_timestamp_when_not_set() { + // When generated_at is None, format_yara should use current_timestamp() + let metadata = OutputMetadata::new("test.bin".to_string(), OutputFormat::Yara, 0, 0); + // Note: generated_at is None + let output = format_yara(&[], &metadata).expect("Formatting should succeed"); + + // Should contain a timestamp in the generated_at field + assert!(output.contains("generated_at = \"")); + // Timestamp should be numeric (or CLOCK_ERROR in exceptional cases) + assert!( + output.contains("generated_at = \"1") + || output.contains("generated_at = \"CLOCK_ERROR"), + "Timestamp should be numeric or CLOCK_ERROR" + ); + } } From 6c1b531ea24bd708e6791a4f90e17f4131978e54 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 12:11:03 -0500 Subject: [PATCH 07/25] fix(docs): clarify ASCII rule for Unicode handling Signed-off-by: UncleSp1d3r --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index b841448..2e71a4e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,7 +6,7 @@ 1. **No `unsafe` code** - `#![forbid(unsafe_code)]` enforced 2. **Zero warnings** - `cargo clippy -- -D warnings` must pass -3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation +3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation (except when explicity testing or working with Unicode strings or emjois) 4. **File size limit** - Keep files under 500 lines; split larger files 5. **No blanket `#[allow]`** - Any `allow` requires inline justification From a122d32613a66b363ffc8305f2488ef9df53650b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 12:12:11 -0500 Subject: [PATCH 08/25] fix(reviews): clarify ASCII rule for Unicode punctuation Signed-off-by: UncleSp1d3r --- .coderabbit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coderabbit.yml b/.coderabbit.yml index f44fd03..22f80b3 100644 --- a/.coderabbit.yml +++ b/.coderabbit.yml @@ -351,7 +351,7 @@ reviews: - mode: "warning" name: "ASCII Only" instructions: | - Verify that no Unicode punctuation is introduced: + Verify that no Unicode punctuation is introduced unless explicitly required: 1. No emojis in code or documentation 2. No em-dashes - use regular hyphens 3. No smart quotes - use straight quotes From 3b9c618ab12a15d3441fd31f5d0734a2430c8b76 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 12:40:25 -0500 Subject: [PATCH 09/25] chore(settings): remove enabled plugins from configuration Signed-off-by: UncleSp1d3r --- .claude/settings.json | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.claude/settings.json b/.claude/settings.json index 9faa2d4..c72c6b7 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,6 +1,3 @@ { - "enabledPlugins": { - "commit@cc-marketplace": true, - "superpowers@claude-plugins-official": true - } + "enabledPlugins": {} } From b6689ce1271eaae11a82c4469e43dd353139a5de Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 12:53:09 -0500 Subject: [PATCH 10/25] chore(contributing): add contributing guidelines document Signed-off-by: UncleSp1d3r --- CONTRIBUTING.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..869f001 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,86 @@ +# Contributing to Stringy + +Thanks for your interest in Stringy. This guide explains how to propose changes and what we expect for code quality. + +## Quick start + +1. Search existing issues and pull requests before filing a new one. +2. For bugs, open an issue with a clear reproduction and expected vs actual behavior. +3. For new features or larger changes, open an issue first to discuss scope. + +## Development setup + +Stringy uses Rust 2024 (MSRV 1.85+, see `rust-toolchain.toml`). We also use just for common tasks. + +Recommended workflow: + +- `just setup` (to install tools) +- `just build` (compiles a debug build) +- `just test` (runs tests) +- `just lint` (runs linters) + +If you do not use just, the critical requirement is that: + +- `cargo clippy -- -D warnings` passes +- `cargo fmt` produces no changes + +## Coding standards + +These rules are enforced by CI: + +- No unsafe code +- Zero warnings (`clippy -D warnings`) +- ASCII only in code and documentation, unless explicitly working with Unicode handling +- Keep files under 500-600 lines; split when needed +- No blanket `#[allow]` on modules or files +- No async; this is a synchronous CLI tool + +Use thiserror for structured errors and include context (offsets, section names, file paths) when relevant. + +## Project-specific guidance + +Module layout: + +- `container/` handles format detection and section analysis +- `extraction/` handles string extraction, filtering, and deduplication +- `classification/` handles semantic tagging and ranking +- `output/` handles output formatters +- `types.rs` contains core data structures and error types + +Key patterns: + +- Section weights: add new section weights in `container/*.rs` using existing match patterns. Higher weight means more likely to contain useful strings. +- Semantic tags: add new Tag variants in `types.rs`, implement detection in `classification/semantic.rs`, and update any tag merging logic if needed. +- Deduplication: preserve all occurrences and merge tags across occurrences in `extraction/dedup.rs`. +- Public structs: keep public API structs non_exhaustive and provide explicit constructors. +- Imports: prefer `stringy::extraction` or `stringy::types`. Do not import locally-defined types inside `extraction/mod.rs`. + +## Tests + +- Add or update tests for behavior changes. +- Use insta snapshots for output verification when appropriate. +- Integration tests live in tests/ and fixtures in tests/fixtures/. +- Use insta snapshots for output verification when changing output formatters. + +Run: + +- `just test` + +## Pull requests + +- Keep PRs focused and small when possible. +- Include a clear description of the problem and the solution. +- Link related issues in the PR description. +- Update documentation when behavior changes. + +## Documentation + +Docs live under docs/ and project planning artifacts are in project_plan/. Update them when you change user-facing behavior. + +## Security + +If you believe you found a security issue, please do not open a public issue. Use GitHub Security Advisories if available, or contact the maintainers privately. + +## Questions + +If you are unsure where to start, open an issue with your question and we will point you in the right direction. From bec8192cc80acf690b5af63fc08d88c9d73b3ecb Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 13:50:16 -0500 Subject: [PATCH 11/25] refactor: address code review findings and add project documentation Immediate fixes: - Fix failing doctests in extraction/mod.rs by wrapping in fn main() - Fix rustdoc warning in patterns/ip.rs by escaping [::1]:8080 - Fix O(n^2) algorithms in dedup.rs using HashSet for unique detection New features: - Add OutputFormatter trait for extensible output formatting - Add #[non_exhaustive] to OutputFormat enum for API stability - Add Hash derive to Encoding and StringSource enums Documentation: - Create CHANGELOG.md following Keep a Changelog format - Create ROADMAP.md documenting medium/long-term improvements - Create examples/ directory with basic_extraction, network_indicators, and output_formats examples The O(n^2) fixes include: - unique_sections detection now uses HashSet instead of Vec::contains - unique_sources detection now uses HashSet instead of Vec::contains - merge_tags now uses HashSet for deduplication - Grouping key simplified from format!("{:?}", encoding) to Encoding Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 53 +++++++ CONTRIBUTING.md | 2 +- ROADMAP.md | 239 ++++++++++++++++++++++++++++++ examples/basic_extraction.rs | 75 ++++++++++ examples/network_indicators.rs | 84 +++++++++++ examples/output_formats.rs | 72 +++++++++ src/classification/patterns/ip.rs | 2 +- src/extraction/dedup.rs | 27 +--- src/extraction/mod.rs | 66 +++++---- src/lib.rs | 3 +- src/output/mod.rs | 42 ++++++ src/types.rs | 4 +- 12 files changed, 615 insertions(+), 54 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 ROADMAP.md create mode 100644 examples/basic_extraction.rs create mode 100644 examples/network_indicators.rs create mode 100644 examples/output_formats.rs diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d82fa35 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,53 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Output formatters: JSON (JSONL), table (TTY-friendly), and YARA rule templates +- `generated_at` timestamp support in output metadata for deterministic outputs +- Ranking system for prioritizing extracted strings by relevance +- Symbol demangling support for Rust mangled names +- File path classification for POSIX, Windows, and registry paths +- Semantic classification for URLs, domains, and IP addresses (IPv4/IPv6) +- String deduplication with full occurrence metadata preservation +- `CanonicalString` type for deduplicated strings with occurrence tracking +- UTF-16 string extraction with confidence scoring +- Noise filtering framework with entropy, linguistic, and repetition filters +- Mach-O load command extraction with section weight normalization +- Comprehensive PE support: section classification, import/export parsing, resource extraction +- ELF symbol extraction with type support and visibility filtering +- `#[non_exhaustive]` and builder pattern for `FoundString` public API +- Contributing guidelines document + +### Changed +- Repository renamed from StringyMcStringFace to Stringy +- Improved YARA formatter code quality and test coverage +- Clarified ASCII rule for Unicode handling in documentation + +### Fixed +- Rustdoc warning for IPv6 address example in documentation + +### Dependencies +- Updated criterion to 0.8.1 +- Updated actions/checkout to v6 +- Updated actions/download-artifact to v7 +- Updated actions/attest-build-provenance to v3 +- Updated actions/upload-artifact to v5 +- Updated github/codeql-action to v4 +- Updated EmbarkStudios/cargo-deny-action to v2 + +## [0.1.0] - TBD + +Initial release with core functionality: + +### Added +- ELF, PE, and Mach-O binary format detection and parsing +- ASCII and UTF-8 string extraction from binary sections +- Section-aware extraction with weight-based prioritization +- Basic semantic tagging infrastructure +- Command-line interface (in development) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 869f001..04b90e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,7 +46,7 @@ Module layout: - `classification/` handles semantic tagging and ranking - `output/` handles output formatters - `types.rs` contains core data structures and error types - + Key patterns: - Section weights: add new section weights in `container/*.rs` using existing match patterns. Higher weight means more likely to contain useful strings. diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..30c8c58 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,239 @@ +# Stringy Development Roadmap + +This document tracks medium-term and long-term improvements identified during the comprehensive code review (2026-01-18). Issues are organized by priority and category. + +## Medium-Term Issues (Next 1-3 Releases) + +### Architecture Improvements + +#### 1. Split `extraction/mod.rs` into smaller modules +**Priority:** High +**Current state:** 1542 lines (exceeds 500-line project limit by 1042 lines) +**Files affected:** `src/extraction/mod.rs` + +Recommended split: +- `src/extraction/config.rs` - Move `ExtractionConfig` and validation logic +- `src/extraction/trait.rs` - Move `StringExtractor` trait definition +- `src/extraction/basic.rs` - Move `BasicExtractor` implementation +- `src/extraction/helpers.rs` - Move internal helper functions (`is_printable_text_byte`, `could_be_utf8_byte`, `extract_ascii_utf8_strings`) + +Other oversized files to address: +| File | Lines | Overage | +|------|-------|---------| +| `src/extraction/pe_resources.rs` | 1449 | +949 | +| `src/extraction/utf16.rs` | 1273 | +773 | +| `src/extraction/dedup.rs` | 849 | +349 | +| `src/extraction/ascii.rs` | 832 | +332 | +| `src/output/table.rs` | 708 | +208 | +| `src/extraction/filters.rs` | 702 | +202 | +| `src/container/pe.rs` | 661 | +161 | +| `src/container/elf.rs` | 627 | +127 | +| `src/container/macho.rs` | 574 | +74 | +| `src/types.rs` | 558 | +58 | + +#### 2. Move PE resources to container module +**Priority:** Medium +**Current state:** `src/extraction/pe_resources.rs` is in extraction but conceptually belongs in container +**Rationale:** PE resource parsing is part of container analysis, not string extraction + +#### 3. Decouple semantic enrichment from extraction +**Priority:** Medium +**Current state:** `extraction` module imports from `classification` creating bidirectional dependency +**Files affected:** `src/extraction/mod.rs:129` +**Recommendation:** Move semantic enrichment to an orchestration layer that callers control + +#### 4. Add `#[non_exhaustive]` to remaining public enums +**Priority:** Medium +**Files affected:** +- `src/types.rs:4-10` - `Encoding` enum +- `src/types.rs:130-136` - `BinaryFormat` enum + +### Error Handling + +#### 5. Add `SerializationError` variant to `StringyError` +**Priority:** Medium +**Current state:** `ConfigError` is incorrectly used for JSON serialization failures +**Files affected:** `src/output/json.rs:14-16`, `src/types.rs` + +#### 6. Add format-specific error variants +**Priority:** Low +**Recommendation:** Add `InvalidPeError`, `InvalidElfError`, `InvalidMachOError` instead of generic `ParseError(String)` + +### API Improvements + +#### 7. Add constructors to remaining public structs +**Priority:** Medium +**Files affected:** `src/types.rs` +**Structs needing constructors:** `ImportInfo`, `ExportInfo`, `SectionInfo` +**Rationale:** Required for `#[non_exhaustive]` compatibility + +#### 8. Add `#[allow]` justification comments +**Priority:** Low +**Files affected:** +- `src/extraction/utf16.rs:334` - `#[allow(clippy::result_unit_err)]` +- `src/extraction/utf16.rs:350` - `#[allow(dead_code)]` + +### Documentation + +#### 9. Update API documentation for accuracy +**Priority:** Medium +**Files affected:** `docs/src/api.md` +**Issues:** Some function signatures don't match actual implementation + +#### 10. Add security considerations to README +**Priority:** Medium +**Content to add:** Document malware analysis use case, safe handling of untrusted binaries + +#### 11. Document deduplication feature in user docs +**Priority:** Medium +**Files affected:** README.md, `docs/src/string-extraction.md` + +### Performance + +#### 12. Add memory mapping for large files +**Priority:** High +**Current state:** Entire file is loaded into memory +**Impact:** Processing 1GB+ binaries requires 1GB+ RAM +**Recommendation:** Use `memmap2` crate for memory-mapped file access + +```rust +// Recommended approach +use memmap2::Mmap; +use std::fs::File; + +let file = File::open(path)?; +let mmap = unsafe { Mmap::map(&file)? }; +let data: &[u8] = &mmap; +``` + +#### 13. Optimize redundant regex matching +**Priority:** Low +**Files affected:** `src/classification/patterns/network.rs:92-106` +**Issue:** URL_REGEX runs twice on URLs (in `classify_url` then `classify_domain`) + +### Testing + +#### 14. Set up code coverage metrics +**Priority:** Medium +**Tool:** `cargo-tarpaulin` +**Command:** `cargo tarpaulin --out Html` + +#### 15. Add performance benchmarks +**Priority:** Medium +**Tool:** `criterion` +**Focus areas:** Deduplication with large input sets, regex pattern matching + +#### 16. Add fuzzing for binary parsers +**Priority:** Medium +**Tool:** `cargo-fuzz` +**Targets:** `container/*.rs` parsers with malformed input + +--- + +## Long-Term Issues (Future Releases) + +### Performance Optimizations + +#### 17. Consider parallel extraction with rayon +**Priority:** Low +**Rationale:** Section-by-section extraction is embarrassingly parallel + +```rust +use rayon::prelude::*; + +let section_strings: Vec> = sections + .par_iter() + .map(|section| extractor.extract_from_section(data, section, config)) + .collect(); +``` + +#### 18. Consider `Cow` for hot paths +**Priority:** Low +**Files affected:** `src/types.rs:236-237` +**Benefit:** Avoid cloning when strings could be borrowed + +#### 19. Consider `SmallVec` for tags +**Priority:** Low +**Field:** `FoundString::tags` +**Rationale:** Typical 0-3 tags could use stack allocation with `SmallVec<[Tag; 4]>` + +### Dependency Management + +#### 20. Migrate to `std::sync::LazyLock` +**Priority:** Low +**Current state:** Uses `once_cell::sync::Lazy` +**Target:** `std::sync::LazyLock` (stabilized in Rust 1.80) +**Files affected:** All files in `src/classification/patterns/` + +### Feature Enhancements + +#### 21. Implement main CLI +**Priority:** High +**Current state:** `src/main.rs` is a stub with TODO +**File:** `src/main.rs:18` + +#### 22. Integrate Mach-O load command strings +**Priority:** Medium +**Current state:** Feature exists but not integrated into main pipeline +**File:** `src/container/macho.rs:198` + +#### 23. Parse all Mach-O architectures +**Priority:** Low +**Current state:** Only parses first architecture in fat binaries +**File:** `src/container/macho.rs:312` + +### Build Configuration + +#### 24. Add feature flags for output formats +**Priority:** Low +**File:** `Cargo.toml` + +```toml +[features] +default = ["json", "yara", "table"] +json = [] +yara = [] +table = [] +``` + +#### 25. Add `include` field to Cargo.toml +**Priority:** Low +**Purpose:** Control what gets published to crates.io + +```toml +[package] +include = ["src/**/*", "Cargo.toml", "LICENSE", "README.md"] +``` + +--- + +## Completed Items + +The following issues from the comprehensive review have been addressed: + +- [x] Fix failing doctests in `extraction/mod.rs` (2026-01-18) +- [x] Fix rustdoc warning in `patterns/ip.rs:107` (2026-01-18) +- [x] Create `CHANGELOG.md` (2026-01-18) +- [x] Fix O(n^2) algorithms in `dedup.rs` using HashSet (2026-01-18) +- [x] Add `OutputFormatter` trait for extensibility (2026-01-18) +- [x] Add `#[non_exhaustive]` to `OutputFormat` enum (2026-01-18) +- [x] Create `examples/` directory with usage examples (2026-01-18) +- [x] Add `Hash` derive to `Encoding` and `StringSource` enums (2026-01-18) + +--- + +## Review Summary + +**Overall Rating from Comprehensive Review: B+ (85/100)** + +| Dimension | Rating | +|-----------|--------| +| Code Quality | B+ | +| Architecture | B+ | +| Security | A | +| Performance | B | +| Testing | B+ | +| Documentation | B+ | +| Best Practices | A- | + +With the immediate issues addressed and medium-term improvements completed, this project would be ready for a stable 1.0 release. diff --git a/examples/basic_extraction.rs b/examples/basic_extraction.rs new file mode 100644 index 0000000..ce05ebb --- /dev/null +++ b/examples/basic_extraction.rs @@ -0,0 +1,75 @@ +//! Basic string extraction from a binary file. +//! +//! This example demonstrates the fundamental workflow for extracting strings +//! from a binary file using Stringy. +//! +//! Usage: cargo run --example basic_extraction + +use std::env; +use std::fs; +use stringy::container::{create_parser, detect_format}; +use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; + +fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let path = &args[1]; + println!("Analyzing: {}", path); + + // Read the binary file + let data = fs::read(path)?; + println!("File size: {} bytes", data.len()); + + // Detect the binary format + let format = detect_format(&data); + println!("Detected format: {:?}", format); + + // Create a parser for the detected format + let parser = create_parser(format)?; + let container_info = parser.parse(&data)?; + + println!( + "Found {} sections, {} imports, {} exports", + container_info.sections.len(), + container_info.imports.len(), + container_info.exports.len() + ); + + // Extract strings using the basic extractor + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + let strings = extractor.extract(&data, &container_info, &config)?; + + println!("\nExtracted {} strings\n", strings.len()); + + // Display the top 20 strings by score + let mut sorted_strings = strings.clone(); + sorted_strings.sort_by(|a, b| b.score.cmp(&a.score)); + + println!("Top strings by score:"); + println!("{:-<60}", ""); + for string in sorted_strings.iter().take(20) { + let tags: Vec<_> = string.tags.iter().map(|t| format!("{:?}", t)).collect(); + let tags_str = if tags.is_empty() { + String::new() + } else { + format!(" [{}]", tags.join(", ")) + }; + println!( + "{:4} | {:50}{}", + string.score, + if string.text.len() > 50 { + format!("{}...", &string.text[..47]) + } else { + string.text.clone() + }, + tags_str + ); + } + + Ok(()) +} diff --git a/examples/network_indicators.rs b/examples/network_indicators.rs new file mode 100644 index 0000000..f47d76d --- /dev/null +++ b/examples/network_indicators.rs @@ -0,0 +1,84 @@ +//! Extract network indicators (URLs, IPs, domains) from a binary. +//! +//! This example demonstrates how to extract and filter strings that contain +//! network-related indicators useful for threat intelligence. +//! +//! Usage: cargo run --example network_indicators + +use std::env; +use std::fs; +use stringy::container::{create_parser, detect_format}; +use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +use stringy::types::Tag; + +fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let path = &args[1]; + println!("Extracting network indicators from: {}\n", path); + + // Read and parse the binary + let data = fs::read(path)?; + let format = detect_format(&data); + let parser = create_parser(format)?; + let container_info = parser.parse(&data)?; + + // Extract strings with default configuration + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + let strings = extractor.extract(&data, &container_info, &config)?; + + // Filter for network-related tags + let network_tags = [Tag::Url, Tag::Domain, Tag::IPv4, Tag::IPv6]; + + let network_strings: Vec<_> = strings + .iter() + .filter(|s| s.tags.iter().any(|t| network_tags.contains(t))) + .collect(); + + if network_strings.is_empty() { + println!("No network indicators found."); + return Ok(()); + } + + println!("Found {} network indicators:\n", network_strings.len()); + + // Group by tag type + println!("=== URLs ==="); + for s in network_strings + .iter() + .filter(|s| s.tags.contains(&Tag::Url)) + { + println!(" {}", s.text); + } + + println!("\n=== Domains ==="); + for s in network_strings + .iter() + .filter(|s| s.tags.contains(&Tag::Domain)) + { + println!(" {}", s.text); + } + + println!("\n=== IPv4 Addresses ==="); + for s in network_strings + .iter() + .filter(|s| s.tags.contains(&Tag::IPv4)) + { + println!(" {}", s.text); + } + + println!("\n=== IPv6 Addresses ==="); + for s in network_strings + .iter() + .filter(|s| s.tags.contains(&Tag::IPv6)) + { + println!(" {}", s.text); + } + + Ok(()) +} diff --git a/examples/output_formats.rs b/examples/output_formats.rs new file mode 100644 index 0000000..f52aaf0 --- /dev/null +++ b/examples/output_formats.rs @@ -0,0 +1,72 @@ +//! Demonstrate different output formats (JSON, Table, YARA). +//! +//! This example shows how to format extracted strings in different output +//! formats suitable for various use cases. +//! +//! Usage: cargo run --example output_formats [format] +//! +//! Formats: table (default), json, yara + +use std::env; +use std::fs; +use stringy::container::{create_parser, detect_format}; +use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +use stringy::output::{OutputFormat, OutputMetadata, format_output}; + +fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() < 2 { + eprintln!("Usage: {} [format]", args[0]); + eprintln!("Formats: table (default), json, yara"); + std::process::exit(1); + } + + let path = &args[1]; + let format_arg = args.get(2).map(|s| s.as_str()).unwrap_or("table"); + + let output_format = match format_arg.to_lowercase().as_str() { + "table" => OutputFormat::Table, + "json" => OutputFormat::Json, + "yara" => OutputFormat::Yara, + _ => { + eprintln!("Unknown format: {}. Use table, json, or yara.", format_arg); + std::process::exit(1); + } + }; + + // Read and parse the binary + let data = fs::read(path)?; + let format = detect_format(&data); + let parser = create_parser(format)?; + let container_info = parser.parse(&data)?; + + // Extract strings + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + let strings = extractor.extract(&data, &container_info, &config)?; + + // Limit to top 50 strings for demonstration + let mut sorted_strings = strings; + sorted_strings.sort_by(|a, b| b.score.cmp(&a.score)); + let top_strings: Vec<_> = sorted_strings.into_iter().take(50).collect(); + + // Create output metadata + let binary_name = std::path::Path::new(path) + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown") + .to_string(); + + let metadata = OutputMetadata::new( + binary_name, + output_format, + top_strings.len(), + top_strings.len(), + ); + + // Format and print output + let output = format_output(&top_strings, &metadata)?; + println!("{}", output); + + Ok(()) +} diff --git a/src/classification/patterns/ip.rs b/src/classification/patterns/ip.rs index bb64164..98bed5e 100644 --- a/src/classification/patterns/ip.rs +++ b/src/classification/patterns/ip.rs @@ -104,7 +104,7 @@ pub fn is_ipv4_address(text: &str) -> bool { /// Checks if the given text is a valid IPv6 address /// -/// This method handles bracketed IPv6 addresses (e.g., [::1]:8080), +/// This method handles bracketed IPv6 addresses (e.g., `[::1]:8080`), /// strips any port suffix, and validates using both regex and standard library. /// /// # Arguments diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs index b25bae0..53af1f7 100644 --- a/src/extraction/dedup.rs +++ b/src/extraction/dedup.rs @@ -7,7 +7,7 @@ use crate::types::{Encoding, FoundString, StringSource, Tag}; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; /// A canonical string with all its occurrences /// @@ -90,18 +90,16 @@ pub fn deduplicate( } // Group strings by (text, encoding) key - // Use string representation of encoding as HashMap key since Encoding doesn't implement Hash - let mut groups: HashMap<(String, String), Vec> = HashMap::new(); + let mut groups: HashMap<(String, Encoding), Vec> = HashMap::new(); for string in strings { - let encoding_str = format!("{:?}", string.encoding); - let key = (string.text.clone(), encoding_str); + let key = (string.text.clone(), string.encoding); groups.entry(key).or_default().push(string); } // Convert each group to a CanonicalString let mut canonical_strings: Vec = groups .into_iter() - .map(|((text, _encoding_str), found_strings)| { + .map(|((text, _encoding), found_strings)| { // Check if group meets dedup_threshold let meets_threshold = if let Some(threshold) = dedup_threshold { found_strings.len() >= threshold @@ -180,21 +178,11 @@ fn calculate_combined_score(occurrences: &[StringOccurrence]) -> i32 { }; // Cross-section bonus: 10 points if string appears in different sections - let mut unique_sections = Vec::new(); - for occ in occurrences.iter() { - if !unique_sections.contains(&occ.section) { - unique_sections.push(occ.section.clone()); - } - } + let unique_sections: HashSet<_> = occurrences.iter().map(|occ| &occ.section).collect(); let cross_section_bonus = if unique_sections.len() > 1 { 10 } else { 0 }; // Multi-source bonus: 15 points if string appears from different sources - let mut unique_sources = Vec::new(); - for occ in occurrences.iter() { - if !unique_sources.contains(&occ.source) { - unique_sources.push(occ.source); - } - } + let unique_sources: HashSet<_> = occurrences.iter().map(|occ| occ.source).collect(); let multi_source_bonus = if unique_sources.len() > 1 { 15 } else { 0 }; // Confidence boost: max_confidence * 10 @@ -220,10 +208,11 @@ fn calculate_combined_score(occurrences: &[StringOccurrence]) -> i32 { /// /// Vector of unique tags (order may vary since Tag doesn't implement Ord) fn merge_tags(occurrences: &[StringOccurrence]) -> Vec { + let mut seen = HashSet::new(); let mut tags = Vec::new(); for occurrence in occurrences { for tag in &occurrence.original_tags { - if !tags.contains(tag) { + if seen.insert(tag.clone()) { tags.push(tag.clone()); } } diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index af814f8..ea11d32 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -315,18 +315,21 @@ impl ExtractionConfig { /// /// # Example /// -/// ```rust +/// ```rust,no_run /// use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; /// use stringy::container::{detect_format, create_parser}; /// -/// let data = std::fs::read("binary_file")?; -/// let format = detect_format(&data); -/// let parser = create_parser(format)?; -/// let container_info = parser.parse(&data)?; +/// fn main() -> Result<(), Box> { +/// let data = std::fs::read("binary_file")?; +/// let format = detect_format(&data); +/// let parser = create_parser(format)?; +/// let container_info = parser.parse(&data)?; /// -/// let extractor = BasicExtractor::new(); -/// let config = ExtractionConfig::default(); -/// let strings = extractor.extract(&data, &container_info, &config)?; +/// let extractor = BasicExtractor::new(); +/// let config = ExtractionConfig::default(); +/// let strings = extractor.extract(&data, &container_info, &config)?; +/// Ok(()) +/// } /// ``` pub trait StringExtractor { /// Extract strings from entire binary using container metadata @@ -409,31 +412,34 @@ pub trait StringExtractor { /// use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; /// use stringy::types::{ContainerInfo, SectionInfo, SectionType, BinaryFormat}; /// -/// let extractor = BasicExtractor::new(); -/// let config = ExtractionConfig::default(); +/// fn main() -> Result<(), Box> { +/// let extractor = BasicExtractor::new(); +/// let config = ExtractionConfig::default(); /// -/// // Create a simple container info for testing -/// let section = SectionInfo { -/// name: ".rodata".to_string(), -/// offset: 0, -/// size: 100, -/// rva: Some(0x1000), -/// section_type: SectionType::StringData, -/// is_executable: false, -/// is_writable: false, -/// weight: 1.0, -/// }; +/// // Create a simple container info for testing +/// let section = SectionInfo { +/// name: ".rodata".to_string(), +/// offset: 0, +/// size: 100, +/// rva: Some(0x1000), +/// section_type: SectionType::StringData, +/// is_executable: false, +/// is_writable: false, +/// weight: 1.0, +/// }; /// -/// let container_info = ContainerInfo::new( -/// BinaryFormat::Elf, -/// vec![section], -/// vec![], -/// vec![], -/// None, -/// ); +/// let container_info = ContainerInfo::new( +/// BinaryFormat::Elf, +/// vec![section], +/// vec![], +/// vec![], +/// None, +/// ); /// -/// let data = b"Hello World\0Test String\0"; -/// let strings = extractor.extract(data, &container_info, &config)?; +/// let data = b"Hello World\0Test String\0"; +/// let strings = extractor.extract(data, &container_info, &config)?; +/// Ok(()) +/// } /// ``` #[derive(Debug, Clone)] pub struct BasicExtractor; diff --git a/src/lib.rs b/src/lib.rs index d340897..d5b5047 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,5 +87,6 @@ pub use extraction::{ // Re-export output infrastructure types pub use output::{ - OutputFormat, OutputMetadata, format_json, format_output, format_table_with_mode, format_yara, + OutputFormat, OutputFormatter, OutputMetadata, format_json, format_output, + format_table_with_mode, format_yara, }; diff --git a/src/output/mod.rs b/src/output/mod.rs index 9bbdb4c..d7e0818 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -44,7 +44,49 @@ pub use json::format_json; pub use table::{format_table, format_table_with_mode}; pub use yara::format_yara; +/// Trait for output formatters. +/// +/// Implementations of this trait provide different output formats for extracted +/// strings. This trait enables extensibility by allowing custom formatters to be +/// added without modifying the core dispatch logic. +/// +/// # Example +/// +/// ```rust +/// use stringy::output::{OutputFormatter, OutputMetadata}; +/// use stringy::types::{FoundString, Result}; +/// +/// struct CustomFormatter; +/// +/// impl OutputFormatter for CustomFormatter { +/// fn format(&self, strings: &[FoundString], metadata: &OutputMetadata) -> Result { +/// Ok(format!("Custom: {} strings from {}", strings.len(), metadata.binary_name)) +/// } +/// +/// fn name(&self) -> &'static str { +/// "custom" +/// } +/// } +/// ``` +pub trait OutputFormatter { + /// Format the extracted strings into the output representation. + /// + /// # Arguments + /// + /// * `strings` - The extracted strings to format. + /// * `metadata` - Output context including binary name and format settings. + /// + /// # Returns + /// + /// A formatted string on success, or an error if formatting fails. + fn format(&self, strings: &[FoundString], metadata: &OutputMetadata) -> Result; + + /// Returns the name of this formatter for identification purposes. + fn name(&self) -> &'static str; +} + /// Output format selection for Stringy formatters. +#[non_exhaustive] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum OutputFormat { /// Human-readable table format with TTY detection. diff --git a/src/types.rs b/src/types.rs index 745f1c6..69e253a 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,7 +1,7 @@ use serde::{Deserialize, Serialize}; /// Represents the encoding of an extracted string -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum Encoding { Ascii, Utf8, @@ -69,7 +69,7 @@ pub enum SectionType { } /// Source of a string within the binary -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum StringSource { /// String found in section data SectionData, From 0c2744e9b530528f34bef424f7878558c2f5beb5 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 14:29:08 -0500 Subject: [PATCH 12/25] chore(devcontainer): add Rust devcontainer configuration chore(dependabot): update schedules to weekly for various ecosystems Signed-off-by: UncleSp1d3r --- .devcontainer/devcontainer.json | 62 +++++++++++++++++++++++++++++++++ .github/dependabot.yml | 8 +++-- 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..a935486 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,62 @@ +{ + "name": "Rust", + "image": "mcr.microsoft.com/devcontainers/rust:2-1-trixie", + "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker:1": { + "installDockerBuildx": true, + "version": "latest", + "dockerDashComposeVersion": "v2" + }, + "ghcr.io/devcontainers/features/github-cli:1": { + "installDirectlyFromGitHubRelease": true, + "version": "latest" + }, + "ghcr.io/eitsupi/devcontainer-features/mdbook:1": { + "version": "latest" + }, + "ghcr.io/jsburckhardt/devcontainer-features/bat:1": {}, + "ghcr.io/jsburckhardt/devcontainer-features/just:1": {}, + "ghcr.io/lee-orr/rusty-dev-containers/cargo-audit:0": {}, + "ghcr.io/lee-orr/rusty-dev-containers/cargo-binstall:0": {}, + "ghcr.io/lee-orr/rusty-dev-containers/cargo-deny:0": {}, + "ghcr.io/lee-orr/rusty-dev-containers/cargo-llvm-cov:0": {}, + "ghcr.io/lee-orr/rusty-dev-containers/cargo-nextest:0": {}, + "ghcr.io/marcozac/devcontainer-features/goreleaser:1": { + "version": "latest" + }, + "ghcr.io/devcontainers-extra/features/claude-code:1": { + "version": "latest" + }, + "ghcr.io/devcontainers-extra/features/mise:1": { + "version": "latest" + }, + "ghcr.io/devcontainers-extra/features/pre-commit:2": { + "version": "latest" + }, + "ghcr.io/roul/devcontainer-features/mise-node:1": {}, + "ghcr.io/roul/devcontainer-features/mise-python:1": {} + }, + "customizations": { + "vscode": { + "extensions": [ + "mikestead.dotenv", + "EditorConfig.EditorConfig", + "tamasfe.even-better-toml", + "github.vscode-github-actions", + "GitHub.vscode-pull-request-github", + "skellock.just", + "yzhang.markdown-all-in-one", + "bierner.markdown-checkbox", + "bierner.markdown-footnotes", + "bierner.markdown-mermaid", + "bierner.markdown-yaml-preamble", + "DavidAnson.vscode-markdownlint", + "1YiB.rust-bundle", + "rust-lang.rust-analyzer", + "foxundermoon.shell-format", + "redhat.vscode-yaml", + "ms-vscode-remote.remote-containers" + ] + } + } +} \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4d6f904..7b04c4a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,8 +13,12 @@ updates: - package-ecosystem: "github-actions" directory: "/" schedule: - interval: "daily" + interval: "weekly" - package-ecosystem: "rust-toolchain" directory: "/" schedule: - interval: "daily" + interval: "weekly" + - package-ecosystem: "devcontainers" + directory: "/" + schedule: + interval: "weekly" From 6510b90d4fdb7039651253d43e408515b1a4626f Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 14:49:29 -0500 Subject: [PATCH 13/25] refactor(output): split table.rs into module directory Split the oversized table.rs (708 lines) into a module directory with four focused files, all under the 500-line project limit: - table/mod.rs (120 lines): Public API, constants, entry points - table/tty.rs (220 lines): TTY table rendering with column alignment - table/plain.rs (96 lines): Plain text output for piping - table/formatting.rs (326 lines): Shared utilities (truncate, pad, format_tags) This addresses the ticket acceptance criterion requiring all files to be under 500 lines. Co-Authored-By: Claude Opus 4.5 --- src/output/table.rs | 708 --------------------------------- src/output/table/formatting.rs | 326 +++++++++++++++ src/output/table/mod.rs | 120 ++++++ src/output/table/plain.rs | 96 +++++ src/output/table/tty.rs | 226 +++++++++++ 5 files changed, 768 insertions(+), 708 deletions(-) delete mode 100644 src/output/table.rs create mode 100644 src/output/table/formatting.rs create mode 100644 src/output/table/mod.rs create mode 100644 src/output/table/plain.rs create mode 100644 src/output/table/tty.rs diff --git a/src/output/table.rs b/src/output/table.rs deleted file mode 100644 index 4844082..0000000 --- a/src/output/table.rs +++ /dev/null @@ -1,708 +0,0 @@ -//! Table output formatter for Stringy. -//! -//! This module provides human-readable table output with automatic TTY detection. -//! When output is directed to a terminal (TTY), strings are displayed in an aligned -//! table with headers showing String, Tags, Score, and Section columns. When output -//! is piped or redirected (non-TTY), only the raw string text is emitted, one per line, -//! for seamless integration with other command-line tools. -//! -//! # TTY Mode Example -//! -//! ```text -//! String | Tags | Score | Section -//! -------------------------------------------------------------|--------------|-------|-------- -//! https://malware.example.com/beacon | url | 150 | .rdata -//! C:\Windows\System32\cmd.exe | filepath | 120 | .data -//! GetProcAddress | import | 80 | -//! ``` -//! -//! # Non-TTY Mode Example -//! -//! ```text -//! https://malware.example.com/beacon -//! C:\Windows\System32\cmd.exe -//! GetProcAddress -//! ``` -//! -//! # Column Layout -//! -//! - **String**: Up to 60 characters, truncated with `...` if longer -//! - **Tags**: First 2-3 tags, comma-separated, max 20 characters -//! - **Score**: Right-aligned integer score -//! - **Section**: Section name where the string was found - -use std::io::IsTerminal; - -use crate::classification::ranking::RankingConfig; -use crate::types::{FoundString, Result, Tag}; - -use super::OutputMetadata; - -/// Maximum width for the string column before truncation. -const STRING_COLUMN_WIDTH: usize = 60; - -/// Maximum width for the tags column. -const TAGS_COLUMN_WIDTH: usize = 20; - -/// Maximum width for the score column. -const SCORE_COLUMN_WIDTH: usize = 6; - -/// Maximum width for the section column. -const SECTION_COLUMN_WIDTH: usize = 15; - -/// Format strings in a human-readable table format. -/// -/// Automatically detects whether output is going to a TTY (terminal) and adjusts -/// the format accordingly. In TTY mode, outputs an aligned table with headers. -/// In non-TTY mode (piped/redirected), outputs plain strings one per line. -/// -/// # Arguments -/// -/// * `strings` - The extracted strings to format -/// * `metadata` - Output context (currently unused but reserved for future features) -/// -/// # Returns -/// -/// A formatted string ready for output. -pub fn format_table(strings: &[FoundString], metadata: &OutputMetadata) -> Result { - let is_tty = std::io::stdout().is_terminal(); - format_table_with_mode(strings, metadata, is_tty) -} - -/// Format table with explicit TTY mode specification. -/// -/// This function allows explicit control over the output mode, useful for testing -/// and programmatic control over output format. -/// -/// # Arguments -/// -/// * `strings` - The extracted strings to format -/// * `metadata` - Output context -/// * `is_tty` - Whether to use TTY mode (true) or plain mode (false) -pub fn format_table_with_mode( - strings: &[FoundString], - metadata: &OutputMetadata, - is_tty: bool, -) -> Result { - if is_tty { - format_table_tty(strings, metadata) - } else { - format_table_plain(strings) - } -} - -/// Format strings as an aligned table for TTY output. -/// -/// Creates a table with headers and aligned columns showing: -/// - String text (truncated if necessary) -/// - Tags (comma-separated, limited count) -/// - Score (right-aligned) -/// - Section name -fn format_table_tty(strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - if strings.is_empty() { - return Ok(String::new()); - } - - let mut output = String::new(); - - // Calculate dynamic column widths based on content - let section_width = calculate_section_width(strings); - let tags_width = calculate_tags_width(strings); - - // Build header - let header = format!( - "{} | {} | {} | {}", - pad_string("String", STRING_COLUMN_WIDTH, Alignment::Left), - pad_string("Tags", tags_width, Alignment::Left), - pad_string("Score", SCORE_COLUMN_WIDTH, Alignment::Right), - pad_string("Section", section_width, Alignment::Left), - ); - output.push_str(&header); - output.push('\n'); - - // Build separator line - let separator = format!( - "{}-|-{}-|-{}-|-{}", - "-".repeat(STRING_COLUMN_WIDTH), - "-".repeat(tags_width), - "-".repeat(SCORE_COLUMN_WIDTH), - "-".repeat(section_width), - ); - output.push_str(&separator); - output.push('\n'); - - // Build rows - for found_string in strings { - let truncated_text = truncate_string(&found_string.text, STRING_COLUMN_WIDTH); - let tags_display = format_tags(&found_string.tags); - let section_display = found_string.section.as_deref().unwrap_or(""); - - let row = format!( - "{} | {} | {} | {}", - pad_string(&truncated_text, STRING_COLUMN_WIDTH, Alignment::Left), - pad_string(&tags_display, tags_width, Alignment::Left), - pad_string( - &found_string.score.to_string(), - SCORE_COLUMN_WIDTH, - Alignment::Right - ), - pad_string(section_display, section_width, Alignment::Left), - ); - output.push_str(&row); - output.push('\n'); - } - - // Remove trailing newline for consistency - if output.ends_with('\n') { - output.pop(); - } - - Ok(output) -} - -/// Format strings as plain text for non-TTY output. -/// -/// Outputs only the string text, one per line, suitable for piping to other tools. -fn format_table_plain(strings: &[FoundString]) -> Result { - let lines: Vec = strings - .iter() - .map(|s| sanitize_plain_text(&s.text)) - .collect(); - Ok(lines.join("\n")) -} - -/// Calculate the optimal width for the section column based on content. -fn calculate_section_width(strings: &[FoundString]) -> usize { - let max_section_len = strings - .iter() - .filter_map(|s| s.section.as_ref()) - .map(|s| s.len()) - .max() - .unwrap_or(0); - - // Minimum width is "Section" header length, maximum is SECTION_COLUMN_WIDTH - max_section_len.clamp("Section".len(), SECTION_COLUMN_WIDTH) -} - -/// Calculate the optimal width for the tags column based on content. -fn calculate_tags_width(strings: &[FoundString]) -> usize { - let max_tags_len = strings - .iter() - .map(|s| format_tags(&s.tags).len()) - .max() - .unwrap_or(0); - - // Minimum width is "Tags" header length, maximum is TAGS_COLUMN_WIDTH - max_tags_len.clamp("Tags".len(), TAGS_COLUMN_WIDTH) -} - -/// Format tags for display in the table. -/// -/// Converts tags to their display format using serde rename values where applicable. -/// Limits output to `MAX_TAGS_DISPLAY` tags to prevent overflow. -/// -/// # Arguments -/// -/// * `tags` - Slice of tags to format -/// -/// # Returns -/// -/// Comma-separated string of tag names, or empty string if no tags. -/// -/// # Examples -/// -/// ```ignore -/// let tags = vec![Tag::IPv4, Tag::FilePath]; -/// assert_eq!(format_tags(&tags), "ipv4, filepath"); -/// ``` -pub fn format_tags(tags: &[Tag]) -> String { - if tags.is_empty() { - return String::new(); - } - - let config = RankingConfig::default(); - let max_boost = tags - .iter() - .map(|tag| tag_boost_value(tag, &config)) - .max() - .unwrap_or(0); - - let tag_strings: Vec = tags - .iter() - .filter(|tag| tag_boost_value(tag, &config) == max_boost) - .map(tag_to_display_string) - .collect(); - - let result = tag_strings.join(", "); - - // Truncate if still too long - if result.len() > TAGS_COLUMN_WIDTH { - truncate_string(&result, TAGS_COLUMN_WIDTH) - } else { - result - } -} - -/// Sanitize plain text output so each string renders as a single line. -/// -/// Replaces CRLF, LF, and CR with escaped sequences to preserve content -/// while keeping output line-based. -fn sanitize_plain_text(text: &str) -> String { - text.replace("\r\n", "\\r\\n") - .replace('\n', "\\n") - .replace('\r', "\\r") -} - -/// Get the ranking boost value for a tag using the provided config. -fn tag_boost_value(tag: &Tag, config: &RankingConfig) -> i32 { - config.tag_boosts.get(tag).copied().unwrap_or(0) -} - -/// Convert a single tag to its display string. -/// -/// Uses the serde rename value where defined, otherwise uses lowercase Debug format. -fn tag_to_display_string(tag: &Tag) -> String { - match tag { - Tag::Url => "url".to_string(), - Tag::Domain => "domain".to_string(), - Tag::IPv4 => "ipv4".to_string(), - Tag::IPv6 => "ipv6".to_string(), - Tag::FilePath => "filepath".to_string(), - Tag::RegistryPath => "regpath".to_string(), - Tag::Guid => "guid".to_string(), - Tag::Email => "email".to_string(), - Tag::Base64 => "b64".to_string(), - Tag::FormatString => "fmt".to_string(), - Tag::UserAgent => "user-agent-ish".to_string(), - Tag::DemangledSymbol => "demangled".to_string(), - Tag::Import => "import".to_string(), - Tag::Export => "export".to_string(), - Tag::Version => "version".to_string(), - Tag::Manifest => "manifest".to_string(), - Tag::Resource => "resource".to_string(), - Tag::DylibPath => "dylib-path".to_string(), - Tag::Rpath => "rpath".to_string(), - Tag::RpathVariable => "rpath-var".to_string(), - Tag::FrameworkPath => "framework-path".to_string(), - } -} - -/// Truncate a string to the specified maximum length. -/// -/// If the string exceeds the maximum length, it is truncated and `...` is appended. -/// Handles Unicode correctly by truncating at character boundaries. -/// -/// # Arguments -/// -/// * `s` - The string to truncate -/// * `max_len` - Maximum length including the ellipsis -/// -/// # Returns -/// -/// The original string if it fits, or a truncated version with `...` appended. -/// -/// # Examples -/// -/// ```ignore -/// assert_eq!(truncate_string("hello", 10), "hello"); -/// assert_eq!(truncate_string("hello world", 8), "hello..."); -/// ``` -pub fn truncate_string(s: &str, max_len: usize) -> String { - if s.len() <= max_len { - return s.to_string(); - } - - if max_len <= 3 { - return ".".repeat(max_len); - } - - // Find a valid character boundary for truncation - let truncate_at = max_len - 3; - let mut end_index = truncate_at; - - // Ensure we don't split a multi-byte character - for (idx, _) in s.char_indices() { - if idx <= truncate_at { - end_index = idx; - } else { - break; - } - } - - // Handle case where we need to include at least one character - if end_index == 0 && !s.is_empty() { - if let Some((idx, _)) = s.char_indices().nth(1) { - end_index = idx; - } else { - end_index = s.len(); - } - } - - format!("{}...", &s[..end_index]) -} - -/// Text alignment for padding. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Alignment { - /// Left-align text (pad on right). - Left, - /// Right-align text (pad on left). - Right, -} - -/// Pad a string to a fixed width with the specified alignment. -/// -/// # Arguments -/// -/// * `s` - The string to pad -/// * `width` - Target width -/// * `alignment` - Left or right alignment -/// -/// # Returns -/// -/// The padded string. -pub fn pad_string(s: &str, width: usize, alignment: Alignment) -> String { - match alignment { - Alignment::Left => format!("{: format!("{:>width$}", s, width = width), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::output::OutputFormat; - use crate::types::{Encoding, StringSource}; - - fn make_test_string(text: &str) -> FoundString { - FoundString::new( - text.to_string(), - Encoding::Ascii, - 0x1000, - text.len() as u32, - StringSource::SectionData, - ) - } - - fn make_metadata() -> OutputMetadata { - OutputMetadata::new("test.bin".to_string(), OutputFormat::Table, 10, 10) - } - - // Tests for format_tags - mod format_tags_tests { - use super::*; - - #[test] - fn empty_tags() { - assert_eq!(format_tags(&[]), ""); - } - - #[test] - fn single_tag() { - assert_eq!(format_tags(&[Tag::Url]), "url"); - assert_eq!(format_tags(&[Tag::IPv4]), "ipv4"); - assert_eq!(format_tags(&[Tag::FilePath]), "filepath"); - } - - #[test] - fn two_tags() { - assert_eq!(format_tags(&[Tag::Url, Tag::Domain]), "url"); - assert_eq!(format_tags(&[Tag::IPv4, Tag::FilePath]), "ipv4"); - } - - #[test] - fn three_tags() { - assert_eq!(format_tags(&[Tag::Url, Tag::Domain, Tag::IPv4]), "url"); - } - - #[test] - fn more_than_max_tags_truncated() { - let tags = vec![ - Tag::Url, - Tag::Domain, - Tag::IPv4, - Tag::FilePath, - Tag::RegistryPath, - ]; - assert_eq!(format_tags(&tags), "url"); - } - - #[test] - fn multiple_tags_same_priority() { - assert_eq!(format_tags(&[Tag::Import, Tag::Export]), "import, export"); - } - - #[test] - fn all_tag_variants_have_display() { - // Ensure all tag variants produce valid output - let all_tags = vec![ - Tag::Url, - Tag::Domain, - Tag::IPv4, - Tag::IPv6, - Tag::FilePath, - Tag::RegistryPath, - Tag::Guid, - Tag::Email, - Tag::Base64, - Tag::FormatString, - Tag::UserAgent, - Tag::DemangledSymbol, - Tag::Import, - Tag::Export, - Tag::Version, - Tag::Manifest, - Tag::Resource, - Tag::DylibPath, - Tag::Rpath, - Tag::RpathVariable, - Tag::FrameworkPath, - ]; - - for tag in all_tags { - let display = tag_to_display_string(&tag); - assert!(!display.is_empty(), "Tag {:?} should have display", tag); - assert!(display.is_ascii(), "Tag display should be ASCII"); - } - } - } - - // Tests for truncate_string - mod truncate_string_tests { - use super::*; - - #[test] - fn short_string_unchanged() { - assert_eq!(truncate_string("hello", 10), "hello"); - assert_eq!(truncate_string("", 10), ""); - } - - #[test] - fn exact_length_unchanged() { - assert_eq!(truncate_string("hello", 5), "hello"); - } - - #[test] - fn long_string_truncated() { - assert_eq!(truncate_string("hello world", 8), "hello..."); - } - - #[test] - fn very_short_max_length() { - assert_eq!(truncate_string("hello", 3), "..."); - assert_eq!(truncate_string("hello", 2), ".."); - assert_eq!(truncate_string("hello", 1), "."); - } - - #[test] - fn unicode_string_safe_truncation() { - // Ensure we don't split multi-byte characters - let unicode = "hello\u{1F600}world"; // emoji in the middle - let truncated = truncate_string(unicode, 8); - // Should truncate before the emoji to avoid splitting it - assert!(truncated.ends_with("...")); - assert!(truncated.len() <= 8); - } - - #[test] - fn unicode_at_boundary() { - let text = "\u{4E2D}\u{6587}\u{6D4B}\u{8BD5}"; // Chinese characters - let truncated = truncate_string(text, 6); - assert!(truncated.is_char_boundary(truncated.len() - 3)); - } - } - - // Tests for pad_string - mod pad_string_tests { - use super::*; - - #[test] - fn left_alignment() { - assert_eq!(pad_string("hi", 5, Alignment::Left), "hi "); - assert_eq!(pad_string("hello", 5, Alignment::Left), "hello"); - } - - #[test] - fn right_alignment() { - assert_eq!(pad_string("hi", 5, Alignment::Right), " hi"); - assert_eq!(pad_string("hello", 5, Alignment::Right), "hello"); - } - - #[test] - fn exact_width() { - assert_eq!(pad_string("exact", 5, Alignment::Left), "exact"); - assert_eq!(pad_string("exact", 5, Alignment::Right), "exact"); - } - - #[test] - fn empty_string() { - assert_eq!(pad_string("", 5, Alignment::Left), " "); - assert_eq!(pad_string("", 5, Alignment::Right), " "); - } - } - - // Tests for format_table - mod format_table_tests { - use super::*; - - #[test] - fn empty_strings_returns_empty() { - let result = format_table_with_mode(&[], &make_metadata(), true).unwrap(); - assert_eq!(result, ""); - } - - #[test] - fn single_string_tty_mode() { - let strings = vec![make_test_string("test string")]; - let result = format_table_with_mode(&strings, &make_metadata(), true).unwrap(); - - // Should have header, separator, and one data row - let lines: Vec<&str> = result.lines().collect(); - assert_eq!(lines.len(), 3); - assert!(lines[0].contains("String")); - assert!(lines[0].contains("Tags")); - assert!(lines[0].contains("Score")); - assert!(lines[0].contains("Section")); - assert!(lines[1].contains("---")); - assert!(lines[2].contains("test string")); - } - - #[test] - fn single_string_plain_mode() { - let strings = vec![make_test_string("test string")]; - let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); - - assert_eq!(result, "test string"); - } - - #[test] - fn multiple_strings_plain_mode() { - let strings = vec![ - make_test_string("first"), - make_test_string("second"), - make_test_string("third"), - ]; - let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); - - assert_eq!(result, "first\nsecond\nthird"); - } - - #[test] - fn string_with_tags_displayed() { - let mut found = make_test_string("http://example.com"); - found.tags = vec![Tag::Url, Tag::Domain]; - - let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); - assert!(result.contains("url")); - } - - #[test] - fn string_with_section_displayed() { - let found = make_test_string("test").with_section(".rodata".to_string()); - - let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); - assert!(result.contains(".rodata")); - } - - #[test] - fn string_with_score_displayed() { - let found = make_test_string("test").with_score(150); - - let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); - assert!(result.contains("150")); - } - - #[test] - fn long_string_truncated_in_tty() { - let long_text = "a".repeat(100); - let strings = vec![make_test_string(&long_text)]; - let result = format_table_with_mode(&strings, &make_metadata(), true).unwrap(); - - // Should contain truncated version with ... - assert!(result.contains("...")); - // Should not contain the full 100 character string - assert!(!result.contains(&long_text)); - } - - #[test] - fn long_string_not_truncated_in_plain() { - let long_text = "a".repeat(100); - let strings = vec![make_test_string(&long_text)]; - let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); - - // Plain mode should have full string - assert_eq!(result, long_text); - } - - #[test] - fn missing_optional_fields_handled() { - // String with no section, no tags, default score - let found = make_test_string("minimal"); - - let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); - // Should not crash and should contain the string - assert!(result.contains("minimal")); - } - - #[test] - fn special_characters_in_string() { - let strings = vec![make_test_string("tab\there"), make_test_string("pipe|here")]; - let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); - - // Each string should be on its own line in output - let lines: Vec<&str> = result.lines().collect(); - assert_eq!(lines.len(), 2); - assert!(lines[0].contains("tab\there")); - assert!(lines[1].contains("pipe|here")); - } - - #[test] - fn string_with_embedded_newline() { - let strings = vec![make_test_string("line1\nline2")]; - let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); - assert_eq!(result, "line1\\nline2"); - } - } - - // Tests for column width calculation - mod column_width_tests { - use super::*; - - #[test] - fn section_width_minimum() { - let strings = vec![make_test_string("test")]; - let width = calculate_section_width(&strings); - assert_eq!(width, "Section".len()); - } - - #[test] - fn section_width_from_content() { - let strings = vec![make_test_string("test").with_section(".rodata.str1.1".to_string())]; - let width = calculate_section_width(&strings); - assert_eq!(width, ".rodata.str1.1".len()); - } - - #[test] - fn section_width_capped_at_max() { - let long_section = "a".repeat(50); - let strings = vec![make_test_string("test").with_section(long_section)]; - let width = calculate_section_width(&strings); - assert_eq!(width, SECTION_COLUMN_WIDTH); - } - - #[test] - fn tags_width_minimum() { - let strings = vec![make_test_string("test")]; - let width = calculate_tags_width(&strings); - assert_eq!(width, "Tags".len()); - } - - #[test] - fn tags_width_from_content() { - let mut found = make_test_string("test"); - found.tags = vec![Tag::Url, Tag::Domain]; - let width = calculate_tags_width(&[found]); - assert_eq!(width, "Tags".len()); - } - } -} diff --git a/src/output/table/formatting.rs b/src/output/table/formatting.rs new file mode 100644 index 0000000..02c8e4a --- /dev/null +++ b/src/output/table/formatting.rs @@ -0,0 +1,326 @@ +//! String formatting utilities for table output. +//! +//! This module provides shared utilities for formatting strings, tags, and +//! text alignment used by both TTY and plain output modes. + +use crate::classification::ranking::RankingConfig; +use crate::types::Tag; + +use super::TAGS_COLUMN_WIDTH; + +/// Text alignment for padding. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Alignment { + /// Left-align text (pad on right). + Left, + /// Right-align text (pad on left). + Right, +} + +/// Format tags for display in the table. +/// +/// Converts tags to their display format using serde rename values where applicable. +/// Shows only tags with the highest boost value to prioritize important tags. +/// +/// # Arguments +/// +/// * `tags` - Slice of tags to format +/// +/// # Returns +/// +/// Comma-separated string of tag names, or empty string if no tags. +/// +/// # Examples +/// +/// ```ignore +/// let tags = vec![Tag::IPv4, Tag::FilePath]; +/// assert_eq!(format_tags(&tags), "ipv4"); +/// ``` +pub fn format_tags(tags: &[Tag]) -> String { + if tags.is_empty() { + return String::new(); + } + + let config = RankingConfig::default(); + let max_boost = tags + .iter() + .map(|tag| tag_boost_value(tag, &config)) + .max() + .unwrap_or(0); + + let tag_strings: Vec = tags + .iter() + .filter(|tag| tag_boost_value(tag, &config) == max_boost) + .map(tag_to_display_string) + .collect(); + + let result = tag_strings.join(", "); + + // Truncate if still too long + if result.len() > TAGS_COLUMN_WIDTH { + truncate_string(&result, TAGS_COLUMN_WIDTH) + } else { + result + } +} + +/// Get the ranking boost value for a tag using the provided config. +fn tag_boost_value(tag: &Tag, config: &RankingConfig) -> i32 { + config.tag_boosts.get(tag).copied().unwrap_or(0) +} + +/// Convert a single tag to its display string. +/// +/// Uses the serde rename value where defined, otherwise uses lowercase Debug format. +pub(crate) fn tag_to_display_string(tag: &Tag) -> String { + match tag { + Tag::Url => "url".to_string(), + Tag::Domain => "domain".to_string(), + Tag::IPv4 => "ipv4".to_string(), + Tag::IPv6 => "ipv6".to_string(), + Tag::FilePath => "filepath".to_string(), + Tag::RegistryPath => "regpath".to_string(), + Tag::Guid => "guid".to_string(), + Tag::Email => "email".to_string(), + Tag::Base64 => "b64".to_string(), + Tag::FormatString => "fmt".to_string(), + Tag::UserAgent => "user-agent-ish".to_string(), + Tag::DemangledSymbol => "demangled".to_string(), + Tag::Import => "import".to_string(), + Tag::Export => "export".to_string(), + Tag::Version => "version".to_string(), + Tag::Manifest => "manifest".to_string(), + Tag::Resource => "resource".to_string(), + Tag::DylibPath => "dylib-path".to_string(), + Tag::Rpath => "rpath".to_string(), + Tag::RpathVariable => "rpath-var".to_string(), + Tag::FrameworkPath => "framework-path".to_string(), + } +} + +/// Truncate a string to the specified maximum length. +/// +/// If the string exceeds the maximum length, it is truncated and `...` is appended. +/// Handles Unicode correctly by truncating at character boundaries. +/// +/// # Arguments +/// +/// * `s` - The string to truncate +/// * `max_len` - Maximum length including the ellipsis +/// +/// # Returns +/// +/// The original string if it fits, or a truncated version with `...` appended. +/// +/// # Examples +/// +/// ```ignore +/// assert_eq!(truncate_string("hello", 10), "hello"); +/// assert_eq!(truncate_string("hello world", 8), "hello..."); +/// ``` +pub fn truncate_string(s: &str, max_len: usize) -> String { + if s.len() <= max_len { + return s.to_string(); + } + + if max_len <= 3 { + return ".".repeat(max_len); + } + + // Find a valid character boundary for truncation + let truncate_at = max_len - 3; + let mut end_index = truncate_at; + + // Ensure we don't split a multi-byte character + for (idx, _) in s.char_indices() { + if idx <= truncate_at { + end_index = idx; + } else { + break; + } + } + + // Handle case where we need to include at least one character + if end_index == 0 && !s.is_empty() { + if let Some((idx, _)) = s.char_indices().nth(1) { + end_index = idx; + } else { + end_index = s.len(); + } + } + + format!("{}...", &s[..end_index]) +} + +/// Pad a string to a fixed width with the specified alignment. +/// +/// # Arguments +/// +/// * `s` - The string to pad +/// * `width` - Target width +/// * `alignment` - Left or right alignment +/// +/// # Returns +/// +/// The padded string. +pub fn pad_string(s: &str, width: usize, alignment: Alignment) -> String { + match alignment { + Alignment::Left => format!("{: format!("{:>width$}", s, width = width), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + mod format_tags_tests { + use super::*; + + #[test] + fn empty_tags() { + assert_eq!(format_tags(&[]), ""); + } + + #[test] + fn single_tag() { + assert_eq!(format_tags(&[Tag::Url]), "url"); + assert_eq!(format_tags(&[Tag::IPv4]), "ipv4"); + assert_eq!(format_tags(&[Tag::FilePath]), "filepath"); + } + + #[test] + fn two_tags() { + assert_eq!(format_tags(&[Tag::Url, Tag::Domain]), "url"); + assert_eq!(format_tags(&[Tag::IPv4, Tag::FilePath]), "ipv4"); + } + + #[test] + fn three_tags() { + assert_eq!(format_tags(&[Tag::Url, Tag::Domain, Tag::IPv4]), "url"); + } + + #[test] + fn more_than_max_tags_truncated() { + let tags = vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::FilePath, + Tag::RegistryPath, + ]; + assert_eq!(format_tags(&tags), "url"); + } + + #[test] + fn multiple_tags_same_priority() { + assert_eq!(format_tags(&[Tag::Import, Tag::Export]), "import, export"); + } + + #[test] + fn all_tag_variants_have_display() { + // Ensure all tag variants produce valid output + let all_tags = vec![ + Tag::Url, + Tag::Domain, + Tag::IPv4, + Tag::IPv6, + Tag::FilePath, + Tag::RegistryPath, + Tag::Guid, + Tag::Email, + Tag::Base64, + Tag::FormatString, + Tag::UserAgent, + Tag::DemangledSymbol, + Tag::Import, + Tag::Export, + Tag::Version, + Tag::Manifest, + Tag::Resource, + Tag::DylibPath, + Tag::Rpath, + Tag::RpathVariable, + Tag::FrameworkPath, + ]; + + for tag in all_tags { + let display = tag_to_display_string(&tag); + assert!(!display.is_empty(), "Tag {:?} should have display", tag); + assert!(display.is_ascii(), "Tag display should be ASCII"); + } + } + } + + mod truncate_string_tests { + use super::*; + + #[test] + fn short_string_unchanged() { + assert_eq!(truncate_string("hello", 10), "hello"); + assert_eq!(truncate_string("", 10), ""); + } + + #[test] + fn exact_length_unchanged() { + assert_eq!(truncate_string("hello", 5), "hello"); + } + + #[test] + fn long_string_truncated() { + assert_eq!(truncate_string("hello world", 8), "hello..."); + } + + #[test] + fn very_short_max_length() { + assert_eq!(truncate_string("hello", 3), "..."); + assert_eq!(truncate_string("hello", 2), ".."); + assert_eq!(truncate_string("hello", 1), "."); + } + + #[test] + fn unicode_string_safe_truncation() { + // Ensure we don't split multi-byte characters + let unicode = "hello\u{1F600}world"; // emoji in the middle + let truncated = truncate_string(unicode, 8); + // Should truncate before the emoji to avoid splitting it + assert!(truncated.ends_with("...")); + assert!(truncated.len() <= 8); + } + + #[test] + fn unicode_at_boundary() { + let text = "\u{4E2D}\u{6587}\u{6D4B}\u{8BD5}"; // Chinese characters + let truncated = truncate_string(text, 6); + assert!(truncated.is_char_boundary(truncated.len() - 3)); + } + } + + mod pad_string_tests { + use super::*; + + #[test] + fn left_alignment() { + assert_eq!(pad_string("hi", 5, Alignment::Left), "hi "); + assert_eq!(pad_string("hello", 5, Alignment::Left), "hello"); + } + + #[test] + fn right_alignment() { + assert_eq!(pad_string("hi", 5, Alignment::Right), " hi"); + assert_eq!(pad_string("hello", 5, Alignment::Right), "hello"); + } + + #[test] + fn exact_width() { + assert_eq!(pad_string("exact", 5, Alignment::Left), "exact"); + assert_eq!(pad_string("exact", 5, Alignment::Right), "exact"); + } + + #[test] + fn empty_string() { + assert_eq!(pad_string("", 5, Alignment::Left), " "); + assert_eq!(pad_string("", 5, Alignment::Right), " "); + } + } +} diff --git a/src/output/table/mod.rs b/src/output/table/mod.rs new file mode 100644 index 0000000..9bcb95a --- /dev/null +++ b/src/output/table/mod.rs @@ -0,0 +1,120 @@ +//! Table output formatter for Stringy. +//! +//! This module provides human-readable table output with automatic TTY detection. +//! When output is directed to a terminal (TTY), strings are displayed in an aligned +//! table with headers showing String, Tags, Score, and Section columns. When output +//! is piped or redirected (non-TTY), only the raw string text is emitted, one per line, +//! for seamless integration with other command-line tools. +//! +//! # TTY Mode Example +//! +//! ```text +//! String | Tags | Score | Section +//! -------------------------------------------------------------|--------------|-------|-------- +//! https://malware.example.com/beacon | url | 150 | .rdata +//! C:\Windows\System32\cmd.exe | filepath | 120 | .data +//! GetProcAddress | import | 80 | +//! ``` +//! +//! # Non-TTY Mode Example +//! +//! ```text +//! https://malware.example.com/beacon +//! C:\Windows\System32\cmd.exe +//! GetProcAddress +//! ``` +//! +//! # Column Layout +//! +//! - **String**: Up to 60 characters, truncated with `...` if longer +//! - **Tags**: First 2-3 tags, comma-separated, max 20 characters +//! - **Score**: Right-aligned integer score +//! - **Section**: Section name where the string was found + +mod formatting; +mod plain; +mod tty; + +use std::io::IsTerminal; + +use crate::types::{FoundString, Result}; + +use super::OutputMetadata; + +// Re-export public items from submodules +pub use formatting::{Alignment, format_tags, pad_string, truncate_string}; + +/// Maximum width for the string column before truncation. +pub(crate) const STRING_COLUMN_WIDTH: usize = 60; + +/// Maximum width for the tags column. +pub(crate) const TAGS_COLUMN_WIDTH: usize = 20; + +/// Maximum width for the score column. +pub(crate) const SCORE_COLUMN_WIDTH: usize = 6; + +/// Maximum width for the section column. +pub(crate) const SECTION_COLUMN_WIDTH: usize = 15; + +/// Format strings in a human-readable table format. +/// +/// Automatically detects whether output is going to a TTY (terminal) and adjusts +/// the format accordingly. In TTY mode, outputs an aligned table with headers. +/// In non-TTY mode (piped/redirected), outputs plain strings one per line. +/// +/// # Arguments +/// +/// * `strings` - The extracted strings to format +/// * `metadata` - Output context (currently unused but reserved for future features) +/// +/// # Returns +/// +/// A formatted string ready for output. +pub fn format_table(strings: &[FoundString], metadata: &OutputMetadata) -> Result { + let is_tty = std::io::stdout().is_terminal(); + format_table_with_mode(strings, metadata, is_tty) +} + +/// Format table with explicit TTY mode specification. +/// +/// This function allows explicit control over the output mode, useful for testing +/// and programmatic control over output format. +/// +/// # Arguments +/// +/// * `strings` - The extracted strings to format +/// * `metadata` - Output context +/// * `is_tty` - Whether to use TTY mode (true) or plain mode (false) +pub fn format_table_with_mode( + strings: &[FoundString], + metadata: &OutputMetadata, + is_tty: bool, +) -> Result { + if is_tty { + tty::format_table_tty(strings, metadata) + } else { + plain::format_table_plain(strings) + } +} + +#[cfg(test)] +pub(crate) mod test_helpers { + use crate::output::OutputFormat; + use crate::types::{Encoding, FoundString, StringSource}; + + use super::OutputMetadata; + + pub fn make_test_string(text: &str) -> FoundString { + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0x1000, + text.len() as u32, + StringSource::SectionData, + ) + } + + pub fn make_metadata() -> OutputMetadata { + OutputMetadata::new("test.bin".to_string(), OutputFormat::Table, 10, 10) + } +} diff --git a/src/output/table/plain.rs b/src/output/table/plain.rs new file mode 100644 index 0000000..edab83b --- /dev/null +++ b/src/output/table/plain.rs @@ -0,0 +1,96 @@ +//! Plain text output for non-TTY environments. +//! +//! This module provides simple one-string-per-line output suitable for piping +//! to other command-line tools like grep, awk, or sed. + +use crate::types::{FoundString, Result}; + +/// Format strings as plain text for non-TTY output. +/// +/// Outputs only the string text, one per line, suitable for piping to other tools. +pub(super) fn format_table_plain(strings: &[FoundString]) -> Result { + let lines: Vec = strings + .iter() + .map(|s| sanitize_plain_text(&s.text)) + .collect(); + Ok(lines.join("\n")) +} + +/// Sanitize plain text output so each string renders as a single line. +/// +/// Replaces CRLF, LF, and CR with escaped sequences to preserve content +/// while keeping output line-based. +fn sanitize_plain_text(text: &str) -> String { + text.replace("\r\n", "\\r\\n") + .replace('\n', "\\n") + .replace('\r', "\\r") +} + +#[cfg(test)] +mod tests { + use crate::output::table::format_table_with_mode; + use crate::output::table::test_helpers::{make_metadata, make_test_string}; + + #[test] + fn single_string_plain_mode() { + let strings = vec![make_test_string("test string")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + assert_eq!(result, "test string"); + } + + #[test] + fn multiple_strings_plain_mode() { + let strings = vec![ + make_test_string("first"), + make_test_string("second"), + make_test_string("third"), + ]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + assert_eq!(result, "first\nsecond\nthird"); + } + + #[test] + fn long_string_not_truncated_in_plain() { + let long_text = "a".repeat(100); + let strings = vec![make_test_string(&long_text)]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + // Plain mode should have full string + assert_eq!(result, long_text); + } + + #[test] + fn special_characters_in_string() { + let strings = vec![make_test_string("tab\there"), make_test_string("pipe|here")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + + // Each string should be on its own line in output + let lines: Vec<&str> = result.lines().collect(); + assert_eq!(lines.len(), 2); + assert!(lines[0].contains("tab\there")); + assert!(lines[1].contains("pipe|here")); + } + + #[test] + fn string_with_embedded_newline() { + let strings = vec![make_test_string("line1\nline2")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + assert_eq!(result, "line1\\nline2"); + } + + #[test] + fn string_with_crlf() { + let strings = vec![make_test_string("line1\r\nline2")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + assert_eq!(result, "line1\\r\\nline2"); + } + + #[test] + fn string_with_cr() { + let strings = vec![make_test_string("line1\rline2")]; + let result = format_table_with_mode(&strings, &make_metadata(), false).unwrap(); + assert_eq!(result, "line1\\rline2"); + } +} diff --git a/src/output/table/tty.rs b/src/output/table/tty.rs new file mode 100644 index 0000000..38ed658 --- /dev/null +++ b/src/output/table/tty.rs @@ -0,0 +1,226 @@ +//! TTY mode table output for Stringy. +//! +//! This module provides formatted table output with aligned columns for terminal display. + +use crate::types::{FoundString, Result}; + +use super::formatting::{Alignment, format_tags, pad_string, truncate_string}; +use super::{ + OutputMetadata, SCORE_COLUMN_WIDTH, SECTION_COLUMN_WIDTH, STRING_COLUMN_WIDTH, + TAGS_COLUMN_WIDTH, +}; + +/// Format strings as an aligned table for TTY output. +/// +/// Creates a table with headers and aligned columns showing: +/// - String text (truncated if necessary) +/// - Tags (comma-separated, limited count) +/// - Score (right-aligned) +/// - Section name +pub(super) fn format_table_tty( + strings: &[FoundString], + _metadata: &OutputMetadata, +) -> Result { + if strings.is_empty() { + return Ok(String::new()); + } + + let mut output = String::new(); + + // Calculate dynamic column widths based on content + let section_width = calculate_section_width(strings); + let tags_width = calculate_tags_width(strings); + + // Build header + let header = format!( + "{} | {} | {} | {}", + pad_string("String", STRING_COLUMN_WIDTH, Alignment::Left), + pad_string("Tags", tags_width, Alignment::Left), + pad_string("Score", SCORE_COLUMN_WIDTH, Alignment::Right), + pad_string("Section", section_width, Alignment::Left), + ); + output.push_str(&header); + output.push('\n'); + + // Build separator line + let separator = format!( + "{}-|-{}-|-{}-|-{}", + "-".repeat(STRING_COLUMN_WIDTH), + "-".repeat(tags_width), + "-".repeat(SCORE_COLUMN_WIDTH), + "-".repeat(section_width), + ); + output.push_str(&separator); + output.push('\n'); + + // Build rows + for found_string in strings { + let truncated_text = truncate_string(&found_string.text, STRING_COLUMN_WIDTH); + let tags_display = format_tags(&found_string.tags); + let section_display = found_string.section.as_deref().unwrap_or(""); + + let row = format!( + "{} | {} | {} | {}", + pad_string(&truncated_text, STRING_COLUMN_WIDTH, Alignment::Left), + pad_string(&tags_display, tags_width, Alignment::Left), + pad_string( + &found_string.score.to_string(), + SCORE_COLUMN_WIDTH, + Alignment::Right + ), + pad_string(section_display, section_width, Alignment::Left), + ); + output.push_str(&row); + output.push('\n'); + } + + // Remove trailing newline for consistency + if output.ends_with('\n') { + output.pop(); + } + + Ok(output) +} + +/// Calculate the optimal width for the section column based on content. +fn calculate_section_width(strings: &[FoundString]) -> usize { + let max_section_len = strings + .iter() + .filter_map(|s| s.section.as_ref()) + .map(|s| s.len()) + .max() + .unwrap_or(0); + + // Minimum width is "Section" header length, maximum is SECTION_COLUMN_WIDTH + max_section_len.clamp("Section".len(), SECTION_COLUMN_WIDTH) +} + +/// Calculate the optimal width for the tags column based on content. +fn calculate_tags_width(strings: &[FoundString]) -> usize { + let max_tags_len = strings + .iter() + .map(|s| format_tags(&s.tags).len()) + .max() + .unwrap_or(0); + + // Minimum width is "Tags" header length, maximum is TAGS_COLUMN_WIDTH + max_tags_len.clamp("Tags".len(), TAGS_COLUMN_WIDTH) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::output::table::format_table_with_mode; + use crate::output::table::test_helpers::{make_metadata, make_test_string}; + use crate::types::Tag; + + #[test] + fn empty_strings_returns_empty() { + let result = format_table_with_mode(&[], &make_metadata(), true).unwrap(); + assert_eq!(result, ""); + } + + #[test] + fn single_string_tty_mode() { + let strings = vec![make_test_string("test string")]; + let result = format_table_with_mode(&strings, &make_metadata(), true).unwrap(); + + // Should have header, separator, and one data row + let lines: Vec<&str> = result.lines().collect(); + assert_eq!(lines.len(), 3); + assert!(lines[0].contains("String")); + assert!(lines[0].contains("Tags")); + assert!(lines[0].contains("Score")); + assert!(lines[0].contains("Section")); + assert!(lines[1].contains("---")); + assert!(lines[2].contains("test string")); + } + + #[test] + fn string_with_tags_displayed() { + let mut found = make_test_string("http://example.com"); + found.tags = vec![Tag::Url, Tag::Domain]; + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + assert!(result.contains("url")); + } + + #[test] + fn string_with_section_displayed() { + let found = make_test_string("test").with_section(".rodata".to_string()); + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + assert!(result.contains(".rodata")); + } + + #[test] + fn string_with_score_displayed() { + let found = make_test_string("test").with_score(150); + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + assert!(result.contains("150")); + } + + #[test] + fn long_string_truncated_in_tty() { + let long_text = "a".repeat(100); + let strings = vec![make_test_string(&long_text)]; + let result = format_table_with_mode(&strings, &make_metadata(), true).unwrap(); + + // Should contain truncated version with ... + assert!(result.contains("...")); + // Should not contain the full 100 character string + assert!(!result.contains(&long_text)); + } + + #[test] + fn missing_optional_fields_handled() { + // String with no section, no tags, default score + let found = make_test_string("minimal"); + + let result = format_table_with_mode(&[found], &make_metadata(), true).unwrap(); + // Should not crash and should contain the string + assert!(result.contains("minimal")); + } + + mod column_width_tests { + use super::*; + + #[test] + fn section_width_minimum() { + let strings = vec![make_test_string("test")]; + let width = calculate_section_width(&strings); + assert_eq!(width, "Section".len()); + } + + #[test] + fn section_width_from_content() { + let strings = vec![make_test_string("test").with_section(".rodata.str1.1".to_string())]; + let width = calculate_section_width(&strings); + assert_eq!(width, ".rodata.str1.1".len()); + } + + #[test] + fn section_width_capped_at_max() { + let long_section = "a".repeat(50); + let strings = vec![make_test_string("test").with_section(long_section)]; + let width = calculate_section_width(&strings); + assert_eq!(width, SECTION_COLUMN_WIDTH); + } + + #[test] + fn tags_width_minimum() { + let strings = vec![make_test_string("test")]; + let width = calculate_tags_width(&strings); + assert_eq!(width, "Tags".len()); + } + + #[test] + fn tags_width_from_content() { + let mut found = make_test_string("test"); + found.tags = vec![Tag::Url, Tag::Domain]; + let width = calculate_tags_width(&[found]); + assert_eq!(width, "Tags".len()); + } + } +} From 5c53d91d4d711e1f9e3d9785fc95136216a12073 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 15:14:26 -0500 Subject: [PATCH 14/25] fix(yara): correct UTF-16LE encoding and prevent injection attacks Fix two issues in YARA output formatter: 1. UTF-16LE non-ASCII handling: The `wide` modifier only works for ASCII characters. Non-ASCII UTF-16LE strings now use hex encoding instead of the incorrect `wide` modifier approach. - Add `utf16le_hex_string` helper (mirrors `utf16be_hex_string`) - Branch on `is_ascii()` to choose appropriate encoding 2. Injection prevention: Escape `binary_name` and `timestamp` in comment headers and meta fields to prevent newline/quote injection attacks. Add 8 new tests covering: - UTF-16LE hex string encoding (basic, empty, non-ASCII, surrogate pairs) - ASCII UTF-16LE still uses wide modifier - Non-ASCII UTF-16LE uses hex encoding - Binary name injection escaping - Timestamp injection escaping Co-Authored-By: Claude Opus 4.5 --- ROADMAP.md | 167 ++++++++++++++++++++------------------------- src/output/yara.rs | 125 +++++++++++++++++++++++++++++++-- 2 files changed, 195 insertions(+), 97 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index 30c8c58..bc4676c 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -7,94 +7,88 @@ This document tracks medium-term and long-term improvements identified during th ### Architecture Improvements #### 1. Split `extraction/mod.rs` into smaller modules -**Priority:** High -**Current state:** 1542 lines (exceeds 500-line project limit by 1042 lines) -**Files affected:** `src/extraction/mod.rs` + +**Priority:** High **Current state:** 1542 lines (exceeds 500-line project limit by 1042 lines) **Files affected:** `src/extraction/mod.rs` Recommended split: + - `src/extraction/config.rs` - Move `ExtractionConfig` and validation logic - `src/extraction/trait.rs` - Move `StringExtractor` trait definition - `src/extraction/basic.rs` - Move `BasicExtractor` implementation - `src/extraction/helpers.rs` - Move internal helper functions (`is_printable_text_byte`, `could_be_utf8_byte`, `extract_ascii_utf8_strings`) Other oversized files to address: -| File | Lines | Overage | -|------|-------|---------| -| `src/extraction/pe_resources.rs` | 1449 | +949 | -| `src/extraction/utf16.rs` | 1273 | +773 | -| `src/extraction/dedup.rs` | 849 | +349 | -| `src/extraction/ascii.rs` | 832 | +332 | -| `src/output/table.rs` | 708 | +208 | -| `src/extraction/filters.rs` | 702 | +202 | -| `src/container/pe.rs` | 661 | +161 | -| `src/container/elf.rs` | 627 | +127 | -| `src/container/macho.rs` | 574 | +74 | -| `src/types.rs` | 558 | +58 | + +| File | Lines | Overage | +| -------------------------------- | ----- | ------- | +| `src/extraction/pe_resources.rs` | 1449 | +949 | +| `src/extraction/utf16.rs` | 1273 | +773 | +| `src/extraction/dedup.rs` | 849 | +349 | +| `src/extraction/ascii.rs` | 832 | +332 | +| `src/output/table.rs` | 708 | +208 | +| `src/extraction/filters.rs` | 702 | +202 | +| `src/container/pe.rs` | 661 | +161 | +| `src/container/elf.rs` | 627 | +127 | +| `src/container/macho.rs` | 574 | +74 | +| `src/types.rs` | 558 | +58 | #### 2. Move PE resources to container module -**Priority:** Medium -**Current state:** `src/extraction/pe_resources.rs` is in extraction but conceptually belongs in container -**Rationale:** PE resource parsing is part of container analysis, not string extraction + +**Priority:** Medium **Current state:** `src/extraction/pe_resources.rs` is in extraction but conceptually belongs in container **Rationale:** PE resource parsing is part of container analysis, not string extraction #### 3. Decouple semantic enrichment from extraction -**Priority:** Medium -**Current state:** `extraction` module imports from `classification` creating bidirectional dependency -**Files affected:** `src/extraction/mod.rs:129` -**Recommendation:** Move semantic enrichment to an orchestration layer that callers control + +**Priority:** Medium **Current state:** `extraction` module imports from `classification` creating bidirectional dependency **Files affected:** `src/extraction/mod.rs:129` **Recommendation:** Move semantic enrichment to an orchestration layer that callers control #### 4. Add `#[non_exhaustive]` to remaining public enums -**Priority:** Medium -**Files affected:** + +**Priority:** Medium **Files affected:** + - `src/types.rs:4-10` - `Encoding` enum - `src/types.rs:130-136` - `BinaryFormat` enum ### Error Handling #### 5. Add `SerializationError` variant to `StringyError` -**Priority:** Medium -**Current state:** `ConfigError` is incorrectly used for JSON serialization failures -**Files affected:** `src/output/json.rs:14-16`, `src/types.rs` + +**Priority:** Medium **Current state:** `ConfigError` is incorrectly used for JSON serialization failures **Files affected:** `src/output/json.rs:14-16`, `src/types.rs` #### 6. Add format-specific error variants -**Priority:** Low -**Recommendation:** Add `InvalidPeError`, `InvalidElfError`, `InvalidMachOError` instead of generic `ParseError(String)` + +**Priority:** Low **Recommendation:** Add `InvalidPeError`, `InvalidElfError`, `InvalidMachOError` instead of generic `ParseError(String)` ### API Improvements #### 7. Add constructors to remaining public structs -**Priority:** Medium -**Files affected:** `src/types.rs` -**Structs needing constructors:** `ImportInfo`, `ExportInfo`, `SectionInfo` -**Rationale:** Required for `#[non_exhaustive]` compatibility + +**Priority:** Medium **Files affected:** `src/types.rs` **Structs needing constructors:** `ImportInfo`, `ExportInfo`, `SectionInfo` **Rationale:** Required for `#[non_exhaustive]` compatibility #### 8. Add `#[allow]` justification comments -**Priority:** Low -**Files affected:** + +**Priority:** Low **Files affected:** + - `src/extraction/utf16.rs:334` - `#[allow(clippy::result_unit_err)]` - `src/extraction/utf16.rs:350` - `#[allow(dead_code)]` ### Documentation #### 9. Update API documentation for accuracy -**Priority:** Medium -**Files affected:** `docs/src/api.md` -**Issues:** Some function signatures don't match actual implementation + +**Priority:** Medium **Files affected:** `docs/src/api.md` **Issues:** Some function signatures don't match actual implementation #### 10. Add security considerations to README -**Priority:** Medium -**Content to add:** Document malware analysis use case, safe handling of untrusted binaries + +**Priority:** Medium **Content to add:** Document malware analysis use case, safe handling of untrusted binaries #### 11. Document deduplication feature in user docs -**Priority:** Medium -**Files affected:** README.md, `docs/src/string-extraction.md` + +**Priority:** Medium **Files affected:** README.md, `docs/src/string-extraction.md` ### Performance #### 12. Add memory mapping for large files -**Priority:** High -**Current state:** Entire file is loaded into memory -**Impact:** Processing 1GB+ binaries requires 1GB+ RAM -**Recommendation:** Use `memmap2` crate for memory-mapped file access + +**Priority:** High **Current state:** Entire file is loaded into memory **Impact:** Processing 1GB+ binaries requires 1GB+ RAM **Recommendation:** Use `memmap2` crate for memory-mapped file access ```rust // Recommended approach @@ -107,26 +101,22 @@ let data: &[u8] = &mmap; ``` #### 13. Optimize redundant regex matching -**Priority:** Low -**Files affected:** `src/classification/patterns/network.rs:92-106` -**Issue:** URL_REGEX runs twice on URLs (in `classify_url` then `classify_domain`) + +**Priority:** Low **Files affected:** `src/classification/patterns/network.rs:92-106` **Issue:** URL_REGEX runs twice on URLs (in `classify_url` then `classify_domain`) ### Testing #### 14. Set up code coverage metrics -**Priority:** Medium -**Tool:** `cargo-tarpaulin` -**Command:** `cargo tarpaulin --out Html` + +**Priority:** Medium **Tool:** `cargo-tarpaulin` **Command:** `cargo tarpaulin --out Html` #### 15. Add performance benchmarks -**Priority:** Medium -**Tool:** `criterion` -**Focus areas:** Deduplication with large input sets, regex pattern matching + +**Priority:** Medium **Tool:** `criterion` **Focus areas:** Deduplication with large input sets, regex pattern matching #### 16. Add fuzzing for binary parsers -**Priority:** Medium -**Tool:** `cargo-fuzz` -**Targets:** `container/*.rs` parsers with malformed input + +**Priority:** Medium **Tool:** `cargo-fuzz` **Targets:** `container/*.rs` parsers with malformed input --- @@ -135,8 +125,8 @@ let data: &[u8] = &mmap; ### Performance Optimizations #### 17. Consider parallel extraction with rayon -**Priority:** Low -**Rationale:** Section-by-section extraction is embarrassingly parallel + +**Priority:** Low **Rationale:** Section-by-section extraction is embarrassingly parallel ```rust use rayon::prelude::*; @@ -148,45 +138,38 @@ let section_strings: Vec> = sections ``` #### 18. Consider `Cow` for hot paths -**Priority:** Low -**Files affected:** `src/types.rs:236-237` -**Benefit:** Avoid cloning when strings could be borrowed + +**Priority:** Low **Files affected:** `src/types.rs:236-237` **Benefit:** Avoid cloning when strings could be borrowed #### 19. Consider `SmallVec` for tags -**Priority:** Low -**Field:** `FoundString::tags` -**Rationale:** Typical 0-3 tags could use stack allocation with `SmallVec<[Tag; 4]>` + +**Priority:** Low **Field:** `FoundString::tags` **Rationale:** Typical 0-3 tags could use stack allocation with `SmallVec<[Tag; 4]>` ### Dependency Management #### 20. Migrate to `std::sync::LazyLock` -**Priority:** Low -**Current state:** Uses `once_cell::sync::Lazy` -**Target:** `std::sync::LazyLock` (stabilized in Rust 1.80) -**Files affected:** All files in `src/classification/patterns/` + +**Priority:** Low **Current state:** Uses `once_cell::sync::Lazy` **Target:** `std::sync::LazyLock` (stabilized in Rust 1.80) **Files affected:** All files in `src/classification/patterns/` ### Feature Enhancements #### 21. Implement main CLI -**Priority:** High -**Current state:** `src/main.rs` is a stub with TODO -**File:** `src/main.rs:18` + +**Priority:** High **Current state:** `src/main.rs` is a stub with TODO **File:** `src/main.rs:18` #### 22. Integrate Mach-O load command strings -**Priority:** Medium -**Current state:** Feature exists but not integrated into main pipeline -**File:** `src/container/macho.rs:198` + +**Priority:** Medium **Current state:** Feature exists but not integrated into main pipeline **File:** `src/container/macho.rs:198` #### 23. Parse all Mach-O architectures -**Priority:** Low -**Current state:** Only parses first architecture in fat binaries -**File:** `src/container/macho.rs:312` + +**Priority:** Low **Current state:** Only parses first architecture in fat binaries **File:** `src/container/macho.rs:312` ### Build Configuration #### 24. Add feature flags for output formats -**Priority:** Low -**File:** `Cargo.toml` + +**Priority:** Low **File:** `Cargo.toml` ```toml [features] @@ -197,8 +180,8 @@ table = [] ``` #### 25. Add `include` field to Cargo.toml -**Priority:** Low -**Purpose:** Control what gets published to crates.io + +**Priority:** Low **Purpose:** Control what gets published to crates.io ```toml [package] @@ -226,14 +209,14 @@ The following issues from the comprehensive review have been addressed: **Overall Rating from Comprehensive Review: B+ (85/100)** -| Dimension | Rating | -|-----------|--------| -| Code Quality | B+ | -| Architecture | B+ | -| Security | A | -| Performance | B | -| Testing | B+ | -| Documentation | B+ | -| Best Practices | A- | +| Dimension | Rating | +| -------------- | ------ | +| Code Quality | B+ | +| Architecture | B+ | +| Security | A | +| Performance | B | +| Testing | B+ | +| Documentation | B+ | +| Best Practices | A- | With the immediate issues addressed and medium-term improvements completed, this project would be ready for a stable 1.0 release. diff --git a/src/output/yara.rs b/src/output/yara.rs index 099fcbe..ca2bfe7 100644 --- a/src/output/yara.rs +++ b/src/output/yara.rs @@ -15,8 +15,14 @@ pub fn format_yara(strings: &[FoundString], metadata: &OutputMetadata) -> Result let mut output = String::new(); output.push_str("// YARA rule generated by Stringy\n"); - output.push_str(&format!("// Binary: {}\n", metadata.binary_name)); - output.push_str(&format!("// Generated: {}\n\n", timestamp)); + output.push_str(&format!( + "// Binary: {}\n", + escape_yara_string(&metadata.binary_name) + )); + output.push_str(&format!( + "// Generated: {}\n\n", + escape_yara_string(×tamp) + )); output.push_str(&format!("rule {} {{\n", rule_name)); output.push_str(" meta:\n"); @@ -25,7 +31,10 @@ pub fn format_yara(strings: &[FoundString], metadata: &OutputMetadata) -> Result escape_yara_string(&metadata.binary_name) )); output.push_str(" generated_by = \"stringy\"\n"); - output.push_str(&format!(" generated_at = \"{}\"\n", timestamp)); + output.push_str(&format!( + " generated_at = \"{}\"\n", + escape_yara_string(×tamp) + )); if strings.is_empty() { output.push_str(" condition:\n"); @@ -64,8 +73,14 @@ pub fn format_yara(strings: &[FoundString], metadata: &OutputMetadata) -> Result strings_block.push_str(&format!(" {} = {}\n", var_name, hex)); } Encoding::Utf16Le => { - let escaped = escape_yara_unicode_literal(&item.text); - strings_block.push_str(&format!(" {} = \"{}\" wide\n", var_name, escaped)); + if item.text.is_ascii() { + let escaped = escape_yara_unicode_literal(&item.text); + strings_block + .push_str(&format!(" {} = \"{}\" wide\n", var_name, escaped)); + } else { + let hex = utf16le_hex_string(&item.text); + strings_block.push_str(&format!(" {} = {}\n", var_name, hex)); + } } Encoding::Ascii | Encoding::Utf8 => { let escaped = escape_yara_string(&item.text); @@ -193,6 +208,20 @@ fn utf16be_hex_string(text: &str) -> String { format!("{{ {} }}", hex_bytes.join(" ")) } +fn utf16le_hex_string(text: &str) -> String { + let hex_bytes: Vec = text + .encode_utf16() + .flat_map(|unit| unit.to_le_bytes()) + .map(|b| format!("{:02x}", b)) + .collect(); + + if hex_bytes.is_empty() { + return "{ }".to_string(); + } + + format!("{{ {} }}", hex_bytes.join(" ")) +} + fn tag_name(tag: &Tag) -> &'static str { match tag { Tag::Url => "Url", @@ -418,4 +447,90 @@ mod tests { "Timestamp should be numeric or CLOCK_ERROR" ); } + + #[test] + fn test_utf16le_hex_string_basic() { + // Basic ASCII - should be little-endian (ASCII byte followed by 00) + assert_eq!(utf16le_hex_string("A"), "{ 41 00 }"); + assert_eq!(utf16le_hex_string("AB"), "{ 41 00 42 00 }"); + } + + #[test] + fn test_utf16le_hex_string_empty() { + assert_eq!(utf16le_hex_string(""), "{ }"); + } + + #[test] + fn test_utf16le_hex_string_non_ascii() { + // Non-ASCII Unicode (BMP) - Chinese character U+4E2D + let chinese = utf16le_hex_string("\u{4E2D}"); + assert_eq!(chinese, "{ 2d 4e }"); + } + + #[test] + fn test_utf16le_hex_string_surrogate_pair() { + // Character requiring surrogate pair (outside BMP) - emoji U+1F600 + let emoji = utf16le_hex_string("\u{1F600}"); + // Should produce surrogate pair: 3D D8 00 DE (little-endian) + assert_eq!(emoji, "{ 3d d8 00 de }"); + } + + #[test] + fn test_utf16le_ascii_uses_wide_modifier() { + // ASCII UTF-16LE should use "wide" modifier + let mut string = make_string("test"); + string.encoding = Encoding::Utf16Le; + let output = format_yara(&[string], &make_metadata()).expect("Formatting should succeed"); + assert!( + output.contains("wide"), + "ASCII UTF-16LE should use wide modifier" + ); + assert!(output.contains("\"test\"")); + } + + #[test] + fn test_utf16le_non_ascii_uses_hex() { + // Non-ASCII UTF-16LE should use hex string, not wide modifier + let mut string = make_string("\u{4E2D}\u{6587}"); + string.encoding = Encoding::Utf16Le; + let output = format_yara(&[string], &make_metadata()).expect("Formatting should succeed"); + assert!( + !output.contains("wide"), + "Non-ASCII UTF-16LE should not use wide modifier" + ); + assert!( + output.contains("{ 2d 4e 87 65 }"), + "Non-ASCII UTF-16LE should use hex encoding" + ); + } + + #[test] + fn test_binary_name_injection_escaped_in_comments() { + // Binary name with newlines should be escaped in comments + let mut metadata = make_metadata(); + metadata.binary_name = "evil\nname".to_string(); + let output = format_yara(&[], &metadata).expect("Formatting should succeed"); + // Should contain escaped newline, not literal + assert!( + output.contains("evil\\nname"), + "Newlines in binary_name should be escaped" + ); + assert!( + !output.contains("evil\nname"), + "Literal newlines should not appear" + ); + } + + #[test] + fn test_timestamp_injection_escaped_in_meta() { + // Timestamp with special characters should be escaped + let mut metadata = make_metadata(); + metadata.generated_at = Some("2024\"\n//attack".to_string()); + let output = format_yara(&[], &metadata).expect("Formatting should succeed"); + // Should contain escaped characters + assert!( + output.contains("2024\\\"\\n//attack"), + "Special chars in timestamp should be escaped" + ); + } } From f4388bebf2f33fe92f9cc55c95a71960344f2cdb Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 15:32:02 -0500 Subject: [PATCH 15/25] chore(tests): add comprehensive testing strategy analysis Signed-off-by: UncleSp1d3r --- TESTING_ANALYSIS.md | 499 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 499 insertions(+) create mode 100644 TESTING_ANALYSIS.md diff --git a/TESTING_ANALYSIS.md b/TESTING_ANALYSIS.md new file mode 100644 index 0000000..dee19ca --- /dev/null +++ b/TESTING_ANALYSIS.md @@ -0,0 +1,499 @@ +# Stringy Testing Strategy Analysis + +## Executive Summary + +### Overall Test Health: STRONG with Minor Gaps + +- **Total Tests**: 535 tests (280 unit + 219 integration + 36 ignored/doctest) +- **Test Pass Rate**: 98.9% (529 passed, 6 failed/ignored) +- **Test Coverage**: 6,106 test lines vs 14,138 source lines (43% ratio) +- **Test Modules**: 24 modules with unit tests +- **Fixtures**: 5 binary fixtures (ELF, Mach-O, PE with/without resources) + +## Test Distribution Analysis + +### Unit Tests (280 tests, 24 modules) + +**Coverage by Module**: + +- `classification/` - 70 tests (patterns, ranking, symbols, semantic) +- `container/` - 42 tests (ELF, PE, Mach-O parsers) +- `extraction/` - 95 tests (ASCII, UTF-16, dedup, filters, resources) +- `output/` - 51 tests (JSON, YARA, table formatters) +- `types.rs` - 4 tests (serialization/deserialization) + +### Integration Tests (219 tests, 13 test files) + +**Test Files**: + +1. `integration_elf.rs` (10 tests) - ELF parsing and extraction +2. `integration_extraction.rs` (9 tests) - End-to-end extraction +3. `integration_macho.rs` (15 tests) - Mach-O parsing and load commands +4. `integration_pe.rs` (22 tests) - PE parsing and resource extraction +5. `test_ascii_extraction.rs` (14 tests) - ASCII extraction scenarios +6. `test_ascii_integration.rs` (14 tests) - ASCII integration tests +7. `test_deduplication.rs` (5 tests) - Deduplication workflows +8. `test_noise_filters.rs` (9 tests) - Noise filtering heuristics +9. `test_utf16_extraction.rs` (5 tests) - UTF-16 extraction +10. `classification_integration.rs` (27 tests) - Semantic classification +11. `output_json_integration.rs` (41 tests) - JSON output format +12. `output_table_integration.rs` (27 tests) - Table output format +13. `output_yara_integration.rs` (41 tests) - YARA rule generation + +### Test Infrastructure + +**Snapshot Testing**: Using `insta` for output validation + +- JSON output snapshots +- YARA rule snapshots +- Table format snapshots + +**Test Fixtures**: Well-organized in `tests/fixtures/` + +- Source code (`test_binary.c`) +- ELF binary (`test_binary_elf`) +- Mach-O binary (`test_binary_macho`) +- PE binary (`test_binary_pe.exe`) +- PE with resources (`test_binary_with_resources.exe`) +- Resource definition files (`.rc`, `.res`) +- Comprehensive README with rebuild instructions + +## Critical Findings + +### 1. Doctest Failures (2 failures) + +**Issue**: Two doctests failing due to missing error handling in example code + +```text +src\extraction\mod.rs - extraction::StringExtractor (line 318) +src\extraction\mod.rs - extraction::BasicExtractor (line 408) +``` + +**Problem**: Doctests use `?` operator without proper return type: + +```rust +fn main() { // Should be: fn main() -> Result<(), Box> { + let data = std::fs::read("binary_file")?; // Error: can't use ? in fn returning () + ... +} +``` + +**Severity**: MEDIUM - Documentation examples don't compile, misleading users + +**Fix Required**: Add proper return types to doctest main functions + +### 2. Performance/Large Input Tests Missing + +**Critical Gap**: No tests for O(n^2) algorithms identified in previous phase + +**Affected Code**: + +- `src/extraction/dedup.rs:183-188` - Cross-section deduplication (vector contains) +- `src/extraction/dedup.rs:222-231` - Tag merging (vector contains) + +**Current Dedup Tests**: + +- `test_deduplication_with_basic_extractor` - Small input (6 strings) +- `test_deduplication_metadata_preservation` - Small input (2 strings) +- `test_deduplication_with_real_fixture` - Uses test fixture (unknown size) +- `test_deduplication_score_bonuses` - 2 strings +- `test_extract_canonical_preserves_occurrences` - Small input + +**Missing Coverage**: + +- No tests with 1,000+ duplicate strings +- No performance regression tests +- No benchmark for deduplication scalability + +**Severity**: HIGH - Performance bottlenecks not validated + +**Recommendation**: Add performance tests for large inputs + +### 3. Main Binary Untested + +**Issue**: `src/main.rs` has no tests (stub implementation) + +```rust +fn main() -> Result<(), Box> { + let _cli = Cli::parse(); + + // TODO: Implement main extraction pipeline + println!("Stringy - Binary string extraction tool"); + println!("Implementation coming soon..."); + + Ok(()) +} +``` + +**Severity**: LOW - Main is a stub, library is well-tested + +**Impact**: End-to-end CLI testing not possible until main is implemented + +### 4. Bounds Checking Coverage + +**Question from Previous Phase**: Are bounds checks in `extraction/mod.rs:688-699` tested? + +```rust +if section_offset >= data.len() { + return Ok(Vec::new()); +} + +let end_offset = section_offset + .checked_add(section_size) + .unwrap_or(data.len()) + .min(data.len()); +``` + +**Test Coverage Analysis**: + +- `test_string_at_section_boundary` in `test_ascii_extraction.rs:76-100` - Tests section boundary extraction +- `test_extract_from_section_basic` in integration tests - Tests basic section extraction +- `integration_extraction.rs` - Multiple boundary tests + +**Verdict**: PARTIALLY COVERED + +- Boundary conditions tested +- Edge case: Section offset beyond data length - NEEDS EXPLICIT TEST +- Edge case: Section size overflow - NEEDS EXPLICIT TEST + +**Missing Test Cases**: + +```rust +#[test] +fn test_section_beyond_file_boundary() { + // Section offset > data.len() +} + +#[test] +fn test_section_size_overflow() { + // section_offset + section_size overflows +} +``` + +**Severity**: MEDIUM - Edge cases not explicitly validated + +## Test Quality Metrics + +### 1. Assertion Density + +**Good Examples**: + +- `classification/patterns/` - High density (multiple assertions per test) +- `output/yara.rs` tests - Comprehensive validation of output format +- `extraction/dedup.rs` tests - Multiple assertions for score calculation + +**Average Tests per Module**: + +- Classification: 2.9 tests per function +- Extraction: 2.1 tests per function +- Output: 3.5 tests per function + +**Verdict**: GOOD - Adequate test coverage per module + +### 2. Edge Case Coverage + +**Well-Tested Edge Cases**: + +- Empty input (`test_empty_input`, `test_empty_strings_produces_minimal_rule`) +- Null/zero values (`test_no_valid_strings`) +- Boundary conditions (`test_string_at_section_boundary`, `test_boundary_conditions`) +- Unicode edge cases (`test_truncate_string_unicode_at_boundary`, `test_escape_yara_unicode_literal_empty`) +- Threshold boundaries (`test_entropy_filter_edge_cases`) + +**Missing Edge Cases**: + +- Large input (1,000+ strings) - NO TESTS +- Malformed binaries - LIMITED TESTS +- Section size overflow - NO EXPLICIT TEST +- Memory exhaustion scenarios - NO TESTS + +**Verdict**: GOOD for typical cases, WEAK for extreme cases + +### 3. Test Isolation + +**Positive Findings**: + +- Each test creates its own test data +- No shared mutable state +- Fixtures are read-only +- Tests can run in parallel (proven by test suite execution) + +**Verdict**: EXCELLENT - Tests are properly isolated + +### 4. Regression Protection + +**Snapshot Testing**: + +- `insta` used for output format validation +- JSON, YARA, table outputs have snapshot tests +- Changes to output format require explicit snapshot updates + +**Verdict**: EXCELLENT - Good regression protection via snapshots + +## Coverage Gaps by Priority + +### HIGH Priority Gaps + +1. **Performance Tests for Deduplication** + - Test with 10,000+ duplicate strings + - Validate O(n^2) algorithms don't cause timeout + - File: `tests/test_deduplication_performance.rs` (MISSING) + +2. **Doctest Fixes** + - Fix `extraction::StringExtractor` doctest (line 318) + - Fix `extraction::BasicExtractor` doctest (line 408) + - Files: `src/extraction/mod.rs` + +3. **Bounds Checking Edge Cases** + - Section offset beyond file boundary + - Section size causing integer overflow + - File: `tests/test_extraction_edge_cases.rs` (MISSING) + +### MEDIUM Priority Gaps + +1. **Malformed Binary Handling** + - Truncated ELF headers + - Invalid PE signatures + - Corrupted Mach-O load commands + - File: `tests/test_malformed_binaries.rs` (MISSING) + +2. **Regex Pattern Edge Cases** + - URL regex with edge cases (IPv6 in URLs, Unicode domains) + - Email regex with uncommon formats + - Path regex with UNC paths edge cases + - Files: Pattern test modules (PARTIAL) + +3. **Resource Extraction Error Paths** + - PE resource directory corruption + - Version info parsing failures + - String table malformed data + - File: `src/extraction/pe_resources.rs` tests (PARTIAL) + +### LOW Priority Gaps + +1. **Main Binary CLI Testing** + - Integration tests for CLI argument parsing + - File: `tests/cli_integration.rs` (MISSING, but main is stub) + +2. **Memory Leak Tests** + - Large file processing without memory growth + - File: Performance test suite (MISSING) + +3. **Concurrency Tests** + - Parallel extraction from multiple files + - Thread safety validation + - File: Concurrency test suite (MISSING) + +## Test Infrastructure Assessment + +### Strengths + +1. **Excellent Fixture Management** + - Well-documented rebuild process + - Multiple binary formats covered + - Source code available for reproduction + +2. **Comprehensive Integration Tests** + - 219 integration tests covering end-to-end scenarios + - Real binary fixtures used + - All output formats tested + +3. **Snapshot Testing** + - `insta` framework well-utilized + - Output format changes tracked + - Easy to review snapshot diffs + +4. **Test Organization** + - Clear separation: unit vs integration + - Logical grouping by functionality + - Consistent naming conventions + +### Weaknesses + +1. **No Performance Benchmarks** + - No `criterion` benchmarks + - No performance regression detection + - Large input scenarios untested + +2. **No Fuzzing Tests** + - No `cargo-fuzz` integration + - Binary parsing not fuzz-tested + - String extraction not fuzz-tested + +3. **No Code Coverage Metrics** + - `cargo-tarpaulin` not installed + - No coverage reports in CI + - Unknown actual code coverage percentage + +4. **Limited Error Injection** + - Few tests for error paths + - Missing tests for resource failures + - I/O error handling not tested + +## Recommendations + +### Immediate Actions (Week 1) + +1. **Fix Doctest Failures** + + ```rust + // In src/extraction/mod.rs (line 318 and 408) + // Change: fn main() { + // To: fn main() -> Result<(), Box> { + // Add: Ok(()) at end of function + ``` + +2. **Add Performance Tests** + + ```rust + // tests/test_deduplication_performance.rs + #[test] + #[ignore] // Marked as ignored for normal runs + fn test_deduplication_large_input() { + // Test with 10,000 duplicate strings + } + ``` + +3. **Add Bounds Checking Tests** + + ```rust + // tests/test_extraction_edge_cases.rs + #[test] + fn test_section_beyond_boundary() { + // Section offset > data.len() + } + ``` + +### Short-term Improvements (Month 1) + +1. **Add Fuzzing** + - Install `cargo-fuzz` + - Fuzz container parsers (ELF, PE, Mach-O) + - Fuzz string extractors (ASCII, UTF-16) + +2. **Enable Code Coverage** + - Install `cargo-tarpaulin` + - Add coverage to CI pipeline + - Set coverage threshold (80% target) + +3. **Add Malformed Binary Tests** + - Create corrupted fixtures + - Test graceful error handling + - Verify no panics on invalid input + +### Long-term Enhancements (Quarter 1) + +1. **Performance Benchmarks** + - Add `criterion` benchmarks + - Track deduplication performance + - Track classification performance + - Add to CI for regression detection + +2. **Property-Based Testing** + - Add `proptest` or `quickcheck` + - Generate random binaries + - Verify invariants (no panics, valid output) + +3. **CLI Integration Tests** + - Implement main binary + - Add end-to-end CLI tests + - Test output redirection, error handling + +4. **Concurrency Tests** + - Test thread safety + - Test parallel file processing + - Validate no data races + +## Test Quality Score + +### Category Scores (0-10) + +- **Coverage Breadth**: 8/10 - Most code paths tested, some edge cases missing +- **Coverage Depth**: 7/10 - Good assertions, but performance/stress tests lacking +- **Test Isolation**: 10/10 - Excellent isolation, no shared state +- **Edge Case Coverage**: 6/10 - Common cases covered, extreme cases missing +- **Regression Protection**: 9/10 - Snapshot tests provide strong protection +- **Performance Testing**: 2/10 - No performance tests, benchmarks missing +- **Error Path Testing**: 6/10 - Some error paths tested, but incomplete +- **Documentation**: 7/10 - Good fixture docs, some doctests broken + +### Overall Score: 6.9/10 (GOOD) + +**Strengths**: + +- Strong unit and integration test coverage +- Excellent test isolation and organization +- Good snapshot testing for output formats +- Comprehensive fixture management + +**Critical Weaknesses**: + +- No performance/stress testing +- Missing large input validation +- No fuzzing or property-based testing +- Code coverage metrics unavailable + +## Comparison to Industry Standards + +### TDD Compliance + +**Current State**: PARTIAL TDD + +- Tests exist for all major features +- Good test-first evidence in git history +- Some features lack comprehensive edge case tests + +**TDD Cycle Metrics** (Not tracked): + +- Red-green-refactor cycle time: UNKNOWN +- Test-first compliance: ESTIMATED 60-70% +- Test growth rate: Not measured + +**Recommendation**: Add TDD metrics tracking + +### Test Pyramid Balance + +**Current Distribution**: + +- Unit Tests: 52% (280/535) - GOOD +- Integration Tests: 41% (219/535) - GOOD +- End-to-End Tests: 7% (36/535) - LOW (but main is stub) + +**Verdict**: BALANCED - Good unit/integration ratio + +### Industry Benchmarks + +- **Test-to-Code Ratio**: 43% (6,106 test lines / 14,138 src lines) - ACCEPTABLE (industry: 30-50%) +- **Test Count**: 535 tests for 14k LOC - GOOD (industry: ~1 test per 30 LOC) +- **Test Pass Rate**: 98.9% - EXCELLENT (industry: >95%) + +## Test Execution Performance + +**Test Suite Speed**: FAST + +- Unit tests: 0.04s (258 tests) +- Integration tests: ~1.5s (219 tests) +- Total execution: <20s including doctests + +**Verdict**: EXCELLENT - Fast feedback loop + +## Conclusion + +The Stringy project demonstrates **strong testing practices** with comprehensive unit and integration test coverage. The test suite provides good regression protection through snapshot testing and maintains excellent test isolation. + +**Key Strengths**: + +1. High test count (535 tests) +2. Well-organized test structure +3. Excellent fixture management +4. Fast test execution + +**Critical Improvements Needed**: + +1. Fix failing doctests (IMMEDIATE) +2. Add performance/stress tests (HIGH PRIORITY) +3. Add bounds checking edge case tests (MEDIUM PRIORITY) +4. Enable code coverage metrics (MEDIUM PRIORITY) +5. Add fuzzing for binary parsers (LONG-TERM) + +**Recommendation**: The test infrastructure is solid, but adding performance tests and fixing doctests should be immediate priorities before production release. From 7306f48295948290246868a64d79fd97e1b04322 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 19 Jan 2026 21:58:26 +0000 Subject: [PATCH 16/25] chore(devcontainer): update Docker features and remove unused ones - Added 'moby' configuration to Docker feature - Removed unused features from the devcontainer configuration - Added mise.toml for tool dependencies Signed-off-by: UncleSp1d3r --- .devcontainer/devcontainer.json | 20 +++----------------- mise.toml | 13 +++++++++++++ 2 files changed, 16 insertions(+), 17 deletions(-) create mode 100644 mise.toml diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index a935486..b145780 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -5,7 +5,8 @@ "ghcr.io/devcontainers/features/docker-outside-of-docker:1": { "installDockerBuildx": true, "version": "latest", - "dockerDashComposeVersion": "v2" + "dockerDashComposeVersion": "v2", + "moby": false }, "ghcr.io/devcontainers/features/github-cli:1": { "installDirectlyFromGitHubRelease": true, @@ -14,27 +15,12 @@ "ghcr.io/eitsupi/devcontainer-features/mdbook:1": { "version": "latest" }, - "ghcr.io/jsburckhardt/devcontainer-features/bat:1": {}, - "ghcr.io/jsburckhardt/devcontainer-features/just:1": {}, - "ghcr.io/lee-orr/rusty-dev-containers/cargo-audit:0": {}, - "ghcr.io/lee-orr/rusty-dev-containers/cargo-binstall:0": {}, - "ghcr.io/lee-orr/rusty-dev-containers/cargo-deny:0": {}, - "ghcr.io/lee-orr/rusty-dev-containers/cargo-llvm-cov:0": {}, - "ghcr.io/lee-orr/rusty-dev-containers/cargo-nextest:0": {}, - "ghcr.io/marcozac/devcontainer-features/goreleaser:1": { - "version": "latest" - }, "ghcr.io/devcontainers-extra/features/claude-code:1": { "version": "latest" }, "ghcr.io/devcontainers-extra/features/mise:1": { "version": "latest" - }, - "ghcr.io/devcontainers-extra/features/pre-commit:2": { - "version": "latest" - }, - "ghcr.io/roul/devcontainer-features/mise-node:1": {}, - "ghcr.io/roul/devcontainer-features/mise-python:1": {} + } }, "customizations": { "vscode": { diff --git a/mise.toml b/mise.toml new file mode 100644 index 0000000..58e2eee --- /dev/null +++ b/mise.toml @@ -0,0 +1,13 @@ +[tools] +actionlint = "1.7.10" +cargo-binstall = "1.16.7" +cargo-insta = "1.46.1" +claude = "latest" +goreleaser = "2.13.3" +just = "1.46.0" +markdownlint-cli2 = "0.20.0" +mdbook = "0.5.2" +pre-commit = "4.5.1" +prettier = "3.8.0" +python = "3.14.2" +rust = "1.92.0" From d52047a5ae73a3912b9c68506d382faa0143b8fe Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 19 Jan 2026 22:20:49 +0000 Subject: [PATCH 17/25] chore(setup): update setup commands and add mise installation Signed-off-by: UncleSp1d3r --- justfile | 41 +++++++++++++++++++++++++---------------- mise.toml | 2 ++ 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/justfile b/justfile index 96c4802..34a4d7a 100644 --- a/justfile +++ b/justfile @@ -53,37 +53,52 @@ rmrf path: # Development setup [windows] setup: - Set-Location "{{ root }}" + mise trust + mise install rustup component add rustfmt clippy llvm-tools-preview - cargo install cargo-binstall --locked @just mdformat-install Write-Host "Note: You may need to restart your shell for pipx PATH changes to take effect" [unix] setup: - cd "{{ root }}" + mise trust + mise install rustup component add rustfmt clippy llvm-tools-preview - cargo install cargo-binstall --locked @just mdformat-install echo "Note: You may need to restart your shell for pipx PATH changes to take effect" -# Install development tools (extended setup) +# Install tool versions defined in mise.toml +[windows] +mise-install: + mise trust + mise install + +[unix] +mise-install: + mise trust + mise install + +# Install development tools not managed by mise [windows] install-tools: + @just mise-install cargo binstall --disable-telemetry cargo-llvm-cov cargo-audit cargo-deny cargo-dist cargo-release cargo-cyclonedx cargo-auditable cargo-nextest --locked [unix] install-tools: + @just mise-install cargo binstall --disable-telemetry cargo-llvm-cov cargo-audit cargo-deny cargo-dist cargo-release cargo-cyclonedx cargo-auditable cargo-nextest --locked -# Install mdBook and plugins for documentation +# Install mdBook plugins for documentation [windows] docs-install: - cargo binstall mdbook mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers + @just mise-install + cargo binstall mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers [unix] docs-install: - cargo binstall mdbook mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers + @just mise-install + cargo binstall mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers # Install pipx for Python tool management [windows] @@ -132,7 +147,7 @@ format: fmt format-json-yaml format-docs fmt-justfile # Individual format recipes format-json-yaml: - npx prettier --write "**/*.{json,yaml,yml}" + prettier --write "**/*.{json,yaml,yml}" [windows] format-docs: @@ -140,7 +155,6 @@ format-docs: [unix] format-docs: - cd "{{ root }}" @if command -v mdformat >/dev/null 2>&1; then find . -type f -name "*.md" -not -path "./target/*" -not -path "./node_modules/*" -exec mdformat {} + ; else echo "mdformat not found. Run 'just mdformat-install' first."; fi fmt: @@ -191,10 +205,9 @@ pre-commit-run: # Format a single file (for pre-commit hooks) format-files +FILES: - npx prettier --write --config .prettierrc.json {{ FILES }} + prettier --write --config .prettierrc.json {{ FILES }} megalinter: - cd "{{ root }}" npx mega-linter-runner --flavor rust # ============================================================================= @@ -213,26 +226,22 @@ test: # Test justfile cross-platform functionality [windows] test-justfile: - Set-Location "{{ root }}" $p = (Get-Location).Path; Write-Host "Current directory: $p"; Write-Host "Expected directory: {{ root }}" [unix] test-justfile: - cd "{{ root }}" /bin/echo "Current directory: $(pwd -P)" /bin/echo "Expected directory: {{ root }}" # Test cross-platform file system helpers [windows] test-fs: - Set-Location "{{ root }}" @just rmrf tmp/xfstest @just ensure-dir tmp/xfstest/sub @just rmrf tmp/xfstest [unix] test-fs: - cd "{{ root }}" @just rmrf tmp/xfstest @just ensure-dir tmp/xfstest/sub @just rmrf tmp/xfstest diff --git a/mise.toml b/mise.toml index 58e2eee..94ebd4d 100644 --- a/mise.toml +++ b/mise.toml @@ -3,6 +3,8 @@ actionlint = "1.7.10" cargo-binstall = "1.16.7" cargo-insta = "1.46.1" claude = "latest" +cyclonedx = "0.29.2" +git-cliff = "2.11.0" goreleaser = "2.13.3" just = "1.46.0" markdownlint-cli2 = "0.20.0" From 1cb3744c38b784d50c5c4590273cef2dac3fe910 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 19 Jan 2026 22:24:13 +0000 Subject: [PATCH 18/25] chore(cleanup): remove megalinter configurations and references Signed-off-by: UncleSp1d3r --- .gitignore | 2 -- .mdformat.toml | 4 +--- .mega-linter.yml | 48 ---------------------------------------------- cspell.config.yaml | 1 - justfile | 3 --- 5 files changed, 1 insertion(+), 57 deletions(-) delete mode 100644 .mega-linter.yml diff --git a/.gitignore b/.gitignore index 4b8b60f..98b1e83 100644 --- a/.gitignore +++ b/.gitignore @@ -121,8 +121,6 @@ docs/book/ .envrc .direnv/ -megalinter-reports/ - # Override global gitignore !bin/ # Added by goreleaser init: diff --git a/.mdformat.toml b/.mdformat.toml index 8f1e01d..57f1a18 100644 --- a/.mdformat.toml +++ b/.mdformat.toml @@ -7,7 +7,6 @@ exclude = [ "**/*.tpl.md", "**/CHANGELOG.md", "target/**", - "megalinter-reports/**", ] validate = true number = true @@ -26,5 +25,4 @@ extensions = [ [plugin.mkdocs] align_semantic_breaks_in_lists = true -ignore_missing_references = true - +ignore_missing_references = true diff --git a/.mega-linter.yml b/.mega-linter.yml deleted file mode 100644 index 37a81da..0000000 --- a/.mega-linter.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -# MegaLinter configuration for Stringy -# This configuration minimizes false positives while maintaining code quality - -# Apply linter fixes where safe -APPLY_FIXES: all - -# File/directory exclusions -EXCLUDED_DIRECTORIES: - - target - - dist - - build - - node_modules - - .git - - .cache - - coverage - - docs/book - - docs/build - -# All linters now properly configured - -# ActionLint configuration - suppress shellcheck issues in generated cargo-dist file -ACTION_ACTIONLINT_ARGUMENTS: - - --ignore=SC2086:info - - --ignore=SC2129:style - - --ignore=SC2001:style - -# File-specific exclusions for generated content -FILTER_REGEX_EXCLUDE: | - \.github/workflows/release\.yml - -# Lychee configuration for link checking -SPELL_LYCHEE_ARGUMENTS: - - --no-progress - - --exclude-loopback - - --exclude-private - - --exclude-mail - - --timeout=10 - -# Markdown table formatting exclusions -MARKDOWN_MARKDOWN_TABLE_FORMATTER_FILTER_REGEX_EXCLUDE: | - README\.md - -# Prettier configuration - respect .prettierignore -JSON_PRETTIER_ARGUMENTS: - - --check -YAML_PRETTIER_ARGUMENTS: - - --check diff --git a/cspell.config.yaml b/cspell.config.yaml index 89200af..dad0288 100644 --- a/cspell.config.yaml +++ b/cspell.config.yaml @@ -100,7 +100,6 @@ words: - mdformat - actionlint - lychee - - megalinter - cspell - justfile diff --git a/justfile b/justfile index 34a4d7a..359c785 100644 --- a/justfile +++ b/justfile @@ -207,9 +207,6 @@ pre-commit-run: format-files +FILES: prettier --write --config .prettierrc.json {{ FILES }} -megalinter: - npx mega-linter-runner --flavor rust - # ============================================================================= # BUILDING AND TESTING # ============================================================================= From 3b821e5cc7cc89279c22989428786638bbc41e39 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 19 Jan 2026 23:05:25 +0000 Subject: [PATCH 19/25] chore(mise): add node version to tools configuration Signed-off-by: UncleSp1d3r --- mise.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/mise.toml b/mise.toml index 94ebd4d..c162dd4 100644 --- a/mise.toml +++ b/mise.toml @@ -9,6 +9,7 @@ goreleaser = "2.13.3" just = "1.46.0" markdownlint-cli2 = "0.20.0" mdbook = "0.5.2" +node = "25.4.0" pre-commit = "4.5.1" prettier = "3.8.0" python = "3.14.2" From 704e7c5bfd3ceaf87acfe35cbad610fb09c134f7 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 19 Jan 2026 18:10:10 -0500 Subject: [PATCH 20/25] refactor(yara): split module to stay under 500-line limit Extract YARA string escaping utilities to separate escaping.rs module: - escape_yara_string, escape_yara_unicode_literal - utf16be_hex_string, utf16le_hex_string - All associated tests The original yara.rs was 536 lines, now split into: - yara/mod.rs (357 lines) - main formatting logic - yara/escaping.rs (204 lines) - escaping utilities Co-Authored-By: Claude Opus 4.5 --- src/output/yara/escaping.rs | 204 ++++++++++++++++++++++++++++ src/output/{yara.rs => yara/mod.rs} | 197 ++------------------------- 2 files changed, 214 insertions(+), 187 deletions(-) create mode 100644 src/output/yara/escaping.rs rename src/output/{yara.rs => yara/mod.rs} (64%) diff --git a/src/output/yara/escaping.rs b/src/output/yara/escaping.rs new file mode 100644 index 0000000..ad30421 --- /dev/null +++ b/src/output/yara/escaping.rs @@ -0,0 +1,204 @@ +//! YARA string escaping and encoding utilities. +//! +//! Provides functions for escaping strings and encoding them to hex formats +//! suitable for YARA rule strings. + +/// Escape a string for use in YARA string literals (ASCII/UTF-8). +/// +/// Handles control characters, backslashes, quotes, and non-printable bytes. +pub fn escape_yara_string(text: &str) -> String { + let mut escaped = String::new(); + for byte in text.as_bytes() { + match *byte { + b'\\' => escaped.push_str("\\\\"), + b'"' => escaped.push_str("\\\""), + b'\n' => escaped.push_str("\\n"), + b'\r' => escaped.push_str("\\r"), + b'\t' => escaped.push_str("\\t"), + 0x08 => escaped.push_str("\\b"), + 0x0b => escaped.push_str("\\x0b"), + 0x0c => escaped.push_str("\\x0c"), + 0x00..=0x1f | 0x7f..=0xff => { + escaped.push_str(&format!("\\x{:02x}", byte)); + } + _ => escaped.push(*byte as char), + } + } + escaped +} + +/// Escape a Unicode string for use with YARA's `wide` modifier. +/// +/// This preserves non-control Unicode characters while escaping control characters +/// and special YARA syntax characters. +pub fn escape_yara_unicode_literal(text: &str) -> String { + let mut escaped = String::new(); + for ch in text.chars() { + match ch { + '\\' => escaped.push_str("\\\\"), + '"' => escaped.push_str("\\\""), + '\n' => escaped.push_str("\\n"), + '\r' => escaped.push_str("\\r"), + '\t' => escaped.push_str("\\t"), + _ if ch.is_control() => { + let mut buf = [0; 4]; + let encoded = ch.encode_utf8(&mut buf); + for byte in encoded.as_bytes() { + escaped.push_str(&format!("\\x{:02x}", byte)); + } + } + _ => escaped.push(ch), + } + } + escaped +} + +/// Convert a string to UTF-16 big-endian hex format for YARA. +/// +/// Returns a hex string like `{ 00 41 00 42 }` for "AB". +pub fn utf16be_hex_string(text: &str) -> String { + let hex_bytes: Vec = text + .encode_utf16() + .flat_map(|unit| unit.to_be_bytes()) + .map(|b| format!("{:02x}", b)) + .collect(); + + if hex_bytes.is_empty() { + return "{ }".to_string(); + } + + format!("{{ {} }}", hex_bytes.join(" ")) +} + +/// Convert a string to UTF-16 little-endian hex format for YARA. +/// +/// Returns a hex string like `{ 41 00 42 00 }` for "AB". +pub fn utf16le_hex_string(text: &str) -> String { + let hex_bytes: Vec = text + .encode_utf16() + .flat_map(|unit| unit.to_le_bytes()) + .map(|b| format!("{:02x}", b)) + .collect(); + + if hex_bytes.is_empty() { + return "{ }".to_string(); + } + + format!("{{ {} }}", hex_bytes.join(" ")) +} + +#[cfg(test)] +mod tests { + use super::*; + + mod escape_yara_string_tests { + use super::*; + + #[test] + fn basic_escapes() { + let input = "quote\" backslash\\ line\n tab\t"; + let escaped = escape_yara_string(input); + assert!(escaped.contains("\\\"")); + assert!(escaped.contains("\\\\")); + assert!(escaped.contains("\\n")); + assert!(escaped.contains("\\t")); + } + + #[test] + fn control_characters() { + assert_eq!(escape_yara_string("\r"), "\\r"); + assert_eq!(escape_yara_string("\x00"), "\\x00"); + assert_eq!(escape_yara_string("\x08"), "\\b"); + assert_eq!(escape_yara_string("\x0b"), "\\x0b"); + assert_eq!(escape_yara_string("\x0c"), "\\x0c"); + assert_eq!(escape_yara_string("\x7f"), "\\x7f"); + } + } + + mod escape_yara_unicode_literal_tests { + use super::*; + + #[test] + fn basic_escapes() { + assert_eq!(escape_yara_unicode_literal("quote\""), "quote\\\""); + assert_eq!(escape_yara_unicode_literal("back\\slash"), "back\\\\slash"); + assert_eq!(escape_yara_unicode_literal("line\nbreak"), "line\\nbreak"); + assert_eq!(escape_yara_unicode_literal("tab\there"), "tab\\there"); + assert_eq!(escape_yara_unicode_literal("return\rhere"), "return\\rhere"); + } + + #[test] + fn control_chars_hex_escaped() { + assert_eq!(escape_yara_unicode_literal("\x00"), "\\x00"); + assert_eq!(escape_yara_unicode_literal("\x1f"), "\\x1f"); + } + + #[test] + fn unicode_passthrough() { + let result = escape_yara_unicode_literal("\u{4E2D}\u{6587}"); + assert!( + result.contains('\u{4E2D}'), + "Non-control Unicode should not be escaped" + ); + } + + #[test] + fn empty_string() { + assert_eq!(escape_yara_unicode_literal(""), ""); + } + } + + mod utf16be_hex_string_tests { + use super::*; + + #[test] + fn basic_ascii() { + assert_eq!(utf16be_hex_string("A"), "{ 00 41 }"); + assert_eq!(utf16be_hex_string("AB"), "{ 00 41 00 42 }"); + } + + #[test] + fn empty_string() { + assert_eq!(utf16be_hex_string(""), "{ }"); + } + + #[test] + fn non_ascii_unicode() { + let chinese = utf16be_hex_string("\u{4E2D}"); + assert_eq!(chinese, "{ 4e 2d }"); + } + + #[test] + fn surrogate_pair() { + let emoji = utf16be_hex_string("\u{1F600}"); + assert_eq!(emoji, "{ d8 3d de 00 }"); + } + } + + mod utf16le_hex_string_tests { + use super::*; + + #[test] + fn basic_ascii() { + assert_eq!(utf16le_hex_string("A"), "{ 41 00 }"); + assert_eq!(utf16le_hex_string("AB"), "{ 41 00 42 00 }"); + } + + #[test] + fn empty_string() { + assert_eq!(utf16le_hex_string(""), "{ }"); + } + + #[test] + fn non_ascii_unicode() { + let chinese = utf16le_hex_string("\u{4E2D}"); + assert_eq!(chinese, "{ 2d 4e }"); + } + + #[test] + fn surrogate_pair() { + let emoji = utf16le_hex_string("\u{1F600}"); + assert_eq!(emoji, "{ 3d d8 00 de }"); + } + } +} diff --git a/src/output/yara.rs b/src/output/yara/mod.rs similarity index 64% rename from src/output/yara.rs rename to src/output/yara/mod.rs index ca2bfe7..5043278 100644 --- a/src/output/yara.rs +++ b/src/output/yara/mod.rs @@ -1,4 +1,14 @@ +//! YARA rule generation from extracted strings. +//! +//! Generates YARA rule templates suitable for malware analysis and detection. +//! Strings are grouped by tag and formatted with appropriate encoding modifiers. + +mod escaping; + use crate::types::{Encoding, FoundString, Result, Tag}; +use escaping::{ + escape_yara_string, escape_yara_unicode_literal, utf16be_hex_string, utf16le_hex_string, +}; use super::OutputMetadata; use std::collections::{BTreeMap, HashMap}; @@ -151,77 +161,6 @@ fn sanitize_identifier(name: &str) -> String { } } -fn escape_yara_string(text: &str) -> String { - let mut escaped = String::new(); - for byte in text.as_bytes() { - match *byte { - b'\\' => escaped.push_str("\\\\"), - b'"' => escaped.push_str("\\\""), - b'\n' => escaped.push_str("\\n"), - b'\r' => escaped.push_str("\\r"), - b'\t' => escaped.push_str("\\t"), - 0x08 => escaped.push_str("\\b"), - 0x0b => escaped.push_str("\\x0b"), - 0x0c => escaped.push_str("\\x0c"), - 0x00..=0x1f | 0x7f..=0xff => { - escaped.push_str(&format!("\\x{:02x}", byte)); - } - _ => escaped.push(*byte as char), - } - } - escaped -} - -fn escape_yara_unicode_literal(text: &str) -> String { - let mut escaped = String::new(); - for ch in text.chars() { - match ch { - '\\' => escaped.push_str("\\\\"), - '"' => escaped.push_str("\\\""), - '\n' => escaped.push_str("\\n"), - '\r' => escaped.push_str("\\r"), - '\t' => escaped.push_str("\\t"), - _ if ch.is_control() => { - let mut buf = [0; 4]; - let encoded = ch.encode_utf8(&mut buf); - for byte in encoded.as_bytes() { - escaped.push_str(&format!("\\x{:02x}", byte)); - } - } - _ => escaped.push(ch), - } - } - escaped -} - -fn utf16be_hex_string(text: &str) -> String { - let hex_bytes: Vec = text - .encode_utf16() - .flat_map(|unit| unit.to_be_bytes()) - .map(|b| format!("{:02x}", b)) - .collect(); - - if hex_bytes.is_empty() { - return "{ }".to_string(); - } - - format!("{{ {} }}", hex_bytes.join(" ")) -} - -fn utf16le_hex_string(text: &str) -> String { - let hex_bytes: Vec = text - .encode_utf16() - .flat_map(|unit| unit.to_le_bytes()) - .map(|b| format!("{:02x}", b)) - .collect(); - - if hex_bytes.is_empty() { - return "{ }".to_string(); - } - - format!("{{ {} }}", hex_bytes.join(" ")) -} - fn tag_name(tag: &Tag) -> &'static str { match tag { Tag::Url => "Url", @@ -291,16 +230,6 @@ mod tests { assert_eq!(sanitize_rule_name(""), "_"); } - #[test] - fn test_escape_yara_string() { - let input = "quote\" backslash\\ line\n tab\t"; - let escaped = escape_yara_string(input); - assert!(escaped.contains("\\\"")); - assert!(escaped.contains("\\\\")); - assert!(escaped.contains("\\n")); - assert!(escaped.contains("\\t")); - } - #[test] fn test_group_strings_by_tag() { let strings = vec![ @@ -362,85 +291,12 @@ mod tests { assert!(output.contains("\\x")); } - #[test] - fn test_escape_yara_unicode_literal_basic() { - // Basic escapes - assert_eq!(escape_yara_unicode_literal("quote\""), "quote\\\""); - assert_eq!(escape_yara_unicode_literal("back\\slash"), "back\\\\slash"); - assert_eq!(escape_yara_unicode_literal("line\nbreak"), "line\\nbreak"); - assert_eq!(escape_yara_unicode_literal("tab\there"), "tab\\there"); - assert_eq!(escape_yara_unicode_literal("return\rhere"), "return\\rhere"); - } - - #[test] - fn test_escape_yara_unicode_literal_control_chars() { - // Control characters should be hex-escaped - assert_eq!(escape_yara_unicode_literal("\x00"), "\\x00"); - assert_eq!(escape_yara_unicode_literal("\x1f"), "\\x1f"); - } - - #[test] - fn test_escape_yara_unicode_literal_unicode_passthrough() { - // Non-control Unicode should pass through unescaped - let result = escape_yara_unicode_literal("\u{4E2D}\u{6587}"); - assert!( - result.contains('\u{4E2D}'), - "Non-control Unicode should not be escaped" - ); - } - - #[test] - fn test_escape_yara_unicode_literal_empty() { - assert_eq!(escape_yara_unicode_literal(""), ""); - } - - #[test] - fn test_utf16be_hex_string_basic() { - // Basic ASCII - should be big-endian (00 followed by ASCII byte) - assert_eq!(utf16be_hex_string("A"), "{ 00 41 }"); - assert_eq!(utf16be_hex_string("AB"), "{ 00 41 00 42 }"); - } - - #[test] - fn test_utf16be_hex_string_empty() { - assert_eq!(utf16be_hex_string(""), "{ }"); - } - - #[test] - fn test_utf16be_hex_string_non_ascii() { - // Non-ASCII Unicode (BMP) - Chinese character U+4E2D - let chinese = utf16be_hex_string("\u{4E2D}"); - assert_eq!(chinese, "{ 4e 2d }"); - } - - #[test] - fn test_utf16be_hex_string_surrogate_pair() { - // Character requiring surrogate pair (outside BMP) - emoji U+1F600 - let emoji = utf16be_hex_string("\u{1F600}"); - // Should produce surrogate pair: D83D DE00 - assert_eq!(emoji, "{ d8 3d de 00 }"); - } - - #[test] - fn test_escape_yara_string_control_characters() { - assert_eq!(escape_yara_string("\r"), "\\r"); - assert_eq!(escape_yara_string("\x00"), "\\x00"); - assert_eq!(escape_yara_string("\x08"), "\\b"); - assert_eq!(escape_yara_string("\x0b"), "\\x0b"); - assert_eq!(escape_yara_string("\x0c"), "\\x0c"); - assert_eq!(escape_yara_string("\x7f"), "\\x7f"); - } - #[test] fn test_format_yara_uses_current_timestamp_when_not_set() { - // When generated_at is None, format_yara should use current_timestamp() let metadata = OutputMetadata::new("test.bin".to_string(), OutputFormat::Yara, 0, 0); - // Note: generated_at is None let output = format_yara(&[], &metadata).expect("Formatting should succeed"); - // Should contain a timestamp in the generated_at field assert!(output.contains("generated_at = \"")); - // Timestamp should be numeric (or CLOCK_ERROR in exceptional cases) assert!( output.contains("generated_at = \"1") || output.contains("generated_at = \"CLOCK_ERROR"), @@ -448,36 +304,8 @@ mod tests { ); } - #[test] - fn test_utf16le_hex_string_basic() { - // Basic ASCII - should be little-endian (ASCII byte followed by 00) - assert_eq!(utf16le_hex_string("A"), "{ 41 00 }"); - assert_eq!(utf16le_hex_string("AB"), "{ 41 00 42 00 }"); - } - - #[test] - fn test_utf16le_hex_string_empty() { - assert_eq!(utf16le_hex_string(""), "{ }"); - } - - #[test] - fn test_utf16le_hex_string_non_ascii() { - // Non-ASCII Unicode (BMP) - Chinese character U+4E2D - let chinese = utf16le_hex_string("\u{4E2D}"); - assert_eq!(chinese, "{ 2d 4e }"); - } - - #[test] - fn test_utf16le_hex_string_surrogate_pair() { - // Character requiring surrogate pair (outside BMP) - emoji U+1F600 - let emoji = utf16le_hex_string("\u{1F600}"); - // Should produce surrogate pair: 3D D8 00 DE (little-endian) - assert_eq!(emoji, "{ 3d d8 00 de }"); - } - #[test] fn test_utf16le_ascii_uses_wide_modifier() { - // ASCII UTF-16LE should use "wide" modifier let mut string = make_string("test"); string.encoding = Encoding::Utf16Le; let output = format_yara(&[string], &make_metadata()).expect("Formatting should succeed"); @@ -490,7 +318,6 @@ mod tests { #[test] fn test_utf16le_non_ascii_uses_hex() { - // Non-ASCII UTF-16LE should use hex string, not wide modifier let mut string = make_string("\u{4E2D}\u{6587}"); string.encoding = Encoding::Utf16Le; let output = format_yara(&[string], &make_metadata()).expect("Formatting should succeed"); @@ -506,11 +333,9 @@ mod tests { #[test] fn test_binary_name_injection_escaped_in_comments() { - // Binary name with newlines should be escaped in comments let mut metadata = make_metadata(); metadata.binary_name = "evil\nname".to_string(); let output = format_yara(&[], &metadata).expect("Formatting should succeed"); - // Should contain escaped newline, not literal assert!( output.contains("evil\\nname"), "Newlines in binary_name should be escaped" @@ -523,11 +348,9 @@ mod tests { #[test] fn test_timestamp_injection_escaped_in_meta() { - // Timestamp with special characters should be escaped let mut metadata = make_metadata(); metadata.generated_at = Some("2024\"\n//attack".to_string()); let output = format_yara(&[], &metadata).expect("Formatting should succeed"); - // Should contain escaped characters assert!( output.contains("2024\\\"\\n//attack"), "Special chars in timestamp should be escaped" From c4ec73be8be98817df69f7e6c574a7769a1a2c4b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Tue, 20 Jan 2026 00:23:20 +0000 Subject: [PATCH 21/25] chore(agents): fix formatting in critical rules section Signed-off-by: UncleSp1d3r --- AGENTS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 2e71a4e..2dec2aa 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,8 +6,7 @@ 1. **No `unsafe` code** - `#![forbid(unsafe_code)]` enforced 2. **Zero warnings** - `cargo clippy -- -D warnings` must pass -3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation (except when explicity testing or working with Unicode strings or emjois) -4. **File size limit** - Keep files under 500 lines; split larger files +3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation (except when explicity testing or working with Unicode strings or emjois)4. **File size limit** - Keep files under 500 lines; split larger files 5. **No blanket `#[allow]`** - Any `allow` requires inline justification ## Project Summary From 74c71bc8945d1089f7f68fef5aba21e957039c7a Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 24 Jan 2026 13:31:00 -0500 Subject: [PATCH 22/25] docs(AGENTS): improve AI agent guidelines with fixes and additions Fix formatting bug where rules 3 and 4 were merged on one line, correct typos, add Rust version requirements, expand development commands, clarify module structure, and document key dependencies. Co-Authored-By: Claude Opus 4.5 --- .gitignore | 6 ++++++ .repomixignore | 4 ++++ .vscode/settings.json | 12 ++++++++++++ AGENTS.md | 21 ++++++++++++++++++--- 4 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 .repomixignore create mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index 98b1e83..6657408 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,9 @@ docs/book/ !bin/ # Added by goreleaser init: .intentionally-empty-file.o + + +megalinter-reports/* +target/* +stringy-output/* +tests/fixtures/* diff --git a/.repomixignore b/.repomixignore new file mode 100644 index 0000000..cff354a --- /dev/null +++ b/.repomixignore @@ -0,0 +1,4 @@ +megalinter-reports/* +target/* +stringy-output/* +tests/fixtures/* diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..868f790 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,12 @@ +{ + "ruff.path": [ + "${workspaceFolder}/.vscode/mise-tools/ruff" + ], + "ruff.interpreter": [ + "${workspaceFolder}/.vscode/mise-tools/python" + ], + "python.defaultInterpreterPath": "${workspaceFolder}/.vscode/mise-tools/python", + "debug.javascript.defaultRuntimeExecutable": { + "pwa-node": "${workspaceFolder}/.vscode/mise-tools/node" + } +} \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 2dec2aa..e18adf4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,13 +6,16 @@ 1. **No `unsafe` code** - `#![forbid(unsafe_code)]` enforced 2. **Zero warnings** - `cargo clippy -- -D warnings` must pass -3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation (except when explicity testing or working with Unicode strings or emjois)4. **File size limit** - Keep files under 500 lines; split larger files +3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation (except when explicitly testing or working with Unicode strings or emojis) +4. **File size limit** - Keep files under 500 lines; split larger files 5. **No blanket `#[allow]`** - Any `allow` requires inline justification ## Project Summary Stringy extracts meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. Unlike standard `strings`, it is section-aware and semantically intelligent. +**Rust**: Edition 2024, MSRV 1.91 + **Data flow**: Binary -> Format Detection -> Container Parsing -> String Extraction -> Deduplication -> Classification -> Ranking -> Output ## Module Structure @@ -21,8 +24,8 @@ Stringy extracts meaningful strings from ELF, PE, and Mach-O binaries using form | ----------------- | ---------------------------------------------------------------- | | `container/` | Format detection, section analysis, imports/exports via `goblin` | | `extraction/` | ASCII/UTF-8/UTF-16 extraction, deduplication, PE resources | -| `classification/` | Semantic tagging (URLs, IPs, domains, paths, GUIDs) | -| `output/` | Formatters (JSON, human-readable, YARA-friendly) | +| `classification/` | Semantic tagging (URLs, IPs, domains, paths, GUIDs), ranking | +| `output/` | Formatters: `json/`, `table/` (tty/plain), `yara/` | | `types/` | Core data structures, error handling with `thiserror` | ## Key Patterns @@ -47,6 +50,10 @@ just test # Run tests with nextest just lint # Full lint suite just fix # Auto-fix clippy warnings just ci-check # Full CI suite locally +just build # Debug build +just run # Run stringy with arguments +just bench # Run benchmarks +just format # Format all (Rust, JSON, YAML, Markdown, Justfile) ``` ## Testing @@ -59,6 +66,14 @@ just ci-check # Full CI suite locally Import from `stringy::extraction` or `stringy::types`, not deeply nested paths. Re-exports are in `lib.rs`. +## Key Dependencies + +- `goblin` - Binary format parsing (ELF, PE, Mach-O) +- `pelite` - PE resource extraction +- `thiserror` - Error type definitions +- `insta` - Snapshot testing (dev) +- `criterion` - Benchmarking (dev) + ## Adding Features **New semantic tag**: Add variant to `Tag` enum in `types.rs`, implement pattern in `classification/semantic.rs` From 96635563536ef92e61883545ba504af5fdfa9ecb Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 24 Jan 2026 18:14:51 -0500 Subject: [PATCH 23/25] fix: address PR review comments for output formatters - Fix test metadata count mismatches in output_table_integration.rs - Remove underscore prefix from used parameters in json.rs - Use crate re-export for RankingConfig import - Fix truncate_string edge case for wide multibyte characters - Update table docs to reflect actual tag selection behavior - Add control character sanitization for TTY output - Use UTF-8 encoding for Unicode test content - Add Claude Code AI assistance section to CONTRIBUTING.md - Update snapshots for encoding and sanitization changes Co-Authored-By: Claude Opus 4.5 --- CONTRIBUTING.md | 4 + TESTING_ANALYSIS.md | 192 ++++++++++-------- src/output/json.rs | 8 +- src/output/table/formatting.rs | 16 +- src/output/table/mod.rs | 2 +- src/output/table/tty.rs | 24 ++- tests/output_json_integration.rs | 9 +- tests/output_table_integration.rs | 4 +- ...son_integration__json_unicode_content.snap | 3 +- ...e_integration__tty_special_characters.snap | 3 +- 10 files changed, 160 insertions(+), 105 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 04b90e9..64a7366 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,6 +81,10 @@ Docs live under docs/ and project planning artifacts are in project_plan/. Updat If you believe you found a security issue, please do not open a public issue. Use GitHub Security Advisories if available, or contact the maintainers privately. +## AI-assisted development + +This project includes Claude Code configuration in `.claude/settings.json`. These settings enable plugins that help maintain code quality and follow project conventions. If you use Claude Code, the configuration will be applied automatically. + ## Questions If you are unsure where to start, open an issue with your question and we will point you in the right direction. diff --git a/TESTING_ANALYSIS.md b/TESTING_ANALYSIS.md index dee19ca..30b54bc 100644 --- a/TESTING_ANALYSIS.md +++ b/TESTING_ANALYSIS.md @@ -26,15 +26,15 @@ **Test Files**: -1. `integration_elf.rs` (10 tests) - ELF parsing and extraction -2. `integration_extraction.rs` (9 tests) - End-to-end extraction -3. `integration_macho.rs` (15 tests) - Mach-O parsing and load commands -4. `integration_pe.rs` (22 tests) - PE parsing and resource extraction -5. `test_ascii_extraction.rs` (14 tests) - ASCII extraction scenarios -6. `test_ascii_integration.rs` (14 tests) - ASCII integration tests -7. `test_deduplication.rs` (5 tests) - Deduplication workflows -8. `test_noise_filters.rs` (9 tests) - Noise filtering heuristics -9. `test_utf16_extraction.rs` (5 tests) - UTF-16 extraction +01. `integration_elf.rs` (10 tests) - ELF parsing and extraction +02. `integration_extraction.rs` (9 tests) - End-to-end extraction +03. `integration_macho.rs` (15 tests) - Mach-O parsing and load commands +04. `integration_pe.rs` (22 tests) - PE parsing and resource extraction +05. `test_ascii_extraction.rs` (14 tests) - ASCII extraction scenarios +06. `test_ascii_integration.rs` (14 tests) - ASCII integration tests +07. `test_deduplication.rs` (5 tests) - Deduplication workflows +08. `test_noise_filters.rs` (9 tests) - Noise filtering heuristics +09. `test_utf16_extraction.rs` (5 tests) - UTF-16 extraction 10. `classification_integration.rs` (27 tests) - Semantic classification 11. `output_json_integration.rs` (41 tests) - JSON output format 12. `output_table_integration.rs` (27 tests) - Table output format @@ -235,100 +235,117 @@ fn test_section_size_overflow() { ### HIGH Priority Gaps 1. **Performance Tests for Deduplication** - - Test with 10,000+ duplicate strings - - Validate O(n^2) algorithms don't cause timeout - - File: `tests/test_deduplication_performance.rs` (MISSING) + + - Test with 10,000+ duplicate strings + - Validate O(n^2) algorithms don't cause timeout + - File: `tests/test_deduplication_performance.rs` (MISSING) 2. **Doctest Fixes** - - Fix `extraction::StringExtractor` doctest (line 318) - - Fix `extraction::BasicExtractor` doctest (line 408) - - Files: `src/extraction/mod.rs` + + - Fix `extraction::StringExtractor` doctest (line 318) + - Fix `extraction::BasicExtractor` doctest (line 408) + - Files: `src/extraction/mod.rs` 3. **Bounds Checking Edge Cases** - - Section offset beyond file boundary - - Section size causing integer overflow - - File: `tests/test_extraction_edge_cases.rs` (MISSING) + + - Section offset beyond file boundary + - Section size causing integer overflow + - File: `tests/test_extraction_edge_cases.rs` (MISSING) ### MEDIUM Priority Gaps 1. **Malformed Binary Handling** - - Truncated ELF headers - - Invalid PE signatures - - Corrupted Mach-O load commands - - File: `tests/test_malformed_binaries.rs` (MISSING) + + - Truncated ELF headers + - Invalid PE signatures + - Corrupted Mach-O load commands + - File: `tests/test_malformed_binaries.rs` (MISSING) 2. **Regex Pattern Edge Cases** - - URL regex with edge cases (IPv6 in URLs, Unicode domains) - - Email regex with uncommon formats - - Path regex with UNC paths edge cases - - Files: Pattern test modules (PARTIAL) + + - URL regex with edge cases (IPv6 in URLs, Unicode domains) + - Email regex with uncommon formats + - Path regex with UNC paths edge cases + - Files: Pattern test modules (PARTIAL) 3. **Resource Extraction Error Paths** - - PE resource directory corruption - - Version info parsing failures - - String table malformed data - - File: `src/extraction/pe_resources.rs` tests (PARTIAL) + + - PE resource directory corruption + - Version info parsing failures + - String table malformed data + - File: `src/extraction/pe_resources.rs` tests (PARTIAL) ### LOW Priority Gaps 1. **Main Binary CLI Testing** - - Integration tests for CLI argument parsing - - File: `tests/cli_integration.rs` (MISSING, but main is stub) + + - Integration tests for CLI argument parsing + - File: `tests/cli_integration.rs` (MISSING, but main is stub) 2. **Memory Leak Tests** - - Large file processing without memory growth - - File: Performance test suite (MISSING) + + - Large file processing without memory growth + - File: Performance test suite (MISSING) 3. **Concurrency Tests** - - Parallel extraction from multiple files - - Thread safety validation - - File: Concurrency test suite (MISSING) + + - Parallel extraction from multiple files + - Thread safety validation + - File: Concurrency test suite (MISSING) ## Test Infrastructure Assessment ### Strengths 1. **Excellent Fixture Management** - - Well-documented rebuild process - - Multiple binary formats covered - - Source code available for reproduction + + - Well-documented rebuild process + - Multiple binary formats covered + - Source code available for reproduction 2. **Comprehensive Integration Tests** - - 219 integration tests covering end-to-end scenarios - - Real binary fixtures used - - All output formats tested + + - 219 integration tests covering end-to-end scenarios + - Real binary fixtures used + - All output formats tested 3. **Snapshot Testing** - - `insta` framework well-utilized - - Output format changes tracked - - Easy to review snapshot diffs + + - `insta` framework well-utilized + - Output format changes tracked + - Easy to review snapshot diffs 4. **Test Organization** - - Clear separation: unit vs integration - - Logical grouping by functionality - - Consistent naming conventions + + - Clear separation: unit vs integration + - Logical grouping by functionality + - Consistent naming conventions ### Weaknesses 1. **No Performance Benchmarks** - - No `criterion` benchmarks - - No performance regression detection - - Large input scenarios untested + + - No `criterion` benchmarks + - No performance regression detection + - Large input scenarios untested 2. **No Fuzzing Tests** - - No `cargo-fuzz` integration - - Binary parsing not fuzz-tested - - String extraction not fuzz-tested + + - No `cargo-fuzz` integration + - Binary parsing not fuzz-tested + - String extraction not fuzz-tested 3. **No Code Coverage Metrics** - - `cargo-tarpaulin` not installed - - No coverage reports in CI - - Unknown actual code coverage percentage + + - `cargo-tarpaulin` not installed + - No coverage reports in CI + - Unknown actual code coverage percentage 4. **Limited Error Injection** - - Few tests for error paths - - Missing tests for resource failures - - I/O error handling not tested + + - Few tests for error paths + - Missing tests for resource failures + - I/O error handling not tested ## Recommendations @@ -367,42 +384,49 @@ fn test_section_size_overflow() { ### Short-term Improvements (Month 1) 1. **Add Fuzzing** - - Install `cargo-fuzz` - - Fuzz container parsers (ELF, PE, Mach-O) - - Fuzz string extractors (ASCII, UTF-16) + + - Install `cargo-fuzz` + - Fuzz container parsers (ELF, PE, Mach-O) + - Fuzz string extractors (ASCII, UTF-16) 2. **Enable Code Coverage** - - Install `cargo-tarpaulin` - - Add coverage to CI pipeline - - Set coverage threshold (80% target) + + - Install `cargo-tarpaulin` + - Add coverage to CI pipeline + - Set coverage threshold (80% target) 3. **Add Malformed Binary Tests** - - Create corrupted fixtures - - Test graceful error handling - - Verify no panics on invalid input + + - Create corrupted fixtures + - Test graceful error handling + - Verify no panics on invalid input ### Long-term Enhancements (Quarter 1) 1. **Performance Benchmarks** - - Add `criterion` benchmarks - - Track deduplication performance - - Track classification performance - - Add to CI for regression detection + + - Add `criterion` benchmarks + - Track deduplication performance + - Track classification performance + - Add to CI for regression detection 2. **Property-Based Testing** - - Add `proptest` or `quickcheck` - - Generate random binaries - - Verify invariants (no panics, valid output) + + - Add `proptest` or `quickcheck` + - Generate random binaries + - Verify invariants (no panics, valid output) 3. **CLI Integration Tests** - - Implement main binary - - Add end-to-end CLI tests - - Test output redirection, error handling + + - Implement main binary + - Add end-to-end CLI tests + - Test output redirection, error handling 4. **Concurrency Tests** - - Test thread safety - - Test parallel file processing - - Validate no data races + + - Test thread safety + - Test parallel file processing + - Validate no data races ## Test Quality Score @@ -473,7 +497,7 @@ fn test_section_size_overflow() { - Unit tests: 0.04s (258 tests) - Integration tests: ~1.5s (219 tests) -- Total execution: <20s including doctests +- Total execution: \<20s including doctests **Verdict**: EXCELLENT - Fast feedback loop diff --git a/src/output/json.rs b/src/output/json.rs index 635d8aa..ce5c986 100644 --- a/src/output/json.rs +++ b/src/output/json.rs @@ -3,13 +3,13 @@ use crate::types::{FoundString, Result, StringyError}; use super::OutputMetadata; /// Format strings as JSONL output, one object per line. -pub fn format_json(_strings: &[FoundString], _metadata: &OutputMetadata) -> Result { - if _strings.is_empty() { +pub fn format_json(strings: &[FoundString], _metadata: &OutputMetadata) -> Result { + if strings.is_empty() { return Ok(String::new()); } - let mut lines = Vec::with_capacity(_strings.len()); - for item in _strings { + let mut lines = Vec::with_capacity(strings.len()); + for item in strings { if !item.confidence.is_finite() { return Err(StringyError::ConfigError( "JSON serialization failed: non-finite confidence".to_string(), diff --git a/src/output/table/formatting.rs b/src/output/table/formatting.rs index 02c8e4a..06fcbea 100644 --- a/src/output/table/formatting.rs +++ b/src/output/table/formatting.rs @@ -3,7 +3,7 @@ //! This module provides shared utilities for formatting strings, tags, and //! text alignment used by both TTY and plain output modes. -use crate::classification::ranking::RankingConfig; +use crate::classification::RankingConfig; use crate::types::Tag; use super::TAGS_COLUMN_WIDTH; @@ -129,9 +129,9 @@ pub fn truncate_string(s: &str, max_len: usize) -> String { // Find a valid character boundary for truncation let truncate_at = max_len - 3; - let mut end_index = truncate_at; + let mut end_index = 0; - // Ensure we don't split a multi-byte character + // Find the last char boundary that fits within truncate_at bytes for (idx, _) in s.char_indices() { if idx <= truncate_at { end_index = idx; @@ -140,13 +140,9 @@ pub fn truncate_string(s: &str, max_len: usize) -> String { } } - // Handle case where we need to include at least one character - if end_index == 0 && !s.is_empty() { - if let Some((idx, _)) = s.char_indices().nth(1) { - end_index = idx; - } else { - end_index = s.len(); - } + // If the first character is too wide to fit with "...", just return dots + if end_index == 0 { + return ".".repeat(max_len.min(3)); } format!("{}...", &s[..end_index]) diff --git a/src/output/table/mod.rs b/src/output/table/mod.rs index 9bcb95a..8c79829 100644 --- a/src/output/table/mod.rs +++ b/src/output/table/mod.rs @@ -27,7 +27,7 @@ //! # Column Layout //! //! - **String**: Up to 60 characters, truncated with `...` if longer -//! - **Tags**: First 2-3 tags, comma-separated, max 20 characters +//! - **Tags**: Tags with highest boost value shown, max 20 characters //! - **Score**: Right-aligned integer score //! - **Section**: Section name where the string was found diff --git a/src/output/table/tty.rs b/src/output/table/tty.rs index 38ed658..918186a 100644 --- a/src/output/table/tty.rs +++ b/src/output/table/tty.rs @@ -5,6 +5,27 @@ use crate::types::{FoundString, Result}; use super::formatting::{Alignment, format_tags, pad_string, truncate_string}; + +/// Sanitize a string for TTY display by replacing control characters. +/// +/// Replaces newlines, tabs, and other control characters with visible escape sequences +/// to prevent broken table layout. +fn sanitize_for_display(s: &str) -> String { + let mut result = String::with_capacity(s.len()); + for c in s.chars() { + match c { + '\n' => result.push_str("\\n"), + '\r' => result.push_str("\\r"), + '\t' => result.push_str("\\t"), + '\x00'..='\x1f' | '\x7f' => { + // Other control characters shown as \xNN + result.push_str(&format!("\\x{:02x}", c as u8)); + } + _ => result.push(c), + } + } + result +} use super::{ OutputMetadata, SCORE_COLUMN_WIDTH, SECTION_COLUMN_WIDTH, STRING_COLUMN_WIDTH, TAGS_COLUMN_WIDTH, @@ -55,7 +76,8 @@ pub(super) fn format_table_tty( // Build rows for found_string in strings { - let truncated_text = truncate_string(&found_string.text, STRING_COLUMN_WIDTH); + let sanitized_text = sanitize_for_display(&found_string.text); + let truncated_text = truncate_string(&sanitized_text, STRING_COLUMN_WIDTH); let tags_display = format_tags(&found_string.tags); let section_display = found_string.section.as_deref().unwrap_or(""); diff --git a/tests/output_json_integration.rs b/tests/output_json_integration.rs index 4590956..7e74d88 100644 --- a/tests/output_json_integration.rs +++ b/tests/output_json_integration.rs @@ -227,8 +227,15 @@ fn test_json_long_strings() { #[test] fn test_json_unicode_content() { + // Use UTF-8 encoding for non-ASCII content let unicode = "\u{4E2D}\u{6587}\u{5B57}\u{7B26}"; - let strings = vec![make_string(unicode)]; + let strings = vec![FoundString::new( + unicode.to_string(), + Encoding::Utf8, + 0x1000, + unicode.len() as u32, + StringSource::SectionData, + )]; let output = format_json(&strings, &make_metadata(1)).unwrap(); assert_snapshot!(output); } diff --git a/tests/output_table_integration.rs b/tests/output_table_integration.rs index 3446464..e63a45e 100644 --- a/tests/output_table_integration.rs +++ b/tests/output_table_integration.rs @@ -274,7 +274,7 @@ fn test_plain_multiple_strings() { .with_tags(vec![Tag::Import]) .with_score(80), ]; - let result = format_table_with_mode(&strings, &make_metadata(4), false).unwrap(); + let result = format_table_with_mode(&strings, &make_metadata(3), false).unwrap(); assert_snapshot!(result); } @@ -294,7 +294,7 @@ fn test_plain_preserves_special_characters() { make_string("quote\"here"), make_string("line1\nline2"), ]; - let result = format_table_with_mode(&strings, &make_metadata(3), false).unwrap(); + let result = format_table_with_mode(&strings, &make_metadata(4), false).unwrap(); assert_snapshot!(result); } diff --git a/tests/snapshots/output_json_integration__json_unicode_content.snap b/tests/snapshots/output_json_integration__json_unicode_content.snap index 77c2d01..6f94b92 100644 --- a/tests/snapshots/output_json_integration__json_unicode_content.snap +++ b/tests/snapshots/output_json_integration__json_unicode_content.snap @@ -1,5 +1,6 @@ --- source: tests/output_json_integration.rs +assertion_line: 240 expression: output --- -{"text":"中文字符","encoding":"Ascii","offset":4096,"rva":null,"section":null,"length":12,"tags":[],"score":0,"source":"SectionData","confidence":1.0} +{"text":"中文字符","encoding":"Utf8","offset":4096,"rva":null,"section":null,"length":12,"tags":[],"score":0,"source":"SectionData","confidence":1.0} diff --git a/tests/snapshots/output_table_integration__tty_special_characters.snap b/tests/snapshots/output_table_integration__tty_special_characters.snap index 2ebce1e..2718e43 100644 --- a/tests/snapshots/output_table_integration__tty_special_characters.snap +++ b/tests/snapshots/output_table_integration__tty_special_characters.snap @@ -1,9 +1,10 @@ --- source: tests/output_table_integration.rs +assertion_line: 142 expression: result --- String | Tags | Score | Section -------------------------------------------------------------|----------|--------|-------- -string with tab | | 10 | .data +string with\ttab | | 10 | .data pipe|character | | 10 | .data backslash\here | filepath | 20 | .rdata From 5ccbff1e1f61fa9c87de3af82575f81628762287 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 24 Jan 2026 18:21:37 -0500 Subject: [PATCH 24/25] chore: cleanup devcontainer, justfile, and AGENTS.md - Remove duplicate rust-bundle extension, keep official rust-analyzer - Refactor setup recipes to reuse mise-install recipe - Fix markdown lint: convert emphasis-as-heading to list items Co-Authored-By: Claude Opus 4.5 --- .devcontainer/devcontainer.json | 1 - AGENTS.md | 5 ++--- justfile | 6 ++---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b145780..92c4a66 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -37,7 +37,6 @@ "bierner.markdown-mermaid", "bierner.markdown-yaml-preamble", "DavidAnson.vscode-markdownlint", - "1YiB.rust-bundle", "rust-lang.rust-analyzer", "foxundermoon.shell-format", "redhat.vscode-yaml", diff --git a/AGENTS.md b/AGENTS.md index e18adf4..c7baa6a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,9 +14,8 @@ Stringy extracts meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. Unlike standard `strings`, it is section-aware and semantically intelligent. -**Rust**: Edition 2024, MSRV 1.91 - -**Data flow**: Binary -> Format Detection -> Container Parsing -> String Extraction -> Deduplication -> Classification -> Ranking -> Output +- **Rust**: Edition 2024, MSRV 1.91 +- **Data flow**: Binary -> Format Detection -> Container Parsing -> String Extraction -> Deduplication -> Classification -> Ranking -> Output ## Module Structure diff --git a/justfile b/justfile index 359c785..e2279f8 100644 --- a/justfile +++ b/justfile @@ -53,16 +53,14 @@ rmrf path: # Development setup [windows] setup: - mise trust - mise install + @just mise-install rustup component add rustfmt clippy llvm-tools-preview @just mdformat-install Write-Host "Note: You may need to restart your shell for pipx PATH changes to take effect" [unix] setup: - mise trust - mise install + @just mise-install rustup component add rustfmt clippy llvm-tools-preview @just mdformat-install echo "Note: You may need to restart your shell for pipx PATH changes to take effect" From e4d1e15628669d48adcf55ac09824354792043dc Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 24 Jan 2026 18:28:13 -0500 Subject: [PATCH 25/25] refactor(justfile): use mise exec for all tool commands Add mise_exec variable and prefix all tool commands (cargo, prettier, pre-commit, actionlint, cspell, markdownlint, lychee, dist, mdbook, goreleaser) with mise exec to ensure correct tool versions are used regardless of shell activation state. Also adds dotenv-load and ignore-comments settings for consistency with other projects. Co-Authored-By: Claude Opus 4.5 --- justfile | 104 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/justfile b/justfile index e2279f8..278d9bc 100644 --- a/justfile +++ b/justfile @@ -1,9 +1,15 @@ # Cross-platform justfile using OS annotations # Windows uses PowerShell, Unix uses bash -set shell := ["bash", "-c"] +set shell := ["bash", "-cu"] set windows-shell := ["powershell", "-NoProfile", "-Command"] +set dotenv-load := true +set ignore-comments := true +# Use mise to manage all dev tools (cargo, node, pre-commit, etc.) +# See mise.toml for tool versions + +mise_exec := "mise exec --" root := justfile_dir() # ============================================================================= @@ -80,23 +86,23 @@ mise-install: [windows] install-tools: @just mise-install - cargo binstall --disable-telemetry cargo-llvm-cov cargo-audit cargo-deny cargo-dist cargo-release cargo-cyclonedx cargo-auditable cargo-nextest --locked + @{{ mise_exec }} cargo binstall --disable-telemetry cargo-llvm-cov cargo-audit cargo-deny cargo-dist cargo-release cargo-cyclonedx cargo-auditable cargo-nextest --locked [unix] install-tools: @just mise-install - cargo binstall --disable-telemetry cargo-llvm-cov cargo-audit cargo-deny cargo-dist cargo-release cargo-cyclonedx cargo-auditable cargo-nextest --locked + @{{ mise_exec }} cargo binstall --disable-telemetry cargo-llvm-cov cargo-audit cargo-deny cargo-dist cargo-release cargo-cyclonedx cargo-auditable cargo-nextest --locked # Install mdBook plugins for documentation [windows] docs-install: @just mise-install - cargo binstall mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers + @{{ mise_exec }} cargo binstall mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers [unix] docs-install: @just mise-install - cargo binstall mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers + @{{ mise_exec }} cargo binstall mdbook-admonish mdbook-mermaid mdbook-linkcheck mdbook-toc mdbook-open-on-gh mdbook-tabs mdbook-i18n-helpers # Install pipx for Python tool management [windows] @@ -145,7 +151,7 @@ format: fmt format-json-yaml format-docs fmt-justfile # Individual format recipes format-json-yaml: - prettier --write "**/*.{json,yaml,yml}" + @{{ mise_exec }} prettier --write "**/*.{json,yaml,yml}" [windows] format-docs: @@ -156,16 +162,16 @@ format-docs: @if command -v mdformat >/dev/null 2>&1; then find . -type f -name "*.md" -not -path "./target/*" -not -path "./node_modules/*" -exec mdformat {} + ; else echo "mdformat not found. Run 'just mdformat-install' first."; fi fmt: - @cargo fmt --all + @{{ mise_exec }} cargo fmt --all fmt-check: - @cargo fmt --all --check + @{{ mise_exec }} cargo fmt --all --check lint-rust: fmt-check - @cargo clippy --workspace --all-targets --all-features -- -D warnings + @{{ mise_exec }} cargo clippy --workspace --all-targets --all-features -- -D warnings lint-rust-min: - @cargo clippy --workspace --all-targets --no-default-features -- -D warnings + @{{ mise_exec }} cargo clippy --workspace --all-targets --no-default-features -- -D warnings # Format justfile fmt-justfile: @@ -180,43 +186,43 @@ lint: lint-rust lint-actions lint-spell lint-docs lint-justfile # Individual lint recipes lint-actions: - actionlint .github/workflows/*.yml + @{{ mise_exec }} actionlint .github/workflows/*.yml lint-spell: - cspell "**" --config cspell.config.yaml + @{{ mise_exec }} cspell "**" --config cspell.config.yaml lint-docs: - markdownlint docs/**/*.md README.md - lychee docs/**/*.md README.md + @{{ mise_exec }} markdownlint docs/**/*.md README.md + @{{ mise_exec }} lychee docs/**/*.md README.md alias lint-just := lint-justfile # Run clippy with fixes fix: - cargo clippy --fix --allow-dirty --allow-staged + @{{ mise_exec }} cargo clippy --fix --allow-dirty --allow-staged # Quick development check check: pre-commit-run lint pre-commit-run: - pre-commit run -a + @{{ mise_exec }} pre-commit run -a # Format a single file (for pre-commit hooks) format-files +FILES: - prettier --write --config .prettierrc.json {{ FILES }} + @{{ mise_exec }} prettier --write --config .prettierrc.json {{ FILES }} # ============================================================================= # BUILDING AND TESTING # ============================================================================= build: - @cargo build --workspace + @{{ mise_exec }} cargo build --workspace build-release: - @cargo build --workspace --release + @{{ mise_exec }} cargo build --workspace --release test: - @cargo nextest run --workspace --no-capture + @{{ mise_exec }} cargo nextest run --workspace --no-capture # Test justfile cross-platform functionality [windows] @@ -242,11 +248,11 @@ test-fs: @just rmrf tmp/xfstest test-ci: - cargo nextest run --workspace --no-capture + @{{ mise_exec }} cargo nextest run --workspace --no-capture # Run all tests including ignored/slow tests across workspace test-all: - cargo nextest run --workspace --no-capture -- --ignored + @{{ mise_exec }} cargo nextest run --workspace --no-capture -- --ignored # ============================================================================= # BENCHMARKING @@ -254,17 +260,17 @@ test-all: # Run all benchmarks bench: - @cargo bench --workspace + @{{ mise_exec }} cargo bench --workspace # ============================================================================= # SECURITY AND AUDITING # ============================================================================= audit: - cargo audit + @{{ mise_exec }} cargo audit deny: - cargo deny check + @{{ mise_exec }} cargo deny check # ============================================================================= # CI AND QUALITY ASSURANCE @@ -272,11 +278,11 @@ deny: # Generate coverage report coverage: - cargo llvm-cov --workspace --lcov --output-path lcov.info + @{{ mise_exec }} cargo llvm-cov --workspace --lcov --output-path lcov.info # Check coverage thresholds coverage-check: - cargo llvm-cov --workspace --lcov --output-path lcov.info --fail-under-lines 9.7 + @{{ mise_exec }} cargo llvm-cov --workspace --lcov --output-path lcov.info --fail-under-lines 9.7 # Full local CI parity check ci-check: pre-commit-run fmt-check lint-rust lint-rust-min test-ci build-release audit coverage-check dist-plan @@ -286,29 +292,29 @@ ci-check: pre-commit-run fmt-check lint-rust lint-rust-min test-ci build-release # ============================================================================= run *args: - @cargo run -p stringy -- {{ args }} + @{{ mise_exec }} cargo run -p stringy -- {{ args }} # ============================================================================= # DISTRIBUTION AND PACKAGING # ============================================================================= dist: - @dist build + @{{ mise_exec }} dist build dist-check: - @dist check + @{{ mise_exec }} dist check dist-plan: - @dist plan + @{{ mise_exec }} dist plan # Regenerate cargo-dist CI workflow safely dist-generate-ci: - dist generate --ci github + @{{ mise_exec }} dist generate --ci github @echo "Generated CI workflow. Remember to fix any expression errors if they exist." @echo "Run 'just lint:actions' to validate the generated workflow." install: - @cargo install --path . + @{{ mise_exec }} cargo install --path . # ============================================================================= # DOCUMENTATION @@ -320,18 +326,18 @@ docs-build: #!/usr/bin/env bash set -euo pipefail # Build rustdoc - cargo doc --no-deps --document-private-items --target-dir docs/book/api-temp + {{ mise_exec }} cargo doc --no-deps --document-private-items --target-dir docs/book/api-temp # Move rustdoc output to final location mkdir -p docs/book/api cp -r docs/book/api-temp/doc/* docs/book/api/ rm -rf docs/book/api-temp # Build mdBook - cd docs && mdbook build + cd docs && {{ mise_exec }} mdbook build # Serve documentation locally with live reload [unix] docs-serve: - cd docs && mdbook serve --open + cd docs && {{ mise_exec }} mdbook serve --open # Clean documentation artifacts [unix] @@ -341,7 +347,7 @@ docs-clean: # Check documentation (build + link validation + formatting) [unix] docs-check: - cd docs && mdbook build + cd docs && {{ mise_exec }} mdbook build @just fmt-check # Generate and serve documentation @@ -358,12 +364,12 @@ docs: # Test GoReleaser configuration goreleaser-check: - @goreleaser check + @{{ mise_exec }} goreleaser check # Build binaries locally with GoReleaser (test build process) [windows] goreleaser-build: - @goreleaser build --clean + @{{ mise_exec }} goreleaser build --clean [unix] goreleaser-build: @@ -379,12 +385,12 @@ goreleaser-build: # Ensure the system linker sees the correct syslibroot and frameworks export RUSTFLAGS="${RUSTFLAGS:-} -C link-arg=-Wl,-syslibroot,${SDKROOT_PATH} -C link-arg=-F${SDKROOT_PATH}/System/Library/Frameworks" fi - goreleaser build --clean + {{ mise_exec }} goreleaser build --clean # Run snapshot release (test full pipeline without publishing) [windows] goreleaser-snapshot: - @goreleaser release --snapshot --clean + @{{ mise_exec }} goreleaser release --snapshot --clean [unix] goreleaser-snapshot: @@ -400,12 +406,12 @@ goreleaser-snapshot: # Ensure the system linker sees the correct syslibroot and frameworks export RUSTFLAGS="${RUSTFLAGS:-} -C link-arg=-Wl,-syslibroot,${SDKROOT_PATH} -C link-arg=-F${SDKROOT_PATH}/System/Library/Frameworks" fi - goreleaser release --snapshot --clean + {{ mise_exec }} goreleaser release --snapshot --clean # Test GoReleaser with specific target [windows] goreleaser-build-target target: - @goreleaser build --clean --single-target {{ target }} + @{{ mise_exec }} goreleaser build --clean --single-target {{ target }} [unix] goreleaser-build-target target: @@ -421,7 +427,7 @@ goreleaser-build-target target: # Ensure the system linker sees the correct syslibroot and frameworks export RUSTFLAGS="${RUSTFLAGS:-} -C link-arg=-Wl,-syslibroot,${SDKROOT_PATH} -C link-arg=-F${SDKROOT_PATH}/System/Library/Frameworks" fi - goreleaser build --clean --single-target {{ target }} + {{ mise_exec }} goreleaser build --clean --single-target {{ target }} # Clean GoReleaser artifacts goreleaser-clean: @@ -432,16 +438,16 @@ goreleaser-clean: # ============================================================================= release: - @cargo release + @{{ mise_exec }} cargo release release-dry-run: - @cargo release --dry-run + @{{ mise_exec }} cargo release --dry-run release-patch: - @cargo release patch + @{{ mise_exec }} cargo release patch release-minor: - @cargo release minor + @{{ mise_exec }} cargo release minor release-major: - @cargo release major + @{{ mise_exec }} cargo release major