From d02e44f82dea8095d53293100944b0361ad2110f Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:44:02 -0500 Subject: [PATCH 01/13] Add --summary flag to cli output --- rust/cli/src/main.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/rust/cli/src/main.rs b/rust/cli/src/main.rs index 6f5c64bc..9d463c81 100644 --- a/rust/cli/src/main.rs +++ b/rust/cli/src/main.rs @@ -48,6 +48,10 @@ struct Flags { #[arg(long)] no_dereference: bool, + /// Prints a summary of file types at the end of the output. + #[arg(long)] + summary: bool, + #[clap(flatten)] colors: Colors, @@ -211,12 +215,26 @@ async fn main() -> Result<()> { if flags.format.json { print!("["); } + // Initialize counter for file types + let mut type_counts: HashMap = HashMap::new(); + // Reorder responses to match input order let mut reorder = Reorder::default(); let mut errors = false; while let Some(response) = result_receiver.recv().await { reorder.push(response?); while let Some(response) = reorder.pop() { errors |= response.result.is_err(); + // Count file type for summary + if flags.summary { + // If result is ok, extract the description (e.g., "Python source") + if let Ok(file_type) = &response.result { + // Can change .description to .label for shorter names like "python" + let label = file_type.info().description.to_string(); + // If the label exists, add 1. If not, insert 0 then add 1. + *type_counts.entry(label).or_insert(0) += 1; + } + } + // Print output if flags.format.json { if reorder.next != 1 { print!(","); @@ -236,6 +254,16 @@ async fn main() -> Result<()> { } println!("]"); } + // Print summary if requested + if flags.summary && !flags.format.json && !flags.format.jsonl { + println!("--- Summary ---"); + // Sort the results by count (descending) + let mut sorted_counts: Vec<_> = type_counts.into_iter().collect(); + sorted_counts.sort_by(|a, b| b.1.cmp(&a.1)); + for (label, count) in sorted_counts { + println!("{}: {}", label, count); + } + } if errors { std::process::exit(1); } From d735809723f90007f8c67fcda21fd4caa9a7dcf0 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Wed, 26 Nov 2025 17:05:25 -0500 Subject: [PATCH 02/13] Update summary to only display if no errors --- rust/cli/src/main.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/cli/src/main.rs b/rust/cli/src/main.rs index 9d463c81..335bdeef 100644 --- a/rust/cli/src/main.rs +++ b/rust/cli/src/main.rs @@ -254,7 +254,10 @@ async fn main() -> Result<()> { } println!("]"); } - // Print summary if requested + if errors { + std::process::exit(1); + } + // Print summary if requested, only if there were no errors if flags.summary && !flags.format.json && !flags.format.jsonl { println!("--- Summary ---"); // Sort the results by count (descending) @@ -264,9 +267,6 @@ async fn main() -> Result<()> { println!("{}: {}", label, count); } } - if errors { - std::process::exit(1); - } Ok(()) } From 871e8fac1953fc365d1a148caabc0565593a3a40 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Sat, 29 Nov 2025 06:09:31 -0500 Subject: [PATCH 03/13] Add color and alphabetical sorting to summary --- rust/cli/src/main.rs | 55 +++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/rust/cli/src/main.rs b/rust/cli/src/main.rs index 335bdeef..09481b72 100644 --- a/rust/cli/src/main.rs +++ b/rust/cli/src/main.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use anyhow::{bail, ensure, Result}; use clap::{Args, Parser}; use colored::ColoredString; +use colored::Colorize; use magika_lib::{ self as magika, ContentType, Features, FeaturesOrRuled, FileType, InferredType, OverwriteReason, Session, TypeInfo, @@ -215,26 +216,26 @@ async fn main() -> Result<()> { if flags.format.json { print!("["); } - // Initialize counter for file types - let mut type_counts: HashMap = HashMap::new(); - // Reorder responses to match input order + // Initializes counter for file types. + let mut type_counts: HashMap<(String, String), usize> = HashMap::new(); let mut reorder = Reorder::default(); let mut errors = false; + // Prints results, reordering as needed to match input order. while let Some(response) = result_receiver.recv().await { reorder.push(response?); while let Some(response) = reorder.pop() { errors |= response.result.is_err(); - // Count file type for summary + // Counts file type for the final summary. if flags.summary { - // If result is ok, extract the description (e.g., "Python source") + // If result is ok, extracts the description (e.g., "Python source"). if let Ok(file_type) = &response.result { - // Can change .description to .label for shorter names like "python" - let label = file_type.info().description.to_string(); - // If the label exists, add 1. If not, insert 0 then add 1. - *type_counts.entry(label).or_insert(0) += 1; + let type_label = file_type.info().description.to_string(); + let group = file_type.info().group.to_string(); + // Increments the count in the HashMap, inserting 0 if the key does not exist. + *type_counts.entry((type_label, group)).or_insert(0) += 1; } } - // Print output + // Prints output. if flags.format.json { if reorder.next != 1 { print!(","); @@ -257,14 +258,23 @@ async fn main() -> Result<()> { if errors { std::process::exit(1); } - // Print summary if requested, only if there were no errors + // Prints summary if requested (only if there were no errors). if flags.summary && !flags.format.json && !flags.format.jsonl { println!("--- Summary ---"); - // Sort the results by count (descending) + // Sorts by count (descending). let mut sorted_counts: Vec<_> = type_counts.into_iter().collect(); - sorted_counts.sort_by(|a, b| b.1.cmp(&a.1)); - for (label, count) in sorted_counts { - println!("{}: {}", label, count); + sorted_counts.sort_by(|a, b| { + let count_cmp = b.1.cmp(&a.1); + if count_cmp == std::cmp::Ordering::Equal { + // Sorts alphabetically (case-insensitive). + // a.0 .0 accesses the type_label. + a.0 .0.to_lowercase().cmp(&b.0 .0.to_lowercase()) + } else { + count_cmp + } + }); + for ((type_label, group), count) in sorted_counts { + println!("{}: {}", color_type_label(&type_label, &group), count); } } Ok(()) @@ -582,7 +592,6 @@ impl Response { } fn color(&self, result: ColoredString) -> ColoredString { - use colored::Colorize as _; // We only use true colors (except for errors). If the terminal doesn't support true colors, // the colored crate will automatically choose the closest one. match &self.result { @@ -615,3 +624,17 @@ fn join>(xs: impl IntoIterator) -> String { result.push(']'); result } + +fn color_type_label(type_label: &str, group: &str) -> colored::ColoredString { + match group { + "application" => type_label.truecolor(0xf4, 0x3f, 0x5e), + "archive" => type_label.truecolor(0xf5, 0x9e, 0x0b), + "audio" => type_label.truecolor(0x84, 0xcc, 0x16), + "code" => type_label.truecolor(0x8b, 0x5c, 0xf6), + "document" => type_label.truecolor(0x3b, 0x82, 0xf6), + "executable" => type_label.truecolor(0xec, 0x48, 0x99), + "image" => type_label.truecolor(0x06, 0xb6, 0xd4), + "video" => type_label.truecolor(0x10, 0xb9, 0x81), + _ => type_label.bold().truecolor(0xcc, 0xcc, 0xcc), + } +} From 8ff868ddbb1d54951338b367d91fca7afd8c6d5e Mon Sep 17 00:00:00 2001 From: Ricardo Date: Sat, 29 Nov 2025 22:12:57 -0500 Subject: [PATCH 04/13] Implemented scan_directory and added necessary testing files --- python/src/magika/magika.py | 27 +++++++++++++++++ python/tests/test_magika_python_module.py | 6 ++++ python/tests/utils.py | 6 ++++ test_magika.py | 24 +++++++++++++++ tests_data/scan_directory/code2.py | 32 ++++++++++++++++++++ tests_data/scan_directory/test.json | 31 +++++++++++++++++++ tests_data/scan_directory/test.rs | 36 +++++++++++++++++++++++ tests_data/scan_directory/test.ts | 36 +++++++++++++++++++++++ 8 files changed, 198 insertions(+) create mode 100644 test_magika.py create mode 100644 tests_data/scan_directory/code2.py create mode 100644 tests_data/scan_directory/test.json create mode 100644 tests_data/scan_directory/test.rs create mode 100644 tests_data/scan_directory/test.ts diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index fa2152ea..ece293c1 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -167,6 +167,33 @@ def identify_paths( return self._get_results_from_paths(paths_) + def scan_directory(self, directory: Union[str, os.PathLike]) -> List[MagikaResult]: + """Identify the content type of a all files in a directory given its path.""" + path_obj = Path(directory) + + # Guard clause: check if directory exists + if not path_obj.exists() or not path_obj.is_dir(): + raise FileNotFoundError(f"The directory '{directory}' does not exist or is not a directory.") + + collected_paths: List[Union[str, os.PathLike]] = [] + + # rglob('*') recursively finds all files and directories + for item in path_obj.rglob('*'): + # We only want files, not sub-directories themselves + if item.is_file(): + collected_paths.append(item) + + paths_ = [] + for path in collected_paths: + if isinstance(path, str) or isinstance(path, os.PathLike): + paths_.append(Path(path)) + else: + raise TypeError( + f"Input '{path}' is invalid: input path should be of type `Union[str, os.PathLike]`" + ) + + return self._get_results_from_paths(paths_) + def identify_bytes(self, content: bytes) -> MagikaResult: """Identify the content type of raw bytes.""" if not isinstance(content, bytes): diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index e25bd40b..c7595e64 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -72,6 +72,12 @@ def test_magika_module_with_explicit_model_dir() -> None: with open(test_path, "rb") as f: _ = m.identify_stream(f) +def test_magika_module_with_basic_tests_by_directory() -> None: + tests_paths = utils.get_directory_test_dir() + + m = Magika() + results = m.scan_directory(tests_paths) + def test_magika_module_with_basic_tests_by_paths() -> None: tests_paths = utils.get_basic_test_files_paths() diff --git a/python/tests/utils.py b/python/tests/utils.py index 375dcd8c..8a38eac1 100644 --- a/python/tests/utils.py +++ b/python/tests/utils.py @@ -42,6 +42,12 @@ def get_basic_tests_files_dir() -> Path: return tests_files_dir +def get_directory_test_dir() -> Path: + tests_files_dir = get_tests_data_dir() / "scan_directory" + assert tests_files_dir.is_dir() + return tests_files_dir + + def get_mitra_tests_files_dir() -> Path: tests_files_dir = get_tests_data_dir() / "mitra" assert tests_files_dir.is_dir() diff --git a/test_magika.py b/test_magika.py new file mode 100644 index 00000000..a5272fd3 --- /dev/null +++ b/test_magika.py @@ -0,0 +1,24 @@ +#from magika import Magika +import sys +import os + +# Get the absolute path to the folder containing magika.py +# We go: current dir -> python -> src -> magika +module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'python', 'src')) + +# --- THE FIX --- +# Use insert(0, path) instead of append(path) +# This puts your folder at the TOP of the search list +if module_path not in sys.path: + sys.path.insert(0, module_path) + +import magika +#m = Magika() +m = magika.Magika() +#res = m.identify_path('./tests_data/basic/python') +res = m.scan_directory('./tests_data/scan_directory') +print("------------------ PREDICTIONS ------------------") +for result in res: + print(result) + print() +#print(res) \ No newline at end of file diff --git a/tests_data/scan_directory/code2.py b/tests_data/scan_directory/code2.py new file mode 100644 index 00000000..a2931d6d --- /dev/null +++ b/tests_data/scan_directory/code2.py @@ -0,0 +1,32 @@ +import random +import time + +class NumberGuesser: + def __init__(self, limit=100): + self.limit = limit + self.target = random.randint(1, limit) + self.attempts = 0 + + def guess(self, user_input): + self.attempts += 1 + try: + val = int(user_input) + except ValueError: + return "Please enter a valid integer." + + if val < self.target: + return "Too low!" + elif val > self.target: + return "Too high!" + else: + return f"Correct! It took you {self.attempts} tries." + +def main(): + game = NumberGuesser() + print("I'm thinking of a number between 1 and 100.") + # Simulation of a game loop + for i in range(5): + print(f"Simulation guess {i}: {game.guess(i * 20)}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests_data/scan_directory/test.json b/tests_data/scan_directory/test.json new file mode 100644 index 00000000..df29c36a --- /dev/null +++ b/tests_data/scan_directory/test.json @@ -0,0 +1,31 @@ +{ + "project": { + "id": 1024, + "name": "MagikaTest", + "isActive": true, + "tags": ["python", "cpp", "automation"], + "metadata": { + "created_at": "2023-10-27T10:00:00Z", + "author": "dev_user", + "version": 1.2 + } + }, + "employees": [ + { + "id": 1, + "name": "Alice Smith", + "role": "Lead Developer", + "skills": ["Rust", "C++"] + }, + { + "id": 2, + "name": "Bob Jones", + "role": "QA Engineer", + "skills": ["Python", "Selenium"] + } + ], + "settings": { + "retry_attempts": 5, + "timeout_ms": 3000 + } +} \ No newline at end of file diff --git a/tests_data/scan_directory/test.rs b/tests_data/scan_directory/test.rs new file mode 100644 index 00000000..3ea8704d --- /dev/null +++ b/tests_data/scan_directory/test.rs @@ -0,0 +1,36 @@ +use std::collections::HashMap; + +struct User { + username: String, + email: String, + sign_in_count: u64, + active: bool, +} + +impl User { + fn new(username: String, email: String) -> User { + User { + username, + email, + sign_in_count: 1, + active: true, + } + } + + fn deactivate(&mut self) { + self.active = false; + println!("User {} has been deactivated.", self.username); + } +} + +fn main() { + let mut users = HashMap::new(); + let user1 = User::new(String::from("rust_fan"), String::from("rust@example.com")); + + users.insert("u1", user1); + + match users.get_mut("u1") { + Some(u) => u.deactivate(), + None => println!("User not found"), + } +} \ No newline at end of file diff --git a/tests_data/scan_directory/test.ts b/tests_data/scan_directory/test.ts new file mode 100644 index 00000000..5773bc3a --- /dev/null +++ b/tests_data/scan_directory/test.ts @@ -0,0 +1,36 @@ +interface Task { + id: number; + title: string; + completed: boolean; + completedAt?: Date; // Optional property +} + +class TaskManager { + private tasks: Task[] = []; + + addTask(title: string): void { + const newTask: Task = { + id: this.tasks.length + 1, + title: title, + completed: false + }; + this.tasks.push(newTask); + } + + completeTask(id: number): void { + const task = this.tasks.find(t => t.id === id); + if (task) { + task.completed = true; + task.completedAt = new Date(); + console.log(`Task '${task.title}' marked as done.`); + } + } + + listTasks(): Task[] { + return this.tasks; + } +} + +const manager = new TaskManager(); +manager.addTask("Learn TypeScript"); +manager.completeTask(1); \ No newline at end of file From 57843b130a5671bdf6469a653d94e0626757f99e Mon Sep 17 00:00:00 2001 From: Ricardo Date: Sat, 29 Nov 2025 22:23:40 -0500 Subject: [PATCH 05/13] cleanup --- test_magika.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test_magika.py b/test_magika.py index a5272fd3..301779a0 100644 --- a/test_magika.py +++ b/test_magika.py @@ -2,13 +2,8 @@ import sys import os -# Get the absolute path to the folder containing magika.py -# We go: current dir -> python -> src -> magika module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'python', 'src')) -# --- THE FIX --- -# Use insert(0, path) instead of append(path) -# This puts your folder at the TOP of the search list if module_path not in sys.path: sys.path.insert(0, module_path) From 23602718124dc4fcd56403c7278f34e32efc6822 Mon Sep 17 00:00:00 2001 From: Christian Lopez Date: Sun, 30 Nov 2025 17:23:01 -0400 Subject: [PATCH 06/13] added recursive scanning to scan_directory --- python/src/magika/magika.py | 12 +++++++----- test_magika.py | 19 ------------------- 2 files changed, 7 insertions(+), 24 deletions(-) delete mode 100644 test_magika.py diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index ece293c1..fb1ba574 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -166,9 +166,9 @@ def identify_paths( ) return self._get_results_from_paths(paths_) - - def scan_directory(self, directory: Union[str, os.PathLike]) -> List[MagikaResult]: - """Identify the content type of a all files in a directory given its path.""" + + def scan_directory(self, directory: Union[str, os.PathLike], recursive_scan=False) -> List[MagikaResult]: + """Identify the content type of all files in a directory given its path.""" path_obj = Path(directory) # Guard clause: check if directory exists @@ -177,8 +177,10 @@ def scan_directory(self, directory: Union[str, os.PathLike]) -> List[MagikaResul collected_paths: List[Union[str, os.PathLike]] = [] - # rglob('*') recursively finds all files and directories - for item in path_obj.rglob('*'): + # Use rglob('*') for recursive scan, glob('*') for single directory + glob_pattern = path_obj.rglob('*') if recursive_scan else path_obj.glob('*') + + for item in glob_pattern: # We only want files, not sub-directories themselves if item.is_file(): collected_paths.append(item) diff --git a/test_magika.py b/test_magika.py deleted file mode 100644 index 301779a0..00000000 --- a/test_magika.py +++ /dev/null @@ -1,19 +0,0 @@ -#from magika import Magika -import sys -import os - -module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'python', 'src')) - -if module_path not in sys.path: - sys.path.insert(0, module_path) - -import magika -#m = Magika() -m = magika.Magika() -#res = m.identify_path('./tests_data/basic/python') -res = m.scan_directory('./tests_data/scan_directory') -print("------------------ PREDICTIONS ------------------") -for result in res: - print(result) - print() -#print(res) \ No newline at end of file From a5c0ab045788a49c7cbc89f330d08b8a77d363dd Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 04:01:39 -0500 Subject: [PATCH 07/13] Fix format for nightly rust --- rust/cli/src/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rust/cli/src/main.rs b/rust/cli/src/main.rs index 09481b72..fdb1a9aa 100644 --- a/rust/cli/src/main.rs +++ b/rust/cli/src/main.rs @@ -21,8 +21,7 @@ use std::sync::Arc; use anyhow::{bail, ensure, Result}; use clap::{Args, Parser}; -use colored::ColoredString; -use colored::Colorize; +use colored::{ColoredString, Colorize}; use magika_lib::{ self as magika, ContentType, Features, FeaturesOrRuled, FileType, InferredType, OverwriteReason, Session, TypeInfo, From 11eef79234fa6808a6ca5bea48d7c108818060fd Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 04:15:12 -0500 Subject: [PATCH 08/13] Sync generated rust file to fix CI error --- rust/cli/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/cli/README.md b/rust/cli/README.md index ebaf28e7..f0dcb3d1 100644 --- a/rust/cli/README.md +++ b/rust/cli/README.md @@ -142,6 +142,9 @@ Options: --no-dereference Identifies symbolic links as is instead of identifying their content by following them + --summary + Prints a summary of file types at the end of the output + --colors Prints with colors regardless of terminal support From 6b451b0872d11d5f4710076a549e1c20ef852ff4 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:53:31 -0500 Subject: [PATCH 09/13] Fix syntax of scan_directory test --- python/tests/test_magika_python_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index c7595e64..a1d6fd57 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -76,7 +76,7 @@ def test_magika_module_with_basic_tests_by_directory() -> None: tests_paths = utils.get_directory_test_dir() m = Magika() - results = m.scan_directory(tests_paths) + _ = m.scan_directory(tests_paths) def test_magika_module_with_basic_tests_by_paths() -> None: From b7978e205f862626c5212762af63f0e4d69daabc Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:15:00 -0500 Subject: [PATCH 10/13] Fix format with ruff --- python/src/magika/magika.py | 16 ++++++++++------ python/tests/test_magika_python_module.py | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index fb1ba574..081ab4ce 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -166,20 +166,24 @@ def identify_paths( ) return self._get_results_from_paths(paths_) - - def scan_directory(self, directory: Union[str, os.PathLike], recursive_scan=False) -> List[MagikaResult]: + + def scan_directory( + self, directory: Union[str, os.PathLike], recursive_scan=False + ) -> List[MagikaResult]: """Identify the content type of all files in a directory given its path.""" path_obj = Path(directory) - + # Guard clause: check if directory exists if not path_obj.exists() or not path_obj.is_dir(): - raise FileNotFoundError(f"The directory '{directory}' does not exist or is not a directory.") + raise FileNotFoundError( + f"The directory '{directory}' does not exist or is not a directory." + ) collected_paths: List[Union[str, os.PathLike]] = [] # Use rglob('*') for recursive scan, glob('*') for single directory - glob_pattern = path_obj.rglob('*') if recursive_scan else path_obj.glob('*') - + glob_pattern = path_obj.rglob("*") if recursive_scan else path_obj.glob("*") + for item in glob_pattern: # We only want files, not sub-directories themselves if item.is_file(): diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index a1d6fd57..d31b7dfa 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -72,6 +72,7 @@ def test_magika_module_with_explicit_model_dir() -> None: with open(test_path, "rb") as f: _ = m.identify_stream(f) + def test_magika_module_with_basic_tests_by_directory() -> None: tests_paths = utils.get_directory_test_dir() From fc502c7be5293a7bc088c3ed2d87e1c511edc881 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:19:59 -0500 Subject: [PATCH 11/13] Fix missing type annotation --- python/src/magika/magika.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index 081ab4ce..120efad4 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -168,7 +168,7 @@ def identify_paths( return self._get_results_from_paths(paths_) def scan_directory( - self, directory: Union[str, os.PathLike], recursive_scan=False + self, directory: Union[str, os.PathLike], recursive_scan: bool = False ) -> List[MagikaResult]: """Identify the content type of all files in a directory given its path.""" path_obj = Path(directory) From f8322bfea3445f44e4b30cc3eeddb5575fa8830e Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Tue, 2 Dec 2025 00:40:26 -0500 Subject: [PATCH 12/13] Fix incomplete unit tests for scan_directory --- python/src/magika/magika.py | 2 +- python/tests/test_magika_python_module.py | 14 ++++++-- python/tests/utils.py | 4 +-- tests_data/directory/txt/complex-sentence.txt | 1 + tests_data/scan_directory/code2.py | 32 ----------------- tests_data/scan_directory/test.json | 31 ---------------- tests_data/scan_directory/test.rs | 36 ------------------- tests_data/scan_directory/test.ts | 36 ------------------- 8 files changed, 16 insertions(+), 140 deletions(-) create mode 100644 tests_data/directory/txt/complex-sentence.txt delete mode 100644 tests_data/scan_directory/code2.py delete mode 100644 tests_data/scan_directory/test.json delete mode 100644 tests_data/scan_directory/test.rs delete mode 100644 tests_data/scan_directory/test.ts diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index 120efad4..0ee892c9 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -182,7 +182,7 @@ def scan_directory( collected_paths: List[Union[str, os.PathLike]] = [] # Use rglob('*') for recursive scan, glob('*') for single directory - glob_pattern = path_obj.rglob("*") if recursive_scan else path_obj.glob("*") + glob_pattern = sorted(path_obj.rglob("*")) if recursive_scan else sorted(path_obj.glob("*")) for item in glob_pattern: # We only want files, not sub-directories themselves diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index d31b7dfa..2ac1aa61 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -74,10 +74,20 @@ def test_magika_module_with_explicit_model_dir() -> None: def test_magika_module_with_basic_tests_by_directory() -> None: - tests_paths = utils.get_directory_test_dir() + tests_paths = utils.get_directory_tests_files_dir() m = Magika() - _ = m.scan_directory(tests_paths) + + # Only scan direct children of tests_data/directory. + # Expected output is "directory" content type. + results = m.scan_directory(tests_paths) + direct_children = sorted([p for p in tests_paths.glob("*")]) + check_results_vs_expected_results(direct_children, results) + + # Scan all files recursively. Expected output is content type of each file. + results = m.scan_directory(tests_paths, recursive_scan=True) + all_files = sorted([p for p in tests_paths.rglob("*") if p.is_file()]) + check_results_vs_expected_results(all_files, results) def test_magika_module_with_basic_tests_by_paths() -> None: diff --git a/python/tests/utils.py b/python/tests/utils.py index 8a38eac1..4796b20e 100644 --- a/python/tests/utils.py +++ b/python/tests/utils.py @@ -42,8 +42,8 @@ def get_basic_tests_files_dir() -> Path: return tests_files_dir -def get_directory_test_dir() -> Path: - tests_files_dir = get_tests_data_dir() / "scan_directory" +def get_directory_tests_files_dir() -> Path: + tests_files_dir = get_tests_data_dir() / "directory" assert tests_files_dir.is_dir() return tests_files_dir diff --git a/tests_data/directory/txt/complex-sentence.txt b/tests_data/directory/txt/complex-sentence.txt new file mode 100644 index 00000000..9b6a0c83 --- /dev/null +++ b/tests_data/directory/txt/complex-sentence.txt @@ -0,0 +1 @@ +This is yet another simple test, it includes one simple sentence, but it is not as trivial as other simpler tests. \ No newline at end of file diff --git a/tests_data/scan_directory/code2.py b/tests_data/scan_directory/code2.py deleted file mode 100644 index a2931d6d..00000000 --- a/tests_data/scan_directory/code2.py +++ /dev/null @@ -1,32 +0,0 @@ -import random -import time - -class NumberGuesser: - def __init__(self, limit=100): - self.limit = limit - self.target = random.randint(1, limit) - self.attempts = 0 - - def guess(self, user_input): - self.attempts += 1 - try: - val = int(user_input) - except ValueError: - return "Please enter a valid integer." - - if val < self.target: - return "Too low!" - elif val > self.target: - return "Too high!" - else: - return f"Correct! It took you {self.attempts} tries." - -def main(): - game = NumberGuesser() - print("I'm thinking of a number between 1 and 100.") - # Simulation of a game loop - for i in range(5): - print(f"Simulation guess {i}: {game.guess(i * 20)}") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/tests_data/scan_directory/test.json b/tests_data/scan_directory/test.json deleted file mode 100644 index df29c36a..00000000 --- a/tests_data/scan_directory/test.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "project": { - "id": 1024, - "name": "MagikaTest", - "isActive": true, - "tags": ["python", "cpp", "automation"], - "metadata": { - "created_at": "2023-10-27T10:00:00Z", - "author": "dev_user", - "version": 1.2 - } - }, - "employees": [ - { - "id": 1, - "name": "Alice Smith", - "role": "Lead Developer", - "skills": ["Rust", "C++"] - }, - { - "id": 2, - "name": "Bob Jones", - "role": "QA Engineer", - "skills": ["Python", "Selenium"] - } - ], - "settings": { - "retry_attempts": 5, - "timeout_ms": 3000 - } -} \ No newline at end of file diff --git a/tests_data/scan_directory/test.rs b/tests_data/scan_directory/test.rs deleted file mode 100644 index 3ea8704d..00000000 --- a/tests_data/scan_directory/test.rs +++ /dev/null @@ -1,36 +0,0 @@ -use std::collections::HashMap; - -struct User { - username: String, - email: String, - sign_in_count: u64, - active: bool, -} - -impl User { - fn new(username: String, email: String) -> User { - User { - username, - email, - sign_in_count: 1, - active: true, - } - } - - fn deactivate(&mut self) { - self.active = false; - println!("User {} has been deactivated.", self.username); - } -} - -fn main() { - let mut users = HashMap::new(); - let user1 = User::new(String::from("rust_fan"), String::from("rust@example.com")); - - users.insert("u1", user1); - - match users.get_mut("u1") { - Some(u) => u.deactivate(), - None => println!("User not found"), - } -} \ No newline at end of file diff --git a/tests_data/scan_directory/test.ts b/tests_data/scan_directory/test.ts deleted file mode 100644 index 5773bc3a..00000000 --- a/tests_data/scan_directory/test.ts +++ /dev/null @@ -1,36 +0,0 @@ -interface Task { - id: number; - title: string; - completed: boolean; - completedAt?: Date; // Optional property -} - -class TaskManager { - private tasks: Task[] = []; - - addTask(title: string): void { - const newTask: Task = { - id: this.tasks.length + 1, - title: title, - completed: false - }; - this.tasks.push(newTask); - } - - completeTask(id: number): void { - const task = this.tasks.find(t => t.id === id); - if (task) { - task.completed = true; - task.completedAt = new Date(); - console.log(`Task '${task.title}' marked as done.`); - } - } - - listTasks(): Task[] { - return this.tasks; - } -} - -const manager = new TaskManager(); -manager.addTask("Learn TypeScript"); -manager.completeTask(1); \ No newline at end of file From 3c4b7aa3c4592c62216e0a87e564d1872300b962 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Tue, 2 Dec 2025 01:18:26 -0500 Subject: [PATCH 13/13] Fix format with ruff --- python/src/magika/magika.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index 0ee892c9..0c616f49 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -182,7 +182,11 @@ def scan_directory( collected_paths: List[Union[str, os.PathLike]] = [] # Use rglob('*') for recursive scan, glob('*') for single directory - glob_pattern = sorted(path_obj.rglob("*")) if recursive_scan else sorted(path_obj.glob("*")) + glob_pattern = ( + sorted(path_obj.rglob("*")) + if recursive_scan + else sorted(path_obj.glob("*")) + ) for item in glob_pattern: # We only want files, not sub-directories themselves