From 45feeb201ad54f3143dd7afe3b9364750741c36c Mon Sep 17 00:00:00 2001 From: Sunshine Date: Sun, 14 Jan 2024 10:56:44 -0500 Subject: [PATCH] implement automatic detection of media type --- src/files.rs | 49 +++++++++++++++++++++++ src/lib.rs | 89 +++++++++++++++++++++++++++++++++++++---- src/main.rs | 60 +++++++++++++++++---------- tests/_data_/pixel.png | Bin 0 -> 67 bytes tests/cli/basic.rs | 4 ++ tests/cli/decode.rs | 4 ++ tests/cli/encode.rs | 4 ++ tests/cli/files.rs | 78 ++++++++++++++++++++++++++++++++++++ tests/cli/mod.rs | 1 + tests/mod.rs | 2 + 10 files changed, 263 insertions(+), 28 deletions(-) create mode 100644 src/files.rs create mode 100644 tests/_data_/pixel.png create mode 100644 tests/cli/files.rs diff --git a/src/files.rs b/src/files.rs new file mode 100644 index 0000000..200bcd3 --- /dev/null +++ b/src/files.rs @@ -0,0 +1,49 @@ +const MEDIA_TYPE_HEADERS: [[&[u8]; 2]; 18] = [ + // Image + [b"GIF87a", b"image/gif"], + [b"GIF89a", b"image/gif"], + [b"\xFF\xD8\xFF", b"image/jpeg"], + [b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"], + [b" "video/avi", +// "bmp" => "image/bmp", +// "css" => "text/css", +// "flac" => "audio/flac", +// "gif" => "image/gif", +// "htm" | "html" => "text/html", +// "ico" => "image/x-icon", +// "jpeg" | "jpg" => "image/jpeg", +// "js" => "application/javascript", +// "json" => "application/json", +// "mp3" => "audio/mpeg", +// "mp4" | "m4v" => "video/mp4", +// "ogg" => "audio/ogg", +// "ogv" => "video/ogg", +// "pdf" => "application/pdf", +// "png" => "image/png", +// "svg" => "image/svg+xml", +// "swf" => "application/x-shockwave-flash", +// "tif" | "tiff" => "image/tiff", +// "txt" => "text/plain", +// "wav" => "audio/wav", +// "webp" => "image/webp", +// "woff" => "font/woff", +// "woff2" => "font/woff2", +// "xml" => "text/xml", diff --git a/src/lib.rs b/src/lib.rs index 0dad6d0..bd830c7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,30 @@ use url::Url; const DEFAULT_MEDIA_TYPE: &'static str = "text/plain"; const DEFAULT_CHARSET: &'static str = "US-ASCII"; -const TEXTUAL_MEDIA_TYPES: &'static [&str] = &[ +const FILE_SIGNATURES: [[&[u8]; 2]; 18] = [ + // Image + [b"GIF87a", b"image/gif"], + [b"GIF89a", b"image/gif"], + [b"\xFF\xD8\xFF", b"image/jpeg"], + [b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"], + [b", // US-ASCII is default, according to the spec is_base64_encoded: bool, // Indicates if it's a base64-encoded data URL data: Vec, // Data, bytes, UTF-8 if text - fragment: Option, // #something-at-the-end, None by default + fragment: Option, // #something at the end, None by default } pub enum DataUrlParseError { @@ -49,6 +72,58 @@ impl fmt::Debug for DataUrlParseError { } } +pub fn detect_media_type(data: &[u8], filename: &str) -> String { + // At first attempt to read file's header + for file_signaure in FILE_SIGNATURES.iter() { + if data.starts_with(file_signaure[0]) { + return String::from_utf8(file_signaure[1].to_vec()).unwrap(); + } + } + + // If header didn't match any known magic signatures, + // try to guess media type from file name + detect_media_type_by_file_name(&filename) +} + +pub fn detect_media_type_by_file_name(filename: &str) -> String { + let filename_lowercased: &str = &filename.to_lowercase(); + let parts: Vec<&str> = filename_lowercased.split('.').collect(); + + let mime: &str = match parts.last() { + Some(v) => match *v { + "avi" => "video/avi", + "bmp" => "image/bmp", + "css" => "text/css", + "flac" => "audio/flac", + "gif" => "image/gif", + "htm" | "html" => "text/html", + "ico" => "image/x-icon", + "jpeg" | "jpg" => "image/jpeg", + "js" => "application/javascript", + "json" => "application/json", + "mp3" => "audio/mpeg", + "mp4" | "m4v" => "video/mp4", + "ogg" => "audio/ogg", + "ogv" => "video/ogg", + "pdf" => "application/pdf", + "png" => "image/png", + "svg" => "image/svg+xml", + "swf" => "application/x-shockwave-flash", + "tif" | "tiff" => "image/tiff", + "txt" => "text/plain", + "wav" => "audio/wav", + "webp" => "image/webp", + "woff" => "font/woff", + "woff2" => "font/woff2", + "xml" => "text/xml", + &_ => "", + }, + None => "", + }; + + mime.to_string() +} + pub(crate) fn parse_data_url_meta_data( meta_data_string: String, ) -> (Option, Option, bool) { @@ -85,7 +160,7 @@ pub(crate) fn parse_data_url_meta_data( } pub(crate) fn validate_media_type(media_type: &str) -> bool { - // Must contain one slash + // Must contain one forward slash media_type.split('/').collect::>().len() == 2 } @@ -161,18 +236,18 @@ impl DataUrl { } let current_media_type: &str = &self.media_type.as_ref().unwrap(); - let is_textual: bool = if current_media_type.split('/').collect::>()[0] + let is_plaintext: bool = if current_media_type.split('/').collect::>()[0] .eq_ignore_ascii_case("text") { true } else { - TEXTUAL_MEDIA_TYPES + PLAINTEXT_MEDIA_TYPES .iter() .find(|mt| current_media_type.eq_ignore_ascii_case(mt)) .is_some() }; - !is_textual + !is_plaintext } pub fn media_type(&self) -> &str { @@ -316,7 +391,7 @@ impl DataUrl { } if let Some(c) = &self.charset { - // windows-1252 is another name for US-ASCII, the default charset for data URLs + // NOTE: windows-1252 is another name for US-ASCII, the default charset for data URLs if c != "windows-1252" { result += ";charset="; result += &c; diff --git a/src/main.rs b/src/main.rs index 83e8e22..f5125c9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -80,48 +80,48 @@ fn main() { ////////////////////////////////////////////////////////////////////////// - let decode_mode_enabled: bool = app.is_present("decode"); - let string_input_set: bool = app.is_present("INPUT"); + let is_decode_mode_enabled: bool = app.is_present("decode"); + let is_string_input_set: bool = app.is_present("INPUT"); // let stdin_is_a_tty: bool = !io::stdio::stdin_raw().isatty(); - let stdout_is_a_tty: bool = atty::is(Stream::Stdout); - let mut file_input_set: bool = app.is_present("INPUT FILE"); - let mut file_output_set: bool = app.is_present("OUTPUT FILE"); - let input_file_path: &str = if file_input_set { + let is_stdout_a_tty: bool = atty::is(Stream::Stdout); + let mut is_file_input_set: bool = app.is_present("INPUT FILE"); + let mut is_file_output_set: bool = app.is_present("OUTPUT FILE"); + let input_file_path: &str = if is_file_input_set { app.value_of("INPUT FILE").unwrap() } else { "-" }; - let output_file_path: &str = if file_output_set { + let output_file_path: &str = if is_file_output_set { app.value_of("OUTPUT FILE").unwrap() } else { "-" }; - if file_input_set && input_file_path == "-" { - file_input_set = false; + if is_file_input_set && input_file_path == "-" { + is_file_input_set = false; } - if file_output_set && output_file_path == "-" { - file_output_set = false; + if is_file_output_set && output_file_path == "-" { + is_file_output_set = false; } - let file_input_set = file_input_set; - let file_output_set = file_output_set; + let is_file_input_set = is_file_input_set; + let is_file_output_set = is_file_output_set; ////////////////////////////////////////////////////////////////////////// - if string_input_set && file_input_set { + if is_string_input_set && is_file_input_set { eprintln!("error: Both file and argument inputs provided"); std::process::exit(1); } - if !stdout_is_a_tty && file_output_set { + if !is_stdout_a_tty && is_file_output_set { eprintln!("error: Both stdout and argument output provided"); std::process::exit(1); } ////////////////////////////////////////////////////////////////////////// - let input: Vec = if string_input_set { + let input: Vec = if is_string_input_set { app.value_of("INPUT").unwrap().as_bytes().to_vec() - } else if file_input_set { + } else if is_file_input_set { match fs::read(input_file_path) { Ok(input_file_data) => input_file_data, Err(_) => { @@ -136,15 +136,19 @@ fn main() { ////////////////////////////////////////////////////////////////////////// - if decode_mode_enabled { + if is_decode_mode_enabled { + //////////// + // Decode // + //////////// + // TODO: ideally the program needs to check the current terminal locale (encoding), and not just assume it's UTF-8 let input_as_string: String = String::from_utf8_lossy(&input).to_string(); std::process::exit(match DataUrl::parse(&input_as_string) { Ok(data_url) => { - if !stdout_is_a_tty || file_output_set || data_url.is_binary() { + if !is_stdout_a_tty || is_file_output_set || data_url.is_binary() { // Write raw bytes if the output is a file, or if the contents of this data URL has binary format - if file_output_set { + if is_file_output_set { let mut handle = fs::File::create(output_file_path).unwrap(); handle.write_all(data_url.data()).unwrap(); } else { @@ -164,6 +168,10 @@ fn main() { } }); } else { + //////////// + // Encode // + //////////// + let mut data_url = DataUrl::new(); data_url.set_data(&input); @@ -186,7 +194,7 @@ fn main() { // TODO: ideally the program needs to check the current terminal locale (encoding), and not just assume it's UTF-8 // Automatically enforce ;charset=UTF-8 for non-ascii argument inputs - if string_input_set && !String::from_utf8_lossy(&input).to_string().is_ascii() { + if is_string_input_set && !String::from_utf8_lossy(&input).to_string().is_ascii() { data_url.set_charset(Some("UTF-8".to_string())); } } @@ -199,6 +207,16 @@ fn main() { eprintln!("error: Invalid media type '{}'", media_type); std::process::exit(1); } + } else { + if is_file_input_set { + if input_file_path.ends_with(".png") { + data_url.set_media_type(Some("image/png".to_string())); + } else { + data_url.set_media_type(Some("text/plain".to_string())); + } + } + // TODO: try to automatically detect file type from file name / header + // data_url.set_media_type(Some("text/TODO".to_string())); } if app.is_present("FRAGMENT") { diff --git a/tests/_data_/pixel.png b/tests/_data_/pixel.png new file mode 100644 index 0000000000000000000000000000000000000000..d0ff224445c97605c8277061eb4a72e1fa378389 GIT binary patch literal 67 zcmeAS@N?(olHy`uVBq!ia0vp^j3CUx1|;Q0k8}blE>9Q7kP60RkPIsWquGUyyFgI} MPgg&ebxsLQ083~K+W-In literal 0 HcmV?d00001 diff --git a/tests/cli/basic.rs b/tests/cli/basic.rs index 0c765d4..4db1c48 100644 --- a/tests/cli/basic.rs +++ b/tests/cli/basic.rs @@ -1,9 +1,11 @@ +// // ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ // ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ // ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ // ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ // ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ // ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// #[cfg(test)] mod passing { @@ -73,12 +75,14 @@ ARGS: } } +// // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ // ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ // █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ // ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ // ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ // ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// #[cfg(test)] mod failing { diff --git a/tests/cli/decode.rs b/tests/cli/decode.rs index ec8009f..46755d7 100644 --- a/tests/cli/decode.rs +++ b/tests/cli/decode.rs @@ -1,9 +1,11 @@ +// // ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ // ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ // ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ // ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ // ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ // ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// #[cfg(test)] mod passing { @@ -47,12 +49,14 @@ mod passing { } } +// // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ // ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ // █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ // ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ // ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ // ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// #[cfg(test)] mod failing { diff --git a/tests/cli/encode.rs b/tests/cli/encode.rs index db121f9..c85221e 100644 --- a/tests/cli/encode.rs +++ b/tests/cli/encode.rs @@ -1,9 +1,11 @@ +// // ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ // ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ // ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ // ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ // ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ // ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// #[cfg(test)] mod passing { @@ -138,12 +140,14 @@ mod passing { } } +// // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ // ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ // █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ // ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ // ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ // ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// #[cfg(test)] mod failing { diff --git a/tests/cli/files.rs b/tests/cli/files.rs new file mode 100644 index 0000000..36791f6 --- /dev/null +++ b/tests/cli/files.rs @@ -0,0 +1,78 @@ +// +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ +// + +#[cfg(test)] +mod passing { + use assert_cmd::prelude::*; + use std::process::Command; + + #[test] + fn must_properly_read_and_encode_basic_text_file() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let assert = cmd.arg("-i").arg("tests/_data_/text-file.txt").assert(); + + assert + // Exit code must be 0 + .success() + // STDERR must be empty + .stderr("") + // STDOUT must be empty + .stdout("data:text/plain,some%20content%0A\n"); + } + + #[test] + fn must_properly_read_and_base64_encode_basic_text_file() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let assert = cmd + .arg("-b") + .arg("-i") + .arg("tests/_data_/text-file.txt") + .assert(); + + assert + // Exit code must be 0 + .success() + // STDERR must be empty + .stderr("") + // STDOUT must be empty + .stdout("data:text/plain;base64,c29tZSBjb250ZW50Cg==\n"); + } + + #[test] + fn must_properly_read_and_encode_image_file() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let assert = cmd.arg("-i").arg("tests/_data_/pixel.png").assert(); + + assert + // Exit code must be 0 + .success() + // STDERR must be empty + .stderr("") + // STDOUT must be empty + .stdout("data:image/png,%89PNG%0D%0A%1A%0A%00%00%00%0DIHDR%00%00%00%01%00%00%00%01%08%06%00%00%00%1F%15%C4%89%00%00%00%0AIDATx%01c%00%01%00%00%05%00%016%D0%88%DD%00%00%00%00IEND%AEB%60%82\n"); + } + + #[test] + fn must_properly_read_and_base64_encode_image_file() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let assert = cmd + .arg("-b") + .arg("-i") + .arg("tests/_data_/pixel.png") + .assert(); + + assert + // Exit code must be 0 + .success() + // STDERR must be empty + .stderr("") + // STDOUT must be empty + .stdout("\n"); + } +} diff --git a/tests/cli/mod.rs b/tests/cli/mod.rs index 030b83a..48826ad 100644 --- a/tests/cli/mod.rs +++ b/tests/cli/mod.rs @@ -1,3 +1,4 @@ mod basic; mod decode; mod encode; +mod files; diff --git a/tests/mod.rs b/tests/mod.rs index 3e6b1f7..e6cfb8b 100644 --- a/tests/mod.rs +++ b/tests/mod.rs @@ -1,2 +1,4 @@ +#![allow(special_module_name)] + mod cli; mod lib;