diff --git a/Cargo.toml b/Cargo.toml index f097286..100afa0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["crates/office2pdf", "crates/office2pdf-cli"] +members = ["crates/office2pdf", "crates/office2pdf-cli", "crates/office2typst"] resolver = "3" [workspace.package] diff --git a/README.md b/README.md index dcbaae2..6796a43 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,13 @@ No LibreOffice, no Chromium, no Docker — just a single binary powered by [Typs office2pdf = "0.3" ``` +For `OOXML -> IR -> Typst` (without PDF compilation), use the workspace crate: + +```toml +[dependencies] +office2typst = { path = "crates/office2typst" } +``` + ### CLI ```sh @@ -65,6 +72,22 @@ let result = office2pdf::convert_bytes( std::fs::write("report.pdf", &result.pdf).unwrap(); ``` +### OOXML to Typst (no PDF compile) + +```rust +use office2typst::config::{ConvertOptions, Format}; + +let bytes = std::fs::read("report.docx").unwrap(); +let result = office2typst::convert_bytes(&bytes, Format::Docx, &ConvertOptions::default()).unwrap(); + +// Parsed IR document +let _doc = result.document; + +// Typst source string +let typst_source = result.typst.source; +println!("{}", typst_source); +``` + ### CLI ```sh diff --git a/crates/office2typst/Cargo.toml b/crates/office2typst/Cargo.toml new file mode 100644 index 0000000..6bd30b1 --- /dev/null +++ b/crates/office2typst/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "office2typst" +version = "0.1.0" +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "Convert DOCX, XLSX, and PPTX to Typst via Office OOXML parsing" +readme = "../../README.md" +keywords = ["typst", "docx", "xlsx", "pptx", "converter"] +categories = ["text-processing"] + +[dependencies] +office2pdf = { path = "../office2pdf" } + +[dev-dependencies] +docx-rs = "0.4" diff --git a/crates/office2typst/src/lib.rs b/crates/office2typst/src/lib.rs new file mode 100644 index 0000000..d3ce73b --- /dev/null +++ b/crates/office2typst/src/lib.rs @@ -0,0 +1,178 @@ +pub use office2pdf::config; +pub use office2pdf::error; +pub use office2pdf::ir; + +use std::collections::HashSet; + +use office2pdf::config::{ConvertOptions, Format}; +use office2pdf::error::{ConvertError, ConvertWarning}; +use office2pdf::ir::Document; +use office2pdf::parser::{self, Parser}; +use office2pdf::render::typst_gen::{self, TypstOutput}; + +#[derive(Debug)] +pub struct TypstResult { + pub document: Document, + pub typst: TypstOutput, + pub warnings: Vec, +} + +const OLE2_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]; + +fn is_ole2(data: &[u8]) -> bool { + data.len() >= OLE2_MAGIC.len() && data[..OLE2_MAGIC.len()] == OLE2_MAGIC +} + +fn extract_panic_message(payload: &Box) -> String { + if let Some(s) = payload.downcast_ref::() { + s.clone() + } else if let Some(s) = payload.downcast_ref::<&str>() { + (*s).to_string() + } else { + "unknown panic".to_string() + } +} + +fn dedup_warnings(warnings: &mut Vec) { + let mut seen: HashSet = HashSet::new(); + warnings.retain(|warning| seen.insert(warning.to_string())); +} + +fn parser_for_format(format: Format) -> Box { + match format { + Format::Docx => Box::new(parser::docx::DocxParser), + Format::Pptx => Box::new(parser::pptx::PptxParser), + Format::Xlsx => Box::new(parser::xlsx::XlsxParser), + } +} + +pub fn parse_bytes( + data: &[u8], + format: Format, + options: &ConvertOptions, +) -> Result<(Document, Vec), ConvertError> { + if is_ole2(data) { + return Err(ConvertError::UnsupportedEncryption); + } + + let parser: Box = parser_for_format(format); + let parse_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + parser.parse(data, options) + })); + + let (document, mut warnings) = match parse_result { + Ok(result) => result?, + Err(panic_info) => { + return Err(ConvertError::Parse(format!( + "upstream parser panicked: {}", + extract_panic_message(&panic_info) + ))) + } + }; + + dedup_warnings(&mut warnings); + Ok((document, warnings)) +} + +#[cfg(not(target_arch = "wasm32"))] +pub fn parse_path( + path: impl AsRef, + options: &ConvertOptions, +) -> Result<(Document, Vec), ConvertError> { + let path = path.as_ref(); + let ext = path + .extension() + .and_then(|e| e.to_str()) + .ok_or_else(|| ConvertError::UnsupportedFormat("no file extension".to_string()))?; + let format = + Format::from_extension(ext).ok_or_else(|| ConvertError::UnsupportedFormat(ext.to_string()))?; + let data: Vec = std::fs::read(path)?; + parse_bytes(&data, format, options) +} + +pub fn generate_typst(doc: &Document) -> Result { + typst_gen::generate_typst(doc) +} + +pub fn generate_typst_with_options( + doc: &Document, + options: &ConvertOptions, +) -> Result { + typst_gen::generate_typst_with_options(doc, options) +} + +pub fn convert_bytes( + data: &[u8], + format: Format, + options: &ConvertOptions, +) -> Result { + let (document, warnings) = parse_bytes(data, format, options)?; + let typst = generate_typst_with_options(&document, options)?; + Ok(TypstResult { + document, + typst, + warnings, + }) +} + +#[cfg(not(target_arch = "wasm32"))] +pub fn convert_path( + path: impl AsRef, + options: &ConvertOptions, +) -> Result { + let (document, warnings) = parse_path(path, options)?; + let typst = generate_typst_with_options(&document, options)?; + Ok(TypstResult { + document, + typst, + warnings, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_minimal_docx() -> Vec { + let doc = docx_rs::Docx::new().add_paragraph( + docx_rs::Paragraph::new().add_run(docx_rs::Run::new().add_text("Hello Typst")), + ); + let mut cursor = std::io::Cursor::new(Vec::new()); + doc.build() + .pack(&mut cursor) + .expect("DOCX pack should succeed"); + cursor.into_inner() + } + + #[test] + fn convert_bytes_docx_produces_typst_source() { + let docx = make_minimal_docx(); + let result = convert_bytes(&docx, Format::Docx, &ConvertOptions::default()) + .expect("DOCX should convert to Typst"); + assert!( + result.typst.source.contains("Hello Typst"), + "Typst source should contain original text, got: {}", + result.typst.source + ); + assert!( + !result.document.pages.is_empty(), + "Parsed document should contain at least one page" + ); + } + + #[cfg(not(target_arch = "wasm32"))] + #[test] + fn parse_path_rejects_unsupported_extension() { + let err = parse_path( + "/tmp/not-supported.txt", + &ConvertOptions { + ..Default::default() + }, + ) + .expect_err("Unsupported extension should return an error"); + match err { + ConvertError::UnsupportedFormat(ext) => assert_eq!(ext, "txt"), + other => panic!("Expected UnsupportedFormat, got {other:?}"), + } + } +}