diff --git a/Cargo.lock b/Cargo.lock index dca0647..c4b3e00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -124,7 +124,7 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "forgekit" -version = "0.0.3" +version = "0.0.5" dependencies = [ "anyhow", "clap", @@ -134,7 +134,7 @@ dependencies = [ [[package]] name = "forgekit-core" -version = "0.0.3" +version = "0.0.5" dependencies = [ "anyhow", "serde", diff --git a/Cargo.toml b/Cargo.toml index 1fe4333..ce30ff4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/core", "crates/cli"] resolver = "2" [workspace.package] -version = "0.0.3" +version = "0.0.5" edition = "2021" authors = ["ForgeKit Contributors"] license = "MIT" diff --git a/ROADMAP.md b/ROADMAP.md index 939c6fb..9c00095 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,8 +1,8 @@ # ForgeKit Beta-MVP Roadmap (CLI-First) -**Last Updated**: December 2025 -**Status**: Core foundation complete, PDF merge/split/compress/extract implemented, dependency management complete -**Current Version**: v0.0.4 +**Last Updated**: January 2026 +**Status**: Core foundation complete, PDF merge/split/compress/extract/ocr/metadata implemented, dependency management complete +**Current Version**: v0.0.5 ## Versioning Strategy @@ -76,20 +76,22 @@ - ✅ Tests for new PDF operations - ✅ Integration with existing pages grammar parser -#### v0.0.5 - PDF OCR and Metadata 📋 +#### v0.0.5 - PDF OCR and Metadata ✅ -**Status**: Pending +**Status**: Completed **Deliverables**: -- PDF OCR command (`pdf ocr`) with language selection -- PDF metadata command (`pdf metadata`) with get/set operations -- ocrmypdf tool adapter (Python wrapper handling) -- exiftool adapter for metadata operations -- OCR progress reporting (parse ocrmypdf output) -- Metadata read/write operations -- Tests for OCR and metadata operations -- Error handling for OCR failures +- ✅ PDF OCR command (`pdf ocr`) with language selection +- ✅ PDF metadata command (`pdf metadata`) with get/set operations +- ✅ ocrmypdf tool adapter (Python wrapper handling) +- ✅ exiftool adapter for metadata operations +- ✅ OCR progress reporting (basic progress events) +- ✅ Metadata read/write operations (get all, get field, set fields) +- ✅ Tests for OCR and metadata operations (9 new tests) +- ✅ Error handling for OCR failures +- ✅ OCR options: --skip-text, --deskew, --force-ocr +- ✅ Integration with check-deps command #### v0.0.6 - Image Operations 📋 @@ -221,15 +223,14 @@ - **Testing**: 20 tests passing (13 unit, 2 integration, 5 doc tests) - **Dependency Checking**: `check-deps` command implemented -**Current Version**: v0.0.4 (completed) +**Current Version**: v0.0.5 (completed) ### 🔄 In Progress -- **v0.0.5**: PDF OCR and Metadata +- **v0.0.6**: Image Operations ### 📋 Pending Features -- PDF: OCR, metadata - Image: convert, resize, strip - Media: transcode, audio convert/normalize - Preset system (YAML) ✅ diff --git a/crates/cli/src/commands/check.rs b/crates/cli/src/commands/check.rs index 904520a..565adec 100644 --- a/crates/cli/src/commands/check.rs +++ b/crates/cli/src/commands/check.rs @@ -1,4 +1,6 @@ +use forgekit_core::tools::exiftool::ExiftoolTool; use forgekit_core::tools::gs::GsTool; +use forgekit_core::tools::ocrmypdf::OcrmypdfTool; use forgekit_core::tools::qpdf::QpdfTool; use forgekit_core::tools::{Tool, ToolConfig}; use forgekit_core::utils::error::Result; @@ -19,7 +21,9 @@ pub fn handle_check_deps() -> Result<()> { let tools: Vec<(&'static str, Box)> = vec![ ("qpdf", Box::new(QpdfTool)), ("gs", Box::new(GsTool)), - // TODO: Add other tools (ocrmypdf, ffmpeg, libvips, etc.) + ("ocrmypdf", Box::new(OcrmypdfTool)), + ("exiftool", Box::new(ExiftoolTool)), + // TODO: Add other tools (ffmpeg, libvips, etc.) ]; let mut all_ok = true; diff --git a/crates/cli/src/commands/pdf.rs b/crates/cli/src/commands/pdf.rs index 70599ac..ba30111 100644 --- a/crates/cli/src/commands/pdf.rs +++ b/crates/cli/src/commands/pdf.rs @@ -1,4 +1,5 @@ use clap::{Args, Subcommand}; +use forgekit_core::job::spec::MetadataAction; use forgekit_core::job::JobSpec; use forgekit_core::utils::error::Result; use forgekit_core::utils::pages::PageSpec; @@ -55,6 +56,29 @@ pub enum PdfCommand { /// Page spec: numbers (1), ranges (1-5, 7-), keywords (odd, even), exclusions (!2) /// Formats: pdf (default), png, jpeg/jpg (image files per page) Extract(ExtractArgs), + /// Add OCR text layer to a scanned PDF + /// + /// Makes scanned PDFs searchable by adding an invisible text layer. + /// + /// Examples: + /// forgekit pdf ocr scan.pdf --output searchable.pdf + /// forgekit pdf ocr scan.pdf --output searchable.pdf --language deu + /// forgekit pdf ocr mixed.pdf --output searchable.pdf --skip-text + /// forgekit pdf ocr tilted.pdf --output fixed.pdf --deskew + /// + /// Language codes: eng (English), deu (German), fra (French), spa (Spanish), etc. + /// Use 'tesseract --list-langs' to see available languages. + Ocr(OcrArgs), + /// Read or write PDF metadata (title, author, etc.) + /// + /// Examples: + /// forgekit pdf metadata doc.pdf # Show all metadata + /// forgekit pdf metadata doc.pdf --get title # Get specific field + /// forgekit pdf metadata doc.pdf --set title="My Doc" # Set a field + /// forgekit pdf metadata doc.pdf --set title="My Doc" --set author="John" --output updated.pdf + /// + /// Supported fields: title, author, subject, keywords, creator, producer + Metadata(MetadataArgs), } #[derive(Args, Clone)] @@ -169,6 +193,67 @@ pub struct ExtractArgs { pub format: String, } +#[derive(Args, Clone)] +pub struct OcrArgs { + /// Input PDF file + #[arg( + required = true, + help = "Input PDF file (typically a scanned document)" + )] + pub input: PathBuf, + + /// Output PDF file path + #[arg(short, long, required = true, help = "Output PDF file path")] + pub output: PathBuf, + + /// OCR language (e.g., "eng", "deu", "fra") + #[arg( + short, + long, + default_value = "eng", + help = "OCR language code (e.g., eng, deu, fra, spa)" + )] + pub language: String, + + /// Skip pages that already have text + #[arg( + long, + help = "Skip pages that already have text (faster for mixed documents)" + )] + pub skip_text: bool, + + /// Deskew pages before OCR (fixes tilted scans) + #[arg(long, help = "Deskew pages before OCR (corrects tilted scans)")] + pub deskew: bool, + + /// Force OCR even if text already exists + #[arg(long, help = "Force OCR even if text already exists (redo OCR)")] + pub force_ocr: bool, +} + +#[derive(Args, Clone)] +pub struct MetadataArgs { + /// Input PDF file + #[arg(required = true, help = "Input PDF file")] + pub input: PathBuf, + + /// Output PDF file path (only required for --set) + #[arg(short, long, help = "Output PDF file path (only required for --set)")] + pub output: Option, + + /// Get a specific metadata field + #[arg(long, help = "Get a specific metadata field (e.g., title, author)")] + pub get: Option, + + /// Set metadata fields (can be repeated). Format: field=value + #[arg( + long = "set", + value_name = "FIELD=VALUE", + help = "Set metadata field (e.g., --set title=\"My Doc\" --set author=\"John\")" + )] + pub set_fields: Vec, +} + pub fn handle_pdf_command(cmd: PdfCommand, plan_only: bool, json_output: bool) -> Result<()> { match cmd { PdfCommand::Merge(args) => handle_merge(args, plan_only, json_output), @@ -177,6 +262,8 @@ pub fn handle_pdf_command(cmd: PdfCommand, plan_only: bool, json_output: bool) - PdfCommand::Linearize(args) => handle_linearize(args, plan_only, json_output), PdfCommand::Reorder(args) => handle_reorder(args, plan_only, json_output), PdfCommand::Extract(args) => handle_extract(args, plan_only, json_output), + PdfCommand::Ocr(args) => handle_ocr(args, plan_only, json_output), + PdfCommand::Metadata(args) => handle_metadata(args, plan_only, json_output), } } @@ -474,3 +561,132 @@ fn handle_extract(args: ExtractArgs, plan_only: bool, json_output: bool) -> Resu Ok(()) } } + +fn handle_ocr(args: OcrArgs, plan_only: bool, json_output: bool) -> Result<()> { + let spec = JobSpec::PdfOcr { + input: args.input, + output: args.output, + language: args.language, + skip_text: args.skip_text, + deskew: args.deskew, + force_ocr: args.force_ocr, + }; + + if plan_only { + let plan = forgekit_core::job::executor::execute_job(&spec, true)?; + if json_output { + let event = forgekit_core::job::progress::ProgressEvent::Progress { + version: 1, + job_id: forgekit_core::job::progress::new_job_id(), + progress: forgekit_core::job::progress::ProgressInfo { + current: 0, + total: 1, + percent: 0, + stage: Some("plan".to_string()), + }, + message: plan.clone(), + }; + println!("{}", serde_json::to_string(&event).unwrap()); + } else { + println!("{}", plan); + } + Ok(()) + } else { + if json_output { + let reporter = forgekit_core::job::progress::JsonProgressReporter; + forgekit_core::job::executor::execute_job_with_progress(&spec, false, &reporter)?; + } else { + let result = forgekit_core::job::executor::execute_job(&spec, false)?; + println!("{}", result); + } + Ok(()) + } +} + +fn handle_metadata(args: MetadataArgs, plan_only: bool, json_output: bool) -> Result<()> { + // Determine the action based on arguments + let action = if !args.set_fields.is_empty() { + // Parse set fields (format: field=value) + let fields: Vec<(String, String)> = args + .set_fields + .iter() + .map(|s| { + let parts: Vec<&str> = s.splitn(2, '=').collect(); + if parts.len() == 2 { + Ok((parts[0].to_string(), parts[1].to_string())) + } else { + Err(forgekit_core::utils::error::ForgeKitError::InvalidInput { + path: PathBuf::new(), + reason: format!("Invalid --set format: '{}'. Expected 'field=value'", s), + }) + } + }) + .collect::, _>>()?; + MetadataAction::Set(fields) + } else if let Some(field) = args.get { + MetadataAction::Get(field) + } else { + MetadataAction::GetAll + }; + + // For set operations, output path is required + if matches!(&action, MetadataAction::Set(_)) && args.output.is_none() { + // Default to modifying in place + let spec = JobSpec::PdfMetadata { + input: args.input.clone(), + output: Some(args.input), + action, + }; + return execute_metadata_job(&spec, plan_only, json_output); + } + + let spec = JobSpec::PdfMetadata { + input: args.input, + output: args.output, + action, + }; + + execute_metadata_job(&spec, plan_only, json_output) +} + +fn execute_metadata_job(spec: &JobSpec, plan_only: bool, json_output: bool) -> Result<()> { + if plan_only { + let plan = forgekit_core::job::executor::execute_job(spec, true)?; + if json_output { + let event = forgekit_core::job::progress::ProgressEvent::Progress { + version: 1, + job_id: forgekit_core::job::progress::new_job_id(), + progress: forgekit_core::job::progress::ProgressInfo { + current: 0, + total: 1, + percent: 0, + stage: Some("plan".to_string()), + }, + message: plan.clone(), + }; + println!("{}", serde_json::to_string(&event).unwrap()); + } else { + println!("{}", plan); + } + Ok(()) + } else { + let result = forgekit_core::job::executor::execute_job(spec, false)?; + if json_output { + // For metadata get operations, the result is already JSON or a value + // Wrap it in a complete event + let event = forgekit_core::job::progress::ProgressEvent::Complete { + version: 1, + job_id: forgekit_core::job::progress::new_job_id(), + result: forgekit_core::job::progress::JobResult { + output: result.clone(), + size_bytes: 0, + duration_ms: 0, + }, + }; + println!("{}", serde_json::to_string(&event).unwrap()); + } else { + println!("{}", result); + } + Ok(()) + } +} diff --git a/crates/core/src/job/executor.rs b/crates/core/src/job/executor.rs index 2309215..625cd9c 100644 --- a/crates/core/src/job/executor.rs +++ b/crates/core/src/job/executor.rs @@ -17,15 +17,18 @@ //! When `plan_only` is true, we skip execution and just return the command that //! would be run. This is great for transparency and debugging. -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::process::Command; use crate::job::progress::{ new_job_id, ErrorInfo, JobResult, ProgressEvent, ProgressInfo, ProgressReporter, }; +use crate::job::spec::MetadataAction; use crate::job::JobSpec; use crate::presets::get_compression_strategy; +use crate::tools::exiftool::ExiftoolTool; use crate::tools::gs::GsTool; +use crate::tools::ocrmypdf::OcrmypdfTool; use crate::tools::qpdf::QpdfTool; use crate::tools::{Tool, ToolConfig}; use crate::utils::error::{ForgeKitError, Result}; @@ -100,6 +103,21 @@ pub fn execute_job_with_progress( format, plan_only, ), + JobSpec::PdfOcr { + input, + output, + language, + skip_text, + deskew, + force_ocr, + } => execute_pdf_ocr_with_progress( + input, output, language, *skip_text, *deskew, *force_ocr, plan_only, reporter, + ), + JobSpec::PdfMetadata { + input, + output, + action, + } => execute_pdf_metadata(input, output.as_deref(), action, plan_only), } } @@ -752,6 +770,277 @@ fn execute_pdf_extract( } } +#[allow(clippy::too_many_arguments)] +fn execute_pdf_ocr_with_progress( + input: &PathBuf, + output: &PathBuf, + language: &str, + skip_text: bool, + deskew: bool, + force_ocr: bool, + plan_only: bool, + reporter: &dyn ProgressReporter, +) -> Result { + let job_id = new_job_id(); + let start_time = Instant::now(); + + // Emit start progress + reporter.report(&ProgressEvent::Progress { + version: 1, + job_id: job_id.clone(), + progress: ProgressInfo { + current: 0, + total: 100, + percent: 0, + stage: Some("starting".to_string()), + }, + message: format!("Starting OCR with language '{}'", language), + }); + + // Build command parts for plan output + let mut cmd_parts = vec!["ocrmypdf".to_string()]; + + cmd_parts.push("-l".to_string()); + cmd_parts.push(language.to_string()); + + if skip_text { + cmd_parts.push("--skip-text".to_string()); + } + + if deskew { + cmd_parts.push("--deskew".to_string()); + } + + if force_ocr { + cmd_parts.push("--force-ocr".to_string()); + } + + // Add progress flag for real execution + if !plan_only { + cmd_parts.push("--progress".to_string()); + } + + cmd_parts.push(input.display().to_string()); + cmd_parts.push(output.display().to_string()); + + if plan_only { + return Ok(cmd_parts.join(" ")); + } + + if !input.exists() { + return Err(ForgeKitError::InvalidInput { + path: input.clone(), + reason: "Input file does not exist".to_string(), + }); + } + + // Probe for ocrmypdf + let tool = OcrmypdfTool; + let config = ToolConfig::default(); + let tool_info = tool.probe(&config)?; + + // Build actual command + let mut cmd = Command::new(&tool_info.path); + cmd.arg("-l").arg(language); + + if skip_text { + cmd.arg("--skip-text"); + } + + if deskew { + cmd.arg("--deskew"); + } + + if force_ocr { + cmd.arg("--force-ocr"); + } + + cmd.arg(input); + cmd.arg(output); + + // Report progress as OCR starts + reporter.report(&ProgressEvent::Progress { + version: 1, + job_id: job_id.clone(), + progress: ProgressInfo { + current: 10, + total: 100, + percent: 10, + stage: Some("ocr".to_string()), + }, + message: "Running OCR (this may take a while for large documents)...".to_string(), + }); + + // Execute + let output_result = cmd.output().map_err(|e| ForgeKitError::ProcessingFailed { + tool: "ocrmypdf".to_string(), + stderr: format!("Failed to execute: {}", e), + })?; + + if !output_result.status.success() { + let stderr = String::from_utf8_lossy(&output_result.stderr); + let error = ForgeKitError::ProcessingFailed { + tool: "ocrmypdf".to_string(), + stderr: stderr.to_string(), + }; + + // Emit error event + reporter.report(&ProgressEvent::Error { + version: 1, + job_id: job_id.clone(), + error: ErrorInfo { + code: "OCR_FAILED".to_string(), + message: error.to_string(), + hint: + "Check that the input PDF is valid and tesseract language packs are installed" + .to_string(), + }, + }); + + return Err(error); + } + + let duration_ms = start_time.elapsed().as_millis() as u64; + let size_bytes = std::fs::metadata(output).map(|m| m.len()).unwrap_or(0); + + // Emit complete event + reporter.report(&ProgressEvent::Complete { + version: 1, + job_id: job_id.clone(), + result: JobResult { + output: output.display().to_string(), + size_bytes, + duration_ms, + }, + }); + + Ok(format!( + "Successfully added OCR text layer to PDF: {}", + output.display() + )) +} + +fn execute_pdf_metadata( + input: &Path, + output: Option<&Path>, + action: &MetadataAction, + plan_only: bool, +) -> Result { + match action { + MetadataAction::GetAll => execute_pdf_metadata_get_all(input, plan_only), + MetadataAction::Get(field) => execute_pdf_metadata_get(input, field, plan_only), + MetadataAction::Set(fields) => { + let output_path = output.ok_or_else(|| ForgeKitError::InvalidInput { + path: PathBuf::new(), + reason: "Output file path required for set operations".to_string(), + })?; + execute_pdf_metadata_set(input, output_path, fields, plan_only) + } + } +} + +fn execute_pdf_metadata_get_all(input: &Path, plan_only: bool) -> Result { + if plan_only { + return Ok(format!( + "exiftool -json -PDF:all -XMP:all {}", + input.display() + )); + } + + if !input.exists() { + return Err(ForgeKitError::InvalidInput { + path: input.to_path_buf(), + reason: "Input file does not exist".to_string(), + }); + } + + // Probe for exiftool + let tool = ExiftoolTool; + let config = ToolConfig::default(); + let tool_info = tool.probe(&config)?; + + // Read metadata as JSON + let json = tool.read_metadata_json(&tool_info.path, input)?; + + Ok(json) +} + +fn execute_pdf_metadata_get(input: &Path, field: &str, plan_only: bool) -> Result { + if plan_only { + return Ok(format!("exiftool -s -s -s -{} {}", field, input.display())); + } + + if !input.exists() { + return Err(ForgeKitError::InvalidInput { + path: input.to_path_buf(), + reason: "Input file does not exist".to_string(), + }); + } + + // Probe for exiftool + let tool = ExiftoolTool; + let config = ToolConfig::default(); + let tool_info = tool.probe(&config)?; + + // Read specific field + let value = tool.read_field(&tool_info.path, input, field)?; + + if value.is_empty() { + Ok(format!("Field '{}' is not set", field)) + } else { + Ok(value) + } +} + +fn execute_pdf_metadata_set( + input: &Path, + output: &Path, + fields: &[(String, String)], + plan_only: bool, +) -> Result { + if plan_only { + let mut cmd_parts = vec!["exiftool".to_string()]; + for (field, value) in fields { + cmd_parts.push(format!("-{}={}", field, value)); + } + // If input != output, we need to copy first + if input != output { + cmd_parts.push("-o".to_string()); + cmd_parts.push(output.display().to_string()); + } else { + cmd_parts.push("-overwrite_original".to_string()); + } + cmd_parts.push(input.display().to_string()); + return Ok(cmd_parts.join(" ")); + } + + if !input.exists() { + return Err(ForgeKitError::InvalidInput { + path: input.to_path_buf(), + reason: "Input file does not exist".to_string(), + }); + } + + // Probe for exiftool + let tool = ExiftoolTool; + let config = ToolConfig::default(); + let tool_info = tool.probe(&config)?; + + // If output is different from input, copy first + if input != output { + std::fs::copy(input, output).map_err(ForgeKitError::Io)?; + } + + // Write metadata (to output file, overwrite original to avoid backup) + tool.write_metadata(&tool_info.path, output, fields, true)?; + + Ok(format!( + "Successfully set {} metadata field(s) in {}", + fields.len(), + output.display() + )) +} + #[cfg(test)] mod tests { use super::*; @@ -898,3 +1187,141 @@ mod image_tests { assert!(result.contains("page_%d.jpg")); } } + +#[cfg(test)] +mod ocr_tests { + use super::*; + use crate::job::progress::NoOpProgressReporter; + + #[test] + fn test_execute_pdf_ocr_plan() { + let input = PathBuf::from("scan.pdf"); + let output = PathBuf::from("searchable.pdf"); + let reporter = NoOpProgressReporter; + + let result = execute_pdf_ocr_with_progress( + &input, &output, "eng", true, false, false, true, &reporter, + ) + .unwrap(); + + assert!(result.contains("ocrmypdf")); + assert!(result.contains("-l eng")); + assert!(result.contains("--skip-text")); + assert!(result.contains("scan.pdf")); + assert!(result.contains("searchable.pdf")); + } + + #[test] + fn test_execute_pdf_ocr_plan_with_deskew() { + let input = PathBuf::from("tilted.pdf"); + let output = PathBuf::from("fixed.pdf"); + let reporter = NoOpProgressReporter; + + let result = execute_pdf_ocr_with_progress( + &input, &output, "deu", false, true, false, true, &reporter, + ) + .unwrap(); + + assert!(result.contains("ocrmypdf")); + assert!(result.contains("-l deu")); + assert!(result.contains("--deskew")); + assert!(!result.contains("--skip-text")); + } + + #[test] + fn test_execute_pdf_ocr_plan_force() { + let input = PathBuf::from("existing_text.pdf"); + let output = PathBuf::from("reocr.pdf"); + let reporter = NoOpProgressReporter; + + let result = execute_pdf_ocr_with_progress( + &input, &output, "eng", false, false, true, true, &reporter, + ) + .unwrap(); + + assert!(result.contains("ocrmypdf")); + assert!(result.contains("--force-ocr")); + } +} + +#[cfg(test)] +mod metadata_tests { + use super::*; + use crate::job::spec::MetadataAction; + + #[test] + fn test_execute_pdf_metadata_get_all_plan() { + let input = PathBuf::from("doc.pdf"); + + let result = execute_pdf_metadata(&input, None, &MetadataAction::GetAll, true).unwrap(); + + assert!(result.contains("exiftool")); + assert!(result.contains("-json")); + assert!(result.contains("-PDF:all")); + assert!(result.contains("-XMP:all")); + assert!(result.contains("doc.pdf")); + } + + #[test] + fn test_execute_pdf_metadata_get_field_plan() { + let input = PathBuf::from("doc.pdf"); + let action = MetadataAction::Get("title".to_string()); + + let result = execute_pdf_metadata(&input, None, &action, true).unwrap(); + + assert!(result.contains("exiftool")); + assert!(result.contains("-s -s -s")); + assert!(result.contains("-title")); + assert!(result.contains("doc.pdf")); + } + + #[test] + fn test_execute_pdf_metadata_set_plan() { + let input = PathBuf::from("doc.pdf"); + let output = PathBuf::from("updated.pdf"); + let fields = vec![ + ("title".to_string(), "My Document".to_string()), + ("author".to_string(), "John Doe".to_string()), + ]; + let action = MetadataAction::Set(fields); + + let result = execute_pdf_metadata(&input, Some(&output), &action, true).unwrap(); + + assert!(result.contains("exiftool")); + assert!(result.contains("-title=My Document")); + assert!(result.contains("-author=John Doe")); + assert!(result.contains("-o")); + assert!(result.contains("updated.pdf")); + } + + #[test] + fn test_execute_pdf_metadata_set_in_place_plan() { + let path = PathBuf::from("doc.pdf"); + let fields = vec![("title".to_string(), "Updated Title".to_string())]; + let action = MetadataAction::Set(fields); + + let result = execute_pdf_metadata(&path, Some(&path), &action, true).unwrap(); + + assert!(result.contains("exiftool")); + assert!(result.contains("-overwrite_original")); + // Check that "-o " (with space) is not present - don't confuse with "-overwrite_original" + assert!(!result.contains(" -o ")); + } + + #[test] + fn test_execute_pdf_metadata_set_requires_output() { + let input = PathBuf::from("doc.pdf"); + let fields = vec![("title".to_string(), "My Document".to_string())]; + let action = MetadataAction::Set(fields); + + let result = execute_pdf_metadata(&input, None, &action, false); + + assert!(result.is_err()); + match result { + Err(ForgeKitError::InvalidInput { reason, .. }) => { + assert!(reason.contains("Output file path required")); + } + _ => panic!("Expected InvalidInput error"), + } + } +} diff --git a/crates/core/src/job/mod.rs b/crates/core/src/job/mod.rs index c00ff3c..beacd8c 100644 --- a/crates/core/src/job/mod.rs +++ b/crates/core/src/job/mod.rs @@ -5,4 +5,4 @@ pub mod spec; pub use progress::{ new_job_id, JsonProgressReporter, NoOpProgressReporter, ProgressEvent, ProgressReporter, }; -pub use spec::JobSpec; +pub use spec::{JobSpec, MetadataAction}; diff --git a/crates/core/src/job/spec.rs b/crates/core/src/job/spec.rs index 998f759..0e32e80 100644 --- a/crates/core/src/job/spec.rs +++ b/crates/core/src/job/spec.rs @@ -14,6 +14,17 @@ use crate::utils::pages::PageSpec; use std::path::PathBuf; +/// Action to perform on PDF metadata +#[derive(Debug, Clone)] +pub enum MetadataAction { + /// Read all metadata as JSON + GetAll, + /// Read a specific field + Get(String), + /// Set one or more fields (field name, value) + Set(Vec<(String, String)>), +} + /// A job specification describing what operation to perform. /// /// This is pure data - it doesn't execute anything. The executor (`job::executor`) @@ -102,6 +113,35 @@ pub enum JobSpec { /// Output format: "pdf" or "images". format: String, }, + /// Add OCR text layer to a scanned PDF. + /// + /// Uses ocrmypdf to add a searchable text layer to scanned PDFs. + /// The original image quality is preserved. + PdfOcr { + /// Input PDF file to OCR. + input: PathBuf, + /// Output PDF file path. + output: PathBuf, + /// OCR language (e.g., "eng", "deu", "fra"). Default is "eng". + language: String, + /// Skip pages that already have text (faster for mixed documents). + skip_text: bool, + /// Deskew pages before OCR (corrects tilted scans). + deskew: bool, + /// Force OCR even if text already exists (redo OCR). + force_ocr: bool, + }, + /// Read or write PDF metadata. + /// + /// Uses exiftool to get/set PDF metadata fields like title, author, subject, etc. + PdfMetadata { + /// Input PDF file. + input: PathBuf, + /// Output PDF file path (only needed for set operations, optional for get). + output: Option, + /// The metadata action to perform. + action: MetadataAction, + }, } impl JobSpec { @@ -133,6 +173,16 @@ impl JobSpec { format ) } + JobSpec::PdfOcr { language, .. } => { + format!("OCR PDF with language '{}'", language) + } + JobSpec::PdfMetadata { action, .. } => match action { + MetadataAction::GetAll => "Read all PDF metadata".to_string(), + MetadataAction::Get(field) => format!("Read PDF metadata field '{}'", field), + MetadataAction::Set(fields) => { + format!("Set {} PDF metadata field(s)", fields.len()) + } + }, } } } @@ -199,4 +249,50 @@ mod tests { }; assert_eq!(spec.description(), "Extract 2 page spec(s) from PDF as pdf"); } + + #[test] + fn test_pdf_ocr_description() { + let spec = JobSpec::PdfOcr { + input: PathBuf::from("scan.pdf"), + output: PathBuf::from("searchable.pdf"), + language: "eng".to_string(), + skip_text: true, + deskew: false, + force_ocr: false, + }; + assert_eq!(spec.description(), "OCR PDF with language 'eng'"); + } + + #[test] + fn test_pdf_metadata_get_all_description() { + let spec = JobSpec::PdfMetadata { + input: PathBuf::from("doc.pdf"), + output: None, + action: MetadataAction::GetAll, + }; + assert_eq!(spec.description(), "Read all PDF metadata"); + } + + #[test] + fn test_pdf_metadata_get_field_description() { + let spec = JobSpec::PdfMetadata { + input: PathBuf::from("doc.pdf"), + output: None, + action: MetadataAction::Get("title".to_string()), + }; + assert_eq!(spec.description(), "Read PDF metadata field 'title'"); + } + + #[test] + fn test_pdf_metadata_set_description() { + let spec = JobSpec::PdfMetadata { + input: PathBuf::from("doc.pdf"), + output: Some(PathBuf::from("updated.pdf")), + action: MetadataAction::Set(vec![ + ("title".to_string(), "My Document".to_string()), + ("author".to_string(), "John Doe".to_string()), + ]), + }; + assert_eq!(spec.description(), "Set 2 PDF metadata field(s)"); + } } diff --git a/crates/core/src/tools/exiftool.rs b/crates/core/src/tools/exiftool.rs new file mode 100644 index 0000000..82d394d --- /dev/null +++ b/crates/core/src/tools/exiftool.rs @@ -0,0 +1,329 @@ +//! # ExifTool Tool Adapter +//! +//! ExifTool is a Perl command-line tool for reading and writing metadata in files. +//! We use it for: +//! - Reading PDF metadata (title, author, subject, keywords, etc.) +//! - Writing/updating PDF metadata +//! - Bulk metadata operations +//! +//! ## Why ExifTool? +//! +//! ExifTool is the most comprehensive metadata tool available: +//! - Supports 400+ file formats including PDF +//! - Can read/write virtually any metadata field +//! - Cross-platform (Windows, macOS, Linux) +//! - Stable CLI interface +//! +//! ## Minimum Version +//! +//! ExifTool 12.0+ is recommended. Older versions may work but aren't tested. + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use crate::tools::{Tool, ToolConfig, ToolInfo}; +use crate::utils::error::{ForgeKitError, Result}; +use crate::utils::platform::ToolInstallHints; + +/// ExifTool adapter. +/// +/// Implements the `Tool` trait for ExifTool, handling detection, version checking, +/// and command construction for metadata operations. +pub struct ExiftoolTool; + +impl Tool for ExiftoolTool { + fn name(&self) -> &'static str { + "exiftool" + } + + fn probe(&self, config: &ToolConfig) -> Result { + // Check override path first + if let Some(ref path) = config.override_path { + if path.exists() { + let version = self.version(path)?; + return Ok(ToolInfo { + path: path.clone(), + version, + available: true, + }); + } + } + + // Probe PATH + let which_output = if cfg!(target_os = "windows") { + Command::new("where").arg("exiftool").output() + } else { + Command::new("which").arg("exiftool").output() + }; + + let path = match which_output { + Ok(output) if output.status.success() => { + let path_str = String::from_utf8_lossy(&output.stdout) + .lines() + .next() + .unwrap_or("") + .trim() + .to_string(); + if !path_str.is_empty() { + PathBuf::from(path_str) + } else { + PathBuf::from("exiftool") + } + } + _ => PathBuf::from("exiftool"), + }; + + // Verify it works + let output = + Command::new(&path) + .arg("-ver") + .output() + .map_err(|_| ForgeKitError::ToolNotFound { + tool: "exiftool".to_string(), + hint: ToolInstallHints::for_tool("exiftool"), + })?; + + if !output.status.success() { + return Err(ForgeKitError::ToolNotFound { + tool: "exiftool".to_string(), + hint: ToolInstallHints::for_tool("exiftool"), + }); + } + + let version = self.version(&path)?; + + Ok(ToolInfo { + path, + version, + available: true, + }) + } + + fn version(&self, path: &Path) -> Result { + let output = Command::new(path) + .arg("-ver") + .output() + .map_err(|e| ForgeKitError::Other(anyhow::anyhow!("Failed to run exiftool: {}", e)))?; + + if !output.status.success() { + return Err(ForgeKitError::Other(anyhow::anyhow!( + "exiftool -ver failed" + ))); + } + + // exiftool outputs just the version number like "12.40" + let version = String::from_utf8_lossy(&output.stdout) + .lines() + .next() + .unwrap_or("") + .to_string(); + + Ok(version.trim().to_string()) + } +} + +/// PDF metadata fields supported for read/write operations +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum PdfMetadataField { + Title, + Author, + Subject, + Keywords, + Creator, + Producer, + CreationDate, + ModifyDate, + /// Custom XMP or PDF metadata field + Custom(String), +} + +impl PdfMetadataField { + /// Convert field name to exiftool tag name + pub fn to_exiftool_tag(&self) -> String { + match self { + PdfMetadataField::Title => "Title".to_string(), + PdfMetadataField::Author => "Author".to_string(), + PdfMetadataField::Subject => "Subject".to_string(), + PdfMetadataField::Keywords => "Keywords".to_string(), + PdfMetadataField::Creator => "Creator".to_string(), + PdfMetadataField::Producer => "Producer".to_string(), + PdfMetadataField::CreationDate => "CreateDate".to_string(), + PdfMetadataField::ModifyDate => "ModifyDate".to_string(), + PdfMetadataField::Custom(name) => name.clone(), + } + } +} + +impl std::str::FromStr for PdfMetadataField { + type Err = std::convert::Infallible; + + fn from_str(s: &str) -> std::result::Result { + Ok(match s.to_lowercase().as_str() { + "title" => PdfMetadataField::Title, + "author" => PdfMetadataField::Author, + "subject" => PdfMetadataField::Subject, + "keywords" => PdfMetadataField::Keywords, + "creator" => PdfMetadataField::Creator, + "producer" => PdfMetadataField::Producer, + "creationdate" | "createdate" | "creation_date" => PdfMetadataField::CreationDate, + "modifydate" | "moddate" | "modify_date" => PdfMetadataField::ModifyDate, + _ => PdfMetadataField::Custom(s.to_string()), + }) + } +} + +impl ExiftoolTool { + /// Read all PDF metadata from a file as JSON. + /// + /// Returns raw JSON output from exiftool for maximum flexibility. + pub fn read_metadata_json(&self, tool_path: &Path, pdf_path: &Path) -> Result { + let output = Command::new(tool_path) + .arg("-json") + .arg("-PDF:all") + .arg("-XMP:all") + .arg(pdf_path) + .output() + .map_err(|e| ForgeKitError::ProcessingFailed { + tool: "exiftool".to_string(), + stderr: format!("Failed to run exiftool: {}", e), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(ForgeKitError::ProcessingFailed { + tool: "exiftool".to_string(), + stderr: stderr.to_string(), + }); + } + + let json = String::from_utf8_lossy(&output.stdout).to_string(); + Ok(json) + } + + /// Read a specific metadata field from a PDF. + pub fn read_field(&self, tool_path: &Path, pdf_path: &Path, field: &str) -> Result { + let tag = format!("-{}", field); + let output = Command::new(tool_path) + .arg("-s") // Short format + .arg("-s") // Even shorter (just value) + .arg("-s") // Shortest (no field name) + .arg(&tag) + .arg(pdf_path) + .output() + .map_err(|e| ForgeKitError::ProcessingFailed { + tool: "exiftool".to_string(), + stderr: format!("Failed to run exiftool: {}", e), + })?; + + // Note: exiftool returns success even if field doesn't exist + let value = String::from_utf8_lossy(&output.stdout).trim().to_string(); + Ok(value) + } + + /// Write metadata to a PDF file. + /// + /// Takes a vector of (field, value) pairs. + /// By default, exiftool modifies the file in place and creates a backup. + /// Use `overwrite_original` to skip creating a backup. + pub fn write_metadata( + &self, + tool_path: &Path, + pdf_path: &Path, + metadata: &[(String, String)], + overwrite_original: bool, + ) -> Result<()> { + let mut cmd = Command::new(tool_path); + + if overwrite_original { + cmd.arg("-overwrite_original"); + } + + for (field, value) in metadata { + let field_obj: PdfMetadataField = field.parse().unwrap(); + let tag = field_obj.to_exiftool_tag(); + cmd.arg(format!("-{}={}", tag, value)); + } + + cmd.arg(pdf_path); + + let output = cmd.output().map_err(|e| ForgeKitError::ProcessingFailed { + tool: "exiftool".to_string(), + stderr: format!("Failed to run exiftool: {}", e), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(ForgeKitError::ProcessingFailed { + tool: "exiftool".to_string(), + stderr: stderr.to_string(), + }); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_exiftool_name() { + let tool = ExiftoolTool; + assert_eq!(tool.name(), "exiftool"); + } + + #[test] + fn test_exiftool_probe() { + let tool = ExiftoolTool; + let config = ToolConfig::default(); + // This will only pass if exiftool is installed + let result = tool.probe(&config); + // We don't assert success here since exiftool may not be installed in test environment + // Just verify it doesn't panic + match result { + Ok(info) => { + assert!(info.available); + assert!(!info.version.is_empty()); + } + Err(ForgeKitError::ToolNotFound { .. }) => { + // Expected if exiftool is not installed + } + Err(e) => panic!("Unexpected error: {:?}", e), + } + } + + #[test] + fn test_pdf_metadata_field_parsing() { + assert_eq!( + "title".parse::().unwrap(), + PdfMetadataField::Title + ); + assert_eq!( + "Title".parse::().unwrap(), + PdfMetadataField::Title + ); + assert_eq!( + "author".parse::().unwrap(), + PdfMetadataField::Author + ); + assert_eq!( + "CustomField".parse::().unwrap(), + PdfMetadataField::Custom("CustomField".to_string()) + ); + } + + #[test] + fn test_pdf_metadata_field_to_exiftool_tag() { + assert_eq!(PdfMetadataField::Title.to_exiftool_tag(), "Title"); + assert_eq!(PdfMetadataField::Author.to_exiftool_tag(), "Author"); + assert_eq!( + PdfMetadataField::CreationDate.to_exiftool_tag(), + "CreateDate" + ); + assert_eq!( + PdfMetadataField::Custom("MyField".to_string()).to_exiftool_tag(), + "MyField" + ); + } +} diff --git a/crates/core/src/tools/mod.rs b/crates/core/src/tools/mod.rs index 7ea9205..356c972 100644 --- a/crates/core/src/tools/mod.rs +++ b/crates/core/src/tools/mod.rs @@ -1,4 +1,6 @@ +pub mod exiftool; pub mod gs; +pub mod ocrmypdf; pub mod qpdf; pub mod trait_def; diff --git a/crates/core/src/tools/ocrmypdf.rs b/crates/core/src/tools/ocrmypdf.rs new file mode 100644 index 0000000..574979d --- /dev/null +++ b/crates/core/src/tools/ocrmypdf.rs @@ -0,0 +1,199 @@ +//! # ocrmypdf Tool Adapter +//! +//! ocrmypdf is a Python command-line tool that adds OCR text layers to PDFs. +//! It wraps Tesseract OCR and handles PDF-specific complexities like: +//! - Maintaining original PDF structure +//! - Adding invisible text layer over scanned pages +//! - Skipping pages that already have text +//! +//! ## Why ocrmypdf? +//! +//! While we could call Tesseract directly, ocrmypdf handles many edge cases: +//! - PDF page extraction and reassembly +//! - Preserving original quality +//! - Handling encrypted PDFs +//! - Multi-language support +//! - Progress reporting +//! +//! ## Minimum Version +//! +//! ocrmypdf 14.0+ is recommended. Older versions may work but aren't tested. +//! +//! ## Dependencies +//! +//! ocrmypdf requires: +//! - Python 3.8+ +//! - Tesseract OCR 4.0+ (with language packs) +//! - Ghostscript (for PDF manipulation) + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use crate::tools::{Tool, ToolConfig, ToolInfo}; +use crate::utils::error::{ForgeKitError, Result}; +use crate::utils::platform::ToolInstallHints; + +/// ocrmypdf tool adapter. +/// +/// Implements the `Tool` trait for ocrmypdf, handling detection, version checking, +/// and command construction for OCR operations on PDFs. +pub struct OcrmypdfTool; + +impl Tool for OcrmypdfTool { + fn name(&self) -> &'static str { + "ocrmypdf" + } + + fn probe(&self, config: &ToolConfig) -> Result { + // Check override path first + if let Some(ref path) = config.override_path { + if path.exists() { + let version = self.version(path)?; + return Ok(ToolInfo { + path: path.clone(), + version, + available: true, + }); + } + } + + // Probe PATH - try 'ocrmypdf' directly + let which_output = if cfg!(target_os = "windows") { + Command::new("where").arg("ocrmypdf").output() + } else { + Command::new("which").arg("ocrmypdf").output() + }; + + let path = match which_output { + Ok(output) if output.status.success() => { + let path_str = String::from_utf8_lossy(&output.stdout) + .lines() + .next() + .unwrap_or("") + .trim() + .to_string(); + if !path_str.is_empty() { + PathBuf::from(path_str) + } else { + PathBuf::from("ocrmypdf") + } + } + _ => PathBuf::from("ocrmypdf"), + }; + + // Verify it works + let output = Command::new(&path).arg("--version").output().map_err(|_| { + ForgeKitError::ToolNotFound { + tool: "ocrmypdf".to_string(), + hint: ToolInstallHints::for_tool("ocrmypdf"), + } + })?; + + if !output.status.success() { + return Err(ForgeKitError::ToolNotFound { + tool: "ocrmypdf".to_string(), + hint: ToolInstallHints::for_tool("ocrmypdf"), + }); + } + + let version = self.version(&path)?; + + Ok(ToolInfo { + path, + version, + available: true, + }) + } + + fn version(&self, path: &Path) -> Result { + let output = Command::new(path) + .arg("--version") + .output() + .map_err(|e| ForgeKitError::Other(anyhow::anyhow!("Failed to run ocrmypdf: {}", e)))?; + + if !output.status.success() { + return Err(ForgeKitError::Other(anyhow::anyhow!( + "ocrmypdf --version failed" + ))); + } + + // ocrmypdf outputs version like "ocrmypdf 14.0.0" or just "14.0.0" + let version = String::from_utf8_lossy(&output.stdout) + .lines() + .next() + .unwrap_or("") + .to_string(); + + Ok(version.trim().to_string()) + } +} + +impl OcrmypdfTool { + /// Get available languages for OCR. + /// + /// Returns a list of language codes that Tesseract has installed. + pub fn get_available_languages(&self, _tool_path: &Path) -> Result> { + // ocrmypdf uses tesseract's languages, so we check tesseract directly + let tesseract_output = Command::new("tesseract") + .arg("--list-langs") + .output() + .map_err(|e| ForgeKitError::Other(anyhow::anyhow!("Failed to run tesseract: {}", e)))?; + + if !tesseract_output.status.success() { + // Fall back to just returning 'eng' if we can't list languages + return Ok(vec!["eng".to_string()]); + } + + let output = String::from_utf8_lossy(&tesseract_output.stdout); + let languages: Vec = output + .lines() + .skip(1) // Skip header line "List of available languages..." + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + + if languages.is_empty() { + Ok(vec!["eng".to_string()]) + } else { + Ok(languages) + } + } + + /// Check if a specific language is available. + pub fn is_language_available(&self, tool_path: &Path, lang: &str) -> bool { + self.get_available_languages(tool_path) + .map(|langs| langs.iter().any(|l| l == lang)) + .unwrap_or(false) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ocrmypdf_name() { + let tool = OcrmypdfTool; + assert_eq!(tool.name(), "ocrmypdf"); + } + + #[test] + fn test_ocrmypdf_probe() { + let tool = OcrmypdfTool; + let config = ToolConfig::default(); + // This will only pass if ocrmypdf is installed + let result = tool.probe(&config); + // We don't assert success here since ocrmypdf may not be installed in test environment + // Just verify it doesn't panic + match result { + Ok(info) => { + assert!(info.available); + assert!(!info.version.is_empty()); + } + Err(ForgeKitError::ToolNotFound { .. }) => { + // Expected if ocrmypdf is not installed + } + Err(e) => panic!("Unexpected error: {:?}", e), + } + } +}