From 028809bf833fe9610a611426a2b7140b80f8e615 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 12 Sep 2024 22:37:27 +0200 Subject: [PATCH 01/28] draft --- src/odr/file.hpp | 13 +++++++++++++ src/odr/internal/open_strategy.hpp | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/odr/file.hpp b/src/odr/file.hpp index e3e28f4d..1070340f 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -97,6 +97,19 @@ enum class FileLocation { disk, }; +/// @brief Collection of decoder engines. +enum class DecoderEngine { + odr, + pdf2html_ex, + wv_ware, +}; + +/// @brief Preference for decoding files. +struct DecodePreference final { + std::vector file_type_priority; + std::vector engine_priority; +}; + /// @brief Collection of encryption states. enum class EncryptionState { unknown, diff --git a/src/odr/internal/open_strategy.hpp b/src/odr/internal/open_strategy.hpp index 1661cd0e..a18cef8a 100644 --- a/src/odr/internal/open_strategy.hpp +++ b/src/odr/internal/open_strategy.hpp @@ -6,7 +6,9 @@ namespace odr { enum class FileType; -} +enum class DecoderEngine; +struct DecodePreferences; +} // namespace odr namespace odr::internal::abstract { class File; @@ -20,11 +22,22 @@ class Path; namespace odr::internal::open_strategy { std::vector types(std::shared_ptr file); +std::vector +engines(std::shared_ptr file, FileType as); std::unique_ptr open_file(std::shared_ptr file); std::unique_ptr open_file(std::shared_ptr file, FileType as); +std::unique_ptr +open_file(std::shared_ptr file, FileType as, + DecoderEngine with); +std::unique_ptr +open_file(std::shared_ptr file, + const DecodePreferences &preferences); +std::unique_ptr +open_file(std::shared_ptr file, FileType as, + const DecodePreferences &preferences); std::unique_ptr open_document_file(std::shared_ptr file); From 32e3d5df68ed8b5a07e031db8a85f491b1810c9f Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 12 Sep 2024 22:56:36 +0200 Subject: [PATCH 02/28] draft --- src/odr/file.cpp | 4 ++++ src/odr/file.hpp | 1 + src/odr/internal/abstract/file.hpp | 1 + src/odr/internal/cfb/cfb_file.cpp | 4 ++++ src/odr/internal/cfb/cfb_file.hpp | 1 + src/odr/internal/common/image_file.cpp | 4 ++++ src/odr/internal/common/image_file.hpp | 1 + src/odr/internal/csv/csv_file.cpp | 4 ++++ src/odr/internal/csv/csv_file.hpp | 1 + src/odr/internal/json/json_file.cpp | 4 ++++ src/odr/internal/json/json_file.hpp | 1 + src/odr/internal/odf/odf_file.cpp | 4 ++++ src/odr/internal/odf/odf_file.hpp | 2 ++ src/odr/internal/oldms/oldms_file.cpp | 4 ++++ src/odr/internal/oldms/oldms_file.hpp | 2 ++ src/odr/internal/ooxml/ooxml_file.cpp | 4 ++++ src/odr/internal/ooxml/ooxml_file.hpp | 2 ++ src/odr/internal/open_strategy.cpp | 2 +- src/odr/internal/open_strategy.hpp | 5 +++-- src/odr/internal/pdf/pdf_file.cpp | 4 ++++ src/odr/internal/pdf/pdf_file.hpp | 1 + src/odr/internal/svm/svm_file.cpp | 4 ++++ src/odr/internal/svm/svm_file.hpp | 1 + src/odr/internal/text/text_file.cpp | 4 ++++ src/odr/internal/text/text_file.hpp | 1 + src/odr/internal/zip/zip_file.cpp | 4 ++++ src/odr/internal/zip/zip_file.hpp | 1 + 27 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/odr/file.cpp b/src/odr/file.cpp index 565fe8a5..52e8d6e9 100644 --- a/src/odr/file.cpp +++ b/src/odr/file.cpp @@ -98,6 +98,10 @@ FileCategory DecodedFile::file_category() const noexcept { FileMeta DecodedFile::file_meta() const noexcept { return m_impl->file_meta(); } +DecoderEngine DecodedFile::decoder_engine() const noexcept { + return m_impl->decoder_engine(); +} + File DecodedFile::file() const { return File(m_impl->file()); } bool DecodedFile::is_text_file() const { diff --git a/src/odr/file.hpp b/src/odr/file.hpp index 1070340f..0b9f4823 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -190,6 +190,7 @@ class DecodedFile { [[nodiscard]] FileType file_type() const noexcept; [[nodiscard]] FileCategory file_category() const noexcept; [[nodiscard]] FileMeta file_meta() const noexcept; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept; [[nodiscard]] File file() const; diff --git a/src/odr/internal/abstract/file.hpp b/src/odr/internal/abstract/file.hpp index cd86ed54..63cbbaa9 100644 --- a/src/odr/internal/abstract/file.hpp +++ b/src/odr/internal/abstract/file.hpp @@ -38,6 +38,7 @@ class DecodedFile { [[nodiscard]] virtual FileType file_type() const noexcept = 0; [[nodiscard]] virtual FileCategory file_category() const noexcept = 0; [[nodiscard]] virtual FileMeta file_meta() const noexcept = 0; + [[nodiscard]] virtual DecoderEngine decoder_engine() const noexcept = 0; }; class TextFile : public DecodedFile { diff --git a/src/odr/internal/cfb/cfb_file.cpp b/src/odr/internal/cfb/cfb_file.cpp index 7facfc25..e1e818a2 100644 --- a/src/odr/internal/cfb/cfb_file.cpp +++ b/src/odr/internal/cfb/cfb_file.cpp @@ -22,6 +22,10 @@ FileMeta CfbFile::file_meta() const noexcept { return meta; } +DecoderEngine CfbFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr CfbFile::archive() const { return std::make_shared(m_cfb); } diff --git a/src/odr/internal/cfb/cfb_file.hpp b/src/odr/internal/cfb/cfb_file.hpp index 35a7ac12..f5696f0e 100644 --- a/src/odr/internal/cfb/cfb_file.hpp +++ b/src/odr/internal/cfb/cfb_file.hpp @@ -26,6 +26,7 @@ class CfbFile final : public abstract::ArchiveFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr archive() const final; diff --git a/src/odr/internal/common/image_file.cpp b/src/odr/internal/common/image_file.cpp index 2f655e9e..43f14acf 100644 --- a/src/odr/internal/common/image_file.cpp +++ b/src/odr/internal/common/image_file.cpp @@ -14,6 +14,10 @@ FileType ImageFile::file_type() const noexcept { return m_file_type; } FileMeta ImageFile::file_meta() const noexcept { return {}; } +DecoderEngine ImageFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr ImageFile::image() const { return {}; } } // namespace odr::internal::common diff --git a/src/odr/internal/common/image_file.hpp b/src/odr/internal/common/image_file.hpp index da61e735..81024754 100644 --- a/src/odr/internal/common/image_file.hpp +++ b/src/odr/internal/common/image_file.hpp @@ -13,6 +13,7 @@ class ImageFile : public abstract::ImageFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr image() const final; diff --git a/src/odr/internal/csv/csv_file.cpp b/src/odr/internal/csv/csv_file.cpp index 59936046..0beaa2f1 100644 --- a/src/odr/internal/csv/csv_file.cpp +++ b/src/odr/internal/csv/csv_file.cpp @@ -24,4 +24,8 @@ FileMeta CsvFile::file_meta() const noexcept { return {FileType::comma_separated_values, false, {}}; } +DecoderEngine CsvFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::csv diff --git a/src/odr/internal/csv/csv_file.hpp b/src/odr/internal/csv/csv_file.hpp index b0fadebf..00d67dd4 100644 --- a/src/odr/internal/csv/csv_file.hpp +++ b/src/odr/internal/csv/csv_file.hpp @@ -17,6 +17,7 @@ class CsvFile final : public abstract::TextFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/json/json_file.cpp b/src/odr/internal/json/json_file.cpp index f19ffca1..71b35024 100644 --- a/src/odr/internal/json/json_file.cpp +++ b/src/odr/internal/json/json_file.cpp @@ -24,4 +24,8 @@ FileMeta JsonFile::file_meta() const noexcept { return {FileType::javascript_object_notation, false, {}}; } +DecoderEngine JsonFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::json diff --git a/src/odr/internal/json/json_file.hpp b/src/odr/internal/json/json_file.hpp index 29872e23..6640d2bc 100644 --- a/src/odr/internal/json/json_file.hpp +++ b/src/odr/internal/json/json_file.hpp @@ -17,6 +17,7 @@ class JsonFile final : public abstract::TextFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/odf/odf_file.cpp b/src/odr/internal/odf/odf_file.cpp index 12cf46f9..5e85d90a 100644 --- a/src/odr/internal/odf/odf_file.cpp +++ b/src/odr/internal/odf/odf_file.cpp @@ -43,6 +43,10 @@ FileType OpenDocumentFile::file_type() const noexcept { FileMeta OpenDocumentFile::file_meta() const noexcept { return m_file_meta; } +DecoderEngine OpenDocumentFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + DocumentType OpenDocumentFile::document_type() const { return m_file_meta.document_meta->document_type; } diff --git a/src/odr/internal/odf/odf_file.hpp b/src/odr/internal/odf/odf_file.hpp index 684ac403..8d50525e 100644 --- a/src/odr/internal/odf/odf_file.hpp +++ b/src/odr/internal/odf/odf_file.hpp @@ -28,6 +28,8 @@ class OpenDocumentFile final : public virtual abstract::DocumentFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] DocumentType document_type() const final; [[nodiscard]] DocumentMeta document_meta() const final; diff --git a/src/odr/internal/oldms/oldms_file.cpp b/src/odr/internal/oldms/oldms_file.cpp index 9b5a9f0f..256912be 100644 --- a/src/odr/internal/oldms/oldms_file.cpp +++ b/src/odr/internal/oldms/oldms_file.cpp @@ -62,6 +62,10 @@ FileType LegacyMicrosoftFile::file_type() const noexcept { FileMeta LegacyMicrosoftFile::file_meta() const noexcept { return m_file_meta; } +DecoderEngine LegacyMicrosoftFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + DocumentType LegacyMicrosoftFile::document_type() const { return m_file_meta.document_meta->document_type; } diff --git a/src/odr/internal/oldms/oldms_file.hpp b/src/odr/internal/oldms/oldms_file.hpp index f3651c7e..7950b3e3 100644 --- a/src/odr/internal/oldms/oldms_file.hpp +++ b/src/odr/internal/oldms/oldms_file.hpp @@ -25,6 +25,8 @@ class LegacyMicrosoftFile final : public abstract::DocumentFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] DocumentType document_type() const final; [[nodiscard]] DocumentMeta document_meta() const final; diff --git a/src/odr/internal/ooxml/ooxml_file.cpp b/src/odr/internal/ooxml/ooxml_file.cpp index 2cf85092..6a1404be 100644 --- a/src/odr/internal/ooxml/ooxml_file.cpp +++ b/src/odr/internal/ooxml/ooxml_file.cpp @@ -37,6 +37,10 @@ FileType OfficeOpenXmlFile::file_type() const noexcept { FileMeta OfficeOpenXmlFile::file_meta() const noexcept { return m_file_meta; } +DecoderEngine OfficeOpenXmlFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + DocumentType OfficeOpenXmlFile::document_type() const { return m_file_meta.document_meta->document_type; } diff --git a/src/odr/internal/ooxml/ooxml_file.hpp b/src/odr/internal/ooxml/ooxml_file.hpp index b3ab97a1..e6b420fe 100644 --- a/src/odr/internal/ooxml/ooxml_file.hpp +++ b/src/odr/internal/ooxml/ooxml_file.hpp @@ -27,6 +27,8 @@ class OfficeOpenXmlFile final : public abstract::DocumentFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] DocumentType document_type() const final; [[nodiscard]] DocumentMeta document_meta() const final; diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index 37df3282..fe20de88 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -22,7 +22,7 @@ namespace odr::internal { std::vector -open_strategy::types(std::shared_ptr file) { +open_strategy::types(const std::shared_ptr &file) { std::vector result; auto file_type = magic::file_type(*file); diff --git a/src/odr/internal/open_strategy.hpp b/src/odr/internal/open_strategy.hpp index a18cef8a..b6c554db 100644 --- a/src/odr/internal/open_strategy.hpp +++ b/src/odr/internal/open_strategy.hpp @@ -21,9 +21,10 @@ class Path; } // namespace odr::internal::common namespace odr::internal::open_strategy { -std::vector types(std::shared_ptr file); +std::vector +types(const std::shared_ptr &file); std::vector -engines(std::shared_ptr file, FileType as); +engines(const std::shared_ptr &file, FileType as); std::unique_ptr open_file(std::shared_ptr file); diff --git a/src/odr/internal/pdf/pdf_file.cpp b/src/odr/internal/pdf/pdf_file.cpp index 00260c0f..4227e13e 100644 --- a/src/odr/internal/pdf/pdf_file.cpp +++ b/src/odr/internal/pdf/pdf_file.cpp @@ -19,4 +19,8 @@ FileType PdfFile::file_type() const noexcept { FileMeta PdfFile::file_meta() const noexcept { return {}; } +DecoderEngine PdfFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_file.hpp b/src/odr/internal/pdf/pdf_file.hpp index 42b751be..8b72bd37 100644 --- a/src/odr/internal/pdf/pdf_file.hpp +++ b/src/odr/internal/pdf/pdf_file.hpp @@ -14,6 +14,7 @@ class PdfFile : public abstract::DecodedFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileCategory file_category() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/svm/svm_file.cpp b/src/odr/internal/svm/svm_file.cpp index fefc604f..47db2716 100644 --- a/src/odr/internal/svm/svm_file.cpp +++ b/src/odr/internal/svm/svm_file.cpp @@ -31,6 +31,10 @@ FileMeta SvmFile::file_meta() const noexcept { return result; } +DecoderEngine SvmFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr SvmFile::image() const { return {}; } } // namespace odr::internal::svm diff --git a/src/odr/internal/svm/svm_file.hpp b/src/odr/internal/svm/svm_file.hpp index 60e1f806..dd28cd6d 100644 --- a/src/odr/internal/svm/svm_file.hpp +++ b/src/odr/internal/svm/svm_file.hpp @@ -20,6 +20,7 @@ class SvmFile final : public abstract::ImageFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr image() const final; diff --git a/src/odr/internal/text/text_file.cpp b/src/odr/internal/text/text_file.cpp index a6908f5d..7fc489c1 100644 --- a/src/odr/internal/text/text_file.cpp +++ b/src/odr/internal/text/text_file.cpp @@ -22,4 +22,8 @@ FileMeta TextFile::file_meta() const noexcept { return {FileType::text_file, false, {}}; } +DecoderEngine TextFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::text diff --git a/src/odr/internal/text/text_file.hpp b/src/odr/internal/text/text_file.hpp index f0e84271..69bad36b 100644 --- a/src/odr/internal/text/text_file.hpp +++ b/src/odr/internal/text/text_file.hpp @@ -18,6 +18,7 @@ class TextFile final : public abstract::TextFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/zip/zip_file.cpp b/src/odr/internal/zip/zip_file.cpp index b0fb59a7..496a986a 100644 --- a/src/odr/internal/zip/zip_file.cpp +++ b/src/odr/internal/zip/zip_file.cpp @@ -23,6 +23,10 @@ FileMeta ZipFile::file_meta() const noexcept { return meta; } +DecoderEngine ZipFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr ZipFile::archive() const { return std::make_shared(m_zip); } diff --git a/src/odr/internal/zip/zip_file.hpp b/src/odr/internal/zip/zip_file.hpp index 6d11c195..fde6f6db 100644 --- a/src/odr/internal/zip/zip_file.hpp +++ b/src/odr/internal/zip/zip_file.hpp @@ -27,6 +27,7 @@ class ZipFile final : public abstract::ArchiveFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr archive() const final; From 04bb3d58a009af518bf6d0deec392984fad60fab Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sun, 15 Sep 2024 12:38:54 +0200 Subject: [PATCH 03/28] clean up --- conanfile.py | 27 +++++++++----------- scripts/conan | 6 ++--- src/odr/internal/html/pdf2htmlEX_wrapper.cpp | 2 +- src/odr/internal/html/wvWare_wrapper.cpp | 9 +++++-- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/conanfile.py b/conanfile.py index 54ad2160..16eab5f2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -1,11 +1,7 @@ -import os - from conan import ConanFile from conan.tools.build import check_min_cppstd from conan.tools.cmake import CMakeToolchain, CMakeDeps, CMake from conan.tools.env import Environment -from conan.tools.env.environment import EnvVars -from conan.tools.files import copy class OpenDocumentCoreConan(ConanFile): @@ -27,8 +23,18 @@ class OpenDocumentCoreConan(ConanFile): default_options = { "shared": False, "fPIC": True, + "with_pdf2htmlEX": True, + "with_wvWare": True, } + exports_sources = ["cli/*", "cmake/*", "src/*", "CMakeLists.txt"] + + def config_options(self): + if self.settings.os == "Windows": + del self.options.fPIC + del self.options.with_pdf2htmlEX + del self.options.with_wvWare + def requirements(self): self.requires("pugixml/1.14") self.requires("cryptopp/8.9.0") @@ -37,9 +43,9 @@ def requirements(self): self.requires("vincentlaucsb-csv-parser/2.3.0") self.requires("uchardet/0.0.8") self.requires("utfcpp/4.0.4") - if self.options.get_safe("with_pdf2htmlEX"): + if self.options.get_safe("with_pdf2htmlEX", False): self.requires("pdf2htmlex/0.18.8.rc1-20240905-git") - if self.options.get_safe("with_wvWare"): + if self.options.get_safe("with_wvWare", False): self.requires("wvware/1.2.9") def build_requirements(self): @@ -49,15 +55,6 @@ def validate_build(self): if self.settings.get_safe("compiler.cppstd"): check_min_cppstd(self, 20) - exports_sources = ["cli/*", "cmake/*", "src/*", "CMakeLists.txt"] - - def config_options(self): - if self.settings.os == "Windows": - del self.options.fPIC - - self.options.with_pdf2htmlEX = self.settings.os not in ["Windows", "Macos"] - self.options.with_wvWare = self.settings.os not in ["Windows", "Macos"] - def configure(self): if self.options.shared: self.options.rm_safe("fPIC") diff --git a/scripts/conan b/scripts/conan index c40f4d8a..6107326f 100755 --- a/scripts/conan +++ b/scripts/conan @@ -1,5 +1,5 @@ #!/usr/bin/env bash -conan install . --output-folder=cmake-build-relwithdebinfo --build=missing -s build_type=Release -s "&:build_type=RelWithDebInfo" -conan install . --output-folder=cmake-build-debug --build=missing -s build_type=Release -s "&:build_type=Debug" -conan install . --output-folder=cmake-build-release --build=never -s build_type=Release -s "&:build_type=Release" +conan install . --output-folder=cmake-build-relwithdebinfo --build=missing -s build_type=RelWithDebInfo -s "&:build_type=RelWithDebInfo" +conan install . --output-folder=cmake-build-debug --build=missing -s build_type=RelWithDebInfo -s "&:build_type=Debug" +conan install . --output-folder=cmake-build-release --build=missing -s build_type=RelWithDebInfo -s "&:build_type=Release" diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp index 169821f8..b3c1a010 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp @@ -5,11 +5,11 @@ #include #include +#include #include #include -#include namespace odr::internal { diff --git a/src/odr/internal/html/wvWare_wrapper.cpp b/src/odr/internal/html/wvWare_wrapper.cpp index 1908836f..6e2376f3 100644 --- a/src/odr/internal/html/wvWare_wrapper.cpp +++ b/src/odr/internal/html/wvWare_wrapper.cpp @@ -1,12 +1,17 @@ +#include + #include #include #include + #include -#include #include -#include + #include +// TODO remove this +#include + namespace odr::internal::html { Html wvWare_wrapper(const std::string &input_path, From 282ea94d9084e6a3fafdeef405fde2dd346cb2bf Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 16 Sep 2024 09:08:13 +0200 Subject: [PATCH 04/28] draft --- CMakeLists.txt | 33 ++++++++--- conanfile.py | 2 +- src/odr/file.hpp | 2 +- src/odr/internal/html/pdf_file.hpp | 6 +- src/odr/internal/html/pdf_poppler_file.cpp | 58 +++++++++++++++++++ src/odr/internal/html/pdf_poppler_file.hpp | 21 +++++++ .../internal/pdf_poppler/poppler_pdf_file.cpp | 36 ++++++++++++ .../internal/pdf_poppler/poppler_pdf_file.hpp | 30 ++++++++++ 8 files changed, 175 insertions(+), 13 deletions(-) create mode 100644 src/odr/internal/html/pdf_poppler_file.cpp create mode 100644 src/odr/internal/html/pdf_poppler_file.hpp create mode 100644 src/odr/internal/pdf_poppler/poppler_pdf_file.cpp create mode 100644 src/odr/internal/pdf_poppler/poppler_pdf_file.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d804a522..bb414b75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/html/html_writer.cpp" "src/odr/internal/html/image_file.cpp" "src/odr/internal/html/pdf_file.cpp" + "src/odr/internal/html/pdf_poppler_file.hpp" "src/odr/internal/html/text_file.cpp" "src/odr/internal/json/json_file.cpp" @@ -194,16 +195,32 @@ target_link_libraries(odr utf8::cpp ) -if(WITH_PDF2HTMLEX) - target_sources(odr PRIVATE "src/odr/internal/html/pdf2htmlEX_wrapper.cpp") +if (WITH_PDF2HTMLEX) find_package(pdf2htmlEX REQUIRED) - target_link_libraries(odr PRIVATE pdf2htmlex::pdf2htmlex) -endif(WITH_PDF2HTMLEX) -if(WITH_WVWARE) - target_sources(odr PRIVATE "src/odr/internal/html/wvWare_wrapper.cpp") + find_package(poppler REQUIRED) + target_sources(odr + PRIVATE + "src/odr/internal/html/pdf2htmlEX_wrapper.cpp" + "src/odr/internal/html/pdf_poppler_file.cpp" + "src/odr/internal/pdf_poppler/poppler_pdf_file.cpp" + ) + target_link_libraries(odr + PRIVATE + pdf2htmlex::pdf2htmlex + poppler::poppler + ) +endif () +if (WITH_WVWARE) find_package(wvware REQUIRED) - target_link_libraries(odr PRIVATE wvware::wvware) -endif(WITH_WVWARE) + target_sources(odr + PRIVATE + "src/odr/internal/html/wvWare_wrapper.cpp" + ) + target_link_libraries(odr + PRIVATE + wvware::wvware + ) +endif () if (EXISTS "${PROJECT_SOURCE_DIR}/.git") add_dependencies(odr check_git) diff --git a/conanfile.py b/conanfile.py index 16eab5f2..6b772b7a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -44,7 +44,7 @@ def requirements(self): self.requires("uchardet/0.0.8") self.requires("utfcpp/4.0.4") if self.options.get_safe("with_pdf2htmlEX", False): - self.requires("pdf2htmlex/0.18.8.rc1-20240905-git") + self.requires("pdf2htmlex/0.18.8.rc1-20240905-git", transitive_headers=True, transitive_libs=True) if self.options.get_safe("with_wvWare", False): self.requires("wvware/1.2.9") diff --git a/src/odr/file.hpp b/src/odr/file.hpp index 0b9f4823..f3abe05d 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -100,7 +100,7 @@ enum class FileLocation { /// @brief Collection of decoder engines. enum class DecoderEngine { odr, - pdf2html_ex, + poppler, wv_ware, }; diff --git a/src/odr/internal/html/pdf_file.hpp b/src/odr/internal/html/pdf_file.hpp index 6df0f55a..068b822a 100644 --- a/src/odr/internal/html/pdf_file.hpp +++ b/src/odr/internal/html/pdf_file.hpp @@ -1,5 +1,5 @@ -#ifndef ODR_INTERNAL_PDF_FILE_HPP -#define ODR_INTERNAL_PDF_FILE_HPP +#ifndef ODR_INTERNAL_HTML_PDF_FILE_HPP +#define ODR_INTERNAL_HTML_PDF_FILE_HPP #include @@ -17,4 +17,4 @@ Html translate_pdf_file(const PdfFile &pdf_file, const std::string &output_path, } -#endif // ODR_INTERNAL_PDF_FILE_HPP +#endif // ODR_INTERNAL_HTML_PDF_FILE_HPP diff --git a/src/odr/internal/html/pdf_poppler_file.cpp b/src/odr/internal/html/pdf_poppler_file.cpp new file mode 100644 index 00000000..23c2ef62 --- /dev/null +++ b/src/odr/internal/html/pdf_poppler_file.cpp @@ -0,0 +1,58 @@ +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace odr::internal { + +Html html::translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + static const char *fontconfig_path = getenv("FONTCONFIG_PATH"); + if (nullptr == fontconfig_path) { + // Storage is allocated and after successful putenv, it will never be freed. + // This is the way of putenv. + char *storage = strdup("FONTCONFIG_PATH=" FONTCONFIG_PATH); + if (0 != putenv(storage)) { + free(storage); + } + fontconfig_path = getenv("FONTCONFIG_PATH"); + } + + pdf2htmlEX::pdf2htmlEX pdf2htmlEX; + pdf2htmlEX.setDataDir(PDF2HTMLEX_DATA_DIR); + pdf2htmlEX.setPopplerDataDir(POPPLER_DATA_DIR); + + pdf2htmlEX.setDestinationDir(output_path); + auto output_file_name = "document.html"; + pdf2htmlEX.setOutputFilename(output_file_name); + + pdf2htmlEX.setDRM(false); + pdf2htmlEX.setProcessOutline(false); + pdf2htmlEX.setProcessAnnotation(true); + + try { + pdf2htmlEX.convert(); + } catch (const pdf2htmlEX::EncryptionPasswordException &e) { + throw WrongPassword(); + } catch (const pdf2htmlEX::DocumentCopyProtectedException &e) { + throw std::runtime_error("document is copy protected"); + } catch (const pdf2htmlEX::ConversionFailedException &e) { + throw std::runtime_error(std::string("conversion error ") + e.what()); + } + + return {FileType::portable_document_format, + config, + {{"document", output_path + "/" + output_file_name}}}; +} + +} // namespace odr::internal diff --git a/src/odr/internal/html/pdf_poppler_file.hpp b/src/odr/internal/html/pdf_poppler_file.hpp new file mode 100644 index 00000000..1bad1ac2 --- /dev/null +++ b/src/odr/internal/html/pdf_poppler_file.hpp @@ -0,0 +1,21 @@ +#ifndef ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP +#define ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP + +#include + +namespace odr { +class PopplerPdfFile; + +struct HtmlConfig; +class Html; +} // namespace odr + +namespace odr::internal::html { + +Html translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config); + +} + +#endif // ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp new file mode 100644 index 00000000..462776e4 --- /dev/null +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp @@ -0,0 +1,36 @@ +#include + +#include + +#include +#include + +namespace odr::internal::poppler_pdf { + +PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) + : m_file{std::move(file)} { + GooString file_path(file->disk_path()->string().c_str()); + m_pdf_doc = std::unique_ptr(PDFDocFactory().createPDFDoc(file_path)); +} + +FileCategory PopplerPdfFile::file_category() const noexcept { + return FileCategory::document; +} + +std::shared_ptr PopplerPdfFile::file() const noexcept { + return m_file; +} + +FileType PopplerPdfFile::file_type() const noexcept { + return FileType::portable_document_format; +} + +FileMeta PopplerPdfFile::file_meta() const noexcept { return {}; } + +DecoderEngine PopplerPdfFile::decoder_engine() const noexcept { + return DecoderEngine::poppler; +} + +const PDFDoc &PopplerPdfFile::pdf_doc() const { return *m_pdf_doc; } + +} // namespace odr::internal::poppler_pdf diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp new file mode 100644 index 00000000..06daa7ea --- /dev/null +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp @@ -0,0 +1,30 @@ +#ifndef ODR_INTERNAL_POPPLER_PDF_FILE_HPP +#define ODR_INTERNAL_POPPLER_PDF_FILE_HPP + +#include + +class PDFDoc; + +namespace odr::internal::poppler_pdf { + +class PopplerPdfFile : public abstract::DecodedFile { +public: + explicit PopplerPdfFile(std::shared_ptr file); + + [[nodiscard]] std::shared_ptr file() const noexcept final; + + [[nodiscard]] FileType file_type() const noexcept final; + [[nodiscard]] FileCategory file_category() const noexcept final; + [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + + [[nodiscard]] const PDFDoc &pdf_doc() const; + +private: + std::shared_ptr m_file; + std::unique_ptr m_pdf_doc; +}; + +} // namespace odr::internal::poppler_pdf + +#endif // ODR_INTERNAL_POPPLER_PDF_FILE_HPP From 92c8e3dd78d5f83d9f0f8c1404641a347afc7220 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 16 Sep 2024 09:40:07 +0200 Subject: [PATCH 05/28] draft --- src/odr/internal/html/pdf_poppler_file.cpp | 29 +++++----------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/odr/internal/html/pdf_poppler_file.cpp b/src/odr/internal/html/pdf_poppler_file.cpp index 23c2ef62..e196abf9 100644 --- a/src/odr/internal/html/pdf_poppler_file.cpp +++ b/src/odr/internal/html/pdf_poppler_file.cpp @@ -8,7 +8,8 @@ #include #include -#include +#include +#include #include @@ -28,31 +29,13 @@ Html html::translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, fontconfig_path = getenv("FONTCONFIG_PATH"); } - pdf2htmlEX::pdf2htmlEX pdf2htmlEX; - pdf2htmlEX.setDataDir(PDF2HTMLEX_DATA_DIR); - pdf2htmlEX.setPopplerDataDir(POPPLER_DATA_DIR); - - pdf2htmlEX.setDestinationDir(output_path); - auto output_file_name = "document.html"; - pdf2htmlEX.setOutputFilename(output_file_name); - - pdf2htmlEX.setDRM(false); - pdf2htmlEX.setProcessOutline(false); - pdf2htmlEX.setProcessAnnotation(true); - - try { - pdf2htmlEX.convert(); - } catch (const pdf2htmlEX::EncryptionPasswordException &e) { - throw WrongPassword(); - } catch (const pdf2htmlEX::DocumentCopyProtectedException &e) { - throw std::runtime_error("document is copy protected"); - } catch (const pdf2htmlEX::ConversionFailedException &e) { - throw std::runtime_error(std::string("conversion error ") + e.what()); - } + Param param; + + HTMLRenderer(nullptr, param).process(&pdf_file.pdf_doc()); return {FileType::portable_document_format, config, - {{"document", output_path + "/" + output_file_name}}}; + {{"document", output_path + "/document.html"}}}; } } // namespace odr::internal From 52d0fe98b0e4b59f526b124b9a5c7230ac21cdc6 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 16 Sep 2024 15:11:04 +0200 Subject: [PATCH 06/28] draft --- CMakeLists.txt | 3 +- conanfile.py | 2 +- src/odr/file.cpp | 14 +- src/odr/file.hpp | 13 +- src/odr/internal/abstract/file.hpp | 14 ++ src/odr/internal/html/pdf2htmlEX_wrapper.cpp | 142 ++++++++++++++---- src/odr/internal/html/pdf2htmlEX_wrapper.hpp | 37 +++-- src/odr/internal/html/pdf_poppler_file.cpp | 41 ----- src/odr/internal/html/pdf_poppler_file.hpp | 21 --- src/odr/internal/open_strategy.cpp | 2 +- src/odr/internal/pdf/pdf_file.cpp | 4 +- src/odr/internal/pdf/pdf_file.hpp | 4 +- .../internal/pdf_poppler/poppler_pdf_file.cpp | 57 +++++-- .../internal/pdf_poppler/poppler_pdf_file.hpp | 22 ++- test/data/input/odr-private | 2 +- test/src/pdf2htmlEX_wrapper_test.cpp | 16 +- 16 files changed, 240 insertions(+), 154 deletions(-) delete mode 100644 src/odr/internal/html/pdf_poppler_file.cpp delete mode 100644 src/odr/internal/html/pdf_poppler_file.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index bb414b75..ea59d501 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,7 +108,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/html/html_writer.cpp" "src/odr/internal/html/image_file.cpp" "src/odr/internal/html/pdf_file.cpp" - "src/odr/internal/html/pdf_poppler_file.hpp" + "src/odr/internal/html/pdf2htmlEX_wrapper.hpp" "src/odr/internal/html/text_file.cpp" "src/odr/internal/json/json_file.cpp" @@ -201,7 +201,6 @@ if (WITH_PDF2HTMLEX) target_sources(odr PRIVATE "src/odr/internal/html/pdf2htmlEX_wrapper.cpp" - "src/odr/internal/html/pdf_poppler_file.cpp" "src/odr/internal/pdf_poppler/poppler_pdf_file.cpp" ) target_link_libraries(odr diff --git a/conanfile.py b/conanfile.py index 6b772b7a..16eab5f2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -44,7 +44,7 @@ def requirements(self): self.requires("uchardet/0.0.8") self.requires("utfcpp/4.0.4") if self.options.get_safe("with_pdf2htmlEX", False): - self.requires("pdf2htmlex/0.18.8.rc1-20240905-git", transitive_headers=True, transitive_libs=True) + self.requires("pdf2htmlex/0.18.8.rc1-20240905-git") if self.options.get_safe("with_wvWare", False): self.requires("wvware/1.2.9") diff --git a/src/odr/file.cpp b/src/odr/file.cpp index 52e8d6e9..739a4965 100644 --- a/src/odr/file.cpp +++ b/src/odr/file.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -69,7 +68,7 @@ FileMeta DecodedFile::meta(const std::string &path) { DecodedFile::DecodedFile(std::shared_ptr impl) : m_impl{std::move(impl)} { - if (!m_impl) { + if (m_impl == nullptr) { throw UnknownFileType(); } } @@ -125,7 +124,8 @@ bool DecodedFile::is_document_file() const { } bool DecodedFile::is_pdf_file() const { - return std::dynamic_pointer_cast(m_impl) != nullptr; + return std::dynamic_pointer_cast(m_impl) != + nullptr; } TextFile DecodedFile::text_file() const { @@ -162,7 +162,7 @@ DocumentFile DecodedFile::document_file() const { PdfFile DecodedFile::pdf_file() const { if (auto pdf_file = - std::dynamic_pointer_cast(m_impl)) { + std::dynamic_pointer_cast(m_impl)) { return PdfFile(pdf_file); } throw NoPdfFile(); @@ -233,7 +233,11 @@ DocumentMeta DocumentFile::document_meta() const { Document DocumentFile::document() const { return Document(m_impl->document()); } -PdfFile::PdfFile(std::shared_ptr impl) +PdfFile::PdfFile(std::shared_ptr impl) : DecodedFile(impl), m_impl{std::move(impl)} {} +std::shared_ptr PdfFile::impl() const { + return m_impl; +} + } // namespace odr diff --git a/src/odr/file.hpp b/src/odr/file.hpp index f3abe05d..ca630306 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -14,11 +14,8 @@ class TextFile; class ImageFile; class ArchiveFile; class DocumentFile; -} // namespace odr::internal::abstract - -namespace odr::internal::pdf { class PdfFile; -} +} // namespace odr::internal::abstract namespace odr { class TextFile; @@ -179,7 +176,7 @@ class DecodedFile { static FileType type(const std::string &path); static FileMeta meta(const std::string &path); - explicit DecodedFile(std::shared_ptr); + explicit DecodedFile(std::shared_ptr impl); explicit DecodedFile(const File &file); DecodedFile(const File &file, FileType as); explicit DecodedFile(const std::string &path); @@ -271,10 +268,12 @@ class DocumentFile final : public DecodedFile { /// @brief Represents a PDF file. class PdfFile final : public DecodedFile { public: - explicit PdfFile(std::shared_ptr); + explicit PdfFile(std::shared_ptr); + + [[nodiscard]] std::shared_ptr impl() const; private: - std::shared_ptr m_impl; + std::shared_ptr m_impl; }; } // namespace odr diff --git a/src/odr/internal/abstract/file.hpp b/src/odr/internal/abstract/file.hpp index 63cbbaa9..728e9d30 100644 --- a/src/odr/internal/abstract/file.hpp +++ b/src/odr/internal/abstract/file.hpp @@ -82,6 +82,20 @@ class DocumentFile : public DecodedFile { [[nodiscard]] virtual std::shared_ptr document() const = 0; }; +class PdfFile : public DecodedFile { +public: + [[nodiscard]] FileType file_type() const noexcept final { + return FileType::portable_document_format; + } + [[nodiscard]] FileCategory file_category() const noexcept final { + return FileCategory::document; + } + + [[nodiscard]] virtual bool password_encrypted() const noexcept = 0; + [[nodiscard]] virtual EncryptionState encryption_state() const noexcept = 0; + [[nodiscard]] virtual bool decrypt(const std::string &password) = 0; +}; + } // namespace odr::internal::abstract #endif // ODR_INTERNAL_ABSTRACT_FILE_HPP diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp index b3c1a010..03d864ec 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp @@ -4,61 +4,139 @@ #include #include -#include +#include #include -#include +#include +#include + +#include +#include #include namespace odr::internal { -Html html::pdf2htmlEX_wrapper(const std::string &input_path, - const std::string &output_path, - const HtmlConfig &config, - std::optional &password) { - static const char *fontconfig_path = getenv("FONTCONFIG_PATH"); - if (nullptr == fontconfig_path) { +Html html::translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + PDFDoc &pdf_doc = pdf_file.pdf_doc(); + + if (!pdf_doc.isOk()) { + int errCode = pdf_doc.getErrorCode(); + if (errCode == errEncrypted) { + throw EncryptionPasswordException(std::to_string(errCode)); + } else { + throw ConversionFailedException(std::to_string(errCode)); + } + } + + const char *fontconfig_path = std::getenv("FONTCONFIG_PATH"); + if (fontconfig_path == nullptr) { // Storage is allocated and after successful putenv, it will never be freed. // This is the way of putenv. char *storage = strdup("FONTCONFIG_PATH=" FONTCONFIG_PATH); if (0 != putenv(storage)) { free(storage); } - fontconfig_path = getenv("FONTCONFIG_PATH"); + fontconfig_path = std::getenv("FONTCONFIG_PATH"); } - pdf2htmlEX::pdf2htmlEX pdf2htmlEX; - pdf2htmlEX.setDataDir(PDF2HTMLEX_DATA_DIR); - pdf2htmlEX.setPopplerDataDir(POPPLER_DATA_DIR); + pdf2htmlEX::Param param; - pdf2htmlEX.setInputFilename(input_path); - pdf2htmlEX.setDestinationDir(output_path); - auto output_file_name = "document.html"; - pdf2htmlEX.setOutputFilename(output_file_name); + // pages + param.first_page = 1; + param.last_page = pdf_doc.getNumPages(); - pdf2htmlEX.setDRM(false); - pdf2htmlEX.setProcessOutline(false); - pdf2htmlEX.setProcessAnnotation(true); + // dimension + param.zoom = 0; + param.fit_width = 0; + param.fit_height = 0; + param.use_cropbox = 1; + param.desired_dpi = 144.0; - if (password.has_value()) { - pdf2htmlEX.setOwnerPassword(password.value()); - pdf2htmlEX.setUserPassword(password.value()); - } + // output + param.embed_css = 1; + param.embed_font = 1; + param.embed_image = 1; + param.embed_javascript = 1; + param.embed_outline = 1; + param.split_pages = 0; + param.dest_dir = output_path; + param.css_filename = ""; + param.page_filename = ""; + param.outline_filename = ""; + param.process_nontext = 1; + param.process_outline = 1; + param.process_annotation = 0; + param.process_form = 0; + param.printing = 1; + param.fallback = 0; + param.tmp_file_size_limit = -1; + + // font + param.embed_external_font = 0; // TODO 1 + param.font_format = "woff"; + param.decompose_ligature = 0; + param.turn_off_ligatures = 0; + param.auto_hint = 0; + param.external_hint_tool = ""; + param.stretch_narrow_glyph = 0; + param.squeeze_wide_glyph = 1; + param.override_fstype = 0; + param.process_type3 = 0; + + // text + param.h_eps = 1.0; + param.v_eps = 1.0; + param.space_threshold = 1.0 / 8; + param.font_size_multiplier = 4.0; + param.space_as_offset = 0; + param.tounicode = 0; + param.optimize_text = 0; + param.correct_text_visibility = 1; + param.text_dpi = 300; - try { - pdf2htmlEX.convert(); - } catch (const pdf2htmlEX::EncryptionPasswordException &e) { - throw WrongPassword(); - } catch (const pdf2htmlEX::DocumentCopyProtectedException &e) { - throw std::runtime_error("document is copy protected"); - } catch (const pdf2htmlEX::ConversionFailedException &e) { - throw std::runtime_error(std::string("conversion error ") + e.what()); + // background + param.bg_format = "png"; + param.svg_node_count_limit = -1; + param.svg_embed_bitmap = 1; + + // encryption + param.owner_password = ""; + param.user_password = ""; + param.no_drm = 0; + + // misc + param.clean_tmp = 1; + param.tmp_dir = "/tmp"; + param.data_dir = PDF2HTMLEX_DATA_DIR; + param.poppler_data_dir = POPPLER_DATA_DIR; + param.debug = 0; + param.proof = 0; + param.quiet = 1; + + // input, output + param.input_filename = ""; + param.output_filename = "document.html"; + + if (!pdf_doc.okToCopy()) { + if (param.no_drm == 0) { + throw DocumentCopyProtectedException(""); + } } + globalParams = std::make_unique( + !param.poppler_data_dir.empty() ? param.poppler_data_dir.c_str() + : nullptr); + + pdf2htmlEX::HTMLRenderer(nullptr, param).process(&pdf_doc); + + globalParams.reset(); + return {FileType::portable_document_format, config, - {{"document", output_path + "/" + output_file_name}}}; + {{"document", output_path + "/document.html"}}}; } } // namespace odr::internal diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp index ace0e5ce..186a6fc0 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp @@ -1,23 +1,38 @@ -#ifndef ODR_INTERNAL_PDF2HTMLEX_WRAPPER_HPP -#define ODR_INTERNAL_PDF2HTMLEX_WRAPPER_HPP +#ifndef ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP +#define ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP -#include #include namespace odr { -class PdfFile; - struct HtmlConfig; class Html; } // namespace odr +namespace odr::internal { +class PopplerPdfFile; +} // namespace odr::internal + namespace odr::internal::html { -Html pdf2htmlEX_wrapper(const std::string &input_path, - const std::string &output_path, - const HtmlConfig &config, - std::optional &password); +Html translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config); + +class ConversionFailedException : public std::runtime_error { +public: + using std::runtime_error::runtime_error; +}; + +class DocumentCopyProtectedException : public ConversionFailedException { +public: + using ConversionFailedException::ConversionFailedException; +}; + +class EncryptionPasswordException : public ConversionFailedException { +public: + using ConversionFailedException::ConversionFailedException; +}; -} +} // namespace odr::internal::html -#endif // ODR_INTERNAL_PDF2HTMLEX_WRAPPER_HPP +#endif // ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP diff --git a/src/odr/internal/html/pdf_poppler_file.cpp b/src/odr/internal/html/pdf_poppler_file.cpp deleted file mode 100644 index e196abf9..00000000 --- a/src/odr/internal/html/pdf_poppler_file.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include - -namespace odr::internal { - -Html html::translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, - const std::string &output_path, - const HtmlConfig &config) { - static const char *fontconfig_path = getenv("FONTCONFIG_PATH"); - if (nullptr == fontconfig_path) { - // Storage is allocated and after successful putenv, it will never be freed. - // This is the way of putenv. - char *storage = strdup("FONTCONFIG_PATH=" FONTCONFIG_PATH); - if (0 != putenv(storage)) { - free(storage); - } - fontconfig_path = getenv("FONTCONFIG_PATH"); - } - - Param param; - - HTMLRenderer(nullptr, param).process(&pdf_file.pdf_doc()); - - return {FileType::portable_document_format, - config, - {{"document", output_path + "/document.html"}}}; -} - -} // namespace odr::internal diff --git a/src/odr/internal/html/pdf_poppler_file.hpp b/src/odr/internal/html/pdf_poppler_file.hpp deleted file mode 100644 index 1bad1ac2..00000000 --- a/src/odr/internal/html/pdf_poppler_file.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP -#define ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP - -#include - -namespace odr { -class PopplerPdfFile; - -struct HtmlConfig; -class Html; -} // namespace odr - -namespace odr::internal::html { - -Html translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, - const std::string &output_path, - const HtmlConfig &config); - -} - -#endif // ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index fe20de88..2eef3d40 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -133,7 +133,7 @@ open_strategy::open_file(std::shared_ptr file) { return cfb_file; } else if (file_type == FileType::portable_document_format) { - return std::make_unique(file); + return std::make_unique(file); } else if (file_type == FileType::portable_network_graphics || file_type == FileType::graphics_interchange_format || file_type == FileType::jpeg || diff --git a/src/odr/internal/pdf/pdf_file.cpp b/src/odr/internal/pdf/pdf_file.cpp index 4227e13e..97903ada 100644 --- a/src/odr/internal/pdf/pdf_file.cpp +++ b/src/odr/internal/pdf/pdf_file.cpp @@ -1,6 +1,6 @@ #include -namespace odr::internal::pdf { +namespace odr::internal { PdfFile::PdfFile(std::shared_ptr file) : m_file{std::move(file)} {} @@ -23,4 +23,4 @@ DecoderEngine PdfFile::decoder_engine() const noexcept { return DecoderEngine::odr; } -} // namespace odr::internal::pdf +} // namespace odr::internal diff --git a/src/odr/internal/pdf/pdf_file.hpp b/src/odr/internal/pdf/pdf_file.hpp index 8b72bd37..ec1d1dd8 100644 --- a/src/odr/internal/pdf/pdf_file.hpp +++ b/src/odr/internal/pdf/pdf_file.hpp @@ -3,7 +3,7 @@ #include -namespace odr::internal::pdf { +namespace odr::internal { class PdfFile : public abstract::DecodedFile { public: @@ -20,6 +20,6 @@ class PdfFile : public abstract::DecodedFile { std::shared_ptr m_file; }; -} // namespace odr::internal::pdf +} // namespace odr::internal #endif // ODR_INTERNAL_PDF_FILE_HPP diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp index 462776e4..77e94765 100644 --- a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp @@ -1,36 +1,67 @@ #include -#include - #include #include -namespace odr::internal::poppler_pdf { +namespace odr::internal { PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) : m_file{std::move(file)} { - GooString file_path(file->disk_path()->string().c_str()); - m_pdf_doc = std::unique_ptr(PDFDocFactory().createPDFDoc(file_path)); + open(std::nullopt); } -FileCategory PopplerPdfFile::file_category() const noexcept { - return FileCategory::document; +void PopplerPdfFile::open(const std::optional &password) { + GooString file_path_goo(m_file->disk_path()->string()); + std::optional password_goo; + if (password.has_value()) { + password_goo = GooString(password.value().c_str()); + } + + m_pdf_doc = std::shared_ptr( + PDFDocFactory().createPDFDoc(file_path_goo, password_goo, password_goo)); + + if (!m_pdf_doc->isOk()) { + if (m_pdf_doc->getErrorCode() == errEncrypted) { + m_encryption_state = EncryptionState::encrypted; + } else { + throw std::runtime_error("Failed to open PDF file"); + } + } else { + m_encryption_state = m_pdf_doc->isEncrypted() + ? EncryptionState::decrypted + : EncryptionState::not_encrypted; + } } std::shared_ptr PopplerPdfFile::file() const noexcept { return m_file; } -FileType PopplerPdfFile::file_type() const noexcept { - return FileType::portable_document_format; -} - FileMeta PopplerPdfFile::file_meta() const noexcept { return {}; } DecoderEngine PopplerPdfFile::decoder_engine() const noexcept { return DecoderEngine::poppler; } -const PDFDoc &PopplerPdfFile::pdf_doc() const { return *m_pdf_doc; } +bool PopplerPdfFile::password_encrypted() const noexcept { + return m_encryption_state == EncryptionState::encrypted || + m_encryption_state == EncryptionState::decrypted; +} + +EncryptionState PopplerPdfFile::encryption_state() const noexcept { + return m_encryption_state; +} + +bool PopplerPdfFile::decrypt(const std::string &password) { + if (encryption_state() != EncryptionState::encrypted) { + return false; + } + + open(password); + + return encryption_state() == EncryptionState::decrypted; +} + +PDFDoc &PopplerPdfFile::pdf_doc() const { return *m_pdf_doc; } -} // namespace odr::internal::poppler_pdf +} // namespace odr::internal diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp index 06daa7ea..8bf697bc 100644 --- a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp @@ -3,28 +3,36 @@ #include +#include + class PDFDoc; -namespace odr::internal::poppler_pdf { +namespace odr::internal { -class PopplerPdfFile : public abstract::DecodedFile { +class PopplerPdfFile final : public abstract::PdfFile { public: explicit PopplerPdfFile(std::shared_ptr file); [[nodiscard]] std::shared_ptr file() const noexcept final; - [[nodiscard]] FileType file_type() const noexcept final; - [[nodiscard]] FileCategory file_category() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; - [[nodiscard]] const PDFDoc &pdf_doc() const; + [[nodiscard]] bool password_encrypted() const noexcept final; + [[nodiscard]] EncryptionState encryption_state() const noexcept final; + [[nodiscard]] bool decrypt(const std::string &password) final; + + [[nodiscard]] PDFDoc &pdf_doc() const; private: std::shared_ptr m_file; - std::unique_ptr m_pdf_doc; + std::shared_ptr m_pdf_doc; + + EncryptionState m_encryption_state{EncryptionState::unknown}; + + void open(const std::optional &password); }; -} // namespace odr::internal::poppler_pdf +} // namespace odr::internal #endif // ODR_INTERNAL_POPPLER_PDF_FILE_HPP diff --git a/test/data/input/odr-private b/test/data/input/odr-private index a997171b..2e0f2f9a 160000 --- a/test/data/input/odr-private +++ b/test/data/input/odr-private @@ -1 +1 @@ -Subproject commit a997171b727f230c4a81421d43e2ed62f37b94ca +Subproject commit 2e0f2f9ac0af7b3fd11a3f808e0ac2cf479c6b25 diff --git a/test/src/pdf2htmlEX_wrapper_test.cpp b/test/src/pdf2htmlEX_wrapper_test.cpp index e29c82c3..02a2ae07 100644 --- a/test/src/pdf2htmlEX_wrapper_test.cpp +++ b/test/src/pdf2htmlEX_wrapper_test.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include using namespace odr; @@ -35,18 +36,17 @@ TEST_P(pdf2htmlEXWrapperTests, html) { fs::create_directories(output_path); HtmlConfig config; - std::optional password; + PopplerPdfFile pdf_file(std::make_shared(test_file.path)); + + EXPECT_EQ(test_file.password_encrypted, pdf_file.password_encrypted()); if (test_file.password_encrypted) { - password = test_file.password; - } - // @TODO: why does test_file.password_encrypted == false for this file?? - else if (test_file.path.ends_with("encrypted_fontfile3_opentype.pdf")) { - password = "sample-user-password"; + EXPECT_TRUE(pdf_file.decrypt(test_file.password)); + EXPECT_EQ(EncryptionState::decrypted, pdf_file.encryption_state()); } - Html html = odr::internal::html::pdf2htmlEX_wrapper( - test_file.path, output_path, config, password); + Html html = odr::internal::html::translate_pdf_poppler_file( + pdf_file, output_path, config); for (const HtmlPage &html_page : html.pages()) { EXPECT_TRUE(fs::is_regular_file(html_page.path)); EXPECT_LT(0, fs::file_size(html_page.path)); From 0cd4d8578efe3b898d4263e73c7a676e4d86378f Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 16 Sep 2024 17:21:30 +0200 Subject: [PATCH 07/28] draft --- src/odr/file.hpp | 3 + src/odr/html.cpp | 12 +- src/odr/internal/html/pdf2htmlEX_wrapper.cpp | 2 +- src/odr/internal/html/pdf2htmlEX_wrapper.hpp | 2 +- src/odr/internal/open_strategy.cpp | 227 +++++++++++++++++- src/odr/internal/open_strategy.hpp | 8 +- src/odr/internal/pdf/pdf_file.cpp | 16 +- src/odr/internal/pdf/pdf_file.hpp | 8 +- .../internal/pdf_poppler/poppler_pdf_file.cpp | 23 +- .../internal/pdf_poppler/poppler_pdf_file.hpp | 1 + test/src/pdf2htmlEX_wrapper_test.cpp | 2 +- 11 files changed, 277 insertions(+), 27 deletions(-) diff --git a/src/odr/file.hpp b/src/odr/file.hpp index ca630306..5ed0322e 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -103,6 +103,9 @@ enum class DecoderEngine { /// @brief Preference for decoding files. struct DecodePreference final { + std::optional as_file_type; + std::optional with_engine; + std::vector file_type_priority; std::vector engine_priority; }; diff --git a/src/odr/html.cpp b/src/odr/html.cpp index 9a7fd62b..b3dd6a78 100644 --- a/src/odr/html.cpp +++ b/src/odr/html.cpp @@ -9,8 +9,10 @@ #include #include #include +#include #include #include +#include #include @@ -112,7 +114,15 @@ Html html::translate(const Document &document, const std::string &output_path, Html html::translate(const PdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config) { - fs::create_directories(output_path); + auto pdf_file_impl = pdf_file.impl(); + + if (auto poppler_pdf_file = + std::dynamic_pointer_cast(pdf_file_impl)) { + fs::create_directories(output_path); + return internal::html::translate_poppler_pdf_file(*poppler_pdf_file, + output_path, config); + } + return internal::html::translate_pdf_file(pdf_file, output_path, config); } diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp index 03d864ec..10bb226a 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp @@ -17,7 +17,7 @@ namespace odr::internal { -Html html::translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, +Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config) { PDFDoc &pdf_doc = pdf_file.pdf_doc(); diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp index 186a6fc0..0fce0a70 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp @@ -14,7 +14,7 @@ class PopplerPdfFile; namespace odr::internal::html { -Html translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, +Html translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config); diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index 2eef3d40..498a4a18 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -93,6 +94,26 @@ open_strategy::types(const std::shared_ptr &file) { return result; } +std::vector +open_strategy::engines(const std::shared_ptr &file, + FileType as) { + std::vector result; + + result.push_back(DecoderEngine::odr); + + if (as == FileType::legacy_word_document || + as == FileType::legacy_powerpoint_presentation || + as == FileType::legacy_excel_worksheets) { + result.push_back(DecoderEngine::wv_ware); + } + + if (as == FileType::portable_document_format) { + result.push_back(DecoderEngine::poppler); + } + + return result; +} + std::unique_ptr open_strategy::open_file(std::shared_ptr file) { auto file_type = magic::file_type(*file); @@ -167,10 +188,208 @@ open_strategy::open_file(std::shared_ptr file) { } std::unique_ptr -open_strategy::open_file(std::shared_ptr /*file*/, - const FileType /*as*/) { - // TODO implement - throw UnknownFileType(); +open_strategy::open_file(std::shared_ptr file, FileType as) { + DecodePreference preference; + preference.as_file_type = as; + return open_file(file, preference); +} + +std::unique_ptr +open_strategy::open_file(std::shared_ptr file, FileType as, + DecoderEngine with) { + if (as == FileType::opendocument_text || + as == FileType::opendocument_presentation || + as == FileType::opendocument_spreadsheet || + as == FileType::opendocument_graphics) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto zip_file = std::make_unique(std::move(memory_file)); + auto filesystem = zip_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::office_open_xml_document || + as == FileType::office_open_xml_presentation || + as == FileType::office_open_xml_workbook || + as == FileType::office_open_xml_encrypted) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto cfb_file = std::make_unique(std::move(memory_file)); + auto filesystem = cfb_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::legacy_word_document || + as == FileType::legacy_excel_worksheets) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto cfb_file = std::make_unique(std::move(memory_file)); + auto filesystem = cfb_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::portable_document_format) { + if (with == DecoderEngine::odr) { + try { + return std::make_unique(file); + } catch (...) { + } + return nullptr; + } + if (with == DecoderEngine::poppler) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::portable_network_graphics || + as == FileType::graphics_interchange_format || as == FileType::jpeg || + as == FileType::bitmap_image_file) { + if (with == DecoderEngine::odr) { + try { + return std::make_unique(file, as); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::starview_metafile) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::text_file) { + if (with == DecoderEngine::odr) { + try { + return std::make_unique(file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::comma_separated_values) { + if (with == DecoderEngine::odr) { + try { + auto text = std::make_shared(file); + return std::make_unique(text); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::javascript_object_notation) { + if (with == DecoderEngine::odr) { + try { + auto text = std::make_shared(file); + return std::make_unique(text); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::zip) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::compound_file_binary_format) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + return nullptr; +} + +std::unique_ptr +open_strategy::open_file(std::shared_ptr file, + const DecodePreference &preference) { + std::vector probe_types = + preference.as_file_type.has_value() + ? std::vector{*preference.as_file_type} + : preference.file_type_priority; + { + std::vector detected_types = types(file); + probe_types.insert(probe_types.end(), detected_types.begin(), + detected_types.end()); + auto probe_types_end = std::unique(probe_types.begin(), probe_types.end()); + probe_types.erase(probe_types_end, probe_types.end()); + } + + for (FileType as : probe_types) { + std::vector probe_engines = + preference.with_engine.has_value() + ? std::vector{*preference.with_engine} + : preference.engine_priority; + { + std::vector detected_engines = engines(file, as); + probe_engines.insert(probe_engines.end(), detected_engines.begin(), + detected_engines.end()); + auto probe_engines_end = + std::unique(probe_engines.begin(), probe_engines.end()); + probe_engines.erase(probe_engines_end, probe_engines.end()); + } + + for (DecoderEngine with : probe_engines) { + auto decoded_file = open_file(file, as, with); + if (decoded_file != nullptr) { + return decoded_file; + } + } + } + + return nullptr; } std::unique_ptr diff --git a/src/odr/internal/open_strategy.hpp b/src/odr/internal/open_strategy.hpp index b6c554db..704c28e0 100644 --- a/src/odr/internal/open_strategy.hpp +++ b/src/odr/internal/open_strategy.hpp @@ -7,7 +7,7 @@ namespace odr { enum class FileType; enum class DecoderEngine; -struct DecodePreferences; +struct DecodePreference; } // namespace odr namespace odr::internal::abstract { @@ -30,15 +30,13 @@ std::unique_ptr open_file(std::shared_ptr file); std::unique_ptr open_file(std::shared_ptr file, FileType as); + std::unique_ptr open_file(std::shared_ptr file, FileType as, DecoderEngine with); std::unique_ptr open_file(std::shared_ptr file, - const DecodePreferences &preferences); -std::unique_ptr -open_file(std::shared_ptr file, FileType as, - const DecodePreferences &preferences); + const DecodePreference &preference); std::unique_ptr open_document_file(std::shared_ptr file); diff --git a/src/odr/internal/pdf/pdf_file.cpp b/src/odr/internal/pdf/pdf_file.cpp index 97903ada..a1ba8e56 100644 --- a/src/odr/internal/pdf/pdf_file.cpp +++ b/src/odr/internal/pdf/pdf_file.cpp @@ -5,22 +5,22 @@ namespace odr::internal { PdfFile::PdfFile(std::shared_ptr file) : m_file{std::move(file)} {} -FileCategory PdfFile::file_category() const noexcept { - return FileCategory::document; -} - std::shared_ptr PdfFile::file() const noexcept { return m_file; } -FileType PdfFile::file_type() const noexcept { - return FileType::portable_document_format; -} - FileMeta PdfFile::file_meta() const noexcept { return {}; } DecoderEngine PdfFile::decoder_engine() const noexcept { return DecoderEngine::odr; } +bool PdfFile::password_encrypted() const noexcept { return false; } + +EncryptionState PdfFile::encryption_state() const noexcept { + return EncryptionState::not_encrypted; +} + +bool PdfFile::decrypt(const std::string &) { return false; } + } // namespace odr::internal diff --git a/src/odr/internal/pdf/pdf_file.hpp b/src/odr/internal/pdf/pdf_file.hpp index ec1d1dd8..a19f89f1 100644 --- a/src/odr/internal/pdf/pdf_file.hpp +++ b/src/odr/internal/pdf/pdf_file.hpp @@ -5,17 +5,19 @@ namespace odr::internal { -class PdfFile : public abstract::DecodedFile { +class PdfFile final : public abstract::PdfFile { public: explicit PdfFile(std::shared_ptr file); [[nodiscard]] std::shared_ptr file() const noexcept final; - [[nodiscard]] FileType file_type() const noexcept final; - [[nodiscard]] FileCategory file_category() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] bool password_encrypted() const noexcept final; + [[nodiscard]] EncryptionState encryption_state() const noexcept final; + [[nodiscard]] bool decrypt(const std::string &password) final; + private: std::shared_ptr m_file; }; diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp index 77e94765..fed671df 100644 --- a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp @@ -1,6 +1,7 @@ #include #include +#include #include namespace odr::internal { @@ -10,15 +11,31 @@ PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) open(std::nullopt); } +PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) + : m_file{std::move(file)} { + open(std::nullopt); +} + void PopplerPdfFile::open(const std::optional &password) { - GooString file_path_goo(m_file->disk_path()->string()); std::optional password_goo; if (password.has_value()) { password_goo = GooString(password.value().c_str()); } - m_pdf_doc = std::shared_ptr( - PDFDocFactory().createPDFDoc(file_path_goo, password_goo, password_goo)); + if (auto disk_file = std::dynamic_pointer_cast(m_file)) { + auto file_path_goo = + std::make_unique(disk_file->disk_path()->string().c_str()); + m_pdf_doc = std::make_shared(std::move(file_path_goo), password_goo, + password_goo); + } else if (auto memory_file = + std::dynamic_pointer_cast(m_file)) { + // `stream` is freed by `m_pdf_doc` + auto stream = new MemStream(memory_file->memory_data(), 0, + memory_file->size(), Object(objNull)); + m_pdf_doc = std::make_shared(stream, password_goo, password_goo); + } else { + throw std::runtime_error("Unsupported file type"); + } if (!m_pdf_doc->isOk()) { if (m_pdf_doc->getErrorCode() == errEncrypted) { diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp index 8bf697bc..47f7866e 100644 --- a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp @@ -12,6 +12,7 @@ namespace odr::internal { class PopplerPdfFile final : public abstract::PdfFile { public: explicit PopplerPdfFile(std::shared_ptr file); + explicit PopplerPdfFile(std::shared_ptr file); [[nodiscard]] std::shared_ptr file() const noexcept final; diff --git a/test/src/pdf2htmlEX_wrapper_test.cpp b/test/src/pdf2htmlEX_wrapper_test.cpp index 02a2ae07..2d985e97 100644 --- a/test/src/pdf2htmlEX_wrapper_test.cpp +++ b/test/src/pdf2htmlEX_wrapper_test.cpp @@ -45,7 +45,7 @@ TEST_P(pdf2htmlEXWrapperTests, html) { EXPECT_EQ(EncryptionState::decrypted, pdf_file.encryption_state()); } - Html html = odr::internal::html::translate_pdf_poppler_file( + Html html = odr::internal::html::translate_poppler_pdf_file( pdf_file, output_path, config); for (const HtmlPage &html_page : html.pages()) { EXPECT_TRUE(fs::is_regular_file(html_page.path)); From 43be0ceb9e8747913f94a25519e0b3b984b3bf4f Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 19 Sep 2024 11:02:17 +0200 Subject: [PATCH 08/28] many stuff; much wvware --- CMakeLists.txt | 1 + src/odr/exceptions.cpp | 3 + src/odr/exceptions.hpp | 5 + src/odr/file.cpp | 23 + src/odr/file.hpp | 7 + src/odr/internal/html/pdf2htmlEX_wrapper.cpp | 9 - src/odr/internal/html/pdf2htmlEX_wrapper.hpp | 12 +- src/odr/internal/html/wvWare_wrapper.cpp | 509 ++++++++++++++++-- src/odr/internal/html/wvWare_wrapper.hpp | 12 +- src/odr/internal/oldms/oldms_file.cpp | 5 - .../oldms_wvware/wvware_oldms_file.cpp | 107 ++++ .../oldms_wvware/wvware_oldms_file.hpp | 53 ++ src/odr/internal/open_strategy.cpp | 8 + src/odr/open_document_reader.cpp | 40 +- src/odr/open_document_reader.hpp | 29 + test/CMakeLists.txt | 9 - test/data/input/odr-public | 2 +- test/data/reference-output/odr-private | 2 +- test/data/reference-output/odr-public | 2 +- test/src/document_test.cpp | 2 +- test/src/html_output_test.cpp | 131 ++++- test/src/pdf2htmlEX_wrapper_test.cpp | 68 --- test/src/test_util.cpp | 132 ++--- test/src/test_util.hpp | 26 +- test/src/wvWare_wrapper_test.cpp | 65 --- 25 files changed, 949 insertions(+), 313 deletions(-) create mode 100644 src/odr/internal/oldms_wvware/wvware_oldms_file.cpp create mode 100644 src/odr/internal/oldms_wvware/wvware_oldms_file.hpp delete mode 100644 test/src/pdf2htmlEX_wrapper_test.cpp delete mode 100644 test/src/wvWare_wrapper_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ea59d501..253c9484 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,7 @@ if (WITH_WVWARE) target_sources(odr PRIVATE "src/odr/internal/html/wvWare_wrapper.cpp" + "src/odr/internal/oldms_wvware/wvware_oldms_file.cpp" ) target_link_libraries(odr PRIVATE diff --git a/src/odr/exceptions.cpp b/src/odr/exceptions.cpp index 65d97585..7b81c7ef 100644 --- a/src/odr/exceptions.cpp +++ b/src/odr/exceptions.cpp @@ -14,6 +14,9 @@ UnknownFileType::UnknownFileType() : std::runtime_error("unknown file type") {} UnsupportedFileType::UnsupportedFileType(const FileType file_type) : std::runtime_error("unknown file type"), file_type{file_type} {} +UnknownDecoderEngine::UnknownDecoderEngine() + : std::runtime_error("unknown decoder engine") {} + FileReadError::FileReadError() : std::runtime_error("file read error") {} FileWriteError::FileWriteError() : std::runtime_error("file write error") {} diff --git a/src/odr/exceptions.hpp b/src/odr/exceptions.hpp index 356d773a..eae36daf 100644 --- a/src/odr/exceptions.hpp +++ b/src/odr/exceptions.hpp @@ -28,6 +28,11 @@ struct UnsupportedFileType final : public std::runtime_error { explicit UnsupportedFileType(FileType file_type); }; +/// @brief Unknown decoder engine exception +struct UnknownDecoderEngine final : public std::runtime_error { + UnknownDecoderEngine(); +}; + /// @brief File read error struct FileReadError final : public std::runtime_error { FileReadError(); diff --git a/src/odr/file.cpp b/src/odr/file.cpp index 739a4965..7a189f18 100644 --- a/src/odr/file.cpp +++ b/src/odr/file.cpp @@ -58,6 +58,12 @@ std::vector DecodedFile::types(const std::string &path) { std::make_shared(path)); } +std::vector DecodedFile::engines(const std::string &path, + FileType as) { + return internal::open_strategy::engines( + std::make_shared(path), as); +} + FileType DecodedFile::type(const std::string &path) { return DecodedFile(path).file_type(); } @@ -87,6 +93,11 @@ DecodedFile::DecodedFile(const std::string &path, FileType as) : DecodedFile(internal::open_strategy::open_file( std::make_shared(path), as)) {} +DecodedFile::DecodedFile(const std::string &path, + const DecodePreference &preference) + : DecodedFile(internal::open_strategy::open_file( + std::make_shared(path), preference)) {} + DecodedFile::operator bool() const { return m_impl.operator bool(); } FileType DecodedFile::file_type() const noexcept { return m_impl->file_type(); } @@ -236,6 +247,18 @@ Document DocumentFile::document() const { return Document(m_impl->document()); } PdfFile::PdfFile(std::shared_ptr impl) : DecodedFile(impl), m_impl{std::move(impl)} {} +bool PdfFile::password_encrypted() const { + return m_impl->password_encrypted(); +} + +EncryptionState PdfFile::encryption_state() const { + return m_impl->encryption_state(); +} + +bool PdfFile::decrypt(const std::string &password) { + return m_impl->decrypt(password); +} + std::shared_ptr PdfFile::impl() const { return m_impl; } diff --git a/src/odr/file.hpp b/src/odr/file.hpp index 5ed0322e..72013e19 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -176,6 +176,8 @@ class File final { class DecodedFile { public: static std::vector types(const std::string &path); + static std::vector engines(const std::string &path, + FileType as); static FileType type(const std::string &path); static FileMeta meta(const std::string &path); @@ -184,6 +186,7 @@ class DecodedFile { DecodedFile(const File &file, FileType as); explicit DecodedFile(const std::string &path); DecodedFile(const std::string &path, FileType as); + DecodedFile(const std::string &path, const DecodePreference &preference); [[nodiscard]] explicit operator bool() const; @@ -273,6 +276,10 @@ class PdfFile final : public DecodedFile { public: explicit PdfFile(std::shared_ptr); + [[nodiscard]] bool password_encrypted() const; + [[nodiscard]] EncryptionState encryption_state() const; + bool decrypt(const std::string &password); + [[nodiscard]] std::shared_ptr impl() const; private: diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp index 10bb226a..6a97897a 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp @@ -22,15 +22,6 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, const HtmlConfig &config) { PDFDoc &pdf_doc = pdf_file.pdf_doc(); - if (!pdf_doc.isOk()) { - int errCode = pdf_doc.getErrorCode(); - if (errCode == errEncrypted) { - throw EncryptionPasswordException(std::to_string(errCode)); - } else { - throw ConversionFailedException(std::to_string(errCode)); - } - } - const char *fontconfig_path = std::getenv("FONTCONFIG_PATH"); if (fontconfig_path == nullptr) { // Storage is allocated and after successful putenv, it will never be freed. diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp index 0fce0a70..ee492fee 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp +++ b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp @@ -18,21 +18,11 @@ Html translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config); -class ConversionFailedException : public std::runtime_error { +class DocumentCopyProtectedException : public std::runtime_error { public: using std::runtime_error::runtime_error; }; -class DocumentCopyProtectedException : public ConversionFailedException { -public: - using ConversionFailedException::ConversionFailedException; -}; - -class EncryptionPasswordException : public ConversionFailedException { -public: - using ConversionFailedException::ConversionFailedException; -}; - } // namespace odr::internal::html #endif // ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP diff --git a/src/odr/internal/html/wvWare_wrapper.cpp b/src/odr/internal/html/wvWare_wrapper.cpp index 6e2376f3..825b3aac 100644 --- a/src/odr/internal/html/wvWare_wrapper.cpp +++ b/src/odr/internal/html/wvWare_wrapper.cpp @@ -1,57 +1,504 @@ #include -#include #include #include #include -#include +#include #include -// TODO remove this -#include +#include -namespace odr::internal::html { +namespace odr::internal { -Html wvWare_wrapper(const std::string &input_path, - const std::string &output_path, const HtmlConfig &config, - std::optional &password) { - if (nullptr == g_wvDataDir) { - g_wvDataDir = WVDATADIR; +/// A lot of this code is duplicated from wvWare, mostly from `wvWare.c` and +/// `wvHtml.c`. +/// +/// wvWare is writing to stdout, while we want to write to a file. Also, wvWare +/// is configurable to write not only HTML but also other formats. We only need +/// HTML. +/// +/// We decided to duplicate the code instead of changing upstream wvWare code +/// because it is rather an application not a library, it is quite outdated and +/// not actively developed, and written in C. Duplication allows for a clean +/// separation between wvWare and our code while also being able to write modern +/// C++ code. +/// +/// A copy of wvWare can be found here: +/// https://github.com/opendocument-app/wvWare +namespace { + +/// Extension of `expand_data` see +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wv.h#L2776-L2814 +/// to allow for more state variables. +struct TranslationState : public expand_data { + int i = 0; + char *charset = nullptr; + PAP *ppap = nullptr; +}; + +/// Originally from `wvWare.c` `wvStrangeNoGraphicData` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L661-L676 +/// simplified to HTML output +void strange_no_graphic_data(wvParseStruct *ps, int graphicstype) { + std::cerr << "Strange No Graphic Data in the 0x01/0x08 graphic\n"; + + // TODO print to output file + printf(R"(%#.2x graphic
)", graphicstype, + "StrangeNoGraphicData"); +} + +/// Originally from `wvWare.c` `name_to_url` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1703-L1772 +char *name_to_url(char *name) { + // TODO get rid of static + // TODO use std::string + static char *url = 0; + static long max = 0; + char *ptr = 0; + long count = 0; + + ptr = name; + while (*ptr) { + switch (*ptr) { + case ' ': + count += 3; + break; + default: + count++; + break; + } + ptr++; } + count++; - auto output_file_path = output_path + "/document.html"; + if (count > max) { + char *more = nullptr; + if (url == nullptr) { + more = static_cast(malloc(count)); + } else { + more = static_cast(realloc(url, count)); + } + if (more != nullptr) { + url = more; + max = count; + } + } + + if (url != nullptr) { + count = 0; + ptr = name; + while ((*ptr != 0) && (count < max)) { + switch (*ptr) { + case ' ': + url[count++] = '%'; + if (count < max) + url[count++] = '2'; + if (count < max) + url[count++] = '0'; + break; + default: + url[count++] = *ptr; + break; + } + ptr++; + } + url[max - 1] = 0; + } else { + std::cerr << "failed to convert name to URL\n"; + return name; + } + + return url; +} + +/// Originally from `wvWare.c` `wvPrintGraphics` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1239-L1287 +/// simplified to HTML output +void print_graphics(int graphicstype, int width, int height, char *source) { + // upstream converts to PNG, we just use the original format as the browser + // should support them + + // TODO export/embed image - char *input_file_path = strdup(input_path.c_str()); - char *output_dir = strdup(output_path.c_str()); + // TODO replace printf + printf(R"(%#.2x graphic
)", + width, height, graphicstype, name_to_url(source)); +} - g_htmlOutputFileHandle = fopen(output_file_path.c_str(), "w"); +/// Originally from `wvWare.c` `myelehandler` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L503-L599 +int element_handler(wvParseStruct *ps, wvTag tag, void *props, int dirty) { + auto *data = (TranslationState *)ps->userData; + data->anSttbfAssoc = &ps->anSttbfAssoc; + data->lfo = &ps->lfo; + data->lfolvl = ps->lfolvl; + data->lvl = ps->lvl; + data->nolfo = &ps->nolfo; + data->nooflvl = &ps->nooflvl; + data->stsh = &ps->stsh; + data->lst = &ps->lst; + data->noofLST = &ps->noofLST; + data->liststartnos = &ps->liststartnos; + data->listnfcs = &ps->listnfcs; + data->finallvl = &ps->finallvl; + data->fib = &ps->fib; + data->dop = &ps->dop; + data->intable = &ps->intable; + data->cellbounds = &ps->cellbounds; + data->nocellbounds = &ps->nocellbounds; + data->endcell = &ps->endcell; + data->vmerges = &ps->vmerges; + data->norows = &ps->norows; + data->nextpap = &ps->nextpap; + if (data->charset == nullptr) { + data->charset = wvAutoCharset(ps); + } + data->props = props; - std::string pw; - if (password.has_value()) { - pw = password.value(); + switch (tag) { + case PARABEGIN: { + S16 tilfo = 0; + /* test begin */ + if (*(data->endcell) != 0) { + tilfo = ((PAP *)(data->props))->ilfo; + ((PAP *)(data->props))->ilfo = 0; + } + /* test end */ + data->ppap = (PAP *)data->props; + wvBeginPara(data); + if (tilfo != 0) { + ((PAP *)(data->props))->ilfo = tilfo; + } + } break; + case PARAEND: { + S16 tilfo = 0; + /* test begin */ + if (*(data->endcell) != 0) { + tilfo = ((PAP *)(data->props))->ilfo; + ((PAP *)(data->props))->ilfo = 0; + } + /* test end */ + wvEndCharProp(data); /* danger will break in the future */ + wvEndPara(data); + if (tilfo != 0) { + ((PAP *)(data->props))->ilfo = tilfo; + } + wvCopyPAP(&data->lastpap, (PAP *)(data->props)); + } break; + case CHARPROPBEGIN: + wvBeginCharProp(data, data->ppap); + break; + case CHARPROPEND: + wvEndCharProp(data); + break; + case SECTIONBEGIN: + wvBeginSection(data); + break; + case SECTIONEND: + wvEndSection(data); + break; + case COMMENTBEGIN: + wvBeginComment(data); + break; + case COMMENTEND: + wvEndComment(data); + break; + default: + break; } - int retVal = wvHtml_convert(input_file_path, output_dir, pw.c_str()); - free(output_dir); - free(input_file_path); - fclose(g_htmlOutputFileHandle); - g_htmlOutputFileHandle = nullptr; + return 0; +} - if (0 != retVal) { - unlink(output_file_path.c_str()); +/// Originally from `wvWare.c` `mydochandler` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L601-L659 +int document_handler(wvParseStruct *ps, wvTag tag) { + auto *data = (TranslationState *)ps->userData; + data->anSttbfAssoc = &ps->anSttbfAssoc; + data->lfo = &ps->lfo; + data->lfolvl = ps->lfolvl; + data->lvl = ps->lvl; + data->nolfo = &ps->nolfo; + data->nooflvl = &ps->nooflvl; + data->stsh = &ps->stsh; + data->lst = &ps->lst; + data->noofLST = &ps->noofLST; + data->liststartnos = &ps->liststartnos; + data->listnfcs = &ps->listnfcs; + data->finallvl = &ps->finallvl; + data->fib = &ps->fib; + data->dop = &ps->dop; + data->intable = &ps->intable; + data->cellbounds = &ps->cellbounds; + data->nocellbounds = &ps->nocellbounds; + data->endcell = &ps->endcell; + data->vmerges = &ps->vmerges; + data->norows = &ps->norows; + if (data->i == 0) { + wvSetEntityConverter(data); + data->filename = ps->filename; + data->whichcell = 0; + data->whichrow = 0; + data->asep = nullptr; + data->i++; + wvInitPAP(&data->lastpap); + data->nextpap = nullptr; + data->ps = ps; + } - switch (retVal) { - case 100: // PasswordRequired - case 101: // Wrong Password - throw WrongPassword(); - default: - throw std::runtime_error("Conversion error"); + if (data->charset == nullptr) { + data->charset = wvAutoCharset(ps); + } + + switch (tag) { + case DOCBEGIN: + wvBeginDocument(data); + break; + case DOCEND: + wvEndDocument(data); + break; + default: + break; + } + + return 0; +} + +/// Originally from `wvWare.c` `myCharProc` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1556-L1605 +int char_handler(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid) { + auto *data = (TranslationState *)ps->userData; + + switch (eachchar) { + case 19: + ps->fieldstate++; + ps->fieldmiddle = 0; + fieldCharProc(ps, eachchar, chartype, lid); /* temp */ + return 0; + break; + case 20: + fieldCharProc(ps, eachchar, chartype, lid); + ps->fieldmiddle = 1; + return 0; + break; + case 21: + ps->fieldmiddle = 0; + ps->fieldstate--; + fieldCharProc(ps, eachchar, chartype, lid); /* temp */ + return 0; + break; + case 0x08: + std::cerr << "hmm did we loose the fSpec flag ?, this is possibly a bug\n"; + break; + default: + break; + } + + if (ps->fieldstate != 0) { + if (fieldCharProc(ps, eachchar, chartype, lid) != 0) { + return 0; + } + } + + if (data->charset != nullptr) { + wvOutputHtmlChar(eachchar, chartype, data->charset, lid); + } else { + wvOutputHtmlChar(eachchar, chartype, wvAutoCharset(ps), lid); + } + + return 0; +} + +/// Originally from `wvWare.c` `mySpecCharProc` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1289-L1553 +int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { + static int message; + PICF picf; + FSPA *fspa; + auto *data = (TranslationState *)ps->userData; + + switch (eachchar) { + case 19: + std::cerr << "field began\n"; + ps->fieldstate++; + ps->fieldmiddle = 0; + fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ + return 0; + case 20: + if (achp->fOle2) { + std::cerr << "this field has an associated embedded object of id " + << achp->fcPic_fcObj_lTagObj << "\n"; + } + fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ + ps->fieldmiddle = 1; + return 0; + case 21: + ps->fieldstate--; + ps->fieldmiddle = 0; + fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ + return 0; + break; + } + + if (ps->fieldstate) { + if (fieldCharProc(ps, eachchar, 0, 0x400)) + return 0; + } + + switch (eachchar) { + case 0x05: + /* this should be handled by the COMMENTBEGIN and COMMENTEND events */ + return 0; + case 0x01: { + wvStream *f; + Blip blip; + char *name; + long p = wvStream_tell(ps->data); + std::cerr << "picture 0x01 here, at offset " << achp->fcPic_fcObj_lTagObj + << " in Data Stream, obj is " << achp->fObj << ", ole is " + << achp->fOle2 << "\n"; + + if (achp->fOle2) { + return 0; + } + if (no_graphics != 0) { + wvStream_goto(ps->data, achp->fcPic_fcObj_lTagObj); + wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); + f = picf.rgb; + if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { + name = wvHtmlGraphic(ps, &blip); + print_graphics(0x01, (int)wvTwipsToHPixels(picf.dxaGoal), + (int)wvTwipsToVPixels(picf.dyaGoal), name); + wvFree(name); + } else { + strange_no_graphic_data(ps, 0x01); + } + } + + wvStream_goto(ps->data, p); + return 0; + } + case 0x08: { + Blip blip; + char *name; + if (wvQuerySupported(&ps->fib, nullptr) == WORD8) { + if (!no_graphics) { + if (ps->nooffspa > 0) { + fspa = wvGetFSPAFromCP(ps->currentcp, ps->fspa, ps->fspapos, + ps->nooffspa); + + if (fspa == nullptr) { + std::cerr << "No fspa! Insanity abounds!\n"; + return 0; + } + + data->props = fspa; + if (wv0x08(&blip, fspa->spid, ps)) { + name = wvHtmlGraphic(ps, &blip); + print_graphics( + 0x08, (int)wvTwipsToHPixels(fspa->xaRight - fspa->xaLeft), + (int)wvTwipsToVPixels(fspa->yaBottom - fspa->yaTop), name); + wvFree(name); + } else + strange_no_graphic_data(ps, 0x08); + } else { + std::cerr << "nooffspa was <=0! Ignoring.\n"; + } + } + } else { + FDOA *fdoa; + std::cerr << "pre word8 0x08 graphic, unsupported at the moment\n"; + fdoa = + wvGetFDOAFromCP(ps->currentcp, ps->fdoa, ps->fdoapos, ps->nooffdoa); + data->props = fdoa; } + + // Potentially relevant disabled code section in `wvWare.c`? + // https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1443-L1459 + + return 0; + } + case 0x28: { + U16 symbol[6] = {'S', 'y', 'm', 'b', 'o', 'l'}; + U16 wingdings[9] = {'W', 'i', 'n', 'g', 'd', 'i', 'n', 'g', 's'}; + U16 mtextra[8] = {'M', 'T', ' ', 'E', 'x', 't', 'r', 'a'}; + + if (0 == memcmp(symbol, ps->fonts.ffn[achp->ftcSym].xszFfn, 12)) { + if ((!message) && (strcasecmp("UTF-8", data->charset) != 0)) { + std::cerr + << "Symbol font detected (too late sorry!), rerun wvHtml with option --charset utf-8\n\ +option to support correct symbol font conversion to a viewable format.\n"; + message++; + } + wvOutputFromUnicode(wvConvertSymbolToUnicode(achp->xchSym - 61440), + data->charset); + return 0; + } else if (0 == memcmp(mtextra, ps->fonts.ffn[achp->ftcSym].xszFfn, 16)) { + if ((message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { + std::cerr + << "MT Extra font detected (too late sorry!), rerun wvHtml with option --charset utf-8\n\ +option to support correct symbol font conversion to a viewable format.\n"; + message++; + } + wvOutputFromUnicode(wvConvertMTExtraToUnicode(achp->xchSym - 61440), + data->charset); + return 0; + } else if (0 == memcmp(wingdings, ps->fonts.ffn[achp->ftcSym].xszFfn, 18)) { + if (message == 0) { + std::cerr << "Wingdings font detected, i need a mapping table to " + "unicode for this\n"; + message++; + } + } else { + if (message == 0) { + char *fontname = wvWideStrToMB(ps->fonts.ffn[achp->ftcSym].xszFfn); + std::cerr << "Special font " << fontname + << ", i need a mapping table to unicode for this\n"; + wvFree(fontname); + // TODO replace printf + printf("*"); + } + return 0; + } + } + default: + return 0; + } + + return 0; +} + +} // namespace + +Html html::translate_wvware_oldms_file( + const WvWareLegacyMicrosoftFile &oldms_file, const std::string &output_path, + const HtmlConfig &config) { + auto output_file_path = output_path + "/document.html"; + + char *wv_config = nullptr; // TODO + + wvParseStruct &ps = oldms_file.parse_struct(); + + wvSetElementHandler(&ps, element_handler); + wvSetDocumentHandler(&ps, document_handler); + wvSetCharHandler(&ps, char_handler); + wvSetSpecialCharHandler(&ps, special_char_handler); + + state_data handle; + TranslationState translation_state; + + wvInitStateData(&handle); + + translation_state.sd = &handle; + ps.userData = &translation_state; + + if (wvHtml(&ps) != 0) { + throw std::runtime_error("wvHtml failed"); } return { FileType::legacy_word_document, config, {{"document", output_file_path}}}; } -} // namespace odr::internal::html +} // namespace odr::internal diff --git a/src/odr/internal/html/wvWare_wrapper.hpp b/src/odr/internal/html/wvWare_wrapper.hpp index e7000901..b548923d 100644 --- a/src/odr/internal/html/wvWare_wrapper.hpp +++ b/src/odr/internal/html/wvWare_wrapper.hpp @@ -5,17 +5,19 @@ #include namespace odr { -class File; - struct HtmlConfig; class Html; } // namespace odr +namespace odr::internal { +class WvWareLegacyMicrosoftFile; +} // namespace odr::internal + namespace odr::internal::html { -Html wvWare_wrapper(const std::string &input_path, - const std::string &output_path, const HtmlConfig &config, - std::optional &password); +Html translate_wvware_oldms_file(const WvWareLegacyMicrosoftFile &oldms_file, + const std::string &output_path, + const HtmlConfig &config); } diff --git a/src/odr/internal/oldms/oldms_file.cpp b/src/odr/internal/oldms/oldms_file.cpp index 256912be..a8433df0 100644 --- a/src/odr/internal/oldms/oldms_file.cpp +++ b/src/odr/internal/oldms/oldms_file.cpp @@ -8,11 +8,6 @@ #include #include -namespace odr::internal::abstract { -class Document; -class File; -} // namespace odr::internal::abstract - namespace odr::internal::oldms { namespace { diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp new file mode 100644 index 00000000..d5177692 --- /dev/null +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp @@ -0,0 +1,107 @@ +#include + +#include +#include + +#include +#include + +#include + +namespace odr::internal { + +WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( + std::shared_ptr file) + : m_file{std::move(file)} { + wvInit(); + char *path = const_cast(m_file->disk_path()->string().c_str()); + int ret = wvInitParser(&m_ps, path); + + // check if password is required + if ((ret & 0x8000) != 0) { + m_encryption_state = EncryptionState::encrypted; + m_encryption_flag = ret & 0x7fff; + + if ((m_encryption_flag == WORD8) || (m_encryption_flag == WORD7) || + (m_encryption_flag == WORD6)) { + ret = 0; + } + } else { + m_encryption_state = EncryptionState::decrypted; + } + + if (ret != 0) { + wvOLEFree(&m_ps); + throw std::runtime_error("wvInitParser failed"); + } +} + +WvWareLegacyMicrosoftFile::~WvWareLegacyMicrosoftFile() { wvOLEFree(&m_ps); } + +std::shared_ptr +WvWareLegacyMicrosoftFile::file() const noexcept { + return m_file; +} + +FileType WvWareLegacyMicrosoftFile::file_type() const noexcept { + return {}; // TODO +} + +FileMeta WvWareLegacyMicrosoftFile::file_meta() const noexcept { + return {}; // TODO +} + +DecoderEngine WvWareLegacyMicrosoftFile::decoder_engine() const noexcept { + return DecoderEngine::wv_ware; +} + +DocumentType WvWareLegacyMicrosoftFile::document_type() const { + return {}; // TODO +} + +DocumentMeta WvWareLegacyMicrosoftFile::document_meta() const { + return {}; // TODO +} + +bool WvWareLegacyMicrosoftFile::password_encrypted() const noexcept { + return m_encryption_state == EncryptionState::encrypted || + m_encryption_state == EncryptionState::decrypted; +} + +EncryptionState WvWareLegacyMicrosoftFile::encryption_state() const noexcept { + return m_encryption_state; +} + +bool WvWareLegacyMicrosoftFile::decrypt(const std::string &password) { + if (m_encryption_state != EncryptionState::encrypted) { + return false; + } + + wvSetPassword(password.c_str(), &m_ps); + + bool success = false; + + if (m_encryption_flag == WORD8) { + success = wvDecrypt97(&m_ps); + } else if (m_encryption_flag == WORD7 || m_encryption_flag == WORD6) { + success = wvDecrypt95(&m_ps); + } + + if (!success) { + return false; + } + + m_encryption_state = EncryptionState::decrypted; + return true; +} + +std::shared_ptr +WvWareLegacyMicrosoftFile::document() const { + return {}; // TODO throw +} + +wvParseStruct &WvWareLegacyMicrosoftFile::parse_struct() const { + return const_cast(m_ps); +} + +} // namespace odr::internal diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp new file mode 100644 index 00000000..3d80717f --- /dev/null +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp @@ -0,0 +1,53 @@ +#ifndef ODR_INTERNAL_WVWARE_OLDMS_FILE_HPP +#define ODR_INTERNAL_WVWARE_OLDMS_FILE_HPP + +#include + +#include +#include + +#include +#include + +#include + +namespace odr::internal::common { +class DiskFile; +} // namespace odr::internal::common + +namespace odr::internal { + +class WvWareLegacyMicrosoftFile final : public abstract::DocumentFile { +public: + explicit WvWareLegacyMicrosoftFile(std::shared_ptr file); + ~WvWareLegacyMicrosoftFile() final; + + [[nodiscard]] std::shared_ptr file() const noexcept final; + + [[nodiscard]] FileType file_type() const noexcept final; + [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + + [[nodiscard]] DocumentType document_type() const final; + [[nodiscard]] DocumentMeta document_meta() const final; + + [[nodiscard]] bool password_encrypted() const noexcept final; + [[nodiscard]] EncryptionState encryption_state() const noexcept final; + bool decrypt(const std::string &password) final; + + [[nodiscard]] std::shared_ptr document() const final; + + [[nodiscard]] wvParseStruct &parse_struct() const; + +private: + std::shared_ptr m_file; + + EncryptionState m_encryption_state{EncryptionState::unknown}; + + wvParseStruct m_ps{}; + int m_encryption_flag{}; +}; + +} // namespace odr::internal + +#endif // ODR_INTERNAL_WVWARE_OLDMS_FILE_HPP diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index 498a4a18..aab82396 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -219,6 +219,13 @@ open_strategy::open_file(std::shared_ptr file, FileType as, as == FileType::office_open_xml_workbook || as == FileType::office_open_xml_encrypted) { if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto zip_file = std::make_unique(std::move(memory_file)); + auto filesystem = zip_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } try { auto memory_file = std::make_shared(*file); auto cfb_file = std::make_unique(std::move(memory_file)); @@ -232,6 +239,7 @@ open_strategy::open_file(std::shared_ptr file, FileType as, } if (as == FileType::legacy_word_document || + as == FileType::legacy_powerpoint_presentation || as == FileType::legacy_excel_worksheets) { if (with == DecoderEngine::odr) { try { diff --git a/src/odr/open_document_reader.cpp b/src/odr/open_document_reader.cpp index 04d8d865..52b35c3f 100644 --- a/src/odr/open_document_reader.cpp +++ b/src/odr/open_document_reader.cpp @@ -164,15 +164,51 @@ std::string OpenDocumentReader::type_to_string(const FileType type) noexcept { } } +std::string OpenDocumentReader::engine_to_string(const DecoderEngine engine) { + if (engine == DecoderEngine::odr) { + return "odr"; + } else if (engine == DecoderEngine::poppler) { + return "poppler"; + } else if (engine == DecoderEngine::wv_ware) { + return "wv_ware"; + } + throw UnknownDecoderEngine(); +} + +DecoderEngine OpenDocumentReader::engine_by_name(const std::string &name) { + if (name == "odr") { + return DecoderEngine::odr; + } else if (name == "poppler") { + return DecoderEngine::poppler; + } else if (name == "wv_ware") { + return DecoderEngine::wv_ware; + } + throw UnknownDecoderEngine(); +} + std::vector OpenDocumentReader::types(const std::string &path) { - File file(path); - return internal::open_strategy::types(file.impl()); + return DecodedFile::types(path); +} + +std::vector OpenDocumentReader::engines(const std::string &path, + const FileType as) { + return DecodedFile::engines(path, as); } DecodedFile OpenDocumentReader::open(const std::string &path) { return DecodedFile(path); } +DecodedFile OpenDocumentReader::open(const std::string &path, + const FileType as) { + return DecodedFile(path, as); +} + +DecodedFile OpenDocumentReader::open(const std::string &path, + const DecodePreference &preference) { + return DecodedFile(path, preference); +} + Html OpenDocumentReader::html(const std::string &path, const PasswordCallback &password_callback, const std::string &output_path, diff --git a/src/odr/open_document_reader.hpp b/src/odr/open_document_reader.hpp index e73c993e..a6893f65 100644 --- a/src/odr/open_document_reader.hpp +++ b/src/odr/open_document_reader.hpp @@ -8,6 +8,8 @@ namespace odr { enum class FileType; enum class FileCategory; +enum class DecoderEngine; +struct DecodePreference; class File; class DecodedFile; class TextFile; @@ -45,14 +47,41 @@ class OpenDocumentReader final { /// @return The file type as a string. [[nodiscard]] static std::string type_to_string(FileType type) noexcept; + /// @brief Get the decoder engine as a string. + /// @param engine The decoder engine. + /// @return The decoder engine as a string. + [[nodiscard]] static std::string engine_to_string(DecoderEngine engine); + /// @brief Get the decoder engine by the name. + /// @param engine The name of the decoder engine. + /// @return The decoder engine. + [[nodiscard]] static DecoderEngine engine_by_name(const std::string &engine); + /// @brief Get the file types by the file path. /// @param path The file path. /// @return The file types. [[nodiscard]] static std::vector types(const std::string &path); + /// @brief Get the decoder engines for a file path and file type. + /// @param path The file path. + /// @param as The file type. + /// @return The decoder engines. + [[nodiscard]] static std::vector + engines(const std::string &path, FileType as); + /// @brief Open a file. /// @param path The file path. /// @return The decoded file. [[nodiscard]] static DecodedFile open(const std::string &path); + /// @brief Open a file. + /// @param path The file path. + /// @param as The file type. + /// @return The decoded file. + [[nodiscard]] static DecodedFile open(const std::string &path, FileType as); + /// @brief Open a file. + /// @param path The file path. + /// @param preference The decode preference. + /// @return The decoded file. + [[nodiscard]] static DecodedFile open(const std::string &path, + const DecodePreference &preference); /// @brief Translates a file to HTML. /// diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e8f623e9..16c988cd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -63,13 +63,4 @@ target_link_libraries(odr_test odr ) -if(WITH_PDF2HTMLEX) - target_sources(odr_test PRIVATE "src/pdf2htmlEX_wrapper_test.cpp") - target_link_libraries(odr_test PRIVATE pdf2htmlex::pdf2htmlex) -endif(WITH_PDF2HTMLEX) -if(WITH_WVWARE) - target_sources(odr_test PRIVATE "src/wvWare_wrapper_test.cpp") - target_link_libraries(odr_test PRIVATE wvware::wvware) -endif(WITH_WVWARE) - gtest_add_tests(TARGET odr_test) diff --git a/test/data/input/odr-public b/test/data/input/odr-public index c2cc81ba..99f85ddc 160000 --- a/test/data/input/odr-public +++ b/test/data/input/odr-public @@ -1 +1 @@ -Subproject commit c2cc81ba91b6145ff51801644169f4f01878556b +Subproject commit 99f85ddc0ab26c83759ab6de544fac82b85e5cc8 diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index b1d06179..118b6afa 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit b1d061790ee59b5ded4c3b970dd0a5c453d65b96 +Subproject commit 118b6afae107a2326f5eb70e3536e209751eb079 diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index c3b3d0b1..76d0a13e 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit c3b3d0b160c4bb34ee3ca9b7e61cff504335cbc5 +Subproject commit 76d0a13e5d69081fc41cf2cfc296fb1bd85156f8 diff --git a/test/src/document_test.cpp b/test/src/document_test.cpp index 5f56abac..983ffb8a 100644 --- a/test/src/document_test.cpp +++ b/test/src/document_test.cpp @@ -107,7 +107,7 @@ TEST(Document, edit_ods_diff) { DocumentFile document_file( TestData::test_file_path("odr-public/ods/pages.ods")); document_file.decrypt( - TestData::test_file("odr-public/ods/pages.ods").password); + TestData::test_file("odr-public/ods/pages.ods").password.value()); Document document = document_file.document(); html::edit(document, diff); diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index bea05802..80c033e3 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -21,26 +21,33 @@ using namespace odr::internal; using namespace odr::test; namespace fs = std::filesystem; -using HtmlOutputTests = ::testing::TestWithParam; +struct TestParams { + TestFile test_file; + std::string path; + DecoderEngine engine{DecoderEngine::odr}; + std::string test_repo; + std::string output_path; + std::string output_path_prefix; +}; -TEST_P(HtmlOutputTests, html_meta) { - const std::string test_file_path = GetParam(); - const TestFile test_file = TestData::test_file(test_file_path); +using HtmlOutputTests = ::testing::TestWithParam; - const std::string test_repo = *common::Path(test_file_path).begin(); - const std::string output_path_prefix = - common::Path("output").join(test_repo).join("output").string(); - const std::string output_path = - common::Path(output_path_prefix) - .join(common::Path(test_file_path).rebase(test_repo)) - .string(); +TEST_P(HtmlOutputTests, html_meta) { + const TestParams ¶ms = GetParam(); + const TestFile &test_file = params.test_file; + const std::string &test_file_path = params.path; + const DecoderEngine engine = params.engine; + const std::string &test_repo = params.test_repo; + const std::string &output_path = params.output_path; + const std::string &output_path_prefix = + common::Path(output_path).parent().string(); - std::cout << test_file.path << " to " << output_path << std::endl; + std::cout << test_file.short_path << " to " << output_path << std::endl; // TODO compare guessed file type VS actual file type // these files cannot be opened - if (util::string::ends_with(test_file.path, ".sxw") || + if (util::string::ends_with(test_file.short_path, ".sxw") || (test_file.type == FileType::legacy_word_document) || (test_file.type == FileType::legacy_powerpoint_presentation) || (test_file.type == FileType::legacy_excel_worksheets) || @@ -50,12 +57,17 @@ TEST_P(HtmlOutputTests, html_meta) { } // TODO fix - if ((test_file.type == FileType::portable_document_format) && + if ((engine == DecoderEngine::odr) && + (test_file.type == FileType::portable_document_format) && (test_repo != "odr-public")) { GTEST_SKIP(); } - const DecodedFile file{test_file.path}; + DecodePreference decode_preference; + decode_preference.as_file_type = test_file.type; + decode_preference.with_engine = engine; + DecodedFile file = + OpenDocumentReader::open(test_file.absolute_path, decode_preference); FileMeta file_meta = file.file_meta(); @@ -74,13 +86,23 @@ TEST_P(HtmlOutputTests, html_meta) { if (file.is_document_file()) { DocumentFile document_file = file.document_file(); - EXPECT_EQ(test_file.password_encrypted, document_file.password_encrypted()); - if (document_file.password_encrypted()) { - EXPECT_TRUE(document_file.decrypt(test_file.password)); + EXPECT_EQ(test_file.password.has_value(), + document_file.password_encrypted()); + if (test_file.password.has_value()) { + EXPECT_TRUE(document_file.decrypt(test_file.password.value())); } EXPECT_EQ(test_file.type, document_file.file_type()); } + if (file.is_pdf_file()) { + PdfFile pdf_file = file.pdf_file(); + + EXPECT_EQ(test_file.password.has_value(), pdf_file.password_encrypted()); + if (test_file.password.has_value()) { + EXPECT_TRUE(pdf_file.decrypt(test_file.password.value())); + } + } + fs::create_directories(output_path); file_meta = file.file_meta(); @@ -115,15 +137,68 @@ TEST_P(HtmlOutputTests, html_meta) { } } +namespace { + +std::string engine_suffix(const DecoderEngine engine) { + return engine == DecoderEngine::odr + ? "" + : "-" + OpenDocumentReader::engine_to_string(engine); +} + +std::string test_params_to_name(const TestParams ¶ms) { + std::string path = params.path + engine_suffix(params.engine); + internal::util::string::replace_all(path, "/", "_"); + internal::util::string::replace_all(path, "-", "_"); + internal::util::string::replace_all(path, "+", "_"); + internal::util::string::replace_all(path, ".", "_"); + internal::util::string::replace_all(path, " ", "_"); + internal::util::string::replace_all(path, "$", ""); + return path; +} + +TestParams create_test_params(const TestFile &test_file, + const DecoderEngine engine) { + const std::string test_file_path = test_file.short_path; + + const std::string test_repo = *common::Path(test_file_path).begin(); + const std::string output_path_prefix = + common::Path("output").join(test_repo).join("output").string(); + const std::string output_path_suffix = engine_suffix(engine); + const std::string output_path = + common::Path(output_path_prefix) + .join(common::Path(test_file_path).rebase(test_repo)) + .string() + + output_path_suffix; + + return { + .test_file = test_file, + .path = test_file_path, + .engine = engine, + .test_repo = test_repo, + .output_path = output_path, + .output_path_prefix = output_path_prefix, + }; +} + +std::vector list_test_params() { + std::vector params; + for (const TestFile &test_file : TestData::test_files()) { + std::vector engines = {DecoderEngine::odr}; + if (test_file.type == FileType::portable_document_format) { + engines.push_back(DecoderEngine::poppler); + } + + for (const DecoderEngine engine : engines) { + params.push_back(create_test_params(test_file, engine)); + } + } + return params; +} + +} // namespace + INSTANTIATE_TEST_SUITE_P(all_test_files, HtmlOutputTests, - testing::ValuesIn(TestData::test_file_paths()), - [](const ::testing::TestParamInfo &info) { - std::string path = info.param; - internal::util::string::replace_all(path, "/", "_"); - internal::util::string::replace_all(path, "-", "_"); - internal::util::string::replace_all(path, "+", "_"); - internal::util::string::replace_all(path, ".", "_"); - internal::util::string::replace_all(path, " ", "_"); - internal::util::string::replace_all(path, "$", ""); - return path; + testing::ValuesIn(list_test_params()), + [](const ::testing::TestParamInfo &info) { + return test_params_to_name(info.param); }); diff --git a/test/src/pdf2htmlEX_wrapper_test.cpp b/test/src/pdf2htmlEX_wrapper_test.cpp deleted file mode 100644 index 2d985e97..00000000 --- a/test/src/pdf2htmlEX_wrapper_test.cpp +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -using namespace odr; -using namespace odr::test; -using namespace odr::internal; -using namespace odr::test; -namespace fs = std::filesystem; - -using pdf2htmlEXWrapperTests = ::testing::TestWithParam; - -TEST_P(pdf2htmlEXWrapperTests, html) { - const std::string test_file_path = GetParam(); - const TestFile test_file = TestData::test_file(test_file_path); - - const std::string test_repo = *common::Path(test_file_path).begin(); - const std::string output_path_prefix = - common::Path("output").join(test_repo).join("output-pdf2htmlEX").string(); - const std::string output_path = - common::Path(output_path_prefix) - .join(common::Path(test_file_path).rebase(test_repo)) - .string(); - - std::cout << test_file.path << " to " << output_path << std::endl; - - fs::create_directories(output_path); - HtmlConfig config; - - PopplerPdfFile pdf_file(std::make_shared(test_file.path)); - - EXPECT_EQ(test_file.password_encrypted, pdf_file.password_encrypted()); - if (test_file.password_encrypted) { - EXPECT_TRUE(pdf_file.decrypt(test_file.password)); - EXPECT_EQ(EncryptionState::decrypted, pdf_file.encryption_state()); - } - - Html html = odr::internal::html::translate_poppler_pdf_file( - pdf_file, output_path, config); - for (const HtmlPage &html_page : html.pages()) { - EXPECT_TRUE(fs::is_regular_file(html_page.path)); - EXPECT_LT(0, fs::file_size(html_page.path)); - } -} - -INSTANTIATE_TEST_SUITE_P(pdf2htmlEX_test_files, pdf2htmlEXWrapperTests, - testing::ValuesIn(TestData::test_file_paths( - FileType::portable_document_format)), - [](const ::testing::TestParamInfo &info) { - std::string path = info.param; - internal::util::string::replace_all(path, "/", "_"); - internal::util::string::replace_all(path, "-", "_"); - internal::util::string::replace_all(path, "+", "_"); - internal::util::string::replace_all(path, ".", "_"); - internal::util::string::replace_all(path, " ", "_"); - internal::util::string::replace_all(path, "$", ""); - return path; - }); diff --git a/test/src/test_util.cpp b/test/src/test_util.cpp index 764ad4ea..0863a093 100644 --- a/test/src/test_util.cpp +++ b/test/src/test_util.cpp @@ -18,23 +18,29 @@ namespace fs = std::filesystem; namespace odr::test { namespace { -TestFile get_test_file(std::string input) { - const FileType type = - OpenDocumentReader::type_by_extension(common::Path(input).extension()); - const std::string file_name = fs::path(input).filename().string(); - std::string password; - if (const auto left = file_name.find('$'), right = file_name.rfind('$'); + +TestFile get_test_file(const std::string &root_path, + std::string absolute_path) { + const FileType type = OpenDocumentReader::type_by_extension( + common::Path(absolute_path).extension()); + + std::string short_path = absolute_path.substr(root_path.size() + 1); + + std::optional password; + const std::string filename = fs::path(absolute_path).filename().string(); + if (const auto left = filename.find('$'), right = filename.rfind('$'); (left != std::string::npos) && (left != right)) { - password = file_name.substr(left, right); + password = filename.substr(left, right); } - const bool encrypted = !password.empty(); - return {std::move(input), type, encrypted, std::move(password)}; + return {std::move(absolute_path), std::move(short_path), type, + std::move(password)}; } -std::vector get_test_files(const std::string &input_path) { +std::vector get_test_files(const std::string &root_path, + const std::string &input_path) { if (fs::is_regular_file(input_path)) { - return {get_test_file(input_path)}; + return {get_test_file(root_path, input_path)}; } if (!fs::is_directory(input_path)) { return {}; @@ -45,21 +51,22 @@ std::vector get_test_files(const std::string &input_path) { const std::string index_path = input_path + "/index.csv"; if (fs::is_regular_file(index_path)) { for (const auto &row : csv::CSVReader(index_path)) { - const std::string path = input_path + "/" + row["path"].get<>(); - const FileType type = + std::string absolute_path = input_path + "/" + row["path"].get<>(); + std::string short_path = absolute_path.substr(root_path.size() + 1); + FileType type = OpenDocumentReader::type_by_extension(row["type"].get<>()); - std::string password = row["password"].get<>(); - const bool encrypted = !password.empty(); - const std::string file_name = fs::path(path).filename().string(); + std::optional password = row["encrypted"].get<>() == "yes" + ? row["password"].get<>() + : std::optional(); if (type == FileType::unknown) { continue; } - result.emplace_back(path, type, encrypted, std::move(password)); + result.emplace_back(std::move(absolute_path), std::move(short_path), type, + std::move(password)); } } - // TODO this will also recurse `.git` for (auto &&p : fs::recursive_directory_iterator(input_path)) { if (!p.is_regular_file()) { continue; @@ -68,15 +75,17 @@ std::vector get_test_files(const std::string &input_path) { if (path == index_path) { continue; } - - if (const auto it = - std::find_if(std::begin(result), std::end(result), - [&](auto &&file) { return file.path == path; }); + if (p.path().filename().string().starts_with(".")) { + continue; + } + if (const auto it = std::find_if( + std::begin(result), std::end(result), + [&](auto &&file) { return file.absolute_path == path; }); it != std::end(result)) { continue; } - const auto file = get_test_file(path); + const auto file = get_test_file(root_path, path); if (file.type == FileType::unknown) { continue; @@ -87,26 +96,30 @@ std::vector get_test_files(const std::string &input_path) { return result; } -std::unordered_map get_test_files() { - std::unordered_map result; +std::vector get_test_files() { + std::vector result; - for (const auto &e : - fs::directory_iterator(test::TestData::data_input_directory())) { - const auto files = get_test_files(e.path().string()); - for (auto &&file : files) { - std::string testPath = - file.path.substr(TestData::data_input_directory().length() + 1); - result[testPath] = file; - } + std::string root = TestData::data_input_directory(); + + for (const auto &e : fs::directory_iterator(root)) { + const auto files = get_test_files(root, e.path().string()); + result.insert(std::end(result), std::begin(files), std::end(files)); } + std::sort(std::begin(result), std::end(result), + [](const auto &lhs, const auto &rhs) { + return lhs.short_path < rhs.short_path; + }); + return result; } + } // namespace -TestFile::TestFile(std::string path, const FileType type, - const bool password_encrypted, std::string password) - : path{std::move(path)}, type{type}, password_encrypted{password_encrypted}, +TestFile::TestFile(std::string absolute_path, std::string short_path, + const FileType type, std::optional password) + : absolute_path{std::move(absolute_path)}, + short_path{std::move(short_path)}, type{type}, password{std::move(password)} {} std::string TestData::data_input_directory() { @@ -118,46 +131,41 @@ TestData &TestData::instance_() { return instance; } -std::vector TestData::test_file_paths() { - return instance_().test_file_paths_(); +std::vector TestData::test_files() { + return instance_().m_test_files; } -std::vector TestData::test_file_paths(FileType fileType) { - return instance_().test_file_paths_(fileType); +std::vector TestData::test_files(FileType fileType) { + return instance_().test_files_(fileType); } -TestFile TestData::test_file(const std::string &path) { - return instance_().test_file_(path); +TestFile TestData::test_file(const std::string &short_path) { + const auto &files = instance_().m_test_files; + const auto it = + std::find_if(std::begin(files), std::end(files), [&](const auto &file) { + return file.short_path == short_path; + }); + if (it == std::end(files)) { + throw std::runtime_error("Test file not found: " + short_path); + } + return *it; } -std::string TestData::test_file_path(const std::string &path) { - return test_file(path).path; +std::string TestData::test_file_path(const std::string &short_path) { + return test_file(short_path).absolute_path; } TestData::TestData() : m_test_files{get_test_files()} {} -std::vector TestData::test_file_paths_() const { - std::vector result; - for (auto &&file : m_test_files) { - result.push_back(file.first); - } - std::sort(std::begin(result), std::end(result)); - return result; -} - -std::vector TestData::test_file_paths_(FileType fileType) const { - std::vector result; +std::vector TestData::test_files_(const FileType fileType) const { + std::vector result; + result.reserve(m_test_files.size()); for (auto &&file : m_test_files) { - if (file.second.type == fileType) { - result.push_back(file.first); + if (file.type == fileType) { + result.push_back(file); } } - std::sort(std::begin(result), std::end(result)); return result; } -TestFile TestData::test_file_(const std::string &path) const { - return m_test_files.at(path); -} - } // namespace odr::test diff --git a/test/src/test_util.hpp b/test/src/test_util.hpp index 324f76c4..c5a05e87 100644 --- a/test/src/test_util.hpp +++ b/test/src/test_util.hpp @@ -4,20 +4,19 @@ #include #include -#include #include namespace odr::test { struct TestFile { - std::string path; + std::string absolute_path; + std::string short_path; FileType type{FileType::unknown}; - bool password_encrypted{false}; - std::string password; + std::optional password; TestFile() = default; - TestFile(std::string path, FileType type, bool password_encrypted, - std::string password); + TestFile(std::string absolute_path, std::string short_path, FileType type, + std::optional password); }; class TestData { @@ -25,10 +24,11 @@ class TestData { static std::string data_directory(); static std::string data_input_directory(); - static std::vector test_file_paths(); - static std::vector test_file_paths(FileType); - static TestFile test_file(const std::string &path); - static std::string test_file_path(const std::string &path); + static std::vector test_files(); + static std::vector test_files(FileType); + + static TestFile test_file(const std::string &short_path); + static std::string test_file_path(const std::string &short_path); TestData(const TestData &) = delete; TestData &operator=(const TestData &) = delete; @@ -39,11 +39,9 @@ class TestData { TestData(); static TestData &instance_(); - std::vector test_file_paths_() const; - std::vector test_file_paths_(FileType) const; - TestFile test_file_(const std::string &path) const; + [[nodiscard]] std::vector test_files_(FileType) const; - std::unordered_map m_test_files; + std::vector m_test_files; }; } // namespace odr::test diff --git a/test/src/wvWare_wrapper_test.cpp b/test/src/wvWare_wrapper_test.cpp deleted file mode 100644 index d3d45252..00000000 --- a/test/src/wvWare_wrapper_test.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -using namespace odr; -using namespace odr::test; -using namespace odr::internal; -using namespace odr::test; -namespace fs = std::filesystem; - -using wvWareWrapperTests = ::testing::TestWithParam; - -TEST_P(wvWareWrapperTests, html) { - const std::string test_file_path = GetParam(); - const TestFile test_file = TestData::test_file(test_file_path); - - const std::string test_repo = *common::Path(test_file_path).begin(); - const std::string output_path_prefix = - common::Path("output").join(test_repo).join("output-wvWare").string(); - const std::string output_path = - common::Path(output_path_prefix) - .join(common::Path(test_file_path).rebase(test_repo)) - .string(); - - std::cout << test_file.path << " to " << output_path << std::endl; - - // Password protected files are problematic on wvWare - if (test_file.password_encrypted) { - GTEST_SKIP(); - } - - fs::create_directories(output_path); - HtmlConfig config; - std::optional password; - Html html = odr::internal::html::wvWare_wrapper(test_file.path, output_path, - config, password); - - for (const HtmlPage &html_page : html.pages()) { - EXPECT_TRUE(fs::is_regular_file(html_page.path)); - EXPECT_LT(0, fs::file_size(html_page.path)); - } -} - -INSTANTIATE_TEST_SUITE_P(wvWare_test_files, wvWareWrapperTests, - testing::ValuesIn(TestData::test_file_paths( - FileType::legacy_word_document)), - [](const ::testing::TestParamInfo &info) { - std::string path = info.param; - internal::util::string::replace_all(path, "/", "_"); - internal::util::string::replace_all(path, "-", "_"); - internal::util::string::replace_all(path, "+", "_"); - internal::util::string::replace_all(path, ".", "_"); - internal::util::string::replace_all(path, " ", "_"); - internal::util::string::replace_all(path, "$", ""); - return path; - }); From f683779ea174371c36cf771f928115471670c271 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 28 Sep 2024 15:23:09 +0200 Subject: [PATCH 09/28] wvware progress --- src/odr/file.cpp | 4 ++ src/odr/file.hpp | 4 +- src/odr/html.cpp | 46 +++++++++++++----- src/odr/html.hpp | 27 +++++++++-- .../oldms_wvware/wvware_oldms_file.cpp | 47 ++++++++++++++----- .../oldms_wvware/wvware_oldms_file.hpp | 7 ++- src/odr/internal/open_strategy.cpp | 34 ++++++++------ src/odr/open_document_reader.cpp | 8 ++-- test/src/html_output_test.cpp | 6 ++- 9 files changed, 131 insertions(+), 52 deletions(-) diff --git a/src/odr/file.cpp b/src/odr/file.cpp index 7a189f18..75b9f944 100644 --- a/src/odr/file.cpp +++ b/src/odr/file.cpp @@ -244,6 +244,10 @@ DocumentMeta DocumentFile::document_meta() const { Document DocumentFile::document() const { return Document(m_impl->document()); } +std::shared_ptr DocumentFile::impl() const { + return m_impl; +} + PdfFile::PdfFile(std::shared_ptr impl) : DecodedFile(impl), m_impl{std::move(impl)} {} diff --git a/src/odr/file.hpp b/src/odr/file.hpp index 72013e19..1506fb04 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -98,7 +98,7 @@ enum class FileLocation { enum class DecoderEngine { odr, poppler, - wv_ware, + wvware, }; /// @brief Preference for decoding files. @@ -267,6 +267,8 @@ class DocumentFile final : public DecodedFile { [[nodiscard]] Document document() const; + [[nodiscard]] std::shared_ptr impl() const; + private: std::shared_ptr m_impl; }; diff --git a/src/odr/html.cpp b/src/odr/html.cpp index b3dd6a78..5caeebf8 100644 --- a/src/odr/html.cpp +++ b/src/odr/html.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -75,11 +77,9 @@ Html html::translate(const DecodedFile &decoded_file, } else if (decoded_file.is_image_file()) { return translate(decoded_file.image_file(), output_path, config); } else if (decoded_file.is_archive_file()) { - return translate(decoded_file.archive_file().archive(), output_path, - config); + return translate(decoded_file.archive_file(), output_path, config); } else if (decoded_file.is_document_file()) { - return translate(decoded_file.document_file().document(), output_path, - config); + return translate(decoded_file.document_file(), output_path, config); } else if (decoded_file.is_pdf_file()) { return translate(decoded_file.pdf_file(), output_path, config); } @@ -99,17 +99,24 @@ Html html::translate(const ImageFile &image_file, return internal::html::translate_image_file(image_file, output_path, config); } -Html html::translate(const Archive &archive, const std::string &output_path, - const HtmlConfig &config) { - fs::create_directories(output_path); - return internal::html::translate_filesystem( - FileType::unknown, archive.filesystem(), output_path, config); +Html html::translate(const ArchiveFile &archive_file, + const std::string &output_path, const HtmlConfig &config) { + return translate(archive_file.archive(), output_path, config); } -Html html::translate(const Document &document, const std::string &output_path, - const HtmlConfig &config) { - fs::create_directories(output_path); - return internal::html::translate_document(document, output_path, config); +Html html::translate(const DocumentFile &document_file, + const std::string &output_path, const HtmlConfig &config) { + auto document_file_impl = document_file.impl(); + + if (auto wv_document_file = + std::dynamic_pointer_cast( + document_file_impl)) { + fs::create_directories(output_path); + return internal::html::translate_wvware_oldms_file(*wv_document_file, + output_path, config); + } + + return translate(document_file.document(), output_path, config); } Html html::translate(const PdfFile &pdf_file, const std::string &output_path, @@ -126,6 +133,19 @@ Html html::translate(const PdfFile &pdf_file, const std::string &output_path, return internal::html::translate_pdf_file(pdf_file, output_path, config); } +Html html::translate(const Archive &archive, const std::string &output_path, + const HtmlConfig &config) { + fs::create_directories(output_path); + return internal::html::translate_filesystem( + FileType::unknown, archive.filesystem(), output_path, config); +} + +Html html::translate(const Document &document, const std::string &output_path, + const HtmlConfig &config) { + fs::create_directories(output_path); + return internal::html::translate_document(document, output_path, config); +} + void html::edit(const Document &document, const char *diff) { auto json = nlohmann::json::parse(diff); for (const auto &[key, value] : json["modifiedText"].items()) { diff --git a/src/odr/html.hpp b/src/odr/html.hpp index ecf9199c..7996cb18 100644 --- a/src/odr/html.hpp +++ b/src/odr/html.hpp @@ -144,20 +144,20 @@ Html translate(const ImageFile &image_file, const std::string &output_path, const HtmlConfig &config); /// @brief Translates an archive to HTML. /// -/// @param archive Archive to translate. +/// @param archive Archive file to translate. /// @param output_path Path to save the HTML output. /// @param config Configuration for the HTML output. /// @return HTML output. -Html translate(const Archive &archive, const std::string &output_path, +Html translate(const ArchiveFile &archive_file, const std::string &output_path, const HtmlConfig &config); /// @brief Translates a document to HTML. /// -/// @param document Document to translate. +/// @param document_file Document file to translate. /// @param output_path Path to save the HTML output. /// @param config Configuration for the HTML output. /// @return HTML output. -Html translate(const Document &document, const std::string &output_path, - const HtmlConfig &config); +Html translate(const DocumentFile &document_file, + const std::string &output_path, const HtmlConfig &config); /// @brief Translates a PDF file to HTML. /// /// @param pdf_file PDF file to translate. @@ -167,6 +167,23 @@ Html translate(const Document &document, const std::string &output_path, Html translate(const PdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config); +/// @brief Translates an archive to HTML. +/// +/// @param archive Archive to translate. +/// @param output_path Path to save the HTML output. +/// @param config Configuration for the HTML output. +/// @return HTML output. +Html translate(const Archive &archive, const std::string &output_path, + const HtmlConfig &config); +/// @brief Translates a document to HTML. +/// +/// @param document Document to translate. +/// @param output_path Path to save the HTML output. +/// @param config Configuration for the HTML output. +/// @return HTML output. +Html translate(const Document &document, const std::string &output_path, + const HtmlConfig &config); + /// @brief Edits a document with a diff. /// /// @note The diff is generated by our JavaScript code in the browser. diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp index d5177692..6dcd6d1d 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include namespace odr::internal { @@ -13,9 +15,34 @@ namespace odr::internal { WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( std::shared_ptr file) : m_file{std::move(file)} { + GError *error = nullptr; + + m_gsf_input = + gsf_input_stdio_new(m_file->disk_path()->string().c_str(), &error); + + if (m_gsf_input == nullptr) { + throw std::runtime_error("gsf_input_stdio_new failed"); + } + + open(); +} + +WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( + std::shared_ptr file) + : m_file{std::move(file)} { + m_gsf_input = gsf_input_memory_new( + reinterpret_cast(m_file->memory_data()), + static_cast(m_file->size()), false); + + open(); +} + +WvWareLegacyMicrosoftFile::~WvWareLegacyMicrosoftFile() { wvOLEFree(&m_ps); } + +void WvWareLegacyMicrosoftFile::open() { wvInit(); - char *path = const_cast(m_file->disk_path()->string().c_str()); - int ret = wvInitParser(&m_ps, path); + + int ret = wvInitParser_gsf(&m_ps, m_gsf_input); // check if password is required if ((ret & 0x8000) != 0) { @@ -27,7 +54,7 @@ WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( ret = 0; } } else { - m_encryption_state = EncryptionState::decrypted; + m_encryption_state = EncryptionState::not_encrypted; } if (ret != 0) { @@ -36,32 +63,28 @@ WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( } } -WvWareLegacyMicrosoftFile::~WvWareLegacyMicrosoftFile() { wvOLEFree(&m_ps); } - std::shared_ptr WvWareLegacyMicrosoftFile::file() const noexcept { return m_file; } FileType WvWareLegacyMicrosoftFile::file_type() const noexcept { - return {}; // TODO + return FileType::legacy_word_document; } FileMeta WvWareLegacyMicrosoftFile::file_meta() const noexcept { - return {}; // TODO + return {file_type(), password_encrypted(), document_meta()}; } DecoderEngine WvWareLegacyMicrosoftFile::decoder_engine() const noexcept { - return DecoderEngine::wv_ware; + return DecoderEngine::wvware; } DocumentType WvWareLegacyMicrosoftFile::document_type() const { - return {}; // TODO + return DocumentType::text; } -DocumentMeta WvWareLegacyMicrosoftFile::document_meta() const { - return {}; // TODO -} +DocumentMeta WvWareLegacyMicrosoftFile::document_meta() const { return {}; } bool WvWareLegacyMicrosoftFile::password_encrypted() const noexcept { return m_encryption_state == EncryptionState::encrypted || diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp index 3d80717f..6b553bf5 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp @@ -13,6 +13,7 @@ namespace odr::internal::common { class DiskFile; +class MemoryFile; } // namespace odr::internal::common namespace odr::internal { @@ -20,6 +21,7 @@ namespace odr::internal { class WvWareLegacyMicrosoftFile final : public abstract::DocumentFile { public: explicit WvWareLegacyMicrosoftFile(std::shared_ptr file); + explicit WvWareLegacyMicrosoftFile(std::shared_ptr file); ~WvWareLegacyMicrosoftFile() final; [[nodiscard]] std::shared_ptr file() const noexcept final; @@ -40,12 +42,15 @@ class WvWareLegacyMicrosoftFile final : public abstract::DocumentFile { [[nodiscard]] wvParseStruct &parse_struct() const; private: - std::shared_ptr m_file; + std::shared_ptr m_file; + GsfInput *m_gsf_input{}; EncryptionState m_encryption_state{EncryptionState::unknown}; wvParseStruct m_ps{}; int m_encryption_flag{}; + + void open(); }; } // namespace odr::internal diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index aab82396..aa50941f 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -101,10 +102,8 @@ open_strategy::engines(const std::shared_ptr &file, result.push_back(DecoderEngine::odr); - if (as == FileType::legacy_word_document || - as == FileType::legacy_powerpoint_presentation || - as == FileType::legacy_excel_worksheets) { - result.push_back(DecoderEngine::wv_ware); + if (as == FileType::legacy_word_document) { + result.push_back(DecoderEngine::wvware); } if (as == FileType::portable_document_format) { @@ -251,6 +250,15 @@ open_strategy::open_file(std::shared_ptr file, FileType as, } return nullptr; } + if (with == DecoderEngine::wvware) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique( + std::move(memory_file)); + } catch (...) { + } + return nullptr; + } return nullptr; } @@ -363,11 +371,10 @@ open_strategy::open_file(std::shared_ptr file, FileType as, std::unique_ptr open_strategy::open_file(std::shared_ptr file, const DecodePreference &preference) { - std::vector probe_types = - preference.as_file_type.has_value() - ? std::vector{*preference.as_file_type} - : preference.file_type_priority; - { + std::vector probe_types; + if (preference.as_file_type.has_value()) { + probe_types.push_back(*preference.as_file_type); + } else { std::vector detected_types = types(file); probe_types.insert(probe_types.end(), detected_types.begin(), detected_types.end()); @@ -376,11 +383,10 @@ open_strategy::open_file(std::shared_ptr file, } for (FileType as : probe_types) { - std::vector probe_engines = - preference.with_engine.has_value() - ? std::vector{*preference.with_engine} - : preference.engine_priority; - { + std::vector probe_engines; + if (preference.with_engine.has_value()) { + probe_engines.push_back(*preference.with_engine); + } else { std::vector detected_engines = engines(file, as); probe_engines.insert(probe_engines.end(), detected_engines.begin(), detected_engines.end()); diff --git a/src/odr/open_document_reader.cpp b/src/odr/open_document_reader.cpp index 52b35c3f..d2da7fa6 100644 --- a/src/odr/open_document_reader.cpp +++ b/src/odr/open_document_reader.cpp @@ -169,8 +169,8 @@ std::string OpenDocumentReader::engine_to_string(const DecoderEngine engine) { return "odr"; } else if (engine == DecoderEngine::poppler) { return "poppler"; - } else if (engine == DecoderEngine::wv_ware) { - return "wv_ware"; + } else if (engine == DecoderEngine::wvware) { + return "wvware"; } throw UnknownDecoderEngine(); } @@ -180,8 +180,8 @@ DecoderEngine OpenDocumentReader::engine_by_name(const std::string &name) { return DecoderEngine::odr; } else if (name == "poppler") { return DecoderEngine::poppler; - } else if (name == "wv_ware") { - return DecoderEngine::wv_ware; + } else if (name == "wvware") { + return DecoderEngine::wvware; } throw UnknownDecoderEngine(); } diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index 80c033e3..6430f0be 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -35,7 +35,6 @@ using HtmlOutputTests = ::testing::TestWithParam; TEST_P(HtmlOutputTests, html_meta) { const TestParams ¶ms = GetParam(); const TestFile &test_file = params.test_file; - const std::string &test_file_path = params.path; const DecoderEngine engine = params.engine; const std::string &test_repo = params.test_repo; const std::string &output_path = params.output_path; @@ -48,7 +47,6 @@ TEST_P(HtmlOutputTests, html_meta) { // these files cannot be opened if (util::string::ends_with(test_file.short_path, ".sxw") || - (test_file.type == FileType::legacy_word_document) || (test_file.type == FileType::legacy_powerpoint_presentation) || (test_file.type == FileType::legacy_excel_worksheets) || (test_file.type == FileType::word_perfect) || @@ -187,6 +185,10 @@ std::vector list_test_params() { if (test_file.type == FileType::portable_document_format) { engines.push_back(DecoderEngine::poppler); } + if (test_file.type == FileType::legacy_word_document) { + engines.clear(); + engines.push_back(DecoderEngine::wvware); + } for (const DecoderEngine engine : engines) { params.push_back(create_test_params(test_file, engine)); From 91423fd4d3b0dd4fceb556df3f60c75b7c2b58a4 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 28 Sep 2024 15:54:18 +0200 Subject: [PATCH 10/28] rename --- CMakeLists.txt | 6 +++--- src/odr/html.cpp | 4 ++-- ...mlEX_wrapper.cpp => pdf2htmlex_wrapper.cpp} | 2 +- ...mlEX_wrapper.hpp => pdf2htmlex_wrapper.hpp} | 0 .../{wvWare_wrapper.cpp => wvware_wrapper.cpp} | 18 +++++++++++++++--- .../{wvWare_wrapper.hpp => wvware_wrapper.hpp} | 0 6 files changed, 21 insertions(+), 9 deletions(-) rename src/odr/internal/html/{pdf2htmlEX_wrapper.cpp => pdf2htmlex_wrapper.cpp} (98%) rename src/odr/internal/html/{pdf2htmlEX_wrapper.hpp => pdf2htmlex_wrapper.hpp} (100%) rename src/odr/internal/html/{wvWare_wrapper.cpp => wvware_wrapper.cpp} (96%) rename src/odr/internal/html/{wvWare_wrapper.hpp => wvware_wrapper.hpp} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 253c9484..e292e4db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,7 +108,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/html/html_writer.cpp" "src/odr/internal/html/image_file.cpp" "src/odr/internal/html/pdf_file.cpp" - "src/odr/internal/html/pdf2htmlEX_wrapper.hpp" + "src/odr/internal/html/pdf2htmlex_wrapper.hpp" "src/odr/internal/html/text_file.cpp" "src/odr/internal/json/json_file.cpp" @@ -200,7 +200,7 @@ if (WITH_PDF2HTMLEX) find_package(poppler REQUIRED) target_sources(odr PRIVATE - "src/odr/internal/html/pdf2htmlEX_wrapper.cpp" + "src/odr/internal/html/pdf2htmlex_wrapper.cpp" "src/odr/internal/pdf_poppler/poppler_pdf_file.cpp" ) target_link_libraries(odr @@ -213,7 +213,7 @@ if (WITH_WVWARE) find_package(wvware REQUIRED) target_sources(odr PRIVATE - "src/odr/internal/html/wvWare_wrapper.cpp" + "src/odr/internal/html/wvware_wrapper.cpp" "src/odr/internal/oldms_wvware/wvware_oldms_file.cpp" ) target_link_libraries(odr diff --git a/src/odr/html.cpp b/src/odr/html.cpp index 5caeebf8..e769f53c 100644 --- a/src/odr/html.cpp +++ b/src/odr/html.cpp @@ -9,10 +9,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp b/src/odr/internal/html/pdf2htmlex_wrapper.cpp similarity index 98% rename from src/odr/internal/html/pdf2htmlEX_wrapper.cpp rename to src/odr/internal/html/pdf2htmlex_wrapper.cpp index 6a97897a..3a65e7ca 100644 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp b/src/odr/internal/html/pdf2htmlex_wrapper.hpp similarity index 100% rename from src/odr/internal/html/pdf2htmlEX_wrapper.hpp rename to src/odr/internal/html/pdf2htmlex_wrapper.hpp diff --git a/src/odr/internal/html/wvWare_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp similarity index 96% rename from src/odr/internal/html/wvWare_wrapper.cpp rename to src/odr/internal/html/wvware_wrapper.cpp index 825b3aac..2e469bc6 100644 --- a/src/odr/internal/html/wvWare_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -8,6 +8,7 @@ #include +#include #include namespace odr::internal { @@ -36,6 +37,8 @@ struct TranslationState : public expand_data { int i = 0; char *charset = nullptr; PAP *ppap = nullptr; + + std::unique_ptr output_stream; }; /// Originally from `wvWare.c` `wvStrangeNoGraphicData` @@ -476,8 +479,6 @@ Html html::translate_wvware_oldms_file( const HtmlConfig &config) { auto output_file_path = output_path + "/document.html"; - char *wv_config = nullptr; // TODO - wvParseStruct &ps = oldms_file.parse_struct(); wvSetElementHandler(&ps, element_handler); @@ -487,16 +488,27 @@ Html html::translate_wvware_oldms_file( state_data handle; TranslationState translation_state; + translation_state.output_stream = + std::make_unique(output_file_path, std::ios::out); wvInitStateData(&handle); translation_state.sd = &handle; ps.userData = &translation_state; + *translation_state.output_stream << "\n\n\n" + << "\n" + << "Document\n" + << "\n\n"; + if (wvHtml(&ps) != 0) { throw std::runtime_error("wvHtml failed"); } + *translation_state.output_stream << "\n\n"; + + translation_state.output_stream->flush(); + return { FileType::legacy_word_document, config, {{"document", output_file_path}}}; } diff --git a/src/odr/internal/html/wvWare_wrapper.hpp b/src/odr/internal/html/wvware_wrapper.hpp similarity index 100% rename from src/odr/internal/html/wvWare_wrapper.hpp rename to src/odr/internal/html/wvware_wrapper.hpp From ab954e1ebb17a4de4310057000a277f6b818cba2 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sun, 29 Sep 2024 11:29:40 +0200 Subject: [PATCH 11/28] pull more code in --- src/odr/internal/html/document.cpp | 15 +- src/odr/internal/html/html_writer.cpp | 74 ++-- src/odr/internal/html/html_writer.hpp | 2 +- src/odr/internal/html/wvware_wrapper.cpp | 516 ++++++++++++++++------- 4 files changed, 405 insertions(+), 202 deletions(-) diff --git a/src/odr/internal/html/document.cpp b/src/odr/internal/html/document.cpp index cfa14e9d..ec20726e 100644 --- a/src/odr/internal/html/document.cpp +++ b/src/odr/internal/html/document.cpp @@ -80,7 +80,7 @@ void front(const Document &document, HtmlWriter &out, const HtmlConfig &config, out.write_body_begin(HtmlElementOptions().set_class(body_clazz)); } -void back(const Document &document, internal::html::HtmlWriter &out, +void back(const Document &document, html::HtmlWriter &out, const HtmlConfig &config, const HtmlResourceLocator &resourceLocator) { (void)document; @@ -104,8 +104,8 @@ void back(const Document &document, internal::html::HtmlWriter &out, std::string fill_path_variables(const std::string &path, std::optional index = {}) { std::string result = path; - internal::util::string::replace_all(result, "{index}", - index ? std::to_string(*index) : ""); + util::string::replace_all(result, "{index}", + index ? std::to_string(*index) : ""); return result; } @@ -219,7 +219,7 @@ class SlideHtmlFragment final : public HtmlFragmentBase { void write_html_fragment(HtmlWriter &out, const HtmlConfig &config, const HtmlResourceLocator &resourceLocator) const final { - internal::html::translate_slide(m_slide, out, config, resourceLocator); + html::translate_slide(m_slide, out, config, resourceLocator); } private: @@ -253,7 +253,7 @@ class PageHtmlFragment final : public HtmlFragmentBase { void write_html_fragment(HtmlWriter &out, const HtmlConfig &config, const HtmlResourceLocator &resourceLocator) const final { - internal::html::translate_page(m_page, out, config, resourceLocator); + html::translate_page(m_page, out, config, resourceLocator); } private: @@ -304,12 +304,11 @@ Html html::translate_document(const odr::Document &document, std::uint32_t i = 0; for (const auto &fragment : service.fragments()) { std::string filled_path = get_output_path(document, i, output_path, config); - std::ofstream ostream(filled_path); + std::ofstream ostream(filled_path, std::ios::out); if (!ostream.is_open()) { throw FileWriteError(); } - internal::html::HtmlWriter out(ostream, config.format_html, - config.html_indent); + html::HtmlWriter out(ostream, config.format_html, config.html_indent); fragment.write_html_document(out.out(), config, resourceLocator); diff --git a/src/odr/internal/html/html_writer.cpp b/src/odr/internal/html/html_writer.cpp index 5fc3ee92..540bb1fd 100644 --- a/src/odr/internal/html/html_writer.cpp +++ b/src/odr/internal/html/html_writer.cpp @@ -121,123 +121,123 @@ HtmlElementOptions::set_extra(std::optional _extra) { HtmlWriter::HtmlWriter(std::ostream &out, bool format, std::uint8_t indent, std::uint32_t current_indent) - : m_out{out}, m_format{format}, m_indent(indent, ' '), + : m_out{&out}, m_format{format}, m_indent(indent, ' '), m_current_indent{current_indent} {} HtmlWriter::HtmlWriter(std::ostream &out, const HtmlConfig &config) : HtmlWriter{out, config.format_html, config.html_indent} {} void HtmlWriter::write_begin() { - m_out << "\n"; - m_out << ""; + out() << "\n"; + out() << ""; } void HtmlWriter::write_end() { write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_begin() { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_header_end() { --m_current_indent; write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_title(const std::string &title) { write_new_line(); - m_out << "" << title << ""; + out() << "" << title << ""; } void HtmlWriter::write_header_viewport(const std::string &viewport) { write_new_line(); - m_out << R"("; + out() << R"("; } void HtmlWriter::write_header_target(const std::string &target) { write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_charset(const std::string &charset) { write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_style(const std::string &href) { write_new_line(); - m_out << R"("; + out() << R"("; } void HtmlWriter::write_header_style_begin() { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_script(const std::string &src) { write_new_line(); - m_out << R"("; + out() << R"("; } void HtmlWriter::write_script_begin() { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_body_begin(const HtmlElementOptions &options) { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_body_end() { --m_current_indent; write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_element_begin(const std::string &name, @@ -248,12 +248,12 @@ void HtmlWriter::write_element_begin(const std::string &name, m_stack.push_back({name, options.inline_element}); } - m_out << "<" << name; - write_element_options(m_out, options); + out() << "<" << name; + write_element_options(out(), options); if (options.close_type == HtmlCloseType::trailing) { - m_out << "/>"; + out() << "/>"; } else { - m_out << ">"; + out() << ">"; } } @@ -269,7 +269,7 @@ void HtmlWriter::write_element_end(const std::string &name) { } m_stack.pop_back(); - m_out << ""; + out() << ""; } bool HtmlWriter::is_inline_mode() const { @@ -286,9 +286,9 @@ void HtmlWriter::write_new_line() { return; } - m_out << '\n'; + out() << '\n'; for (std::uint32_t i = 0; i < m_current_indent; ++i) { - m_out << m_indent; + out() << m_indent; } } @@ -297,9 +297,9 @@ void HtmlWriter::write_raw(const HtmlWritable &writable, bool new_line) { write_new_line(); } - write_writable(m_out, writable); + write_writable(out(), writable); } -std::ostream &HtmlWriter::out() { return m_out; } +std::ostream &HtmlWriter::out() { return *m_out; } } // namespace odr::internal::html diff --git a/src/odr/internal/html/html_writer.hpp b/src/odr/internal/html/html_writer.hpp index 2f8158ab..85776796 100644 --- a/src/odr/internal/html/html_writer.hpp +++ b/src/odr/internal/html/html_writer.hpp @@ -88,7 +88,7 @@ class HtmlWriter { bool inline_element{false}; }; - std::ostream &m_out; + std::ostream *m_out{nullptr}; bool m_format{false}; std::string m_indent; std::uint32_t m_current_indent{0}; diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index 2e469bc6..fe40b8d5 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -1,9 +1,12 @@ #include +#include #include #include #include +#include +#include #include #include @@ -34,105 +37,299 @@ namespace { /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wv.h#L2776-L2814 /// to allow for more state variables. struct TranslationState : public expand_data { - int i = 0; + explicit TranslationState(html::HtmlWriter _out) + : expand_data{}, out(std::move(_out)) {} + char *charset = nullptr; PAP *ppap = nullptr; - std::unique_ptr output_stream; -}; + struct { + int message = 0; + } special_char_handler_state = {}; -/// Originally from `wvWare.c` `wvStrangeNoGraphicData` -/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L661-L676 -/// simplified to HTML output -void strange_no_graphic_data(wvParseStruct *ps, int graphicstype) { - std::cerr << "Strange No Graphic Data in the 0x01/0x08 graphic\n"; + html::HtmlWriter out; +}; - // TODO print to output file - printf(R"(%#.2x graphic
)", graphicstype, - "StrangeNoGraphicData"); +/// Originally from `text.c` `wvConvertUnicodeToHtml` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/text.c#L1999-L2154 +int convert_unicode_to_html(wvParseStruct *ps, U16 char16) { + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + switch (char16) { + case 11: + out.out() << "
"; + return 1; + case 31: /* non-required hyphen */ + out.out() << "­"; /*vladimir@lukianov.name HTML 4.01 spec*/ + return 1; + case 30: + case 45: + case 0x2013: + out.out() << "-"; /* en-dash */ + return 1; + case 12: + case 13: + case 14: + case 7: + return 1; + case 34: + out.out() << """; + return 1; + case 38: + out.out() << "&"; + return 1; + case 60: + out.out() << "<"; + return 1; + case 62: + out.out() << ">"; + return 1; + /* + german characters, im assured that this is the right way to handle them + by Markus Schulte + + As the output encoding for HTML was chosen as UTF-8, + we don't need Ä etc. etc. I removed all but sz + -- MV 6.4.2000 + */ + + case 0xdf: + out.out() << "ß"; + return 1; + /* end german characters */ + case 0x2026: +#if 0 +/* +this just looks awful in netscape 4.5, so im going to do a very foolish +thing and just put ... instead of this +*/ + printf ("…"); +/*is there a proper html name for ... &ellipse;? Yes, … -- MV */ +#endif + out.out() << "…"; + return 1; + case 0x2019: + out.out() << "'"; + return 1; + case 0x2215: + out.out() << "/"; + return 1; + case 0xF8E7: /* without this, things should work in theory, but not for me */ + out.out() << "_"; + return 1; + case 0x2018: + out.out() << "`"; + return 1; + + /* Windows specials (MV): */ + case 0x0160: + out.out() << "Š"; + return 1; + case 0x0161: + out.out() << "š"; + return 1; + case 0x2014: + out.out() << "—"; + return 1; + case 0x201c: + out.out() << "“"; /* inverted double quotation mark */ + return 1; + case 0x201d: + out.out() << "”"; /* double q.m. */ + return 1; + case 0x201e: + out.out() << "„"; /* below double q.m. */ + return 1; + case 0x2020: + out.out() << "†"; + return 1; + case 0x2021: + out.out() << "‡"; + return 1; + case 0x2022: + out.out() << "•"; + return 1; + case 0x0152: + out.out() << "Œ"; + return 1; + case 0x0153: + out.out() << "œ"; + return 1; + case 0x0178: + out.out() << "Ÿ"; + return 1; + case 0x2030: + out.out() << "‰"; + return 1; + case 0x20ac: + out.out() << "€"; + return 1; + + /* Mac specials (MV): */ + + case 0xf020: + out.out() << " "; + return 1; + case 0xf02c: + out.out() << ","; + return 1; + case 0xf028: + out.out() << "("; + return 1; + + case 0xf03e: + out.out() << ">"; + return 1; + case 0xf067: + out.out() << "γ"; + return 1; + case 0xf064: + out.out() << "δ"; + return 1; + case 0xf072: + out.out() << "ρ"; + return 1; + case 0xf073: + out.out() << "σ"; + return 1; + case 0xf0ae: + out.out() << "→"; /* right arrow */ + return 1; + case 0xf0b6: + out.out() << "∂"; /* partial deriv. */ + return 1; + case 0xf0b3: + out.out() << "≥"; + return 1; + default: + break; + } + /* Debugging aid: */ + /* if (char16 >= 0x100) printf("[%x]", char16); */ + return 0; } -/// Originally from `wvWare.c` `name_to_url` -/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1703-L1772 -char *name_to_url(char *name) { - // TODO get rid of static - // TODO use std::string - static char *url = 0; - static long max = 0; - char *ptr = 0; - long count = 0; - - ptr = name; - while (*ptr) { - switch (*ptr) { - case ' ': - count += 3; - break; - default: - count++; - break; - } - ptr++; +/// Originally from `text.c` `wvOutputFromUnicode` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/text.c#L757-L840 +void output_from_unicode(wvParseStruct *ps, U16 eachchar, char *outputtype) { + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + // TODO static + static char cached_outputtype[33]; /* Last outputtype */ + static GIConv g_iconv_handle = (GIConv)-1; /* Cached iconv descriptor */ + static int need_swapping; + gchar *ibuf, *obuf; + size_t ibuflen, obuflen, len, count, i; + U8 buffer[2], buffer2[5]; + + if (convert_unicode_to_html(ps, eachchar) != 0) { + return; } - count++; - if (count > max) { - char *more = nullptr; - if (url == nullptr) { - more = static_cast(malloc(count)); - } else { - more = static_cast(realloc(url, count)); + if ((g_iconv_handle == (GIConv)-1) || + strcmp(cached_outputtype, outputtype) != 0) { + if ((g_iconv_handle != (GIConv)-1)) { + g_iconv_close(g_iconv_handle); } - if (more != nullptr) { - url = more; - max = count; + + g_iconv_handle = g_iconv_open(outputtype, "UCS-2"); + if (g_iconv_handle == (GIConv)-1) { + std::cerr << "g_iconv_open fail: " << errno + << ", cannot convert UCS-2 to " << outputtype << "\n"; + out.out() << "?"; + return; } - } - if (url != nullptr) { - count = 0; - ptr = name; - while ((*ptr != 0) && (count < max)) { - switch (*ptr) { - case ' ': - url[count++] = '%'; - if (count < max) - url[count++] = '2'; - if (count < max) - url[count++] = '0'; - break; - default: - url[count++] = *ptr; - break; - } - ptr++; + /* safe to cache the output type here */ + strcpy(cached_outputtype, outputtype); + + /* Determining if unicode biteorder is swapped (glibc < 2.2) */ + need_swapping = 1; + + buffer[0] = 0x20; + buffer[1] = 0; + ibuf = reinterpret_cast(buffer); + obuf = reinterpret_cast(buffer2); + ibuflen = 2; + obuflen = 5; + + count = g_iconv(g_iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); + if (count >= 0) { + need_swapping = buffer2[0] != 0x20; } - url[max - 1] = 0; + } + + if (need_swapping) { + buffer[0] = (eachchar >> 8) & 0x00ff; + buffer[1] = eachchar & 0x00ff; } else { - std::cerr << "failed to convert name to URL\n"; - return name; + buffer[0] = eachchar & 0x00ff; + buffer[1] = (eachchar >> 8) & 0x00ff; } - return url; + ibuf = reinterpret_cast(buffer); + obuf = reinterpret_cast(buffer2); + + ibuflen = 2; + len = obuflen = 5; + + count = g_iconv(g_iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); + if (count == (size_t)-1) { + std::cerr << "iconv failed, errno: " << errno << ", char: 0x" << std::hex + << eachchar << ", UCS-2 -> " << outputtype << "\n"; + + /* I'm torn here - do i just announce the failure, continue, or copy over to + * the other buffer? */ + + /* errno is usually 84 (illegal byte sequence) + should i reverse the bytes and try again? */ + out.out() << ibuf[1]; + } else { + len = len - obuflen; + + for (i = 0; i < len; i++) { + out.out() << buffer2[i]; + } + } +} + +/// Originally from `wvWare.c` `wvStrangeNoGraphicData` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L661-L676 +/// simplified to HTML output +void strange_no_graphic_data(wvParseStruct *ps, int graphicstype) { + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + std::cerr << "Strange No Graphic Data in the 0x01/0x08 graphic\n"; + + // TODO + out.out() << R"()
)"; } /// Originally from `wvWare.c` `wvPrintGraphics` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1239-L1287 /// simplified to HTML output -void print_graphics(int graphicstype, int width, int height, char *source) { +void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, + char *source) { // upstream converts to PNG, we just use the original format as the browser // should support them + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + // TODO export/embed image - // TODO replace printf - printf(R"(%#.2x graphic
)", - width, height, graphicstype, name_to_url(source)); + out.out() << R"()
)"; } /// Originally from `wvWare.c` `myelehandler` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L503-L599 -int element_handler(wvParseStruct *ps, wvTag tag, void *props, int dirty) { +int element_handler(wvParseStruct *ps, wvTag tag, void *props, int /*dirty*/) { auto *data = (TranslationState *)ps->userData; data->anSttbfAssoc = &ps->anSttbfAssoc; data->lfo = &ps->lfo; @@ -238,17 +435,15 @@ int document_handler(wvParseStruct *ps, wvTag tag) { data->endcell = &ps->endcell; data->vmerges = &ps->vmerges; data->norows = &ps->norows; - if (data->i == 0) { - wvSetEntityConverter(data); - data->filename = ps->filename; - data->whichcell = 0; - data->whichrow = 0; - data->asep = nullptr; - data->i++; - wvInitPAP(&data->lastpap); - data->nextpap = nullptr; - data->ps = ps; - } + + wvSetEntityConverter(data); + data->filename = ps->filename; + data->whichcell = 0; + data->whichrow = 0; + data->asep = nullptr; + wvInitPAP(&data->lastpap); + data->nextpap = nullptr; + data->ps = ps; if (data->charset == nullptr) { data->charset = wvAutoCharset(ps); @@ -279,18 +474,15 @@ int char_handler(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid) { ps->fieldmiddle = 0; fieldCharProc(ps, eachchar, chartype, lid); /* temp */ return 0; - break; case 20: fieldCharProc(ps, eachchar, chartype, lid); ps->fieldmiddle = 1; return 0; - break; case 21: ps->fieldmiddle = 0; ps->fieldstate--; fieldCharProc(ps, eachchar, chartype, lid); /* temp */ return 0; - break; case 0x08: std::cerr << "hmm did we loose the fSpec flag ?, this is possibly a bug\n"; break; @@ -298,16 +490,18 @@ int char_handler(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid) { break; } - if (ps->fieldstate != 0) { - if (fieldCharProc(ps, eachchar, chartype, lid) != 0) { - return 0; - } + if (ps->fieldstate != 0 && fieldCharProc(ps, eachchar, chartype, lid) != 0) { + return 0; } - if (data->charset != nullptr) { - wvOutputHtmlChar(eachchar, chartype, data->charset, lid); - } else { - wvOutputHtmlChar(eachchar, chartype, wvAutoCharset(ps), lid); + // from `wvOutputHtmlChar` + { + char *outputtype = + data->charset != nullptr ? data->charset : wvAutoCharset(ps); + if (chartype != 0) { + eachchar = wvHandleCodePage(eachchar, lid); + } + output_from_unicode(ps, eachchar, outputtype); } return 0; @@ -316,20 +510,22 @@ int char_handler(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid) { /// Originally from `wvWare.c` `mySpecCharProc` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1289-L1553 int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { - static int message; - PICF picf; - FSPA *fspa; auto *data = (TranslationState *)ps->userData; + auto &state = data->special_char_handler_state; + auto &out = data->out; + + PICF picf; + FSPA *fspa = nullptr; switch (eachchar) { case 19: - std::cerr << "field began\n"; + // field began ps->fieldstate++; ps->fieldmiddle = 0; fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ return 0; case 20: - if (achp->fOle2) { + if (achp->fOle2 != 0) { std::cerr << "this field has an associated embedded object of id " << achp->fcPic_fcObj_lTagObj << "\n"; } @@ -341,6 +537,7 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { ps->fieldmiddle = 0; fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ return 0; + default: break; } @@ -358,25 +555,21 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { Blip blip; char *name; long p = wvStream_tell(ps->data); - std::cerr << "picture 0x01 here, at offset " << achp->fcPic_fcObj_lTagObj - << " in Data Stream, obj is " << achp->fObj << ", ole is " - << achp->fOle2 << "\n"; - if (achp->fOle2) { + if (achp->fOle2 != 0) { return 0; } - if (no_graphics != 0) { - wvStream_goto(ps->data, achp->fcPic_fcObj_lTagObj); - wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); - f = picf.rgb; - if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { - name = wvHtmlGraphic(ps, &blip); - print_graphics(0x01, (int)wvTwipsToHPixels(picf.dxaGoal), - (int)wvTwipsToVPixels(picf.dyaGoal), name); - wvFree(name); - } else { - strange_no_graphic_data(ps, 0x01); - } + + wvStream_goto(ps->data, achp->fcPic_fcObj_lTagObj); + wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); + f = picf.rgb; + if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { + name = wvHtmlGraphic(ps, &blip); + print_graphics(ps, 0x01, (int)wvTwipsToHPixels(picf.dxaGoal), + (int)wvTwipsToVPixels(picf.dyaGoal), name); + wvFree(name); + } else { + strange_no_graphic_data(ps, 0x01); } wvStream_goto(ps->data, p); @@ -386,28 +579,28 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { Blip blip; char *name; if (wvQuerySupported(&ps->fib, nullptr) == WORD8) { - if (!no_graphics) { - if (ps->nooffspa > 0) { - fspa = wvGetFSPAFromCP(ps->currentcp, ps->fspa, ps->fspapos, - ps->nooffspa); - - if (fspa == nullptr) { - std::cerr << "No fspa! Insanity abounds!\n"; - return 0; - } - - data->props = fspa; - if (wv0x08(&blip, fspa->spid, ps)) { - name = wvHtmlGraphic(ps, &blip); - print_graphics( - 0x08, (int)wvTwipsToHPixels(fspa->xaRight - fspa->xaLeft), - (int)wvTwipsToVPixels(fspa->yaBottom - fspa->yaTop), name); - wvFree(name); - } else - strange_no_graphic_data(ps, 0x08); - } else { - std::cerr << "nooffspa was <=0! Ignoring.\n"; + if (ps->nooffspa > 0) { + fspa = + wvGetFSPAFromCP(ps->currentcp, ps->fspa, ps->fspapos, ps->nooffspa); + + if (fspa == nullptr) { + std::cerr << "No fspa! Insanity abounds!\n"; + return 0; } + + data->props = fspa; + if (wv0x08(&blip, (int)fspa->spid, ps) != 0) { + name = wvHtmlGraphic(ps, &blip); + print_graphics( + ps, 0x08, + (int)wvTwipsToHPixels((short)(fspa->xaRight - fspa->xaLeft)), + (int)wvTwipsToVPixels((short)(fspa->yaBottom - fspa->yaTop)), + name); + wvFree(name); + } else + strange_no_graphic_data(ps, 0x08); + } else { + std::cerr << "nooffspa was <=0! Ignoring.\n"; } } else { FDOA *fdoa; @@ -428,45 +621,45 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { U16 mtextra[8] = {'M', 'T', ' ', 'E', 'x', 't', 'r', 'a'}; if (0 == memcmp(symbol, ps->fonts.ffn[achp->ftcSym].xszFfn, 12)) { - if ((!message) && (strcasecmp("UTF-8", data->charset) != 0)) { + if ((state.message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { std::cerr << "Symbol font detected (too late sorry!), rerun wvHtml with option --charset utf-8\n\ option to support correct symbol font conversion to a viewable format.\n"; - message++; + state.message++; } - wvOutputFromUnicode(wvConvertSymbolToUnicode(achp->xchSym - 61440), + output_from_unicode(ps, wvConvertSymbolToUnicode(achp->xchSym - 61440), data->charset); return 0; } else if (0 == memcmp(mtextra, ps->fonts.ffn[achp->ftcSym].xszFfn, 16)) { - if ((message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { + if ((state.message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { std::cerr << "MT Extra font detected (too late sorry!), rerun wvHtml with option --charset utf-8\n\ option to support correct symbol font conversion to a viewable format.\n"; - message++; + state.message++; } - wvOutputFromUnicode(wvConvertMTExtraToUnicode(achp->xchSym - 61440), + output_from_unicode(ps, wvConvertMTExtraToUnicode(achp->xchSym - 61440), data->charset); return 0; } else if (0 == memcmp(wingdings, ps->fonts.ffn[achp->ftcSym].xszFfn, 18)) { - if (message == 0) { + if (state.message == 0) { std::cerr << "Wingdings font detected, i need a mapping table to " "unicode for this\n"; - message++; + state.message++; } } else { - if (message == 0) { + if (state.message == 0) { char *fontname = wvWideStrToMB(ps->fonts.ffn[achp->ftcSym].xszFfn); std::cerr << "Special font " << fontname - << ", i need a mapping table to unicode for this\n"; + << ", I need a mapping table to unicode for this\n"; wvFree(fontname); - // TODO replace printf - printf("*"); + out.out() << "*"; + state.message++; } return 0; } } default: - return 0; + break; } return 0; @@ -477,8 +670,17 @@ option to support correct symbol font conversion to a viewable format.\n"; Html html::translate_wvware_oldms_file( const WvWareLegacyMicrosoftFile &oldms_file, const std::string &output_path, const HtmlConfig &config) { + HtmlResourceLocator resourceLocator = + local_resource_locator(output_path, config); + auto output_file_path = output_path + "/document.html"; + std::ofstream ostream(output_file_path, std::ios::out); + if (!ostream.is_open()) { + throw FileWriteError(); + } + html::HtmlWriter out(ostream, config.format_html, config.html_indent); + wvParseStruct &ps = oldms_file.parse_struct(); wvSetElementHandler(&ps, element_handler); @@ -487,27 +689,29 @@ Html html::translate_wvware_oldms_file( wvSetSpecialCharHandler(&ps, special_char_handler); state_data handle; - TranslationState translation_state; - translation_state.output_stream = - std::make_unique(output_file_path, std::ios::out); + TranslationState translation_state(out); wvInitStateData(&handle); translation_state.sd = &handle; ps.userData = &translation_state; - *translation_state.output_stream << "\n\n\n" - << "\n" - << "Document\n" - << "\n\n"; + out.write_begin(); + out.write_header_begin(); + out.write_header_charset("UTF-8"); + out.write_header_target("_blank"); + out.write_header_title("odr"); + out.write_header_viewport( + "width=device-width,initial-scale=1.0,user-scalable=yes"); + out.write_header_end(); + out.write_body_begin(); if (wvHtml(&ps) != 0) { throw std::runtime_error("wvHtml failed"); } - *translation_state.output_stream << "\n\n"; - - translation_state.output_stream->flush(); + out.write_body_end(); + out.write_end(); return { FileType::legacy_word_document, config, {{"document", output_file_path}}}; From 6b4d035526e1319f58f8ff58aed7d88f41874945 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 13:33:16 +0100 Subject: [PATCH 12/28] fix pdf2htmlex version --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 16eab5f2..9cb1dca4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -44,7 +44,7 @@ def requirements(self): self.requires("uchardet/0.0.8") self.requires("utfcpp/4.0.4") if self.options.get_safe("with_pdf2htmlEX", False): - self.requires("pdf2htmlex/0.18.8.rc1-20240905-git") + self.requires("pdf2htmlex/0.18.8.rc1-git-6f85c88") if self.options.get_safe("with_wvWare", False): self.requires("wvware/1.2.9") From d57bc3c9b1751625ea5b76f33a46884e28d610a4 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 13:37:45 +0100 Subject: [PATCH 13/28] fixes after ci --- src/odr/internal/open_strategy.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index aa50941f..de265920 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace odr::internal { @@ -96,7 +96,7 @@ open_strategy::types(const std::shared_ptr &file) { } std::vector -open_strategy::engines(const std::shared_ptr &file, +open_strategy::engines(const std::shared_ptr & /*file*/, FileType as) { std::vector result; From 67d7522030b0c512077eeb623ac2810bac4b7d37 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 14:23:38 +0100 Subject: [PATCH 14/28] avoid linking issue --- src/odr/internal/html/pdf2htmlex_wrapper.cpp | 2 -- src/odr/internal/html/wvware_wrapper.cpp | 6 ++++-- src/odr/internal/html/wvware_wrapper.hpp | 1 - src/odr/internal/oldms_wvware/wvware_oldms_file.cpp | 3 --- src/odr/internal/oldms_wvware/wvware_oldms_file.hpp | 1 - 5 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.cpp b/src/odr/internal/html/pdf2htmlex_wrapper.cpp index 3a65e7ca..24abb8f8 100644 --- a/src/odr/internal/html/pdf2htmlex_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.cpp @@ -13,8 +13,6 @@ #include #include -#include - namespace odr::internal { Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index fe40b8d5..102b0479 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -564,7 +564,8 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); f = picf.rgb; if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { - name = wvHtmlGraphic(ps, &blip); + // TODO port + // name = wvHtmlGraphic(ps, &blip); print_graphics(ps, 0x01, (int)wvTwipsToHPixels(picf.dxaGoal), (int)wvTwipsToVPixels(picf.dyaGoal), name); wvFree(name); @@ -590,7 +591,8 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { data->props = fspa; if (wv0x08(&blip, (int)fspa->spid, ps) != 0) { - name = wvHtmlGraphic(ps, &blip); + // TODO port + // name = wvHtmlGraphic(ps, &blip); print_graphics( ps, 0x08, (int)wvTwipsToHPixels((short)(fspa->xaRight - fspa->xaLeft)), diff --git a/src/odr/internal/html/wvware_wrapper.hpp b/src/odr/internal/html/wvware_wrapper.hpp index b548923d..b7061ea4 100644 --- a/src/odr/internal/html/wvware_wrapper.hpp +++ b/src/odr/internal/html/wvware_wrapper.hpp @@ -1,7 +1,6 @@ #ifndef ODR_INTERNAL_WVWARE_WRAPPER_HPP #define ODR_INTERNAL_WVWARE_WRAPPER_HPP -#include #include namespace odr { diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp index 6dcd6d1d..f9d29def 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp @@ -3,9 +3,6 @@ #include #include -#include -#include - #include #include #include diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp index 6b553bf5..259f66d0 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp @@ -7,7 +7,6 @@ #include #include -#include #include From 15980d9d30e9cf9badf11e8cb6d96e10a5299109 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 14:51:16 +0100 Subject: [PATCH 15/28] some wvware port; fix includes --- src/odr/internal/html/pdf2htmlex_wrapper.hpp | 1 + src/odr/internal/html/wvware_wrapper.cpp | 140 +++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.hpp b/src/odr/internal/html/pdf2htmlex_wrapper.hpp index ee492fee..6d8589f9 100644 --- a/src/odr/internal/html/pdf2htmlex_wrapper.hpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.hpp @@ -1,6 +1,7 @@ #ifndef ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP #define ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP +#include #include namespace odr { diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index 102b0479..641b4068 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -13,6 +13,7 @@ #include #include +#include namespace odr::internal { @@ -327,6 +328,145 @@ void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, << source << R"("/>
)"; } +int handle_bitmap(wvParseStruct *ps, char *name, BitmapBlip *bitmap) { + wvStream *pwv = bitmap->m_pvBits; + FILE *fd = nullptr; + size_t size = 0, i; + + fd = fopen(name, "wb"); + if (fd == nullptr) { + fprintf(stderr, "\nCannot open %s for writing\n", name); + exit(1); + } + size = wvStream_size(pwv); + wvStream_rewind(pwv); + + for (i = 0; i < size; i++) { + fputc(read_8ubit(pwv), fd); + } + fclose(fd); + wvTrace(("Name is %s\n", name)); + return 0; +} + +int handle_metafile(wvParseStruct *ps, char *name, MetaFileBlip *bitmap) { + wvStream *pwv = bitmap->m_pvBits; + FILE *fd = nullptr; + size_t size = 0, i; + U8 decompressf = 0; + + fd = fopen(name, "wb"); + if (fd == nullptr) { + fprintf(stderr, "\nCannot open %s for writing\n", name); + exit(1); + } + size = wvStream_size(pwv); + wvStream_rewind(pwv); + + if (bitmap->m_fCompression == msocompressionDeflate) + decompressf = setdecom(); + + if (!decompressf) { + for (i = 0; i < size; i++) + fputc(read_8ubit(pwv), fd); + } else /* decompress here */ + { + FILE *tmp = tmpfile(); + FILE *out = tmpfile(); + + for (i = 0; i < size; i++) { + fputc(read_8ubit(pwv), tmp); + } + + rewind(tmp); + decompress(tmp, out, bitmap->m_cbSave, bitmap->m_cb); + fclose(tmp); + + rewind(out); + + for (i = 0; i < bitmap->m_cb; i++) + fputc(fgetc(out), fd); + + fclose(out); + } + + fclose(fd); + wvTrace(("Name is %s\n", name)); + return 0; +} + +char *html_graphic(wvParseStruct *ps, Blip *blip) { + char *name; + wvStream *fd; + char test[3]; + + // TODO handle figure name + name = "figure"; + if (name == nullptr) { + return nullptr; + } + + /* + temp hack to test older included bmps in word 6 and 7, + should be wrapped in a modern escher strucure before getting + to here, and then handled as normal + */ + wvTrace(("type is %d\n", blip->type)); + switch (blip->type) { + case msoblipJPEG: + case msoblipDIB: + case msoblipPNG: + fd = (blip->blip.bitmap.m_pvBits); + test[2] = '\0'; + test[0] = read_8ubit(fd); + + test[1] = read_8ubit(fd); + wvStream_rewind(fd); + if (!(strcmp(test, "BM"))) { + wvAppendStr(&name, ".bmp"); + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + return nullptr; + return name; + } + default: + break; + } + + switch (blip->type) { + case msoblipWMF: + wvAppendStr(&name, ".wmf"); + if (0 != handle_metafile(ps, name, &blip->blip.metafile)) + return nullptr; + break; + case msoblipEMF: + wvAppendStr(&name, ".emf"); + if (0 != handle_metafile(ps, name, &blip->blip.metafile)) + return nullptr; + break; + case msoblipPICT: + wvAppendStr(&name, ".pict"); + if (0 != handle_metafile(ps, name, &blip->blip.metafile)) + return nullptr; + break; + case msoblipJPEG: + wvAppendStr(&name, ".jpg"); + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + return nullptr; + break; + case msoblipDIB: + wvAppendStr(&name, ".dib"); + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + return nullptr; + break; + case msoblipPNG: + wvAppendStr(&name, ".png"); + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + return nullptr; + break; + } + return name; +} + /// Originally from `wvWare.c` `myelehandler` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L503-L599 int element_handler(wvParseStruct *ps, wvTag tag, void *props, int /*dirty*/) { From 31dfcc5db06f3837876ffbce08f5944f519ba814 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 15:00:00 +0100 Subject: [PATCH 16/28] try quickfix warning --- src/odr/internal/html/wvware_wrapper.cpp | 32 ++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index 641b4068..99dea546 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -401,7 +401,7 @@ char *html_graphic(wvParseStruct *ps, Blip *blip) { char test[3]; // TODO handle figure name - name = "figure"; + name = nullptr; if (name == nullptr) { return nullptr; } @@ -424,8 +424,9 @@ char *html_graphic(wvParseStruct *ps, Blip *blip) { wvStream_rewind(fd); if (!(strcmp(test, "BM"))) { wvAppendStr(&name, ".bmp"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { return nullptr; + } return name; } default: @@ -435,33 +436,39 @@ char *html_graphic(wvParseStruct *ps, Blip *blip) { switch (blip->type) { case msoblipWMF: wvAppendStr(&name, ".wmf"); - if (0 != handle_metafile(ps, name, &blip->blip.metafile)) + if (0 != handle_metafile(ps, name, &blip->blip.metafile)) { return nullptr; + } break; case msoblipEMF: wvAppendStr(&name, ".emf"); - if (0 != handle_metafile(ps, name, &blip->blip.metafile)) + if (0 != handle_metafile(ps, name, &blip->blip.metafile)) { return nullptr; + } break; case msoblipPICT: wvAppendStr(&name, ".pict"); - if (0 != handle_metafile(ps, name, &blip->blip.metafile)) + if (0 != handle_metafile(ps, name, &blip->blip.metafile)) { return nullptr; + } break; case msoblipJPEG: wvAppendStr(&name, ".jpg"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { return nullptr; + } break; case msoblipDIB: wvAppendStr(&name, ".dib"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { return nullptr; + } break; case msoblipPNG: wvAppendStr(&name, ".png"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) + if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { return nullptr; + } break; } return name; @@ -704,8 +711,7 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); f = picf.rgb; if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { - // TODO port - // name = wvHtmlGraphic(ps, &blip); + name = html_graphic(ps, &blip); print_graphics(ps, 0x01, (int)wvTwipsToHPixels(picf.dxaGoal), (int)wvTwipsToVPixels(picf.dyaGoal), name); wvFree(name); @@ -731,16 +737,16 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { data->props = fspa; if (wv0x08(&blip, (int)fspa->spid, ps) != 0) { - // TODO port - // name = wvHtmlGraphic(ps, &blip); + name = html_graphic(ps, &blip); print_graphics( ps, 0x08, (int)wvTwipsToHPixels((short)(fspa->xaRight - fspa->xaLeft)), (int)wvTwipsToVPixels((short)(fspa->yaBottom - fspa->yaTop)), name); wvFree(name); - } else + } else { strange_no_graphic_data(ps, 0x08); + } } else { std::cerr << "nooffspa was <=0! Ignoring.\n"; } From 431a45d31987ab0a5f2ec88fa991b0a78ab9ce00 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 15:24:09 +0100 Subject: [PATCH 17/28] fix unused var --- src/odr/internal/html/wvware_wrapper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index 99dea546..1c8ac138 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -328,7 +328,7 @@ void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, << source << R"("/>
)"; } -int handle_bitmap(wvParseStruct *ps, char *name, BitmapBlip *bitmap) { +int handle_bitmap(wvParseStruct * /*ps*/, char *name, BitmapBlip *bitmap) { wvStream *pwv = bitmap->m_pvBits; FILE *fd = nullptr; size_t size = 0, i; @@ -349,7 +349,7 @@ int handle_bitmap(wvParseStruct *ps, char *name, BitmapBlip *bitmap) { return 0; } -int handle_metafile(wvParseStruct *ps, char *name, MetaFileBlip *bitmap) { +int handle_metafile(wvParseStruct * /*ps*/, char *name, MetaFileBlip *bitmap) { wvStream *pwv = bitmap->m_pvBits; FILE *fd = nullptr; size_t size = 0, i; From 8ce836c37ca69834c0e02be1b08bc6aaecfe9b61 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 15:39:30 +0100 Subject: [PATCH 18/28] fix warning --- src/odr/internal/html/wvware_wrapper.cpp | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index 1c8ac138..cf66c702 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -53,7 +53,7 @@ struct TranslationState : public expand_data { /// Originally from `text.c` `wvConvertUnicodeToHtml` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/text.c#L1999-L2154 -int convert_unicode_to_html(wvParseStruct *ps, U16 char16) { +int convert_unicode_to_html(wvParseStruct *ps, std::uint16_t char16) { auto *data = (TranslationState *)ps->userData; auto &out = data->out; @@ -213,7 +213,8 @@ thing and just put ... instead of this /// Originally from `text.c` `wvOutputFromUnicode` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/text.c#L757-L840 -void output_from_unicode(wvParseStruct *ps, U16 eachchar, char *outputtype) { +void output_from_unicode(wvParseStruct *ps, std::uint16_t eachchar, + char *outputtype) { auto *data = (TranslationState *)ps->userData; auto &out = data->out; @@ -222,8 +223,8 @@ void output_from_unicode(wvParseStruct *ps, U16 eachchar, char *outputtype) { static GIConv g_iconv_handle = (GIConv)-1; /* Cached iconv descriptor */ static int need_swapping; gchar *ibuf, *obuf; - size_t ibuflen, obuflen, len, count, i; - U8 buffer[2], buffer2[5]; + std::size_t ibuflen, obuflen, len, count, i; + std::uint8_t buffer[2], buffer2[5]; if (convert_unicode_to_html(ps, eachchar) != 0) { return; @@ -257,7 +258,7 @@ void output_from_unicode(wvParseStruct *ps, U16 eachchar, char *outputtype) { obuflen = 5; count = g_iconv(g_iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); - if (count >= 0) { + if (count != (std::size_t)-1) { need_swapping = buffer2[0] != 0x20; } } @@ -277,7 +278,7 @@ void output_from_unicode(wvParseStruct *ps, U16 eachchar, char *outputtype) { len = obuflen = 5; count = g_iconv(g_iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); - if (count == (size_t)-1) { + if (count == (std::size_t)-1) { std::cerr << "iconv failed, errno: " << errno << ", char: 0x" << std::hex << eachchar << ", UCS-2 -> " << outputtype << "\n"; @@ -331,7 +332,7 @@ void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, int handle_bitmap(wvParseStruct * /*ps*/, char *name, BitmapBlip *bitmap) { wvStream *pwv = bitmap->m_pvBits; FILE *fd = nullptr; - size_t size = 0, i; + std::size_t size = 0, i; fd = fopen(name, "wb"); if (fd == nullptr) { @@ -352,8 +353,8 @@ int handle_bitmap(wvParseStruct * /*ps*/, char *name, BitmapBlip *bitmap) { int handle_metafile(wvParseStruct * /*ps*/, char *name, MetaFileBlip *bitmap) { wvStream *pwv = bitmap->m_pvBits; FILE *fd = nullptr; - size_t size = 0, i; - U8 decompressf = 0; + std::size_t size = 0, i; + std::uint8_t decompressf = 0; fd = fopen(name, "wb"); if (fd == nullptr) { @@ -612,7 +613,8 @@ int document_handler(wvParseStruct *ps, wvTag tag) { /// Originally from `wvWare.c` `myCharProc` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1556-L1605 -int char_handler(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid) { +int char_handler(wvParseStruct *ps, std::uint16_t eachchar, + std::uint8_t chartype, std::uint16_t lid) { auto *data = (TranslationState *)ps->userData; switch (eachchar) { @@ -656,7 +658,7 @@ int char_handler(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid) { /// Originally from `wvWare.c` `mySpecCharProc` /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1289-L1553 -int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { +int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { auto *data = (TranslationState *)ps->userData; auto &state = data->special_char_handler_state; auto &out = data->out; @@ -764,9 +766,9 @@ int special_char_handler(wvParseStruct *ps, U16 eachchar, CHP *achp) { return 0; } case 0x28: { - U16 symbol[6] = {'S', 'y', 'm', 'b', 'o', 'l'}; - U16 wingdings[9] = {'W', 'i', 'n', 'g', 'd', 'i', 'n', 'g', 's'}; - U16 mtextra[8] = {'M', 'T', ' ', 'E', 'x', 't', 'r', 'a'}; + std::uint16_t symbol[6] = {'S', 'y', 'm', 'b', 'o', 'l'}; + std::uint16_t wingdings[9] = {'W', 'i', 'n', 'g', 'd', 'i', 'n', 'g', 's'}; + std::uint16_t mtextra[8] = {'M', 'T', ' ', 'E', 'x', 't', 'r', 'a'}; if (0 == memcmp(symbol, ps->fonts.ffn[achp->ftcSym].xszFfn, 12)) { if ((state.message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { From 65c94c7c4888c357eb4c233d3a3a2e7aef6574e3 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 17:00:48 +0100 Subject: [PATCH 19/28] fix temporary --- test/src/html_output_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index 6430f0be..5a813e85 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -38,7 +38,7 @@ TEST_P(HtmlOutputTests, html_meta) { const DecoderEngine engine = params.engine; const std::string &test_repo = params.test_repo; const std::string &output_path = params.output_path; - const std::string &output_path_prefix = + const std::string output_path_prefix = common::Path(output_path).parent().string(); std::cout << test_file.short_path << " to " << output_path << std::endl; From c526d71a05a0f6fcd1f82cc2aa924eba38f4d521 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 17:13:50 +0100 Subject: [PATCH 20/28] remove wvware from global scope --- .../oldms_wvware/wvware_oldms_file.cpp | 46 ++++++++++++------- .../oldms_wvware/wvware_oldms_file.hpp | 10 ++-- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp index f9d29def..711d16f2 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include @@ -9,15 +8,24 @@ namespace odr::internal { +struct WvWareLegacyMicrosoftFile::ParserState { + GsfInput *gsf_input{}; + + wvParseStruct ps{}; + int encryption_flag{}; +}; + WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( std::shared_ptr file) : m_file{std::move(file)} { GError *error = nullptr; - m_gsf_input = + m_parser_state = std::make_shared(); + + m_parser_state->gsf_input = gsf_input_stdio_new(m_file->disk_path()->string().c_str(), &error); - if (m_gsf_input == nullptr) { + if (m_parser_state->gsf_input == nullptr) { throw std::runtime_error("gsf_input_stdio_new failed"); } @@ -27,27 +35,32 @@ WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( std::shared_ptr file) : m_file{std::move(file)} { - m_gsf_input = gsf_input_memory_new( + m_parser_state = std::make_shared(); + + m_parser_state->gsf_input = gsf_input_memory_new( reinterpret_cast(m_file->memory_data()), static_cast(m_file->size()), false); open(); } -WvWareLegacyMicrosoftFile::~WvWareLegacyMicrosoftFile() { wvOLEFree(&m_ps); } +WvWareLegacyMicrosoftFile::~WvWareLegacyMicrosoftFile() { + wvOLEFree(&m_parser_state->ps); +} void WvWareLegacyMicrosoftFile::open() { wvInit(); - int ret = wvInitParser_gsf(&m_ps, m_gsf_input); + int ret = wvInitParser_gsf(&m_parser_state->ps, m_parser_state->gsf_input); // check if password is required if ((ret & 0x8000) != 0) { m_encryption_state = EncryptionState::encrypted; - m_encryption_flag = ret & 0x7fff; + m_parser_state->encryption_flag = ret & 0x7fff; - if ((m_encryption_flag == WORD8) || (m_encryption_flag == WORD7) || - (m_encryption_flag == WORD6)) { + if ((m_parser_state->encryption_flag == WORD8) || + (m_parser_state->encryption_flag == WORD7) || + (m_parser_state->encryption_flag == WORD6)) { ret = 0; } } else { @@ -55,7 +68,7 @@ void WvWareLegacyMicrosoftFile::open() { } if (ret != 0) { - wvOLEFree(&m_ps); + wvOLEFree(&m_parser_state->ps); throw std::runtime_error("wvInitParser failed"); } } @@ -97,14 +110,15 @@ bool WvWareLegacyMicrosoftFile::decrypt(const std::string &password) { return false; } - wvSetPassword(password.c_str(), &m_ps); + wvSetPassword(password.c_str(), &m_parser_state->ps); bool success = false; - if (m_encryption_flag == WORD8) { - success = wvDecrypt97(&m_ps); - } else if (m_encryption_flag == WORD7 || m_encryption_flag == WORD6) { - success = wvDecrypt95(&m_ps); + if (m_parser_state->encryption_flag == WORD8) { + success = wvDecrypt97(&m_parser_state->ps); + } else if (m_parser_state->encryption_flag == WORD7 || + m_parser_state->encryption_flag == WORD6) { + success = wvDecrypt95(&m_parser_state->ps); } if (!success) { @@ -121,7 +135,7 @@ WvWareLegacyMicrosoftFile::document() const { } wvParseStruct &WvWareLegacyMicrosoftFile::parse_struct() const { - return const_cast(m_ps); + return const_cast(m_parser_state->ps); } } // namespace odr::internal diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp index 259f66d0..efc7a1fc 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp @@ -8,7 +8,8 @@ #include -#include +struct _wvParseStruct; +using wvParseStruct = struct _wvParseStruct; namespace odr::internal::common { class DiskFile; @@ -41,14 +42,13 @@ class WvWareLegacyMicrosoftFile final : public abstract::DocumentFile { [[nodiscard]] wvParseStruct &parse_struct() const; private: + struct ParserState; + std::shared_ptr m_file; - GsfInput *m_gsf_input{}; + std::shared_ptr m_parser_state; EncryptionState m_encryption_state{EncryptionState::unknown}; - wvParseStruct m_ps{}; - int m_encryption_flag{}; - void open(); }; From ca0650f48a90905fb8e8c0916cea268901bf4741 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 18:49:33 +0100 Subject: [PATCH 21/28] ifdef feature switches --- .github/workflows/build_test.yml | 4 ++++ CMakeLists.txt | 8 ++++++++ src/odr/html.cpp | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 2bc94e40..3208f2be 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -72,6 +72,8 @@ jobs: -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_INSTALL_PREFIX=install -DODR_TEST=ON + -DWITH_PDF2HTMLEX=ON + -DWITH_WVWARE=ON - name: cmake if: runner.os == 'Windows' @@ -82,6 +84,8 @@ jobs: -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DODR_TEST=ON + -DWITH_PDF2HTMLEX=OFF + -DWITH_WVWARE=OFF - name: build run: cmake --build build --config Release diff --git a/CMakeLists.txt b/CMakeLists.txt index e292e4db..c869e538 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,10 @@ if (WITH_PDF2HTMLEX) pdf2htmlex::pdf2htmlex poppler::poppler ) + target_compile_definitions(odr + PRIVATE + ODR_WITH_PDF2HTMLEX + ) endif () if (WITH_WVWARE) find_package(wvware REQUIRED) @@ -220,6 +224,10 @@ if (WITH_WVWARE) PRIVATE wvware::wvware ) + target_compile_definitions(odr + PRIVATE + ODR_WITH_WVWARE + ) endif () if (EXISTS "${PROJECT_SOURCE_DIR}/.git") diff --git a/src/odr/html.cpp b/src/odr/html.cpp index e769f53c..c619d33a 100644 --- a/src/odr/html.cpp +++ b/src/odr/html.cpp @@ -108,6 +108,7 @@ Html html::translate(const DocumentFile &document_file, const std::string &output_path, const HtmlConfig &config) { auto document_file_impl = document_file.impl(); +#ifdef ODR_WITH_WVWARE if (auto wv_document_file = std::dynamic_pointer_cast( document_file_impl)) { @@ -115,6 +116,7 @@ Html html::translate(const DocumentFile &document_file, return internal::html::translate_wvware_oldms_file(*wv_document_file, output_path, config); } +#endif return translate(document_file.document(), output_path, config); } @@ -123,12 +125,14 @@ Html html::translate(const PdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config) { auto pdf_file_impl = pdf_file.impl(); +#ifdef ODR_WITH_PDF2HTMLEX if (auto poppler_pdf_file = std::dynamic_pointer_cast(pdf_file_impl)) { fs::create_directories(output_path); return internal::html::translate_poppler_pdf_file(*poppler_pdf_file, output_path, config); } +#endif return internal::html::translate_pdf_file(pdf_file, output_path, config); } From ec1f3b919b74378c902c9f128f3f4576c8ee4c2a Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 19:08:33 +0100 Subject: [PATCH 22/28] more guards --- src/odr/internal/open_strategy.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index de265920..335efd43 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -250,6 +250,7 @@ open_strategy::open_file(std::shared_ptr file, FileType as, } return nullptr; } +#ifdef ODR_WITH_WVWARE if (with == DecoderEngine::wvware) { try { auto memory_file = std::make_shared(*file); @@ -259,6 +260,7 @@ open_strategy::open_file(std::shared_ptr file, FileType as, } return nullptr; } +#endif return nullptr; } @@ -270,6 +272,7 @@ open_strategy::open_file(std::shared_ptr file, FileType as, } return nullptr; } +#ifdef ODR_WITH_PDF2HTMLEX if (with == DecoderEngine::poppler) { try { auto memory_file = std::make_shared(*file); @@ -278,6 +281,7 @@ open_strategy::open_file(std::shared_ptr file, FileType as, } return nullptr; } +#endif return nullptr; } From fa9a87ed469cbefe395d9ed9994341bf5e2a3528 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 25 Dec 2024 19:52:11 +0100 Subject: [PATCH 23/28] fix nullptr dereference --- src/odr/internal/html/pdf2htmlex_wrapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.cpp b/src/odr/internal/html/pdf2htmlex_wrapper.cpp index 24abb8f8..2aca8c1e 100644 --- a/src/odr/internal/html/pdf2htmlex_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.cpp @@ -119,7 +119,7 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, !param.poppler_data_dir.empty() ? param.poppler_data_dir.c_str() : nullptr); - pdf2htmlEX::HTMLRenderer(nullptr, param).process(&pdf_doc); + pdf2htmlEX::HTMLRenderer(fontconfig_path, param).process(&pdf_doc); globalParams.reset(); From 38017180c979ee9bb4f59a0767cdaa1ae5efa405 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 26 Dec 2024 11:19:20 +0100 Subject: [PATCH 24/28] wvware figure name handling --- src/odr/internal/html/pdf2htmlex_wrapper.cpp | 2 + src/odr/internal/html/wvware_wrapper.cpp | 126 ++++++++----------- 2 files changed, 55 insertions(+), 73 deletions(-) diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.cpp b/src/odr/internal/html/pdf2htmlex_wrapper.cpp index 2aca8c1e..e8856b15 100644 --- a/src/odr/internal/html/pdf2htmlex_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.cpp @@ -119,6 +119,8 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, !param.poppler_data_dir.empty() ? param.poppler_data_dir.c_str() : nullptr); + // TODO not sure what the `progPath` is used for. it cannot be `nullptr` + // TODO potentially just a cache dir? pdf2htmlEX::HTMLRenderer(fontconfig_path, param).process(&pdf_doc); globalParams.reset(); diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp index cf66c702..4821c259 100644 --- a/src/odr/internal/html/wvware_wrapper.cpp +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -48,6 +48,8 @@ struct TranslationState : public expand_data { int message = 0; } special_char_handler_state = {}; + std::size_t figure_number = 0; + html::HtmlWriter out; }; @@ -168,7 +170,6 @@ thing and just put ... instead of this return 1; /* Mac specials (MV): */ - case 0xf020: out.out() << " "; return 1; @@ -218,10 +219,8 @@ void output_from_unicode(wvParseStruct *ps, std::uint16_t eachchar, auto *data = (TranslationState *)ps->userData; auto &out = data->out; - // TODO static - static char cached_outputtype[33]; /* Last outputtype */ - static GIConv g_iconv_handle = (GIConv)-1; /* Cached iconv descriptor */ - static int need_swapping; + GIConv g_iconv_handle = (GIConv)-1; + int need_swapping; gchar *ibuf, *obuf; std::size_t ibuflen, obuflen, len, count, i; std::uint8_t buffer[2], buffer2[5]; @@ -230,12 +229,7 @@ void output_from_unicode(wvParseStruct *ps, std::uint16_t eachchar, return; } - if ((g_iconv_handle == (GIConv)-1) || - strcmp(cached_outputtype, outputtype) != 0) { - if ((g_iconv_handle != (GIConv)-1)) { - g_iconv_close(g_iconv_handle); - } - + { g_iconv_handle = g_iconv_open(outputtype, "UCS-2"); if (g_iconv_handle == (GIConv)-1) { std::cerr << "g_iconv_open fail: " << errno @@ -244,9 +238,6 @@ void output_from_unicode(wvParseStruct *ps, std::uint16_t eachchar, return; } - /* safe to cache the output type here */ - strcpy(cached_outputtype, outputtype); - /* Determining if unicode biteorder is swapped (glibc < 2.2) */ need_swapping = 1; @@ -295,6 +286,9 @@ void output_from_unicode(wvParseStruct *ps, std::uint16_t eachchar, out.out() << buffer2[i]; } } + + // TODO iconv could be cached + { g_iconv_close(g_iconv_handle); } } /// Originally from `wvWare.c` `wvStrangeNoGraphicData` @@ -315,7 +309,7 @@ void strange_no_graphic_data(wvParseStruct *ps, int graphicstype) { /// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1239-L1287 /// simplified to HTML output void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, - char *source) { + const std::string &source) { // upstream converts to PNG, we just use the original format as the browser // should support them @@ -329,15 +323,15 @@ void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, << source << R"("/>
)"; } -int handle_bitmap(wvParseStruct * /*ps*/, char *name, BitmapBlip *bitmap) { +void handle_bitmap(wvParseStruct * /*ps*/, const std::string &name, + BitmapBlip *bitmap) { wvStream *pwv = bitmap->m_pvBits; FILE *fd = nullptr; std::size_t size = 0, i; - fd = fopen(name, "wb"); + fd = fopen(name.c_str(), "wb"); if (fd == nullptr) { - fprintf(stderr, "\nCannot open %s for writing\n", name); - exit(1); + throw std::runtime_error("Cannot open " + name + " file for writing"); } size = wvStream_size(pwv); wvStream_rewind(pwv); @@ -346,11 +340,10 @@ int handle_bitmap(wvParseStruct * /*ps*/, char *name, BitmapBlip *bitmap) { fputc(read_8ubit(pwv), fd); } fclose(fd); - wvTrace(("Name is %s\n", name)); - return 0; } -int handle_metafile(wvParseStruct * /*ps*/, char *name, MetaFileBlip *bitmap) { +int handle_metafile(wvParseStruct * /*ps*/, const char *name, + MetaFileBlip *bitmap) { wvStream *pwv = bitmap->m_pvBits; FILE *fd = nullptr; std::size_t size = 0, i; @@ -364,12 +357,14 @@ int handle_metafile(wvParseStruct * /*ps*/, char *name, MetaFileBlip *bitmap) { size = wvStream_size(pwv); wvStream_rewind(pwv); - if (bitmap->m_fCompression == msocompressionDeflate) + if (bitmap->m_fCompression == msocompressionDeflate) { decompressf = setdecom(); + } if (!decompressf) { - for (i = 0; i < size; i++) + for (i = 0; i < size; i++) { fputc(read_8ubit(pwv), fd); + } } else /* decompress here */ { FILE *tmp = tmpfile(); @@ -385,49 +380,51 @@ int handle_metafile(wvParseStruct * /*ps*/, char *name, MetaFileBlip *bitmap) { rewind(out); - for (i = 0; i < bitmap->m_cb; i++) + for (i = 0; i < bitmap->m_cb; i++) { fputc(fgetc(out), fd); + } fclose(out); } fclose(fd); - wvTrace(("Name is %s\n", name)); return 0; } -char *html_graphic(wvParseStruct *ps, Blip *blip) { - char *name; +std::string figure_name(wvParseStruct *ps) { + auto *data = (TranslationState *)ps->userData; + + std::size_t number = data->figure_number++; + std::string name = "figure" + std::to_string(number); + + return name; +} + +std::string html_graphic(wvParseStruct *ps, Blip *blip) { + std::string name; wvStream *fd; char test[3]; - // TODO handle figure name - name = nullptr; - if (name == nullptr) { - return nullptr; - } + name = figure_name(ps); /* temp hack to test older included bmps in word 6 and 7, should be wrapped in a modern escher strucure before getting to here, and then handled as normal */ - wvTrace(("type is %d\n", blip->type)); switch (blip->type) { case msoblipJPEG: case msoblipDIB: case msoblipPNG: fd = (blip->blip.bitmap.m_pvBits); test[2] = '\0'; - test[0] = read_8ubit(fd); + test[0] = (char)read_8ubit(fd); - test[1] = read_8ubit(fd); + test[1] = (char)read_8ubit(fd); wvStream_rewind(fd); if (!(strcmp(test, "BM"))) { - wvAppendStr(&name, ".bmp"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { - return nullptr; - } + name += ".bmp"; + handle_bitmap(ps, name, &blip->blip.bitmap); return name; } default: @@ -436,40 +433,28 @@ char *html_graphic(wvParseStruct *ps, Blip *blip) { switch (blip->type) { case msoblipWMF: - wvAppendStr(&name, ".wmf"); - if (0 != handle_metafile(ps, name, &blip->blip.metafile)) { - return nullptr; - } + name += ".wmf"; + handle_metafile(ps, name.c_str(), &blip->blip.metafile); break; case msoblipEMF: - wvAppendStr(&name, ".emf"); - if (0 != handle_metafile(ps, name, &blip->blip.metafile)) { - return nullptr; - } + name += ".emf"; + handle_metafile(ps, name.c_str(), &blip->blip.metafile); break; case msoblipPICT: - wvAppendStr(&name, ".pict"); - if (0 != handle_metafile(ps, name, &blip->blip.metafile)) { - return nullptr; - } + name += ".pict"; + handle_metafile(ps, name.c_str(), &blip->blip.metafile); break; case msoblipJPEG: - wvAppendStr(&name, ".jpg"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { - return nullptr; - } + name += ".jpg"; + handle_bitmap(ps, name.c_str(), &blip->blip.bitmap); break; case msoblipDIB: - wvAppendStr(&name, ".dib"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { - return nullptr; - } + name += ".dib"; + handle_bitmap(ps, name.c_str(), &blip->blip.bitmap); break; case msoblipPNG: - wvAppendStr(&name, ".png"); - if (0 != handle_bitmap(ps, name, &blip->blip.bitmap)) { - return nullptr; - } + name += ".png"; + handle_bitmap(ps, name.c_str(), &blip->blip.bitmap); break; } return name; @@ -702,7 +687,6 @@ int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { case 0x01: { wvStream *f; Blip blip; - char *name; long p = wvStream_tell(ps->data); if (achp->fOle2 != 0) { @@ -713,10 +697,9 @@ int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); f = picf.rgb; if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { - name = html_graphic(ps, &blip); + std::string name = html_graphic(ps, &blip); print_graphics(ps, 0x01, (int)wvTwipsToHPixels(picf.dxaGoal), (int)wvTwipsToVPixels(picf.dyaGoal), name); - wvFree(name); } else { strange_no_graphic_data(ps, 0x01); } @@ -726,7 +709,6 @@ int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { } case 0x08: { Blip blip; - char *name; if (wvQuerySupported(&ps->fib, nullptr) == WORD8) { if (ps->nooffspa > 0) { fspa = @@ -739,13 +721,12 @@ int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { data->props = fspa; if (wv0x08(&blip, (int)fspa->spid, ps) != 0) { - name = html_graphic(ps, &blip); + std::string name = html_graphic(ps, &blip); print_graphics( ps, 0x08, (int)wvTwipsToHPixels((short)(fspa->xaRight - fspa->xaLeft)), (int)wvTwipsToVPixels((short)(fspa->yaBottom - fspa->yaTop)), name); - wvFree(name); } else { strange_no_graphic_data(ps, 0x08); } @@ -753,9 +734,8 @@ int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { std::cerr << "nooffspa was <=0! Ignoring.\n"; } } else { - FDOA *fdoa; std::cerr << "pre word8 0x08 graphic, unsupported at the moment\n"; - fdoa = + FDOA *fdoa = wvGetFDOAFromCP(ps->currentcp, ps->fdoa, ps->fdoapos, ps->nooffdoa); data->props = fdoa; } @@ -823,7 +803,7 @@ Html html::translate_wvware_oldms_file( HtmlResourceLocator resourceLocator = local_resource_locator(output_path, config); - auto output_file_path = output_path + "/document.html"; + std::string output_file_path = output_path + "/document.html"; std::ofstream ostream(output_file_path, std::ios::out); if (!ostream.is_open()) { From 6e3eb56d0e73689c83abfdf3e4c09d193a269cf1 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 26 Dec 2024 12:46:12 +0100 Subject: [PATCH 25/28] fix wvware decryption; skip wvware decryption --- src/odr/internal/oldms_wvware/wvware_oldms_file.cpp | 6 +++--- test/src/html_output_test.cpp | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp index 711d16f2..a7bbdd55 100644 --- a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp @@ -115,10 +115,10 @@ bool WvWareLegacyMicrosoftFile::decrypt(const std::string &password) { bool success = false; if (m_parser_state->encryption_flag == WORD8) { - success = wvDecrypt97(&m_parser_state->ps); + success = wvDecrypt97(&m_parser_state->ps) == 0; } else if (m_parser_state->encryption_flag == WORD7 || m_parser_state->encryption_flag == WORD6) { - success = wvDecrypt95(&m_parser_state->ps); + success = wvDecrypt95(&m_parser_state->ps) == 0; } if (!success) { @@ -135,7 +135,7 @@ WvWareLegacyMicrosoftFile::document() const { } wvParseStruct &WvWareLegacyMicrosoftFile::parse_struct() const { - return const_cast(m_parser_state->ps); + return m_parser_state->ps; } } // namespace odr::internal diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index 5a813e85..75ca2bde 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -81,6 +81,12 @@ TEST_P(HtmlOutputTests, html_meta) { GTEST_SKIP(); } + // TODO check wvware decryption + if ((test_file.type == FileType::legacy_word_document) && + (engine == DecoderEngine::wvware)) { + GTEST_SKIP(); + } + if (file.is_document_file()) { DocumentFile document_file = file.document_file(); From c293feb3b0282fa4695a44da6138a00e8294d66c Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 26 Dec 2024 14:38:59 +0100 Subject: [PATCH 26/28] apply fixes --- test/docker/compare_output_server.sh | 2 +- test/scripts/html_render_diff.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/docker/compare_output_server.sh b/test/docker/compare_output_server.sh index 6a4651ee..bb08cd78 100755 --- a/test/docker/compare_output_server.sh +++ b/test/docker/compare_output_server.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash REF="test/data/reference-output/" -OBS="cmake-build-debug/test/output/" +OBS="cmake-build-relwithdebinfo/test/output/" DRIVER="firefox" # manually build the image diff --git a/test/scripts/html_render_diff.py b/test/scripts/html_render_diff.py index a945e78f..4879883f 100755 --- a/test/scripts/html_render_diff.py +++ b/test/scripts/html_render_diff.py @@ -29,7 +29,7 @@ def screenshot(browser, url): loaded_page_settling_time = 0 # Selenium doesn't like when we try to screenshot element of documents generated by pdf2htmlEX - if 'output-pdf2htmlEX' in url: + if 'poppler' in url: target_find_by = By.ID target = 'page-container' loaded_page_settling_time = 1 From 4e7f37da9bdfe8cd5d019ab8e06f3b475b8faf09 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 26 Dec 2024 14:53:32 +0100 Subject: [PATCH 27/28] doc; error checking --- test/docker/README.md | 5 +++++ test/docker/compare_output_server.sh | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 test/docker/README.md diff --git a/test/docker/README.md b/test/docker/README.md new file mode 100644 index 00000000..bc41a717 --- /dev/null +++ b/test/docker/README.md @@ -0,0 +1,5 @@ +# Manually build the image + +```bash +docker build --tag odr_core_test test/docker +``` diff --git a/test/docker/compare_output_server.sh b/test/docker/compare_output_server.sh index bb08cd78..57b2f2b0 100755 --- a/test/docker/compare_output_server.sh +++ b/test/docker/compare_output_server.sh @@ -4,8 +4,14 @@ REF="test/data/reference-output/" OBS="cmake-build-relwithdebinfo/test/output/" DRIVER="firefox" -# manually build the image -#docker build --tag odr_core_test test/docker +if [ ! -d "$REF" ]; then + echo "Reference output directory does not exist: $REF" + exit 1 +fi +if [ ! -d "$OBS" ]; then + echo "Observed output directory does not exist: $OBS" + exit 1 +fi docker run -ti \ -v $(pwd):/repo \ From ee09f78d764c0d21856bb264c7c0179a594d9931 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Thu, 26 Dec 2024 15:02:13 +0100 Subject: [PATCH 28/28] fix resource path in tests --- test/src/html_output_test.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index 75ca2bde..7fc2f3bf 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -120,8 +120,11 @@ TEST_P(HtmlOutputTests, html_meta) { EXPECT_LT(0, fs::file_size(meta_output)); } - const std::string resource_path = - common::Path(output_path_prefix).parent().join("resources").string(); + const std::string resource_path = common::Path(output_path_prefix) + .parent() + .parent() + .join("resources") + .string(); OpenDocumentReader::copy_resources(resource_path); HtmlConfig config;