diff --git a/README.md b/README.md index 0d506bd..a8741bb 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Una descrizione del tool di confronto con il dataset FUNSD, il report delle diff ## Pipeline ``` -PDF -> PdfPig text extraction -> (optional) PDFtoImage rasterisation -> Tesseract OCR +PDF -> UglyToad.PdfPig text extraction -> (optional) PDFtoImage rasterisation -> Tesseract OCR Image -> Tesseract OCR | v diff --git a/src/MarkItDownNet/MarkItDownConverter.cs b/src/MarkItDownNet/MarkItDownConverter.cs index 53a57ae..6bb964f 100644 --- a/src/MarkItDownNet/MarkItDownConverter.cs +++ b/src/MarkItDownNet/MarkItDownConverter.cs @@ -6,7 +6,9 @@ using System.Threading.Tasks; using Markdig; using Serilog; -using Tesseract; +using TesseractOCR; +using TesseractOCR.Enums; +using TesseractOCR.Pix; using UglyToad.PdfPig; using UglyToad.PdfPig.Content; using PDFtoImage; @@ -99,7 +101,7 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height)); using var image = SKImage.FromBitmap(bitmap); using var data = image.Encode(SKEncodedImageFormat.Png, 100); - using var pix = Pix.LoadFromMemory(data.ToArray()); + using var pix = Image.LoadFromMemory(data.ToArray()); var result = ProcessPix(pix, pages.Count, ct); lines.AddRange(result.lines); words.AddRange(result.words); @@ -112,50 +114,45 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) private MarkItDownResult ProcessImage(string path, CancellationToken ct) { - using var pix = Pix.LoadFromFile(path); + using var pix = Image.LoadFromFile(path); var (lines, words) = ProcessPix(pix, 1, ct); var pages = new List { new Page(1, pix.Width, pix.Height) }; var markdown = BuildMarkdown(lines); return new MarkItDownResult(markdown, pages, lines, words); } - private (List lines, List words) ProcessPix(Pix pix, int pageNumber, CancellationToken ct) + private (List lines, List words) ProcessPix(Image pix, int pageNumber, CancellationToken ct) { var lines = new List(); var words = new List(); - using var engine = new TesseractEngine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default); + using var engine = new Engine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default); using var page = engine.Process(pix); - var iterator = page.GetIterator(); - iterator.Begin(); - do + foreach (var block in page.Layout) { - ct.ThrowIfCancellationRequested(); - if (iterator.TryGetBoundingBox(PageIteratorLevel.TextLine, out var rect)) - { - var text = iterator.GetText(PageIteratorLevel.TextLine)?.Trim() ?? string.Empty; - if (!string.IsNullOrEmpty(text)) - { - lines.Add(new Line(pageNumber, text, Normalize(rect, pix.Width, pix.Height))); - } - } - } - while (iterator.Next(PageIteratorLevel.TextLine)); - - iterator.Begin(); - do - { - ct.ThrowIfCancellationRequested(); - if (iterator.TryGetBoundingBox(PageIteratorLevel.Word, out var rect)) + foreach (var paragraph in block.Paragraphs) { - var text = iterator.GetText(PageIteratorLevel.Word)?.Trim() ?? string.Empty; - if (!string.IsNullOrEmpty(text)) + foreach (var textLine in paragraph.TextLines) { - words.Add(new Word(pageNumber, text, Normalize(rect, pix.Width, pix.Height))); + ct.ThrowIfCancellationRequested(); + var text = textLine.Text?.Trim() ?? string.Empty; + if (!string.IsNullOrEmpty(text) && textLine.BoundingBox is Rect lineRect) + { + lines.Add(new Line(pageNumber, text, Normalize(lineRect, pix.Width, pix.Height))); + } + + foreach (var word in textLine.Words) + { + ct.ThrowIfCancellationRequested(); + var wText = word.Text?.Trim() ?? string.Empty; + if (!string.IsNullOrEmpty(wText) && word.BoundingBox is Rect wordRect) + { + words.Add(new Word(pageNumber, wText, Normalize(wordRect, pix.Width, pix.Height))); + } + } } } } - while (iterator.Next(PageIteratorLevel.Word)); return (lines, words); } diff --git a/src/MarkItDownNet/MarkItDownNet.csproj b/src/MarkItDownNet/MarkItDownNet.csproj index c4f1536..d721deb 100644 --- a/src/MarkItDownNet/MarkItDownNet.csproj +++ b/src/MarkItDownNet/MarkItDownNet.csproj @@ -22,10 +22,18 @@ libtesseract.so.5 PreserveNewest + + libtesseract.so + PreserveNewest + liblept.so.5 PreserveNewest + + libleptonica.so + PreserveNewest + libdl.so PreserveNewest diff --git a/src/MarkItDownNet/TesseractOCR/x64/libleptonica.so b/src/MarkItDownNet/TesseractOCR/x64/libleptonica.so new file mode 120000 index 0000000..b962e75 --- /dev/null +++ b/src/MarkItDownNet/TesseractOCR/x64/libleptonica.so @@ -0,0 +1 @@ +liblept.so.5 \ No newline at end of file diff --git a/src/MarkItDownNet/TesseractOCR/x64/libtesseract.so b/src/MarkItDownNet/TesseractOCR/x64/libtesseract.so new file mode 120000 index 0000000..68a9149 --- /dev/null +++ b/src/MarkItDownNet/TesseractOCR/x64/libtesseract.so @@ -0,0 +1 @@ +libtesseract.so.5 \ No newline at end of file