diff --git a/README.md b/README.md index 47429cc..26f3474 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,8 @@ var options = new MarkItDownOptions { OcrDataPath = "/usr/share/tesseract-ocr/5/tessdata", OcrLanguages = "eng", - PdfRasterDpi = 300 + OcrPsm = 6, + OcrUserDpi = 300 }; var converter = new MarkItDownConverter(options); var result = await converter.ConvertAsync("sample.pdf", "application/pdf"); @@ -77,7 +78,8 @@ Console.WriteLine(result.Markdown); * `OcrDataPath` – location of Tesseract language data (`TESSDATA_PREFIX`) * `OcrLanguages` – languages passed to Tesseract (e.g. `ita+eng`) -* `PdfRasterDpi` – DPI for rasterising PDFs during OCR fallback +* `OcrUserDpi` – DPI used for rasterisation and metadata +* `OcrPsm` – Tesseract page segmentation mode * `MinimumNativeWordThreshold` – minimum words before OCR is triggered * `NormalizeMarkdown` – toggle Markdig normalisation diff --git a/artifacts/validation/OCR/bench-ocr.json b/artifacts/validation/OCR/bench-ocr.json index 9e619df..20948d6 100644 --- a/artifacts/validation/OCR/bench-ocr.json +++ b/artifacts/validation/OCR/bench-ocr.json @@ -13,69 +13,69 @@ }, "files": [ { - "dataset": "FUNSD", - "file": "82250337_0338", - "cer_char": 0.011834319526627219, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 27, - "line_count_hyp": 27, - "line_f1": 1, - "timing_markitdownnet": 511, - "timing_pytesseract": 1845 + "dataset": "MARMOT", + "file": "10.1.1.1.2013_64", + "cer_char": 0.11729323308270677, + "token_precision": 0.7592592592592593, + "token_recall": 0.7522935779816514, + "token_f1": 0.7557603686635944, + "line_count_ref": 21, + "line_count_hyp": 21, + "line_f1": 0.47619047619047616, + "timing_markitdownnet": 521, + "timing_pytesseract": 0 }, { - "dataset": "FUNSD", - "file": "82200067_0069", - "cer_char": 0.020602218700475437, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 28, - "line_count_hyp": 28, - "line_f1": 1, - "timing_markitdownnet": 917, - "timing_pytesseract": 2055 + "dataset": "MARMOT", + "file": "10.1.1.1.2013_63", + "cer_char": 0.04133545310015898, + "token_precision": 0.8972332015810277, + "token_recall": 0.8937007874015748, + "token_f1": 0.8954635108481263, + "line_count_ref": 38, + "line_count_hyp": 38, + "line_f1": 0.5789473684210527, + "timing_markitdownnet": 861, + "timing_pytesseract": 0 }, { - "dataset": "FUNSD", - "file": "82092117", - "cer_char": 0.01079734219269103, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 32, - "line_count_hyp": 32, - "line_f1": 1, - "timing_markitdownnet": 681, - "timing_pytesseract": 1963 + "dataset": "MARMOT", + "file": "10.1.1.1.2006_3", + "cer_char": 0.04483471074380165, + "token_precision": 0.8641975308641975, + "token_recall": 0.8782936010037641, + "token_f1": 0.8711885500933415, + "line_count_ref": 60, + "line_count_hyp": 61, + "line_f1": 0.28099173553719003, + "timing_markitdownnet": 1524, + "timing_pytesseract": 0 }, { - "dataset": "FUNSD", - "file": "82251504", - "cer_char": 0.015776699029126214, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 28, - "line_count_hyp": 28, - "line_f1": 1, - "timing_markitdownnet": 680, - "timing_pytesseract": 2042 + "dataset": "MARMOT", + "file": "10.1.1.1.2014_4", + "cer_char": 0.11923556294142086, + "token_precision": 0.8087557603686636, + "token_recall": 0.8013698630136986, + "token_f1": 0.805045871559633, + "line_count_ref": 37, + "line_count_hyp": 36, + "line_f1": 0.273972602739726, + "timing_markitdownnet": 955, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", - "file": "X00016469670", - "cer_char": 0.09898107714701601, - "token_precision": 0.8468468468468469, - "token_recall": 0.8103448275862069, - "token_f1": 0.8281938325991189, - "line_count_ref": 29, - "line_count_hyp": 28, - "line_f1": 0.6315789473684211, - "timing_markitdownnet": 464, - "timing_pytesseract": 1928 + "file": "X51005200931", + "cer_char": 0.03384279475982533, + "token_precision": 0.891566265060241, + "token_recall": 0.8862275449101796, + "token_f1": 0.888888888888889, + "line_count_ref": 41, + "line_count_hyp": 41, + "line_f1": 0.6585365853658537, + "timing_markitdownnet": 535, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", @@ -87,99 +87,86 @@ "line_count_ref": 26, "line_count_hyp": 26, "line_f1": 0.7307692307692306, - "timing_markitdownnet": 408, - "timing_pytesseract": 1755 + "timing_markitdownnet": 380, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", - "file": "X51005230605", - "cer_char": 0.03929273084479371, - "token_precision": 0.9574468085106383, - "token_recall": 0.967741935483871, - "token_f1": 0.9625668449197862, - "line_count_ref": 25, - "line_count_hyp": 25, - "line_f1": 0.92, - "timing_markitdownnet": 458, - "timing_pytesseract": 1849 + "file": "X00016469670", + "cer_char": 0.09898107714701601, + "token_precision": 0.8468468468468469, + "token_recall": 0.8103448275862069, + "token_f1": 0.8281938325991189, + "line_count_ref": 29, + "line_count_hyp": 28, + "line_f1": 0.6315789473684211, + "timing_markitdownnet": 393, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", - "file": "X51005200931", - "cer_char": 0.03384279475982533, - "token_precision": 0.891566265060241, - "token_recall": 0.8862275449101796, - "token_f1": 0.888888888888889, - "line_count_ref": 41, - "line_count_hyp": 41, - "line_f1": 0.6585365853658537, - "timing_markitdownnet": 642, - "timing_pytesseract": 2056 - }, - { - "dataset": "ICDAR", - "file": "cTDaR_t00016", - "cer_char": 0.707796852646638, - "token_precision": 0.46397694524495675, - "token_recall": 0.3950920245398773, - "token_f1": 0.4267726971504307, - "line_count_ref": 51, - "line_count_hyp": 44, - "line_f1": 0, - "timing_markitdownnet": 3818, - "timing_pytesseract": 5629 + "file": "X51005230605", + "cer_char": 0.023575638506876228, + "token_precision": 0.978494623655914, + "token_recall": 0.978494623655914, + "token_f1": 0.978494623655914, + "line_count_ref": 25, + "line_count_hyp": 25, + "line_f1": 0.96, + "timing_markitdownnet": 344, + "timing_pytesseract": 0 }, { - "dataset": "ICDAR", - "file": "cTDaR_t00014", - "cer_char": 0.6630648330058939, - "token_precision": 0.36752136752136755, - "token_recall": 0.3173431734317343, - "token_f1": 0.3405940594059406, - "line_count_ref": 40, - "line_count_hyp": 36, - "line_f1": 0, - "timing_markitdownnet": 2935, - "timing_pytesseract": 4447 + "dataset": "PUBTABLES", + "file": "PMC1064078_table_6", + "cer_char": 0.3125, + "token_precision": 0.47058823529411764, + "token_recall": 0.4528301886792453, + "token_f1": 0.4615384615384615, + "line_count_ref": 7, + "line_count_hyp": 7, + "line_f1": 0.14285714285714285, + "timing_markitdownnet": 251, + "timing_pytesseract": 0 }, { - "dataset": "ICDAR", - "file": "cTDaR_t00080", - "cer_char": 0.7515723270440252, - "token_precision": 0.3641732283464567, - "token_recall": 0.3798767967145791, - "token_f1": 0.37185929648241206, - "line_count_ref": 50, - "line_count_hyp": 48, - "line_f1": 0, - "timing_markitdownnet": 3762, - "timing_pytesseract": 5772 + "dataset": "PUBTABLES", + "file": "PMC1064078_table_4", + "cer_char": 0.3125, + "token_precision": 0.4267515923566879, + "token_recall": 0.44666666666666666, + "token_f1": 0.4364820846905537, + "line_count_ref": 20, + "line_count_hyp": 21, + "line_f1": 0.04878048780487805, + "timing_markitdownnet": 472, + "timing_pytesseract": 0 }, { - "dataset": "ICDAR", - "file": "cTDaR_t00015", - "cer_char": 0.7768157768157768, - "token_precision": 0.3161904761904762, - "token_recall": 0.30018083182640143, - "token_f1": 0.30797773654916516, - "line_count_ref": 36, - "line_count_hyp": 36, - "line_f1": 0, - "timing_markitdownnet": 3387, - "timing_pytesseract": 4978 + "dataset": "PUBTABLES", + "file": "PMC1064082_table_1", + "cer_char": 0.06351183063511831, + "token_precision": 0.7819548872180451, + "token_recall": 0.7938931297709924, + "token_f1": 0.7878787878787878, + "line_count_ref": 26, + "line_count_hyp": 26, + "line_f1": 0.3076923076923077, + "timing_markitdownnet": 328, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", - "file": "PMC1064078_table_0", - "cer_char": 0.03770739064856712, - "token_precision": 0.8282828282828283, - "token_recall": 0.8367346938775511, - "token_f1": 0.8324873096446701, - "line_count_ref": 10, - "line_count_hyp": 10, - "line_f1": 0.10000000000000002, - "timing_markitdownnet": 350, - "timing_pytesseract": 1813 + "file": "PMC1064078_table_2", + "cer_char": 0.1217008797653959, + "token_precision": 0.6788321167883211, + "token_recall": 0.6992481203007519, + "token_f1": 0.6888888888888889, + "line_count_ref": 18, + "line_count_hyp": 18, + "line_f1": 0.05555555555555555, + "timing_markitdownnet": 472, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", @@ -191,21 +178,8 @@ "line_count_ref": 14, "line_count_hyp": 13, "line_f1": 0, - "timing_markitdownnet": 331, - "timing_pytesseract": 1844 - }, - { - "dataset": "PUBTABLES", - "file": "PMC1064078_table_2", - "cer_char": 0.1217008797653959, - "token_precision": 0.6788321167883211, - "token_recall": 0.6992481203007519, - "token_f1": 0.6888888888888889, - "line_count_ref": 18, - "line_count_hyp": 18, - "line_f1": 0.05555555555555555, - "timing_markitdownnet": 390, - "timing_pytesseract": 1791 + "timing_markitdownnet": 363, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", @@ -217,8 +191,8 @@ "line_count_ref": 16, "line_count_hyp": 15, "line_f1": 0.06451612903225808, - "timing_markitdownnet": 522, - "timing_pytesseract": 2071 + "timing_markitdownnet": 502, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", @@ -230,119 +204,139 @@ "line_count_ref": 5, "line_count_hyp": 3, "line_f1": 0, - "timing_markitdownnet": 116, - "timing_pytesseract": 1539 + "timing_markitdownnet": 131, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", - "file": "PMC1064078_table_4", - "cer_char": 0.3125, - "token_precision": 0.4267515923566879, - "token_recall": 0.44666666666666666, - "token_f1": 0.4364820846905537, - "line_count_ref": 20, - "line_count_hyp": 21, - "line_f1": 0.04878048780487805, - "timing_markitdownnet": 489, - "timing_pytesseract": 1842 + "file": "PMC1064078_table_0", + "cer_char": 0.03770739064856712, + "token_precision": 0.8282828282828283, + "token_recall": 0.8367346938775511, + "token_f1": 0.8324873096446701, + "line_count_ref": 10, + "line_count_hyp": 10, + "line_f1": 0.10000000000000002, + "timing_markitdownnet": 316, + "timing_pytesseract": 0 }, { - "dataset": "PUBTABLES", - "file": "PMC1064078_table_6", - "cer_char": 0.3125, - "token_precision": 0.47058823529411764, - "token_recall": 0.4528301886792453, - "token_f1": 0.4615384615384615, - "line_count_ref": 7, - "line_count_hyp": 7, - "line_f1": 0.14285714285714285, - "timing_markitdownnet": 304, - "timing_pytesseract": 1864 + "dataset": "FUNSD", + "file": "82092117", + "cer_char": 0.01079734219269103, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 32, + "line_count_hyp": 32, + "line_f1": 1, + "timing_markitdownnet": 610, + "timing_pytesseract": 0 }, { - "dataset": "PUBTABLES", - "file": "PMC1064082_table_1", - "cer_char": 0.06351183063511831, - "token_precision": 0.7819548872180451, - "token_recall": 0.7938931297709924, - "token_f1": 0.7878787878787878, - "line_count_ref": 26, - "line_count_hyp": 26, - "line_f1": 0.3076923076923077, - "timing_markitdownnet": 385, - "timing_pytesseract": 1721 + "dataset": "FUNSD", + "file": "82250337_0338", + "cer_char": 0.011834319526627219, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 27, + "line_count_hyp": 27, + "line_f1": 1, + "timing_markitdownnet": 456, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2013_63", - "cer_char": 0.04133545310015898, - "token_precision": 0.8972332015810277, - "token_recall": 0.8937007874015748, - "token_f1": 0.8954635108481263, - "line_count_ref": 38, - "line_count_hyp": 38, - "line_f1": 0.5789473684210527, - "timing_markitdownnet": 3631, - "timing_pytesseract": 2309 + "dataset": "FUNSD", + "file": "82251504", + "cer_char": 0.015776699029126214, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 28, + "line_count_hyp": 28, + "line_f1": 1, + "timing_markitdownnet": 590, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2013_64", - "cer_char": 0.11729323308270677, - "token_precision": 0.7592592592592593, - "token_recall": 0.7522935779816514, - "token_f1": 0.7557603686635944, - "line_count_ref": 21, - "line_count_hyp": 21, - "line_f1": 0.47619047619047616, - "timing_markitdownnet": 626, - "timing_pytesseract": 1912 + "dataset": "FUNSD", + "file": "82200067_0069", + "cer_char": 0.020602218700475437, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 28, + "line_count_hyp": 28, + "line_f1": 1, + "timing_markitdownnet": 623, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2006_3", - "cer_char": 0.03347107438016529, - "token_precision": 0.8811013767209012, - "token_recall": 0.8833124215809285, - "token_f1": 0.8822055137844611, - "line_count_ref": 60, - "line_count_hyp": 60, - "line_f1": 0.31666666666666665, - "timing_markitdownnet": 8684, - "timing_pytesseract": 3309 + "dataset": "ICDAR", + "file": "cTDaR_t00015", + "cer_char": 0.7768157768157768, + "token_precision": 0.3161904761904762, + "token_recall": 0.30018083182640143, + "token_f1": 0.30797773654916516, + "line_count_ref": 36, + "line_count_hyp": 36, + "line_f1": 0, + "timing_markitdownnet": 3270, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2014_4", - "cer_char": 0.11923556294142086, - "token_precision": 0.8087557603686636, - "token_recall": 0.8013698630136986, - "token_f1": 0.805045871559633, - "line_count_ref": 37, + "dataset": "ICDAR", + "file": "cTDaR_t00014", + "cer_char": 0.6630648330058939, + "token_precision": 0.36752136752136755, + "token_recall": 0.3173431734317343, + "token_f1": 0.3405940594059406, + "line_count_ref": 40, "line_count_hyp": 36, - "line_f1": 0.273972602739726, - "timing_markitdownnet": 991, - "timing_pytesseract": 2616 + "line_f1": 0, + "timing_markitdownnet": 2821, + "timing_pytesseract": 0 + }, + { + "dataset": "ICDAR", + "file": "cTDaR_t00080", + "cer_char": 0.7647798742138365, + "token_precision": 0.3570057581573896, + "token_recall": 0.38193018480492813, + "token_f1": 0.36904761904761907, + "line_count_ref": 50, + "line_count_hyp": 48, + "line_f1": 0, + "timing_markitdownnet": 2448, + "timing_pytesseract": 0 + }, + { + "dataset": "ICDAR", + "file": "cTDaR_t00016", + "cer_char": 0.707796852646638, + "token_precision": 0.46397694524495675, + "token_recall": 0.3950920245398773, + "token_f1": 0.4267726971504307, + "line_count_ref": 51, + "line_count_hyp": 44, + "line_f1": 0, + "timing_markitdownnet": 3532, + "timing_pytesseract": 0 } ], "aggregate": { "by_dataset": { - "FUNSD": { - "cer_avg": 0.014752644862229975, - "token_f1_avg": 1, - "line_f1_avg": 1, + "MARMOT": { + "cer_avg": 0.08067473996702207, + "token_f1_avg": 0.8318645752911739, + "line_f1_avg": 0.40252554572211124, "n_files": 4 }, "SROIE2019": { - "cer_avg": 0.06309604032001578, - "token_f1_avg": 0.8816470854794994, - "line_f1_avg": 0.7352211908758763, - "n_files": 4 - }, - "ICDAR": { - "cer_avg": 0.7248124473780835, - "token_f1_avg": 0.36180094739698715, - "line_f1_avg": 0, + "cer_avg": 0.05916676723553641, + "token_f1_avg": 0.8856290301635314, + "line_f1_avg": 0.7452211908758763, "n_files": 4 }, "PUBTABLES": { @@ -351,17 +345,23 @@ "line_f1_avg": 0.08992520286776778, "n_files": 8 }, - "MARMOT": { - "cer_avg": 0.07783383087611297, - "token_f1_avg": 0.8346188162139537, - "line_f1_avg": 0.4114442785044804, + "FUNSD": { + "cer_avg": 0.014752644862229974, + "token_f1_avg": 1, + "line_f1_avg": 1, + "n_files": 4 + }, + "ICDAR": { + "cer_avg": 0.7281143341705363, + "token_f1_avg": 0.3610980280382889, + "line_f1_avg": 0, "n_files": 4 } }, "global": { - "cer_avg": 0.23638613783393658, - "token_f1_avg": 0.7035058050110908, - "line_f1_avg": 0.3877526458526486, + "cer_avg": 0.236755058300417, + "token_f1_avg": 0.7035932690781831, + "line_f1_avg": 0.38793285705558717, "n_files": 24 } } diff --git a/artifacts/validation/OCR/summary-ocr.md b/artifacts/validation/OCR/summary-ocr.md index c42a76e..9910f9d 100644 --- a/artifacts/validation/OCR/summary-ocr.md +++ b/artifacts/validation/OCR/summary-ocr.md @@ -2,20 +2,20 @@ ## Global | scope | CER | Token-F1 | line_F1 | n_files | -| Global | 0.2364 | 0.7035 | 0.3878 | 24 | +| Global | 0.2368 | 0.7036 | 0.3879 | 24 | ## By dataset | scope | CER | Token-F1 | line_F1 | n_files | -| FUNSD | 0.0148 | 1.0000 | 1.0000 | 4 | -| SROIE2019 | 0.0631 | 0.8816 | 0.7352 | 4 | -| ICDAR | 0.7248 | 0.3618 | 0.0000 | 4 | +| MARMOT | 0.0807 | 0.8319 | 0.4025 | 4 | +| SROIE2019 | 0.0592 | 0.8856 | 0.7452 | 4 | | PUBTABLES | 0.2689 | 0.5715 | 0.0899 | 8 | -| MARMOT | 0.0778 | 0.8346 | 0.4114 | 4 | +| FUNSD | 0.0148 | 1.0000 | 1.0000 | 4 | +| ICDAR | 0.7281 | 0.3611 | 0.0000 | 4 | ## Top-5 worst files | dataset/file | cer_char | token_f1 | line_f1 | note | | ICDAR/cTDaR_t00015 | 0.7768 | 0.3080 | 0.0000 | | -| ICDAR/cTDaR_t00080 | 0.7516 | 0.3719 | 0.0000 | | +| ICDAR/cTDaR_t00080 | 0.7648 | 0.3690 | 0.0000 | | | ICDAR/cTDaR_t00016 | 0.7078 | 0.4268 | 0.0000 | | | PUBTABLES/PMC1064082_table_0 | 0.6912 | 0.2000 | 0.0000 | | | ICDAR/cTDaR_t00014 | 0.6631 | 0.3406 | 0.0000 | | diff --git a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt index 19d11c4..f1bdb7c 100644 --- a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt +++ b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt @@ -1,50 +1,52 @@ -Ba otr alorrire, mn Cathe L ares ; Fg 4 es Page Le - -LE as 3 SORE ST Sera Syprr stack aite Heya, -= oe ee 9 iy A ah ; , 2A: -fy |Peoee 2 7 : : rinse a -Too i en = base GLE, : / -pth Aye — ssf -eae ON I % ia. eee -nfs] sp ak f- OY te OO, ee MPEP — —— a see. -ye WO en |e eaten | faa! ae 1 Tie -om GA f i oe ae ) i ' Bas 7 -AU | v AYN tL LIP ru oY 2 spl 2m g 7 pe Sofi Hob ’ -hod} mins i/ Bf. V4 Gdns ie M Al ff // : -# | Lietigtia a. é wr di iS Luh if / J <7} . ff vy 4 a -five | Lu Tap, Lom ling Mlseelors thn ym ) Faas H -na PEW tan | Peep lil, » eae -Te os > 44 ref 2 » WF Fe ile ae ; ( ots e A q i -0 seal Dp on Pkeanaby Sai. | -5 eat ey IE ADR | WY, 24 . @ -: : [= | * a . oe BE bd sar jee v pod A a -be Xh > f Oy r VLnMNALBY wr REEL. “le a -| Gogh. Wy Pre! AT Ae anv pam -ay 4 prov. heb a Yow kath s ge fv br : 7 U/ zs : ;, -cl Lv" | Aix wher. _ 4 sey of ) | | — -oes Pi 8 de oe ee —— 4a -Vig ee by’ 4 pages ES oat | iB SS : : -tc 9 9t AS Z it padlox voa- fire tin soe a Wy eh -TEBE : eae a | -ee 4 a re hy is Z bp if A. Di i -Fees at = ade Let ooh flor ft |B ae -f { f j Jie f- . e 4 -ipcion aig eter | ae ae . £L,. ey -4 sub. by. “LYS Px : Lt 7 : : ae : i -re ae eee) >, -| ee acs Pflomnn heed yt nto ae i — - a ‘4 -ut 6 Us 4 Z Py 4 3 ef -YP hi oy , bey sit a “Ay s6Uy Sack ase: & a | -eae ter flay ay 2be / A -De nh ; 90 —— ! a -fa Hafli Rabat Bint SyhegYouyinem Be isto? @ -4 SES eo, ale 2 bm «fe 7 ee, .. i +Si! eee Se ee +cig 77zZ Cache Ja 2a rocaala Igy ie hae by > A387. ; +5 ayy : al > 7 < . -ff : Ree. -b2 1G. a Aid v7 = Me -i eithe Aero ee | oe -if La 3° VA , of " : J OH 3 Rese -fe te pee [tig | ~ a Aha AP IF7 : y PLD) sa cf y ’ ¥ Bo gh Z P PES. -| spilt Bisping nt) | oe -Za ak ae Fas 4 FA EB “ age Lb, : ane ee oe =e +soy iene a Ny AOP i +Tuy é eee | ope Ke . SiN aay oy i +Tie 0 ‘ one s Ai : Pua¢t7sz2t , * g thy ad —<— FF +i. es 4 fi : BPCLLS q +rin Pose f | . 7 5 ce eee ome oe +Pov". aunaber ss A Gls Moa pi pice TN CHa fanf Vit. ee > oe +Uhm Bry. eas On / 7, é : ry Ss oe haisdd 4 , f SU rine Gy § +J Wi | v ALY sess ALAS 14 esasdit act gay, nl $f i} Y) fe ‘ gp | : ; +{wd | eh Es / il ping “4 VA a rarer / h| '/ é /, 5 i f +mas TiO» Ie Ce. , 7 — ‘it —~Mlawbei. than yr ) a i ' +its pil csobetip bs, é par Arig Balbo 4’ | VG Baa) | +tay { ene —— A Serssta ot ppg ya a ae i - VA y &* ‘. +pvr. / 7 Urng~ if ay i +eas By oe TE PW GY yo . +if lon MAMA PAO. WA ‘ t, 24 fr a +Fie Porshe y gilyorud~ a. +IG BN eee eee He +bicNh > (Og {$+ W aap haee . A 1 = +LW ~ | 8: sess / bi Ba Mbt Dorn a! } “6 . Ts EBE AS Py dt i +/; | PO 7 wh o/s oe f fh) Pf, y 7 } fh i | +66 Bon i, Whine eg) prey A Tle +Us Sane io 0 a Sa ne ce a = +ee PANG 4 iil ee ee +fee on ULROS . | tind. j i oe Hi 2 | +fee 9h a LN testy p “2 ae, . oA st ieee en q +la Ii f ees gh ost a : (Poor 4 tl ; +be 4 sg i) * y he. : s - ae -poe col (eee xe ee Cy ag x ae es \ No newline at end of file +EH bef. rare fbdais fF? he > an +eee LOO ff i| +F on Mason | Ghar ghee Wn gle | | Diy og I + +pit Tied pele ue isto +pas Mave PT I pat on~ Gi BE a | +er ee Bee ree 9, Mg a +Feet ne AL. fed G8 fo + +oe af 4 +{ ne AF, ae : : E | +$= Gollah Phase) Slaw 8) F, 7 : | +tls pe §> Cof Oh 9.8 77 fj Vis A Pr ae ‘i be oe x +fe gs 1 aye ae? a 4 / ‘ © 4 : A (7 ; Ps of 58 are ; Sy 1 +be, qd. We Hla i” by (Oe ) ike +: : th cnl GRA Pe pe «ft {/. » See +MS yg Set OS ae) a a +D: Ritcarlefer fod en MR ne +* hs gee flor tae YU Cc ae eens MORSE 5 Nope eg +Eee See co ee ae + A eee Se eee Se eee ee Sei i \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt index 6cfe12e..d13ffc4 100644 --- a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt +++ b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt @@ -1,17 +1,17 @@ 09 these consume 0,99 CPU. That is, task3- misses -deadlines and the inverted pendulum! falls down, This -an explains the fact that the cost function in Figure 2 goes +a isaaise deadlines and the inverted pendulum! falls down, This +og EF c explains the fact thatthe cost function in Figure 2 goes . to infinite. Note that under RM, the task is not Bod schedulable -a. + EDP: EDF is « dynamic alorihm in open bop -Which assigns priorities to tasks according to their - +i + EDP: EDF is « dynamic alorihm in open bop +§ Which assigns priorities to tasks according to their +i, Beavis dedine, Une BG’ holler te schedulabiity condition is given by U = 1. For our -wg simulations, since U <1, the task set is schedulable +4 simulations, since U <1, the task set is schedulable and the three pendulums can be controlled as it can be A seen in Figure 2: the accumulated cost reaches a finite -a a oe a! value, which means that the deviation caused by each -ii Ail sal aed perturbation thet affected each of the three pendulums +a a oe i value, which means that the deviation caused by each +sig cic al ad perturbation thet affected each of the three pendulums could be adequately comected. ‘The performance with the difference in performance due 10 the we of sehieved by EDF is also given in Figure 2 in tems of different scheduling polices, which is the objective of our tie coat BocHonsreadting a valus oF W2754 at tho diff --git a/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt b/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt index 844908e..4a00d8c 100644 --- a/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt +++ b/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt @@ -1,7 +1,7 @@ PETRON BKT LANJAN SB ALSERKAM ENTERPRISE Tel: 03-6156 8757 Co No: 001083069-M -KM 458.4 BKT LANJAN UTARA, +KM 456.4 BKT LANJAN UTARA, L/RAYA UTARA SELATAN,SG BULOH 47000 SUNGAI BUL @@ -32,4 +32,4 @@ Use 3000 Petron Miles points to pay for RM45 Fuel -* Boo bia aes H \ No newline at end of file +* F eee H \ No newline at end of file diff --git a/src/MarkItDownNet/MarkItDownConverter.cs b/src/MarkItDownNet/MarkItDownConverter.cs index 011cb2a..7a43272 100644 --- a/src/MarkItDownNet/MarkItDownConverter.cs +++ b/src/MarkItDownNet/MarkItDownConverter.cs @@ -65,6 +65,9 @@ private MarkItDownResult ProcessPdf(string path, CancellationToken ct) var lines = new List(); var words = new List(); + if (_options.OcrForceRaster) + return ProcessPdfWithOcr(path, ct); + foreach (var page in document.GetPages()) { ct.ThrowIfCancellationRequested(); @@ -101,46 +104,105 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) var lines = new List(); var words = new List(); + double? deskew = null; + int dpi = 0, depth = 0; + // Rasterize PDF into images using PDFtoImage - var renderOptions = new RenderOptions { Dpi = _options.PdfRasterDpi }; + var renderOptions = new RenderOptions { Dpi = _options.OcrUserDpi }; using var stream = File.OpenRead(path); foreach (var bitmap in Conversion.ToImages(stream, leaveOpen: false, password: null, renderOptions)) { ct.ThrowIfCancellationRequested(); using (bitmap) { - pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height)); using var image = SKImage.FromBitmap(bitmap); using var data = image.Encode(SKEncodedImageFormat.Png, 100); - using var pix = Pix.LoadFromMemory(data.ToArray()); - var result = ProcessPix(pix, pages.Count, ct); - lines.AddRange(result.lines); - words.AddRange(result.words); + using var rawPix = Pix.LoadFromMemory(data.ToArray()); + var prep = PreparePix(rawPix, out var angle); + pages.Add(new Page(pages.Count + 1, prep.Width, prep.Height)); + var res = RunOcr(prep, pages.Count, ct); + prep.Dispose(); + lines.AddRange(res.lines); + words.AddRange(res.words); + deskew ??= angle; + dpi = prep.XRes; + depth = prep.Depth; } } var markdown = BuildMarkdown(lines); - return new MarkItDownResult(markdown, pages, lines, words); + return new MarkItDownResult(markdown, pages, lines, words, deskew, dpi, depth); } private MarkItDownResult ProcessImage(string path, CancellationToken ct) { - using var pix = Pix.LoadFromFile(path); - var (lines, words) = ProcessPix(pix, 1, ct); - var pages = new List { new Page(1, pix.Width, pix.Height) }; + using var rawPix = Pix.LoadFromFile(path); + var prep = PreparePix(rawPix, out var angle); + var (lines, words) = RunOcr(prep, 1, ct); + var pages = new List { new Page(1, prep.Width, prep.Height) }; var markdown = BuildMarkdown(lines); - return new MarkItDownResult(markdown, pages, lines, words); + var result = new MarkItDownResult(markdown, pages, lines, words, angle, prep.XRes, prep.Depth); + prep.Dispose(); + return result; + } + + private Pix PreparePix(Pix pix, out double? deskewAngle) + { + deskewAngle = null; + + if (_options.OcrSetDpiMetadata) + { + pix.XRes = _options.OcrUserDpi; + pix.YRes = _options.OcrUserDpi; + } + + if (pix.XRes < 220 || pix.YRes < 220) + { + float scale = (float)_options.OcrUserDpi / Math.Max(1, Math.Min(pix.XRes, pix.YRes)); + var scaled = pix.Scale(scale, scale); + pix.Dispose(); + pix = scaled; + pix.XRes = _options.OcrUserDpi; + pix.YRes = _options.OcrUserDpi; + } + + if (_options.OcrColorDepth == OcrColorDepth.Grayscale8bpp && pix.Depth != 8) + { + var gray = pix.ConvertRGBToGray(); + pix.Dispose(); + pix = gray; + } + + if (_options.OcrPreBinarize) + { + // Binarization disabled by default; placeholder for future use + } + + var deskewed = pix.Deskew(out var skew); + if (Math.Abs(skew.Angle) >= _options.OcrDeskewMinAngleDeg) + { + pix.Dispose(); + pix = deskewed; + deskewAngle = skew.Angle; + } + else + { + deskewed.Dispose(); + } + + return pix; } - private (List lines, List words) ProcessPix(Pix pix, int pageNumber, CancellationToken ct) + private (List lines, List words) RunOcr(Pix pix, int pageNumber, CancellationToken ct) { var lines = new List(); var words = new List(); using var engine = new TesseractEngine( _options.OcrDataPath ?? string.Empty, _options.OcrLanguages, - EngineMode.LstmOnly); - engine.DefaultPageSegMode = _options.PageSegMode; + _options.OcrOem); + engine.DefaultPageSegMode = (PageSegMode)_options.OcrPsm; + engine.SetVariable("num_threads", _options.OcrThreads.ToString()); using var page = engine.Process(pix); using var iter = page.GetIterator(); iter.Begin(); diff --git a/src/MarkItDownNet/MarkItDownOptions.cs b/src/MarkItDownNet/MarkItDownOptions.cs index e1b86d7..299666b 100644 --- a/src/MarkItDownNet/MarkItDownOptions.cs +++ b/src/MarkItDownNet/MarkItDownOptions.cs @@ -2,6 +2,12 @@ namespace MarkItDownNet; using Tesseract; +public enum OcrColorDepth +{ + Grayscale8bpp, + Color32bpp +} + /// Runtime options for conversion. public class MarkItDownOptions { @@ -11,11 +17,32 @@ public class MarkItDownOptions /// Languages for OCR, e.g. "eng" or "ita+eng". public string OcrLanguages { get; set; } = "eng"; - /// Page segmentation mode used by Tesseract. - public PageSegMode PageSegMode { get; set; } = PageSegMode.SingleBlock; + /// User provided DPI metadata and rendering target. + public int OcrUserDpi { get; set; } = 300; + + /// Tesseract page segmentation mode (PSM). + public int OcrPsm { get; set; } = 6; + + /// Tesseract engine mode (OEM). + public EngineMode OcrOem { get; set; } = EngineMode.LstmOnly; + + /// Number of threads to use for Tesseract OCR. + public int OcrThreads { get; set; } = 1; + + /// Force PDF rasterization even when native text is available. + public bool OcrForceRaster { get; set; } = true; + + /// Apply binarization before OCR. + public bool OcrPreBinarize { get; set; } = false; + + /// Minimum deskew angle in degrees to trigger rotation. + public double OcrDeskewMinAngleDeg { get; set; } = 2.0; + + /// Color depth for OCR input. + public OcrColorDepth OcrColorDepth { get; set; } = OcrColorDepth.Grayscale8bpp; - /// DPI used when rasterizing PDFs for OCR fallback. - public int PdfRasterDpi { get; set; } = 300; + /// Set DPI metadata on images passed to Tesseract. + public bool OcrSetDpiMetadata { get; set; } = true; /// Minimum number of native words required before falling back to OCR. public int MinimumNativeWordThreshold { get; set; } = 1; diff --git a/src/MarkItDownNet/Models.cs b/src/MarkItDownNet/Models.cs index 0c75ac7..b636ddc 100644 --- a/src/MarkItDownNet/Models.cs +++ b/src/MarkItDownNet/Models.cs @@ -28,4 +28,7 @@ public record MarkItDownResult( string Markdown, IReadOnlyList Pages, IReadOnlyList Lines, - IReadOnlyList Words); + IReadOnlyList Words, + double? DeskewAngleDeg = null, + int Dpi = 0, + int ColorDepth = 0); diff --git a/tests/MarkItDownNet.Tests/ConversionTests.cs b/tests/MarkItDownNet.Tests/ConversionTests.cs index 35e0bf8..be3bfe1 100644 --- a/tests/MarkItDownNet.Tests/ConversionTests.cs +++ b/tests/MarkItDownNet.Tests/ConversionTests.cs @@ -18,7 +18,7 @@ public async Task PdfWithDigitalTextProducesMarkdownAndWords() page.AddText("Hello world", 12, new PdfPoint(10, 150), font); await File.WriteAllBytesAsync(tmp, builder.Build()); - var converter = new MarkItDownConverter(new MarkItDownOptions { NormalizeMarkdown = false }); + var converter = new MarkItDownConverter(new MarkItDownOptions { NormalizeMarkdown = false, OcrForceRaster = false }); var result = await converter.ConvertAsync(tmp, "application/pdf"); Assert.False(string.IsNullOrWhiteSpace(result.Markdown)); diff --git a/tools/OcrBench/Program.cs b/tools/OcrBench/Program.cs index be3f5e2..d65a246 100644 --- a/tools/OcrBench/Program.cs +++ b/tools/OcrBench/Program.cs @@ -38,26 +38,41 @@ static void Extract(Dictionary o) var inputDir = o["--input-dir"]; var outDir = o["--out-dir"]; var langs = o["--langs"]; - var psm = o["--psm"]; - var threads = o["--threads"]; - var python = o["--python-exe"]; - - Environment.SetEnvironmentVariable("OMP_THREAD_LIMIT", threads); - - if (Directory.Exists(outDir)) Directory.Delete(outDir, true); - Directory.CreateDirectory(Path.Combine(outDir, "markitdownnet")); - Directory.CreateDirectory(Path.Combine(outDir, "pytesseract")); + var psm = int.Parse(o["--psm"]); + var threads = int.Parse(o["--threads"]); + var python = o.GetValueOrDefault("--python-exe", "python3"); + var refresh = o.ContainsKey("--refresh") ? o["--refresh"] : "markitdownnet"; + + Environment.SetEnvironmentVariable("OMP_THREAD_LIMIT", threads.ToString()); + + Directory.CreateDirectory(outDir); + var refreshSet = new HashSet(refresh.Split(',', StringSplitOptions.RemoveEmptyEntries)); + var refreshMark = refreshSet.Contains("markitdownnet"); + var refreshPy = refreshSet.Contains("pytesseract"); + if (refreshMark) + { + var mdDir = Path.Combine(outDir, "markitdownnet"); + if (Directory.Exists(mdDir)) Directory.Delete(mdDir, true); + Directory.CreateDirectory(mdDir); + } + if (refreshPy) + { + var pyDir = Path.Combine(outDir, "pytesseract"); + if (Directory.Exists(pyDir)) Directory.Delete(pyDir, true); + Directory.CreateDirectory(pyDir); + } var options = new MarkItDownOptions { OcrLanguages = langs, OcrDataPath = "/usr/share/tesseract-ocr/5/tessdata", - PageSegMode = (Tesseract.PageSegMode)6, + OcrPsm = psm, + OcrThreads = threads, NormalizeMarkdown = false, DetectBulletLists = false, MergeLines = false, MinimumNativeWordThreshold = int.MaxValue, - PdfRasterDpi = 300 + OcrForceRaster = true }; var converter = new MarkItDownConverter(options); @@ -77,33 +92,44 @@ static void Extract(Dictionary o) var rel = dataset + "/" + name; var images = GetImages(file).ToList(); - var sw = Stopwatch.StartNew(); - var textMark = OcrMark(converter, images); - sw.Stop(); - var tMark = sw.ElapsedMilliseconds; - var outMarkDir = Path.Combine(outDir, "markitdownnet", dataset); - Directory.CreateDirectory(outMarkDir); - File.WriteAllText(Path.Combine(outMarkDir, name), textMark); - totalMark += tMark; - Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | markitdownnet | {tMark} ms"); - - sw.Restart(); - var textPy = OcrPy(images, python, langs, psm); - sw.Stop(); - var tPy = sw.ElapsedMilliseconds; - var outPyDir = Path.Combine(outDir, "pytesseract", dataset); - Directory.CreateDirectory(outPyDir); - File.WriteAllText(Path.Combine(outPyDir, name), textPy); - totalPy += tPy; - Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | pytesseract | {tPy} ms"); - - timings[rel] = new Dictionary { { "markitdownnet", tMark }, { "pytesseract", tPy } }; + long tMark = 0, tPy = 0; + if (refreshMark) + { + var sw = Stopwatch.StartNew(); + var (textMark, dpiEff, depth, angle) = OcrMark(converter, images); + sw.Stop(); + tMark = sw.ElapsedMilliseconds; + var outMarkDir = Path.Combine(outDir, "markitdownnet", dataset); + Directory.CreateDirectory(outMarkDir); + File.WriteAllText(Path.Combine(outMarkDir, name), textMark); + totalMark += tMark; + var angleStr = angle.HasValue ? angle.Value.ToString("F2") : "skipped"; + Console.WriteLine($"{dataset}/{Path.GetFileName(file)} | dpi {dpiEff} | depth {depth} | deskew {angleStr} | psm {psm} | oem {options.OcrOem} | {tMark} ms"); + } + + if (refreshPy) + { + var sw = Stopwatch.StartNew(); + var textPy = OcrPy(images, python, langs, psm.ToString()); + sw.Stop(); + tPy = sw.ElapsedMilliseconds; + var outPyDir = Path.Combine(outDir, "pytesseract", dataset); + Directory.CreateDirectory(outPyDir); + File.WriteAllText(Path.Combine(outPyDir, name), textPy); + totalPy += tPy; + Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | pytesseract | {tPy} ms"); + } + + var dict = new Dictionary(); + if (refreshMark) dict["markitdownnet"] = tMark; + if (refreshPy) dict["pytesseract"] = tPy; + timings[rel] = dict; } } File.WriteAllText(Path.Combine(outDir, "timings.json"), JsonSerializer.Serialize(timings, new JsonSerializerOptions { WriteIndented = true })); - Console.WriteLine($"TOTAL markitdownnet {totalMark} ms"); - Console.WriteLine($"TOTAL pytesseract {totalPy} ms"); + if (refreshMark) Console.WriteLine($"TOTAL markitdownnet {totalMark} ms"); + if (refreshPy) Console.WriteLine($"TOTAL pytesseract {totalPy} ms"); } static IEnumerable GetImages(string path) @@ -129,15 +155,19 @@ static IEnumerable GetImages(string path) } } -static string OcrMark(MarkItDownConverter conv, IEnumerable images) +static (string text, int dpi, int depth, double? angle) OcrMark(MarkItDownConverter conv, IEnumerable images) { var sb = new StringBuilder(); + int dpi = 0, depth = 0; double? angle = null; foreach (var img in images) { var res = conv.ConvertAsync(img, GetMime(img)).Result; sb.AppendLine(res.Markdown.Trim()); + angle ??= res.DeskewAngleDeg; + if (dpi == 0) dpi = res.Dpi; + if (depth == 0) depth = res.ColorDepth; } - return sb.ToString().Trim(); + return (sb.ToString().Trim(), dpi, depth, angle); } static string OcrPy(IEnumerable images, string py, string lang, string psm) @@ -216,8 +246,8 @@ static void Compare(Dictionary o) line_count_ref = lcRef, line_count_hyp = lcHyp, line_f1 = lf, - timing_markitdownnet = t?["markitdownnet"] ?? 0, - timing_pytesseract = t?["pytesseract"] ?? 0 + timing_markitdownnet = t != null && t.TryGetValue("markitdownnet", out var tm) ? tm : 0, + timing_pytesseract = t != null && t.TryGetValue("pytesseract", out var tpVal) ? tpVal : 0 }); } } @@ -281,6 +311,20 @@ static void Compare(Dictionary o) sb.AppendLine($"- {kv.Key}: {kv.Value}"); Directory.CreateDirectory(Path.GetDirectoryName(outMd)!); File.WriteAllText(outMd, sb.ToString()); + + Console.WriteLine($"GLOBAL Token-F1 {global.token_f1_avg:F4} line_F1 {global.line_f1_avg:F4}"); + foreach (var kv in byDataset) + Console.WriteLine($"{kv.Key} Token-F1 {kv.Value.token_f1_avg:F4} line_F1 {kv.Value.line_f1_avg:F4}"); + + int exitCode = 0; + if (global.token_f1_avg < 0.80 || global.line_f1_avg < 0.50) + exitCode = 1; + foreach (var kv in byDataset) + if ((kv.Key.Equals("ICDAR", StringComparison.OrdinalIgnoreCase) || kv.Key.Equals("PUBTABLES", StringComparison.OrdinalIgnoreCase)) && + (kv.Value.token_f1_avg < 0.80 || kv.Value.line_f1_avg < 0.50)) + exitCode = 1; + + Environment.Exit(exitCode); } static string Normalize(string text)