-
Notifications
You must be signed in to change notification settings - Fork 45
perf: pdfminer to fitz in convert#630 #685
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
120800c
09327be
b6bc3cf
52815fd
9a38365
2b3e77a
c792c09
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| from pathlib import Path | ||
| from typing import List, Optional, Tuple, Union | ||
| from typing import List, Optional | ||
|
|
||
| import fitz | ||
| from pdfminer.high_level import extract_pages | ||
| from pdfminer.layout import ( | ||
| LTAnno, | ||
|
|
@@ -101,3 +102,53 @@ def _create_neighbor_bbox(self, current_bbox: Bbox) -> Bbox: | |
| new_x0, new_y0 = x1, y0 | ||
| new_x1, new_y1 = (x1 - x0) + x1, y1 | ||
| return new_x0, new_y0, new_x1, new_y1 | ||
|
|
||
|
|
||
| class PlainPDFToBadgerdocTokensConverterPytz: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I understand it is needed to replase PlainPDFToBadgerdocTokensConverter class by this one |
||
| def __init__(self) -> None: | ||
| self.offset = 0 | ||
| self.page_size: Optional[PageSize] = None | ||
|
|
||
| def _convert_span(self, span): # type: ignore | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is necessary to remove all type ignores. In order to simplify the process of determining types, you can add a breakpoint here and in debug mode, find out the type of this variable. |
||
| tokens = [] | ||
| for char in span.get("chars"): | ||
| token = BadgerdocToken( | ||
| bbox=char.get("bbox"), | ||
| text=char.get("c"), | ||
| offset=Offset(begin=self.offset, end=self.offset + 1), | ||
| ) | ||
| tokens.append(token) | ||
| self.offset += 1 | ||
| return tokens | ||
|
|
||
| def _convert_line(self, line): # type: ignore | ||
| tokens = [] | ||
| for span in line.get("spans"): | ||
| tokens.extend(self._convert_span(span)) # type: ignore | ||
| return tokens | ||
|
|
||
| def _convert_element(self, element): # type: ignore | ||
| tokens = [] | ||
| for line in element.get("lines"): | ||
| tokens.extend(self._convert_line(line)) # type: ignore | ||
| return tokens | ||
|
|
||
| def _convert_page(self, page): # type: ignore | ||
| tokens = [] | ||
| text = page.get_textpage().extractDICT() | ||
| self.page_size = PageSize( | ||
| width=text.get("width"), height=text.get("height") | ||
| ) | ||
| for block in page.get_textpage().extractRAWDICT().get("blocks"): | ||
| tokens.extend(self._convert_element(block)) # type: ignore | ||
| return tokens | ||
|
|
||
| def convert(self, plain_pdf: Path) -> List[Page]: | ||
| pages = [] | ||
| with fitz.Document(plain_pdf) as doc: | ||
| for i, page in enumerate(doc.pages(), start=1): | ||
| objs = self._convert_page(page) # type: ignore | ||
| if not self.page_size: | ||
| continue | ||
| pages.append(Page(page_num=i, objs=objs, size=self.page_size)) | ||
| return pages | ||
Uh oh!
There was an error while loading. Please reload this page.