Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion convert/convert/converters/pdf/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import List, Optional, Tuple, Union
from typing import List, Optional

import fitz
from pdfminer.high_level import extract_pages
from pdfminer.layout import (
LTAnno,
Expand Down Expand Up @@ -101,3 +102,53 @@ def _create_neighbor_bbox(self, current_bbox: Bbox) -> Bbox:
new_x0, new_y0 = x1, y0
new_x1, new_y1 = (x1 - x0) + x1, y1
return new_x0, new_y0, new_x1, new_y1


class PlainPDFToBadgerdocTokensConverterPytz:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand it is needed to replase PlainPDFToBadgerdocTokensConverter class by this one

def __init__(self) -> None:
self.offset = 0
self.page_size: Optional[PageSize] = None

def _convert_span(self, span): # type: ignore
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is necessary to remove all type ignores. In order to simplify the process of determining types, you can add a breakpoint here and in debug mode, find out the type of this variable.

tokens = []
for char in span.get("chars"):
token = BadgerdocToken(
bbox=char.get("bbox"),
text=char.get("c"),
offset=Offset(begin=self.offset, end=self.offset + 1),
)
tokens.append(token)
self.offset += 1
return tokens

def _convert_line(self, line): # type: ignore
tokens = []
for span in line.get("spans"):
tokens.extend(self._convert_span(span)) # type: ignore
return tokens

def _convert_element(self, element): # type: ignore
tokens = []
for line in element.get("lines"):
tokens.extend(self._convert_line(line)) # type: ignore
return tokens

def _convert_page(self, page): # type: ignore
tokens = []
text = page.get_textpage().extractDICT()
self.page_size = PageSize(
width=text.get("width"), height=text.get("height")
)
for block in page.get_textpage().extractRAWDICT().get("blocks"):
tokens.extend(self._convert_element(block)) # type: ignore
return tokens

def convert(self, plain_pdf: Path) -> List[Page]:
pages = []
with fitz.Document(plain_pdf) as doc:
for i, page in enumerate(doc.pages(), start=1):
objs = self._convert_page(page) # type: ignore
if not self.page_size:
continue
pages.append(Page(page_num=i, objs=objs, size=self.page_size))
return pages
4 changes: 2 additions & 2 deletions convert/convert/converters/pdf/pdf_to_badgerdoc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from convert.converters.base_format.badgerdoc import Badgerdoc
from convert.converters.pdf.pdf_converter import (
PlainPDFToBadgerdocTokensConverter,
PlainPDFToBadgerdocTokensConverterPytz,
)
from convert.models.common import S3Path

Expand Down Expand Up @@ -35,7 +35,7 @@ def download_pdf_from_s3(self, s3_input_pdf: S3Path) -> None:
s3_input_pdf.bucket, s3_input_pdf.path, input_file
)
self.badgerdoc_format.tokens_pages = (
PlainPDFToBadgerdocTokensConverter().convert(input_file)
PlainPDFToBadgerdocTokensConverterPytz().convert(input_file)
)

def upload_badgerdoc_to_s3(self, s3_output_tokens: S3Path) -> None:
Expand Down
Loading