diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b24e0eba..9770cc57 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -59,7 +59,7 @@ jobs: # - app-tag.zip # - app-tag-info.whl - name: Archive package - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: dist path: ./dist @@ -79,7 +79,7 @@ jobs: # download artifacts from job: build - name: Download artifacts from build - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: dist path: dist @@ -110,7 +110,7 @@ jobs: uses: actions/checkout@v2 - name: Download artifacts from release - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: dist path: dist diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 27eadcf6..668565bc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: runs-on: ubuntu-latest container: - image: python:3.8 + image: python:3.10 steps: - name: Check out code @@ -36,6 +36,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install setuptools wheel pip install -r requirements.txt pip install pytest python setup.py develop @@ -53,7 +54,7 @@ jobs: strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - name: Check out code @@ -67,6 +68,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install setuptools wheel pip install -r requirements.txt pip install pytest pytest-cov python setup.py develop @@ -82,7 +84,7 @@ jobs: # upload docx for further job - name: Archive package - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: - name: outputs + name: outputs-py${{ matrix.python-version }} path: ./test/outputs \ No newline at end of file diff --git a/pdf2docx/table/Cell.py b/pdf2docx/table/Cell.py index 1b7d421a..bd01ea87 100644 --- a/pdf2docx/table/Cell.py +++ b/pdf2docx/table/Cell.py @@ -19,6 +19,16 @@ def __init__(self, raw:dict=None): self.border_width = raw.get('border_width', (0,0,0,0)) # type: tuple [float] self.merged_cells = raw.get('merged_cells', (1,1)) # type: tuple [int] + def _block_text(self, block): + '''Get text from a block, always returning a str (for join).''' + if not hasattr(block, 'text'): + return '' + t = block.text + if t is None: + return '' + if isinstance(t, list): + return '\n'.join(str(x) for x in t) + return str(t) @property def text(self): @@ -28,8 +38,9 @@ def text(self): # fixme: prev code did `if block.is_text_block`, but sometimes # there is no `is_text_block` member; would be good to ensure # this member is always present and avoid use of `hasattr()`. - return '\n'.join([block.text if hasattr(block, 'text') else '' - for block in self.blocks]) + return '\n'.join([self._block_text(block) for block in self.blocks]) + # return '\n'.join([block.text if hasattr(block, 'text') else '' + # for block in self.blocks]) @property diff --git a/requirements.txt b/requirements.txt index 87df5f80..da2eee0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -PyMuPDF>=1.19.0 +PyMuPDF>=1.26.7 python-docx>=0.8.10 fonttools>=4.24.0 numpy>=1.17.2 diff --git a/setup.py b/setup.py index 01b2076e..b16f1441 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ def load_requirements(fname): include_package_data=True, zip_safe=False, install_requires=load_requirements("requirements.txt"), - python_requires=">=3.6", + python_requires=">=3.10", entry_points={ "console_scripts": [ "pdf2docx=pdf2docx.main:main" diff --git a/test/samples/demo-table-empty-cell.pdf b/test/samples/demo-table-empty-cell.pdf new file mode 100644 index 00000000..f7f19b3a Binary files /dev/null and b/test/samples/demo-table-empty-cell.pdf differ diff --git a/test/test.py b/test/test.py index 5b0a4397..4fcb6ef5 100644 --- a/test/test.py +++ b/test/test.py @@ -247,6 +247,21 @@ def test_extracting_table(self): ['Description F', '1.00', '0.86', '0.37', '0.78', '0.01'] ] assert table==sample + + # ------------------------------------------ + # extract tables with empty/none cells + # ------------------------------------------ + def test_extract_tables_empty_cell(self): + '''Test extracting tables from demo-table-empty-cell.pdf (no crash, returns list).''' + # Test that extract_tables() handles tables with empty cells correctly. + filename = 'demo-table-empty-cell' + pdf_file = os.path.join(sample_path, f'{filename}.pdf') + cv = Converter(pdf_file) + tables = cv.extract_tables() + cv.close() + assert isinstance(tables, list), 'extract_tables() should return a list' + # At least one table expected from this sample + assert len(tables) >= 1, f'expected at least 1 table, got {len(tables)}' # ------------------------------------------