From 38c9e348602d4cfd27416d74365db4d5ab112e74 Mon Sep 17 00:00:00 2001 From: Mark Dufour Date: Fri, 31 Jul 2020 13:39:26 +0200 Subject: [PATCH] add basic table support --- html2docx/html2docx.py | 46 +++++++++++++++++++++++++++ tests/table.html | 4 +++ tests/table_nested.html | 7 +++++ tests/test_table.py | 70 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 127 insertions(+) create mode 100644 tests/table.html create mode 100644 tests/table_nested.html create mode 100644 tests/test_table.py diff --git a/html2docx/html2docx.py b/html2docx/html2docx.py index 2470bb0..9c1c3f1 100644 --- a/html2docx/html2docx.py +++ b/html2docx/html2docx.py @@ -78,6 +78,8 @@ def _reset(self) -> None: # Formatting options self.pre = False + self.table_cell: Optional[Any] = None + self.tables: List[Tuple[Any, int, int]] = [] self.alignment: Optional[int] = None self.padding_left: Optional[Pt] = None self.attrs: List[List[Tuple[str, Any]]] = [] @@ -101,6 +103,38 @@ def finish_p(self) -> None: self.r.text = self.r.text.rstrip() self._reset() + def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None: + if self.table_cell is not None: + table = self.table_cell.add_table(rows=0, cols=0) + else: + table = self.doc.add_table(rows=0, cols=0) + self.tables.append((table, -1, -1)) + + def finish_table(self) -> None: + table = self.tables.pop()[0] + section = self.doc.sections[0] + page_width = section.page_width - section.left_margin - section.right_margin + page_width = int(page_width * (0.5 ** len(self.tables))) + for col in table.columns: + col.width = page_width // len(table.columns) + + def init_tr(self) -> None: + table, row, col = self.tables[-1] + row += 1 + col = -1 + self.tables[-1] = (table, row, col) + table.add_row() + + def init_tdth(self) -> None: + table, row, col = self.tables[-1] + col += 1 + self.tables[-1] = (table, row, col) + if col >= len(table.columns): + table.add_column(0) + self.table_cell = table.cell(row, col) + self.p = self.table_cell.paragraphs[0] + self.r = None + def init_run(self, attrs: List[Tuple[str, Any]]) -> None: self.attrs.append(attrs) if attrs: @@ -183,6 +217,12 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N self.init_run([("underline", True)]) elif tag == "ul": self.add_list_style("List Bullet") + elif tag == "table": + self.init_table(attrs) + elif tag == "tr": + self.init_tr() + elif tag in ["td", "th"]: + self.init_tdth() def handle_data(self, data: str) -> None: if not self.pre: @@ -208,3 +248,9 @@ def handle_endtag(self, tag: str) -> None: del self.list_style[-1] elif tag == "pre": self.pre = False + elif tag == "table": + self.finish_table() + elif tag in ["td", "th"]: + self.table_cell = None + self.p = None + self.r = None diff --git a/tests/table.html b/tests/table.html new file mode 100644 index 0000000..db9d27c --- /dev/null +++ b/tests/table.html @@ -0,0 +1,4 @@ + + + +
12
3
diff --git a/tests/table_nested.html b/tests/table_nested.html new file mode 100644 index 0000000..224611a --- /dev/null +++ b/tests/table_nested.html @@ -0,0 +1,7 @@ + + + +
12
3 + + +
45
67
diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 0000000..adb78c5 --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,70 @@ +import os + +import docx + +from html2docx import html2docx + +from .utils import TEST_DIR + + +def test_table(): + html_path = os.path.join(TEST_DIR, "table.html") + html = open(html_path).read() + buf = html2docx(html, title="table") + + doc = docx.Document(buf) + + assert len(doc.tables) == 1 + table = doc.tables[0] + + assert len(table.rows) == 2 + assert len(table.columns) == 2 + + contents = [ + ["1", "2"], + ["3"], + ] + + for r, row in enumerate(contents): + for c, text in enumerate(row): + assert table.cell(r, c).text == text + + assert table.cell(0, 0).paragraphs[0].runs[0].font.bold is None + assert table.cell(0, 1).paragraphs[0].runs[0].font.bold is True + + +def test_table_nested(): + html_path = os.path.join(TEST_DIR, "table_nested.html") + html = open(html_path).read() + buf = html2docx(html, title="table") + + doc = docx.Document(buf) + assert len(doc.tables) == 1 + + table = doc.tables[0] + + assert len(table.rows) == 2 + assert len(table.columns) == 2 + + contents = [ + ["1", "2"], + ["3"], + ] + + for r, row in enumerate(contents): + for c, text in enumerate(row): + assert table.cell(r, c).text == text + + cell = table.cell(1, 1) + assert len(cell.tables) == 1 + + table2 = cell.tables[0] + + contents2 = [ + ["4", "5"], + ["6", "7"], + ] + + for r, row in enumerate(contents2): + for c, text in enumerate(row): + assert table2.cell(r, c).text == text