diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index d66cdedd..00000000 --- a/.editorconfig +++ /dev/null @@ -1,11 +0,0 @@ -; top-most EditorConfig file -root = true - -; Unix-style newlines -[*] -end_of_line = CRLF - -; 4 space indentation -[*.py] -indent_style = space -indent_size = 4 diff --git a/.gitignore b/.gitignore index f6059d29..047a2c10 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,14 @@ *.py[co] *.bak -test/*output.md build dist *.egg-info +.idea +.coverage +.coverage.* +env/ +.c9/ +.vscode +.tox/ +htmlcov/ +.mypy_cache/ diff --git a/.travis.yml b/.travis.yml index e077f58d..26c3b4c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,33 @@ language: python -python: - - "2.5" - - "2.6" - - "2.7" - - "pypy" -script: cd test/; python run_tests.py +cache: pip + +matrix: + include: + - python: "3.7" + env: TOXENV=black + - python: "3.7" + env: TOXENV=flake8 + - python: "3.7" + env: TOXENV=mypy + - python: "3.7" + env: TOXENV=isort + - python: "3.5" + env: TOXENV=py35 + - python: "3.6" + env: TOXENV=py36 + - python: "3.7" + env: TOXENV=py37 + - python: 3.8 + env: TOXENV=py38 + - python: 3.9 + env: TOXENV=py39 + - python: "pypy3" + env: TOXENV=pypy3 + +install: + - pip install tox +script: + - tox +after_success: + - pip install coveralls + - coveralls diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 00000000..989a3655 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,41 @@ +``html2text`` was originally written by Aaron Swartz. + +The AUTHORS/Contributors are (and/or have been): + +* Aaron Swartz +* Yariv Barkan +* Alex Musayev +* Matěj Cepl +* Stefano Rivera +* Alireza Savand +* Ivan Gromov +* Jocelyn Delalande +* Matt Dorn +* Miguel Tavares +* Scott Blackburn +* Peter Wu +* Arjoonn Sharma +* Ali Mohammad +* Albert Berger +* Etienne Millon +* John C F +* Mikhail Melnik +* Andres Rey +* Ciprian Miclaus +* Toshihiro Kamiya +* Matt Dennewitz +* Jonathan Sundqvist +* Simon Meers +* Kurt McKee +* Germain Z. +* Jacek Kołodziej +* Jonathan Vanasco +* Jon Dufresne +* Edward Ross +* Mike Borsetti +* Gregory Anders +* Maciej Podolski https://github.com/mpodolsk + +Maintainer: + +* Alireza Savand diff --git a/ChangeLog.rst b/ChangeLog.rst new file mode 100644 index 00000000..3c642f96 --- /dev/null +++ b/ChangeLog.rst @@ -0,0 +1,274 @@ +UNRELEASED +========== +---- + +* Fix #332: Insert at most one space for multiple emphasis +* Feature #318: Make padded tables more similar to pandoc's pipe_tables. +* Add support for Python 3.9. +* Fix extra line breaks inside html link text (between '[' and ']') +* Fix #344: indent ``
    `` inside ``
      `` three spaces instead of two to comply with CommonMark, GFM, etc. +* Fix #324: unnecessary spaces around ````, ````, and ``strike`` tags. +* Don't wrap tables by default and add a ``--wrap-tables`` config option +* Fix #320 padding empty tables and tables with no tags. +* Add ``ignore_mailto_links`` config option to ignore ``mailto:`` style links. + + +2020.1.16 +========= +---- + +* Add type annotations. +* Add support for Python 3.8. +* Performance improvements when ``wrap_links`` is ``False`` (the default). +* Configure setuptools using setup.cfg. + + +2019.9.26 +========= +---- + +* Fix long blockquotes wrapping. +* Remove the trailing whitespaces that were added after wrapping list items & blockquotes. +* Remove support for Python ≤ 3.4. Now requires Python 3.5+. +* Fix memory leak when processing a document containing a ```` tag. +* Fix ``AttributeError`` when reading text from stdin. +* Fix ``UnicodeEncodeError`` when writing output to stdout. + + +2019.8.11 +========= +---- + +* Add support for wrapping list items. +* Fix #201: handle ‎/‏ marks mid-text within stressed tags or right after stressed tags. +* Feature #213: ``images_as_html`` config option to always generate an ``img`` html tag. preserves "height", "width" and "alt" if possible. +* Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. +* Remove support for retrieving HTML over the network. +* Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. +* Fix #238: correct spacing when a HTML entity follows a non-stressed tags which follow a stressed tag. +* Remove unused or deprecated: + + * ``html2text.compat.escape()`` + * ``html2text.config.RE_UNESCAPE`` + * ``html2text.HTML2Text.replaceEntities()`` + * ``html2text.HTML2Text.unescape()`` + * ``html2text.unescape()`` + +* Fix #208: handle LEFT-TO-RIGHT MARK after a stressed tag. + + +2018.1.9 +======== +---- + +* Fix #188: Non-ASCII in title attribute causes encode error. +* Feature #194: Add support for the tag. +* Feature #193: Add support for the tag. + + +2017.10.4 +========== +---- + +* Fix #157: Fix images link with div wrap +* Fix #55: Fix error when empty title tags +* Fix #160: The html2text tests are failing on Windows and on Cygwin due to differences in eol handling between windows/*nix +* Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build +* Fix #109: Fix for unexpanded < > & +* Fix #143: Fix line wrapping for the lines starting with bold +* Adds support for numeric bold text indication in ``font-weight``, + as used by Google (and presumably others.) +* Fix #173 and #142: Stripping whitespace in crucial markdown and adding whitespace as necessary +* Don't drop any cell data on tables uneven row lengths (e.g. colspan in use) + + +2016.9.19 +========= +---- + +* Default image alt text option created and set to a default of empty string "" to maintain backward compatibility +* Fix #136: --default-image-alt now takes a string as argument +* Fix #113: Stop changing quiet levels on \/script tags. +* Merge #126: Fix deprecation warning on py3 due to html.escape +* Fix #145: Running test suite on Travis CI for Python 2.6. + + +2016.5.29 +========= +---- + +* Fix #125: --pad_tables now pads table cells to make them look nice. +* Fix #114: Break does not interrupt blockquotes +* Deprecation warnings for URL retrieval. + + +2016.4.2 +========= +---- + +* Fix #106: encoding by stdin +* Fix #89: Python 3.5 support. +* Fix #113: inplace baseurl substitution for and tags. +* Feature #118: Update the badges to badge.kloud51.com +* Fix #119: new-line after a list is inserted + + +2016.1.8 +========= +---- + +* Feature #99: Removed duplicated initialisation. +* Fix #100: Get element style key error. +* Fix #101: Fix error end tag pop exception +* , , now rendered as ~~text~~. + + +2015.11.4 +========= +---- + +* Fix #38: Long links wrapping controlled by ``--no-wrap-links``. +* Note: ``--no-wrap-links`` implies ``--reference-links`` +* Feature #83: Add callback-on-tag. +* Fix #87: Decode errors can be handled via command line. +* Feature #95: Docs, decode errors spelling mistake. +* Fix #84: Make bodywidth kwarg overridable using config. + + +2015.6.21 +========= +---- + +* Fix #31: HTML entities stay inside link. +* Fix #71: Coverage detects command line tests. +* Fix #39: Documentation update. +* Fix #61: Functionality added for optional use of automatic links. +* Feature #80: ``title`` attribute is preserved in both inline and reference links. +* Feature #82: More command line options. See docs. + + +2015.6.12 +========= +---- + +* Feature #76: Making ``pre`` blocks clearer for further automatic formatting. +* Fix #71: Coverage detects tests carried out in ``subprocesses`` + + +2015.6.6 +======== +---- + +* Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. +* Fix #61. Malformed links in markdown output. +* Feature #62: Automatic version number. +* Fix #63: Nested code, anchor bug. +* Fix #64: Proper handling of anchors with content that starts with tags. +* Feature #67: Documentation all over the module. +* Feature #70: Adding tests for the module. +* Fix #73: Typo in config documentation. + + +2015.4.14 +========= +---- + + +* Feature #59: Write image tags with height and width attrs as raw html to retain dimensions + + +2015.4.13 +========= +---- + + +* Feature #56: Treat '-' file parameter as stdin. +* Feature #57: Retain escaping of html except within code or pre tags. + + +2015.2.18 +========= +---- + +* Fix #38: Anchor tags with empty text or with ```` tags inside are no longer stripped. + + +2014.12.29 +========== +---- + +* Feature #51: Add single line break option. + This feature is useful for ensuring that lots of extra line breaks do not + end up in the resulting Markdown file in situations like Evernote .enex + exports. Note that this only works properly if ``body-width`` is set + to ``0``. + + +2014.12.24 +========== +---- + +* Feature #49: Added an images_to_alt option to discard images and keep only their alt. +* Feature #50: Protect links, surrounding them with angle brackets to avoid breaking... +* Feature: Add ``setup.cfg`` file. + + +2014.12.5 +========= +---- + +* Feature: Update ``README.md`` with usage examples. +* Fix #35: Remove ``py_modules`` from ``setup.py``. +* Fix #36: Excludes tests from being installed as a separate module. +* Fix #37: Don't hardcode the path to the installed binary. +* Fix: Readme typo in running cli. +* Feature #40: Extract cli part to ``cli`` module. +* Feature #42: Bring python version compatibility to ``compat.py`` module. +* Feature #41: Extract utility/helper methods to ``utils`` module. +* Fix #45: Does not accept standard input when running under Python 3. +* Feature: Clean up ``ChangeLog.rst`` for version and date numbers. + + +2014.9.25 +========= +---- + +* Feature #29, #27: Add simple table support with bypass option. +* Fix #20: Replace project website with: https://alir3z4.github.io/html2text/ . + + +2014.9.8 +======== +---- + +* Fix #28: missing ``html2text`` package in installation. + + +2014.9.7 +======== +---- + +* Fix ``unicode``/``type`` error in memory leak unit-test. +* Feature #16: Remove ``install_deps.py``. +* Feature #17: Add status badges via pypin. +* Feature #18: Add ``Python`` ``3.4`` to travis config file. +* Feature #19: Bring ``html2text`` to a separate module and take out the ``conf``/``constant`` variables. +* Feature #21: Remove meta vars from ``html2text.py`` file header. +* Fix: Fix TypeError when parsing tags like . Fixed in #25. + + +2014.7.3 +======== +---- + +* Fix #8: Remove ``How to do a release`` section from README.md. +* Fix #11: Include test directory markdown, html files. +* Fix #13: memory leak in using ``handle`` while keeping the old instance of ``html2text``. + + +2014.4.5 +======== +---- + +* Fix #1: Add ``ChangeLog.rst`` file. +* Fix #2: Add ``AUTHORS.rst`` file. diff --git a/ISSUE_TEMPLATE b/ISSUE_TEMPLATE new file mode 100644 index 00000000..f2d46a02 --- /dev/null +++ b/ISSUE_TEMPLATE @@ -0,0 +1,4 @@ + +- Version by `html2text --version` +- Test script +- Python version `python --version` diff --git a/MANIFEST.in b/MANIFEST.in index 2301d362..28e28387 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,6 @@ include COPYING include README.md +include ChangeLog.rst +include AUTHORS.rst +include tox.ini +recursive-include test *.html *.md *.py diff --git a/README.md b/README.md index e9b44605..201ad560 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,103 @@ -# [html2text](http://www.aaronsw.com/2002/html2text/) +# html2text + +[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](https://travis-ci.org/Alir3z4/html2text) +[![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) +[![Downloads](http://badge.kloud51.com/pypi/d/html2text.png)](https://pypi.org/project/html2text/) +[![Version](http://badge.kloud51.com/pypi/v/html2text.png)](https://pypi.org/project/html2text/) +[![Wheel?](http://badge.kloud51.com/pypi/wheel/html2text.png)](https://pypi.org/project/html2text/) +[![Format](http://badge.kloud51.com/pypi/format/html2text.png)](https://pypi.org/project/html2text/) +[![License](http://badge.kloud51.com/pypi/license/html2text.png)](https://pypi.org/project/html2text/) + html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). -Usage: `html2text.py [(filename|url) [encoding]]` - - Options: - --version show program's version number and exit - -h, --help show this help message and exit - --ignore-links don't include any formatting for links - --ignore-images don't include any formatting for images - -g, --google-doc convert an html-exported Google Document - -d, --dash-unordered-list - use a dash rather than a star for unordered list items - -b BODY_WIDTH, --body-width=BODY_WIDTH - number of characters per output line, 0 for no wrap - -i LIST_INDENT, --google-list-indent=LIST_INDENT - number of pixels Google indents nested lists - -s, --hide-strikethrough - hide strike-through text. only relevent when -g is - specified as well - -Or you can use it from within Python: - - import html2text - print html2text.html2text("

      Hello, world.

      ") + +Usage: `html2text [filename [encoding]]` + +| Option | Description +|--------------------------------------------------------|--------------------------------------------------- +| `--version` | Show program's version number and exit +| `-h`, `--help` | Show this help message and exit +| `--ignore-links` | Don't include any formatting for links +|`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. +| `--reference-links` | Use reference links instead of links to create markdown +| `--mark-code` | Mark preformatted and code blocks with [code]...[/code] + +For a complete list of options see the [docs](https://github.com/Alir3z4/html2text/blob/master/docs/usage.md) + + +Or you can use it from within `Python`: + +``` +>>> import html2text +>>> +>>> print(html2text.html2text("

      Zed's dead baby, Zed's dead.

      ")) +**Zed's** dead baby, _Zed's_ dead. + +``` +Using it on a web link - + +``` +import html2text +from urllib.request import urlopen + +h = html2text.HTML2Text() + +h.ignore_links = True + + +url = "https://en.wikipedia.org/wiki/Aaron_Swartz" +html = urlopen(url).read() + +html = str(html) +print(h.handle(html)) + +``` Or with some configuration options: +``` +>>> import html2text +>>> +>>> h = html2text.HTML2Text() +>>> # Ignore converting links from HTML +>>> h.ignore_links = True +>>> print h.handle("

      Hello, world!") +Hello, world! + +>>> print(h.handle("

      Hello, world!")) + +Hello, world! + +>>> # Don't Ignore links anymore, I like links +>>> h.ignore_links = False +>>> print(h.handle("

      Hello, world!")) +Hello, [world](https://www.google.com/earth/)! - import html2text - h = html2text.HTML2Text() - h.ignore_links = True - print h.handle("

      Hello, world!") +``` -_Originally written by Aaron Swartz. This code is distributed under the GPLv3._ +*Originally written by Aaron Swartz. This code is distributed under the GPLv3.* -## How to do a release +## How to install + +`html2text` is available on pypi +https://pypi.org/project/html2text/ + +``` +$ pip install html2text +``` -1. Update the version in `html2text.py` -2. Update the version in `setup.py` -3. Run `python setup.py sdist upload` ## How to run unit tests - cd test/ - python run_tests.py + tox + +To see the coverage results: + + coverage html + +then open the `./htmlcov/index.html` file in your browser. + +## Documentation -[![Build Status](https://secure.travis-ci.org/aaronsw/html2text.png)](http://travis-ci.org/aaronsw/html2text) +Documentation lives [here](https://github.com/Alir3z4/html2text/blob/master/docs/usage.md) diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 00000000..04092dfb --- /dev/null +++ b/docs/about.md @@ -0,0 +1,13 @@ +About +----- + +html2text is a python package which converts a page of HTML into clean, +easy-to-read plain ASCII text. Better yet, that ASCII also happens to be +valid Markdown (a text-to-HTML format). + +It was originally written by Aaron Swartz. + +The code is under GPL v3. + +The module is based on the html parser in the python standard library +and so any valid input for the parser is valid input for the library. diff --git a/docs/authors.md b/docs/authors.md new file mode 120000 index 00000000..5b54fb58 --- /dev/null +++ b/docs/authors.md @@ -0,0 +1 @@ +../AUTHORS.rst \ No newline at end of file diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 00000000..41a2ce07 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,12 @@ +Pull requests are welcome. + +The package is developed [here](https://github.com/Alir3z4/html2text) + +Pull guidelines +--------------- + +- Make the changes modular. The usual method is one change per commit. +- Add tests. (We love tests). +- Update the ``ChangeLog.rst``. +- Add yourself to ``AUTHORS.rst`` if you're not listed. +- That is all diff --git a/docs/how_it_works.md b/docs/how_it_works.md new file mode 100644 index 00000000..3602610d --- /dev/null +++ b/docs/how_it_works.md @@ -0,0 +1,141 @@ +Introduction +============ + + +There are 5 components to the code. They are kept as separate files in the +html2text directory. This part of the documentation explains them bit by bit. + + +compat.py +--------- + +This part exists only to test compatibility with the available python standard libraries. Python3 relocated some libraries and so this file makes sure that everything has a common interface. + +config.py +--------- + +Used to provide various configuration settings to the converter. They are as follows: + + - UNICODE_SNOB for using unicode + - ESCAPE_SNOB for escaping every special character + - LINKS_EACH_PARAGRAPH for putting links after every paragraph + - BODY_WIDTH for wrapping long lines + - SKIP_INTERNAL_LINKS to skip #local-anchor things + - INLINE_LINKS for formatting images and links + - PROTECT_LINKS protect from line breaks + - GOOGLE_LIST_INDENT no of pixels to indent nested lists + - IGNORE_ANCHORS + - IGNORE_IMAGES + - IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible. + - IMAGES_TO_ALT + - IMAGES_WITH_SIZE + - IGNORE_EMPHASIS + - BYPASS_TABLES format tables in HTML rather than Markdown + - IGNORE_TABLES ignore table-related tags (table, th, td, tr) while keeping rows + - SINGLE_LINE_BREAK to use a single line break rather than two + - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII + values + - RE_SPACE for finding space-only lines + - RE_ORDERED_LIST_MATCHER for matching ordered lists in MD + - RE_UNORDERED_LIST_MATCHER for matching unordered list matcher in MD + - RE_MD_CHARS_MATCHER for matching Md \,[,],( and ) + - RE_MD_CHARS_MATCHER_ALL for matching `,*,_,{,},[,],(,),#,! + - RE_MD_DOT_MATCHER for matching lines starting with 1. + - RE_MD_PLUS_MATCHER for matching lines starting with + + - RE_MD_DASH_MATCHER for matching lines starting with (-) + - RE_SLASH_CHARS a string of slash escapeable characters + - RE_MD_BACKSLASH_MATCHER to match \char + - USE_AUTOMATIC_LINKS to convert http://xyz to + +utils.py +-------- + +Used to provide utility functions to html2text +Some functions are: + + - name2cp :name to code point + - hn :headings + - dumb_property_dict :hash of css attrs + - dumb_css_parser :returns a hash of css selectors, each + containing a hash of css attrs + - element_style :hash of final style of element + - google_list_style :find out ordered?unordered + - google_has_height :does element have height? + - google_text_emphasis :a list of all emphasis modifiers + - google_fixed_width_font :check for fixed width font + - list_numbering_start :extract numbering from list elem attrs + - skipwrap :skip wrap for give para or not? + - escape_md :escape md sensitive within other md + - escape_md_section :escape md sensitive across whole doc + + +cli.py +------ + +Command line interface for the code. + + +| Option | Description +|--------------------------------------------------------|--------------------------------------------------- +| `--version` | Show program version number and exit +| `-h`, `--help` | Show this help message and exit +| `--ignore-links` | Do not include any formatting for links +|`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets +|`--ignore-images` | Do not include any formatting for images +|`--images-to-alt` | Discard image data, only keep alt text +|`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions +|`--images-as-html` | Always write image tags as raw html; preserves "height", "width" and "alt" if possible. +|`-g`, `--google-doc` | Convert an html-exported Google Document +|`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items +|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap +|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists +|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevant when `-g` is specified as well +|`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. +| `--bypass-tables` | Format tables in HTML rather than Markdown syntax. +| `--ignore-tables` | Ignore table-related tags (table, th, td, tr) while keeping rows. +| `--single-line-break` | Use a single line break after a block element rather than two. +| `--reference-links` | Use reference links instead of inline links to create markdown + +*A complete list is available [here](usage.md)* + +__init__.py +----------- + +This is where everything comes together. This is the glue for all the +things we have described above. + +This file describes a single HTML2Text class which is itself a subclass of the HTMLParser in python + +Upon initialization it sets various config variables necessary for +processing the given html in a certain manner necessary to create valid +markdown text. +The class defines methods: + + - feed + - handle + - outtextf + - close + - handle_charref + - handle_entityref + - handle_starttag + - handle_endtag + - previousIndex + - handle_emphasis + - handle_tag + - pbr + - p + - soft_br + - o + - handle_data + - charref + - entityref + - google_nest_count + - optwrap + +Besides this there are 2 more methods defined: + + - html2text :calls the HTML2Text class with .handle() method + - unescape :calls the HTML2Text class with .unescape() method + +What they do is provide methods to make the HTML parser in python +parse the HTML and convert to markdown. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..5e299d8d --- /dev/null +++ b/docs/index.md @@ -0,0 +1,10 @@ +Html2Text +========= + +1. [About](about.md) +2. [Authors](../AUTHORS.rst) +3. [What is markdown](https://daringfireball.net/projects/markdown/) +4. [How it works](how_it_works.md) +5. [Usage](usage.md) +6. [Contributing](contributing.md) +7. [Tests](test.md) diff --git a/docs/test.md b/docs/test.md new file mode 100644 index 00000000..22c64111 --- /dev/null +++ b/docs/test.md @@ -0,0 +1,19 @@ +Tests +===== + +Testing is essential. + +Run the tests +------------- + +`tox` + +Coverage results can be seen with + +`coverage html` + +and then opening the `./htmlcov/index.html` file with your browser. + +New tests +--------- +New tests are always welcome see [contributing](contributing.md) for guidelines. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..2a5b78cc --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,152 @@ +Usage +===== + +The module is simple enough to use. This tutorial will get you started. + +Installing +---------- + +These are the methods you can get the module installed:- + +### PIP + +For those who have pip, we got your back. + +``` +$ pip install html2text +``` + +### Clone from Git Repository + +Clone the repository from https://github.com/Alir3z4/html2text + +``` +$ git clone --depth 1 https://github.com/Alir3z4/html2text.git +$ python setup.py build +$ python setup.py install +``` + + + +Basic Usage +----------- + +Once installed the module can be used as follows. + + import html2text + html = function_to_get_some_html() + text = html2text.html2text(html) + print(text) + +This converts the provided html to text( Markdown text) with all the +options set to default. + +Using Options +-------------- + +To customize the options provided by the module the usage is as follows: + + import html2text + text_maker = html2text.HTML2Text() + text_maker.ignore_links = True + text_maker.bypass_tables = False + html = function_to_get_some_html() + text = text_maker.handle(html) + print(text) + + +Available Options +----------------- + +All options exist in the config.py file. A list is provided here with +simple indications of their function. + + + - UNICODE_SNOB for using unicode + - ESCAPE_SNOB for escaping every special character + - LINKS_EACH_PARAGRAPH for putting links after every paragraph + - BODY_WIDTH for wrapping long lines + - SKIP_INTERNAL_LINKS to skip #local-anchor things + - INLINE_LINKS for formatting images and links + - PROTECT_LINKS protect from line breaks + - GOOGLE_LIST_INDENT no of pixels to indent nested lists + - IGNORE_ANCHORS + - IGNORE_IMAGES + - IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible. + - IMAGES_TO_ALT + - IMAGES_WITH_SIZE + - IGNORE_EMPHASIS + - BYPASS_TABLES format tables in HTML rather than Markdown + - IGNORE_TABLES ignore table-related tags (table, th, td, tr) while keeping rows + - SINGLE_LINE_BREAK to use a single line break rather than two + - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII + values + - RE_SPACE for finding space-only lines + - RE_ORDERED_LIST_MATCHER for matching ordered lists in MD + - RE_UNORDERED_LIST_MATCHER for matching unordered list matcher in MD + - RE_MD_CHARS_MATCHER for matching Md \,[,],( and ) + - RE_MD_CHARS_MATCHER_ALL for matching `,*,_,{,},[,],(,),#,! + - RE_MD_DOT_MATCHER for matching lines starting with 1. + - RE_MD_PLUS_MATCHER for matching lines starting with + + - RE_MD_DASH_MATCHER for matching lines starting with (-) + - RE_SLASH_CHARS a string of slash escapeable characters + - RE_MD_BACKSLASH_MATCHER to match \char + - USE_AUTOMATIC_LINKS to convert http://xyz to + - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags + - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False) + - WRAP_LIST_ITEMS to decide if list items have to be wrapped during text wrapping + - WRAP_TABLES to decide if tables have to be wrapped during text wrapping + - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values. + - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage + - OPEN_QUOTE is the character used to open a quote when replacing the `` tag. It defaults to `"`. + - CLOSE_QUOTE is the character used to close a quote when replacing the `` tag. It defaults to `"`. + +Options that are not in the config.py file: + + - emphasis_mark is the character used when replacing the `` tag. It defaults to `_`. + - strong_mark is the characer used when replacing the `` tag. It defaults to `**`. + +To alter any option the procedure is to create a parser with +`parser = html2text.HTML2Text()` and to set the option on the parser. +example: `parser.unicode_snob = True` to set the UNICODE_SNOB option. + + +Command line options +-------------------- + + +| Option | Description +|--------------------------------------------------------|--------------------------------------------------- +| `--version` | Show program version number and exit +| `-h`, `--help` | Show this help message and exit +| `--ignore-links` | Do not include any formatting for links +|`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets +|`--ignore-images` | Do not include any formatting for images +|`--images-as-html` | Always write image tags as raw html; preserves "height", "width" and "alt" if possible. +|`--images-to-alt` | Discard image data, only keep alt text +|`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions +|`-g`, `--google-doc` | Convert an html-exported Google Document +|`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items +|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap +|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists +|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevant when `-g` is specified as well +|`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. +| `--bypass-tables` | Format tables in HTML rather than Markdown syntax. +| `--ignore-tables` | Ignore table-related tags (table, th, td, tr) while keeping rows. +| `--single-line-break` | Use a single line break after a block element rather than two. +| `--reference-links` | Use reference links instead of inline links to create markdown +| `--ignore-emphasis` | Ignore all emphasis formatting in the html. +| `-e`, `--asterisk-emphasis` | Use asterisk rather than underscore to emphasize text +| `--unicode-snob` | Use unicode throughout instead of ASCII +| `--no-automatic-links` | Do not use automatic links like +| `--no-skip-internal-links` | Turn off skipping of internal links +| `--links-after-para` | Put the links after the paragraph and not at end of document +| `--mark-code` | Mark code with [code]...[/code] blocks +| `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` +| `--wrap-list-items` | Wrap list items during text wrapping. +| `--wrap-tables` | Wrap tables during text wrapping. +| `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. +| `--pad-tables` | Use padding to make tables look good. +| `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values. +| `--open-quote`=`"` | Inserts the given text when opening a quote. Defaults to `"`. +| `--close-quote`=`"` | Inserts the given text when closing a quote. Defaults to `"`. diff --git a/html2text.py b/html2text.py deleted file mode 100755 index 17528901..00000000 --- a/html2text.py +++ /dev/null @@ -1,914 +0,0 @@ -#!/usr/bin/env python -"""html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "3.200.3" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] - -# TODO: -# Support decoded entities with unifiable. - -try: - True -except NameError: - setattr(__builtins__, 'True', 1) - setattr(__builtins__, 'False', 0) - -def has_key(x, y): - if hasattr(x, 'has_key'): return x.has_key(y) - else: return y in x - -try: - import htmlentitydefs - import urlparse - import HTMLParser -except ImportError: #Python3 - import html.entities as htmlentitydefs - import urllib.parse as urlparse - import html.parser as HTMLParser -try: #Python3 - import urllib.request as urllib -except: - import urllib -import optparse, re, sys, codecs, types - -try: from textwrap import wrap -except: pass - -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Escape all special characters. Output is less readable, but avoids corner case formatting issues. -ESCAPE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 - -# Don't show internal links (href="#local-anchor") -- corresponding link targets -# won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = True - -# Use inline, rather than reference, formatting for images and links -INLINE_LINKS = True - -# Number of pixels Google indents nested lists -GOOGLE_LIST_INDENT = 36 - -IGNORE_ANCHORS = False -IGNORE_IMAGES = False -IGNORE_EMPHASIS = False - -### Entity Nonsense ### - -def name2cp(k): - if k == 'apos': return ord("'") - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - else: - k = htmlentitydefs.entitydefs[k] - if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 - return ord(codecs.latin_1_decode(k)[0]) - -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', -'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', -'lrm':'', 'rlm':''} - -unifiable_n = {} - -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] - -### End Entity Nonsense ### - -def onlywhite(line): - """Return true if the line does only consist of whitespace characters.""" - for c in line: - if c is not ' ' and c is not ' ': - return c is ' ' - return line - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): return n - except ValueError: return 0 - -def dumb_property_dict(style): - """returns a hash of css attributes""" - return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); - -def dumb_css_parser(data): - """returns a hash of css selectors, each of which contains a hash of css attributes""" - # remove @import sentences - data += ';' - importIndex = data.find('@import') - while importIndex != -1: - data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] - importIndex = data.find('@import') - - # parse the css. reverted from dictionary compehension in order to support older pythons - elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] - try: - elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) - except ValueError: - elements = {} # not that important - - return elements - -def element_style(attrs, style_def, parent_style): - """returns a hash of the 'final' style attributes of the element""" - style = parent_style.copy() - if 'class' in attrs: - for css_class in attrs['class'].split(): - css_style = style_def['.' + css_class] - style.update(css_style) - if 'style' in attrs: - immediate_style = dumb_property_dict(attrs['style']) - style.update(immediate_style) - return style - -def google_list_style(style): - """finds out whether this is an ordered or unordered list""" - if 'list-style-type' in style: - list_style = style['list-style-type'] - if list_style in ['disc', 'circle', 'square', 'none']: - return 'ul' - return 'ol' - -def google_has_height(style): - """check if the style of the element has the 'height' attribute explicitly defined""" - if 'height' in style: - return True - return False - -def google_text_emphasis(style): - """return a list of all emphasis modifiers of the element""" - emphasis = [] - if 'text-decoration' in style: - emphasis.append(style['text-decoration']) - if 'font-style' in style: - emphasis.append(style['font-style']) - if 'font-weight' in style: - emphasis.append(style['font-weight']) - return emphasis - -def google_fixed_width_font(style): - """check if the css of the current element defines a fixed width font""" - font_family = '' - if 'font-family' in style: - font_family = style['font-family'] - if 'Courier New' == font_family or 'Consolas' == font_family: - return True - return False - -def list_numbering_start(attrs): - """extract numbering from list element attributes""" - if 'start' in attrs: - return int(attrs['start']) - 1 - else: - return 0 - -class HTML2Text(HTMLParser.HTMLParser): - def __init__(self, out=None, baseurl=''): - HTMLParser.HTMLParser.__init__(self) - - # Config options - self.unicode_snob = UNICODE_SNOB - self.escape_snob = ESCAPE_SNOB - self.links_each_paragraph = LINKS_EACH_PARAGRAPH - self.body_width = BODY_WIDTH - self.skip_internal_links = SKIP_INTERNAL_LINKS - self.inline_links = INLINE_LINKS - self.google_list_indent = GOOGLE_LIST_INDENT - self.ignore_links = IGNORE_ANCHORS - self.ignore_images = IGNORE_IMAGES - self.ignore_emphasis = IGNORE_EMPHASIS - self.google_doc = False - self.ul_item_mark = '*' - self.emphasis_mark = '_' - self.strong_mark = '**' - - if out is None: - self.out = self.outtextf - else: - self.out = out - - self.outtextlist = [] # empty list to store output characters before they are "joined" - - try: - self.outtext = unicode() - except NameError: # Python3 - self.outtext = str() - - self.quiet = 0 - self.p_p = 0 # number of newline character to print before next output - self.outcount = 0 - self.start = 1 - self.space = 0 - self.a = [] - self.astack = [] - self.maybe_automatic_link = None - self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') - self.acount = 0 - self.list = [] - self.blockquote = 0 - self.pre = 0 - self.startpre = 0 - self.code = False - self.br_toggle = '' - self.lastWasNL = 0 - self.lastWasList = False - self.style = 0 - self.style_def = {} - self.tag_stack = [] - self.emphasis = 0 - self.drop_white_space = 0 - self.inheader = False - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later - self.baseurl = baseurl - - try: del unifiable_n[name2cp('nbsp')] - except KeyError: pass - unifiable['nbsp'] = ' _place_holder;' - - - def feed(self, data): - data = data.replace("", "") - HTMLParser.HTMLParser.feed(self, data) - - def handle(self, data): - self.feed(data) - self.feed("") - return self.optwrap(self.close()) - - def outtextf(self, s): - self.outtextlist.append(s) - if s: self.lastWasNL = s[-1] == '\n' - - def close(self): - HTMLParser.HTMLParser.close(self) - - self.pbr() - self.o('', 0, 'end') - - self.outtext = self.outtext.join(self.outtextlist) - if self.unicode_snob: - nbsp = unichr(name2cp('nbsp')) - else: - nbsp = u' ' - self.outtext = self.outtext.replace(u' _place_holder;', nbsp) - - return self.outtext - - def handle_charref(self, c): - self.o(self.charref(c), 1) - - def handle_entityref(self, c): - self.o(self.entityref(c), 1) - - def handle_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) - - def handle_endtag(self, tag): - self.handle_tag(tag, None, 0) - - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not has_key(attrs, 'href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if has_key(a, 'href') and a['href'] == attrs['href']: - if has_key(a, 'title') or has_key(attrs, 'title'): - if (has_key(a, 'title') and has_key(attrs, 'title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - - def drop_last(self, nLetters): - if not self.quiet: - self.outtext = self.outtext[:-nLetters] - - def handle_emphasis(self, start, tag_style, parent_style): - """handles various text emphases""" - tag_emphasis = google_text_emphasis(tag_style) - parent_emphasis = google_text_emphasis(parent_style) - - # handle Google's text emphasis - strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough - bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis - italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis - fixed = google_fixed_width_font(tag_style) and not \ - google_fixed_width_font(parent_style) and not self.pre - - if start: - # crossed-out text must be handled before other attributes - # in order not to output qualifiers unnecessarily - if bold or italic or fixed: - self.emphasis += 1 - if strikethrough: - self.quiet += 1 - if italic: - self.o(self.emphasis_mark) - self.drop_white_space += 1 - if bold: - self.o(self.strong_mark) - self.drop_white_space += 1 - if fixed: - self.o('`') - self.drop_white_space += 1 - self.code = True - else: - if bold or italic or fixed: - # there must not be whitespace before closing emphasis mark - self.emphasis -= 1 - self.space = 0 - self.outtext = self.outtext.rstrip() - if fixed: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(1) - self.drop_white_space -= 1 - else: - self.o('`') - self.code = False - if bold: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(2) - self.drop_white_space -= 1 - else: - self.o(self.strong_mark) - if italic: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(1) - self.drop_white_space -= 1 - else: - self.o(self.emphasis_mark) - # space is only allowed after *all* emphasis marks - if (bold or italic) and not self.emphasis: - self.o(" ") - if strikethrough: - self.quiet -= 1 - - def handle_tag(self, tag, attrs, start): - #attrs = fixattrs(attrs) - if attrs is None: - attrs = {} - else: - attrs = dict(attrs) - - if self.google_doc: - # the attrs parameter is empty for a closing tag. in addition, we - # need the attributes of the parent nodes in order to get a - # complete style description for the current element. we assume - # that google docs export well formed html. - parent_style = {} - if start: - if self.tag_stack: - parent_style = self.tag_stack[-1][2] - tag_style = element_style(attrs, self.style_def, parent_style) - self.tag_stack.append((tag, attrs, tag_style)) - else: - dummy, attrs, tag_style = self.tag_stack.pop() - if self.tag_stack: - parent_style = self.tag_stack[-1][2] - - if hn(tag): - self.p() - if start: - self.inheader = True - self.o(hn(tag)*"#" + ' ') - else: - self.inheader = False - return # prevent redundant emphasis marks on headers - - if tag in ['p', 'div']: - if self.google_doc: - if start and google_has_height(tag_style): - self.p() - else: - self.soft_br() - else: - self.p() - - if tag == "br" and start: self.o(" \n") - - if tag == "hr" and start: - self.p() - self.o("* * *") - self.p() - - if tag in ["head", "style", 'script']: - if start: self.quiet += 1 - else: self.quiet -= 1 - - if tag == "style": - if start: self.style += 1 - else: self.style -= 1 - - if tag in ["body"]: - self.quiet = 0 # sites like 9rules.com never close - - if tag == "blockquote": - if start: - self.p(); self.o('> ', 0, 1); self.start = 1 - self.blockquote += 1 - else: - self.blockquote -= 1 - self.p() - - if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark) - if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark) - if tag in ['del', 'strike', 's']: - if start: - self.o("<"+tag+">") - else: - self.o("") - - if self.google_doc: - if not self.inheader: - # handle some font attributes, but leave headers clean - self.handle_emphasis(start, tag_style, parent_style) - - if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` - if tag == "abbr": - if start: - self.abbr_title = None - self.abbr_data = '' - if has_key(attrs, 'title'): - self.abbr_title = attrs['title'] - else: - if self.abbr_title != None: - self.abbr_list[self.abbr_data] = self.abbr_title - self.abbr_title = None - self.abbr_data = '' - - if tag == "a" and not self.ignore_links: - if start: - if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): - self.astack.append(attrs) - self.maybe_automatic_link = attrs['href'] - else: - self.astack.append(None) - else: - if self.astack: - a = self.astack.pop() - if self.maybe_automatic_link: - self.maybe_automatic_link = None - elif a: - if self.inline_links: - self.o("](" + escape_md(a['href']) + ")") - else: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + str(a['count']) + "]") - - if tag == "img" and start and not self.ignore_images: - if has_key(attrs, 'src'): - attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') - self.o("![" + escape_md(alt) + "]") - - if self.inline_links: - self.o("(" + escape_md(attrs['href']) + ")") - else: - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("[" + str(attrs['count']) + "]") - - if tag == 'dl' and start: self.p() - if tag == 'dt' and not start: self.pbr() - if tag == 'dd' and start: self.o(' ') - if tag == 'dd' and not start: self.pbr() - - if tag in ["ol", "ul"]: - # Google Docs create sub lists as top level lists - if (not self.list) and (not self.lastWasList): - self.p() - if start: - if self.google_doc: - list_style = google_list_style(tag_style) - else: - list_style = tag - numbering_start = list_numbering_start(attrs) - self.list.append({'name':list_style, 'num':numbering_start}) - else: - if self.list: self.list.pop() - self.lastWasList = True - else: - self.lastWasList = False - - if tag == 'li': - self.pbr() - if start: - if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} - if self.google_doc: - nest_count = self.google_nest_count(tag_style) - else: - nest_count = len(self.list) - self.o(" " * nest_count) #TODO: line up

      1. s > 9 correctly. - if li['name'] == "ul": self.o(self.ul_item_mark + " ") - elif li['name'] == "ol": - li['num'] += 1 - self.o(str(li['num'])+". ") - self.start = 1 - - if tag in ["table", "tr"] and start: self.p() - if tag == 'td': self.pbr() - - if tag == "pre": - if start: - self.startpre = 1 - self.pre = 1 - else: - self.pre = 0 - self.p() - - def pbr(self): - if self.p_p == 0: - self.p_p = 1 - - def p(self): - self.p_p = 2 - - def soft_br(self): - self.pbr() - self.br_toggle = ' ' - - def o(self, data, puredata=0, force=0): - if self.abbr_data is not None: - self.abbr_data += data - - if not self.quiet: - if self.google_doc: - # prevent white space immediately after 'begin emphasis' marks ('**' and '_') - lstripped_data = data.lstrip() - if self.drop_white_space and not (self.pre or self.code): - data = lstripped_data - if lstripped_data != '': - self.drop_white_space = 0 - - if puredata and not self.pre: - data = re.sub('\s+', ' ', data) - if data and data[0] == ' ': - self.space = 1 - data = data[1:] - if not data and not force: return - - if self.startpre: - #self.out(" :") #TODO: not output when already one there - if not data.startswith("\n"): #
        stuff...
        -                    data = "\n" + data
        -
        -            bq = (">" * self.blockquote)
        -            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
        -
        -            if self.pre:
        -                if not self.list:
        -                    bq += "    "
        -                #else: list content is already partially indented
        -                for i in xrange(len(self.list)):
        -                    bq += "    "
        -                data = data.replace("\n", "\n"+bq)
        -
        -            if self.startpre:
        -                self.startpre = 0
        -                if self.list:
        -                    data = data.lstrip("\n") # use existing initial indentation
        -
        -            if self.start:
        -                self.space = 0
        -                self.p_p = 0
        -                self.start = 0
        -
        -            if force == 'end':
        -                # It's the end.
        -                self.p_p = 0
        -                self.out("\n")
        -                self.space = 0
        -
        -            if self.p_p:
        -                self.out((self.br_toggle+'\n'+bq)*self.p_p)
        -                self.space = 0
        -                self.br_toggle = ''
        -
        -            if self.space:
        -                if not self.lastWasNL: self.out(' ')
        -                self.space = 0
        -
        -            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
        -                if force == "end": self.out("\n")
        -
        -                newa = []
        -                for link in self.a:
        -                    if self.outcount > link['outcount']:
        -                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
        -                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
        -                        self.out("\n")
        -                    else:
        -                        newa.append(link)
        -
        -                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
        -
        -                self.a = newa
        -
        -            if self.abbr_list and force == "end":
        -                for abbr, definition in self.abbr_list.items():
        -                    self.out("  *[" + abbr + "]: " + definition + "\n")
        -
        -            self.p_p = 0
        -            self.out(data)
        -            self.outcount += 1
        -
        -    def handle_data(self, data):
        -        if r'\/script>' in data: self.quiet -= 1
        -
        -        if self.style:
        -            self.style_def.update(dumb_css_parser(data))
        -
        -        if not self.maybe_automatic_link is None:
        -            href = self.maybe_automatic_link
        -            if href == data and self.absolute_url_matcher.match(href):
        -                self.o("<" + data + ">")
        -                return
        -            else:
        -                self.o("[")
        -                self.maybe_automatic_link = None
        -
        -        if not self.code and not self.pre:
        -            data = escape_md_section(data, snob=self.escape_snob)
        -        self.o(data, 1)
        -
        -    def unknown_decl(self, data): pass
        -
        -    def charref(self, name):
        -        if name[0] in ['x','X']:
        -            c = int(name[1:], 16)
        -        else:
        -            c = int(name)
        -
        -        if not self.unicode_snob and c in unifiable_n.keys():
        -            return unifiable_n[c]
        -        else:
        -            try:
        -                return unichr(c)
        -            except NameError: #Python3
        -                return chr(c)
        -
        -    def entityref(self, c):
        -        if not self.unicode_snob and c in unifiable.keys():
        -            return unifiable[c]
        -        else:
        -            try: name2cp(c)
        -            except KeyError: return "&" + c + ';'
        -            else:
        -                try:
        -                    return unichr(name2cp(c))
        -                except NameError: #Python3
        -                    return chr(name2cp(c))
        -
        -    def replaceEntities(self, s):
        -        s = s.group(1)
        -        if s[0] == "#":
        -            return self.charref(s[1:])
        -        else: return self.entityref(s)
        -
        -    r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
        -    def unescape(self, s):
        -        return self.r_unescape.sub(self.replaceEntities, s)
        -
        -    def google_nest_count(self, style):
        -        """calculate the nesting count of google doc lists"""
        -        nest_count = 0
        -        if 'margin-left' in style:
        -            nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
        -        return nest_count
        -
        -
        -    def optwrap(self, text):
        -        """Wrap all paragraphs in the provided text."""
        -        if not self.body_width:
        -            return text
        -
        -        assert wrap, "Requires Python 2.3."
        -        result = ''
        -        newlines = 0
        -        for para in text.split("\n"):
        -            if len(para) > 0:
        -                if not skipwrap(para):
        -                    result += "\n".join(wrap(para, self.body_width))
        -                    if para.endswith('  '):
        -                        result += "  \n"
        -                        newlines = 1
        -                    else:
        -                        result += "\n\n"
        -                        newlines = 2
        -                else:
        -                    if not onlywhite(para):
        -                        result += para + "\n"
        -                        newlines = 1
        -            else:
        -                if newlines < 2:
        -                    result += "\n"
        -                    newlines += 1
        -        return result
        -
        -ordered_list_matcher = re.compile(r'\d+\.\s')
        -unordered_list_matcher = re.compile(r'[-\*\+]\s')
        -md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
        -md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
        -md_dot_matcher = re.compile(r"""
        -    ^             # start of line
        -    (\s*\d+)      # optional whitespace and a number
        -    (\.)          # dot
        -    (?=\s)        # lookahead assert whitespace
        -    """, re.MULTILINE | re.VERBOSE)
        -md_plus_matcher = re.compile(r"""
        -    ^
        -    (\s*)
        -    (\+)
        -    (?=\s)
        -    """, flags=re.MULTILINE | re.VERBOSE)
        -md_dash_matcher = re.compile(r"""
        -    ^
        -    (\s*)
        -    (-)
        -    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
        -                  # or another dash (header or hr)
        -    """, flags=re.MULTILINE | re.VERBOSE)
        -slash_chars = r'\`*_{}[]()#+-.!'
        -md_backslash_matcher = re.compile(r'''
        -    (\\)          # match one slash
        -    (?=[%s])      # followed by a char that requires escaping
        -    ''' % re.escape(slash_chars),
        -    flags=re.VERBOSE)
        -
        -def skipwrap(para):
        -    # If the text begins with four spaces or one tab, it's a code block; don't wrap
        -    if para[0:4] == '    ' or para[0] == '\t':
        -        return True
        -    # If the text begins with only two "--", possibly preceded by whitespace, that's
        -    # an emdash; so wrap.
        -    stripped = para.lstrip()
        -    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
        -        return False
        -    # I'm not sure what this is for; I thought it was to detect lists, but there's
        -    # a 
        -inside- case in one of the tests that also depends upon it. - if stripped[0:1] == '-' or stripped[0:1] == '*': - return True - # If the text begins with a single -, *, or +, followed by a space, or an integer, - # followed by a ., followed by a space (in either case optionally preceeded by - # whitespace), it's a list; don't wrap. - if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): - return True - return False - -def wrapwrite(text): - text = text.encode('utf-8') - try: #Python3 - sys.stdout.buffer.write(text) - except AttributeError: - sys.stdout.write(text) - -def html2text(html, baseurl=''): - h = HTML2Text(baseurl=baseurl) - return h.handle(html) - -def unescape(s, unicode_snob=False): - h = HTML2Text() - h.unicode_snob = unicode_snob - return h.unescape(s) - -def escape_md(text): - """Escapes markdown-sensitive characters within other markdown constructs.""" - return md_chars_matcher.sub(r"\\\1", text) - -def escape_md_section(text, snob=False): - """Escapes markdown-sensitive characters across whole document sections.""" - text = md_backslash_matcher.sub(r"\\\1", text) - if snob: - text = md_chars_matcher_all.sub(r"\\\1", text) - text = md_dot_matcher.sub(r"\1\\\2", text) - text = md_plus_matcher.sub(r"\1\\\2", text) - text = md_dash_matcher.sub(r"\1\\\2", text) - return text - - -def main(): - baseurl = '' - - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', - version='%prog ' + __version__) - p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", - default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") - p.add_option("--ignore-links", dest="ignore_links", action="store_true", - default=IGNORE_ANCHORS, help="don't include any formatting for links") - p.add_option("--ignore-images", dest="ignore_images", action="store_true", - default=IGNORE_IMAGES, help="don't include any formatting for images") - p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", - default=False, help="convert an html-exported Google Document") - p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", - default=False, help="use a dash rather than a star for unordered list items") - p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", - default=False, help="use an asterisk rather than an underscore for emphasized text") - p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", - default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") - p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", - default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") - p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", - default=False, help="hide strike-through text. only relevant when -g is specified as well") - p.add_option("--escape-all", action="store_true", dest="escape_snob", - default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") - (options, args) = p.parse_args() - - # process input - encoding = "utf-8" - if len(args) > 0: - file_ = args[0] - if len(args) == 2: - encoding = args[1] - if len(args) > 2: - p.error('Too many arguments') - - if file_.startswith('http://') or file_.startswith('https://'): - baseurl = file_ - j = urllib.urlopen(baseurl) - data = j.read() - if encoding is None: - try: - from feedparser import _getCharacterEncoding as enc - except ImportError: - enc = lambda x, y: ('utf-8', 1) - encoding = enc(j.headers, data)[0] - if encoding == 'us-ascii': - encoding = 'utf-8' - else: - data = open(file_, 'rb').read() - if encoding is None: - try: - from chardet import detect - except ImportError: - detect = lambda x: {'encoding': 'utf-8'} - encoding = detect(data)['encoding'] - else: - data = sys.stdin.read() - - data = data.decode(encoding) - h = HTML2Text(baseurl=baseurl) - # handle options - if options.ul_style_dash: h.ul_item_mark = '-' - if options.em_style_asterisk: - h.emphasis_mark = '*' - h.strong_mark = '__' - - h.body_width = options.body_width - h.list_indent = options.list_indent - h.ignore_emphasis = options.ignore_emphasis - h.ignore_links = options.ignore_links - h.ignore_images = options.ignore_images - h.google_doc = options.google_doc - h.hide_strikethrough = options.hide_strikethrough - h.escape_snob = options.escape_snob - - wrapwrite(h.handle(data)) - - -if __name__ == "__main__": - main() diff --git a/html2text/__init__.py b/html2text/__init__.py new file mode 100644 index 00000000..7e1a279b --- /dev/null +++ b/html2text/__init__.py @@ -0,0 +1,997 @@ +"""html2text: Turn HTML into equivalent Markdown-structured text.""" + +import html.entities +import html.parser +import re +import string +import urllib.parse as urlparse +from textwrap import wrap +from typing import Dict, List, Optional, Tuple, Union + +from . import config +from .elements import AnchorElement, ListElement +from .typing import OutCallback +from .utils import ( + dumb_css_parser, + element_style, + escape_md, + escape_md_section, + google_fixed_width_font, + google_has_height, + google_list_style, + google_text_emphasis, + hn, + list_numbering_start, + pad_tables_in_text, + skipwrap, + unifiable_n, +) + +__version__ = (2020, 1, 16) + + +# TODO: +# Support decoded entities with UNIFIABLE. + + +class HTML2Text(html.parser.HTMLParser): + def __init__( + self, + out: Optional[OutCallback] = None, + baseurl: str = "", + bodywidth: int = config.BODY_WIDTH, + ) -> None: + """ + Input parameters: + out: possible custom replacement for self.outtextf (which + appends lines of text). + baseurl: base URL of the document we process + """ + super().__init__(convert_charrefs=False) + + # Config options + self.split_next_td = False + self.td_count = 0 + self.table_start = False + self.unicode_snob = config.UNICODE_SNOB # covered in cli + self.escape_snob = config.ESCAPE_SNOB # covered in cli + self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH + self.body_width = bodywidth # covered in cli + self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli + self.inline_links = config.INLINE_LINKS # covered in cli + self.protect_links = config.PROTECT_LINKS # covered in cli + self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli + self.ignore_links = config.IGNORE_ANCHORS # covered in cli + self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli + self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_as_html = config.IMAGES_AS_HTML # covered in cli + self.images_to_alt = config.IMAGES_TO_ALT # covered in cli + self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli + self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli + self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.ignore_tables = config.IGNORE_TABLES # covered in cli + self.google_doc = False # covered in cli + self.ul_item_mark = "*" # covered in cli + self.emphasis_mark = "_" # covered in cli + self.strong_mark = "**" + self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli + self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli + self.hide_strikethrough = False # covered in cli + self.mark_code = config.MARK_CODE + self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli + self.wrap_links = config.WRAP_LINKS # covered in cli + self.wrap_tables = config.WRAP_TABLES + self.pad_tables = config.PAD_TABLES # covered in cli + self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli + self.tag_callback = None + self.open_quote = config.OPEN_QUOTE # covered in cli + self.close_quote = config.CLOSE_QUOTE # covered in cli + + if out is None: + self.out = self.outtextf + else: + self.out = out + + # empty list to store output characters before they are "joined" + self.outtextlist = [] # type: List[str] + + self.quiet = 0 + self.p_p = 0 # number of newline character to print before next output + self.outcount = 0 + self.start = True + self.space = False + self.a = [] # type: List[AnchorElement] + self.astack = [] # type: List[Optional[Dict[str, Optional[str]]]] + self.maybe_automatic_link = None # type: Optional[str] + self.empty_link = False + self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") + self.acount = 0 + self.list = [] # type: List[ListElement] + self.blockquote = 0 + self.pre = False + self.startpre = False + self.code = False + self.quote = False + self.br_toggle = "" + self.lastWasNL = False + self.lastWasList = False + self.style = 0 + self.style_def = {} # type: Dict[str, Dict[str, str]] + self.tag_stack = ( + [] + ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + # Current abbreviation definition + self.abbr_title = None # type: Optional[str] + # Last inner HTML (for abbr being defined) + self.abbr_data = None # type: Optional[str] + # Stack of abbreviations to write later + self.abbr_list = {} # type: Dict[str, str] + self.baseurl = baseurl + self.stressed = False + self.preceding_stressed = False + self.preceding_data = "" + self.current_tag = "" + + config.UNIFIABLE["nbsp"] = " _place_holder;" + + def feed(self, data: str) -> None: + data = data.replace("", "") + super().feed(data) + + def handle(self, data: str) -> str: + self.feed(data) + self.feed("") + markdown = self.optwrap(self.finish()) + if self.pad_tables: + return pad_tables_in_text(markdown) + else: + return markdown + + def outtextf(self, s: str) -> None: + self.outtextlist.append(s) + if s: + self.lastWasNL = s[-1] == "\n" + + def finish(self) -> str: + self.close() + + self.pbr() + self.o("", force="end") + + outtext = "".join(self.outtextlist) + + if self.unicode_snob: + nbsp = html.entities.html5["nbsp;"] + else: + nbsp = " " + outtext = outtext.replace(" _place_holder;", nbsp) + + # Clear self.outtextlist to avoid memory leak of its content to + # the next handling. + self.outtextlist = [] + + return outtext + + def handle_charref(self, c: str) -> None: + self.handle_data(self.charref(c), True) + + def handle_entityref(self, c: str) -> None: + ref = self.entityref(c) + + # ref may be an empty string (e.g. for ‎/‏ markers that should + # not contribute to the final output). + # self.handle_data cannot handle a zero-length string right after a + # stressed tag or mid-text within a stressed tag (text get split and + # self.stressed/self.preceding_stressed gets switched after the first + # part of that text). + if ref: + self.handle_data(ref, True) + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + self.handle_tag(tag, dict(attrs), start=True) + + def handle_endtag(self, tag: str) -> None: + self.handle_tag(tag, {}, start=False) + + def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: + """ + :type attrs: dict + + :returns: The index of certain set of attributes (of a link) in the + self.a list. If the set of attributes is not found, returns None + :rtype: int + """ + if "href" not in attrs: + return None + + match = False + for i, a in enumerate(self.a): + if "href" in a.attrs and a.attrs["href"] == attrs["href"]: + if "title" in a.attrs or "title" in attrs: + if ( + "title" in a.attrs + and "title" in attrs + and a.attrs["title"] == attrs["title"] + ): + match = True + else: + match = True + + if match: + return i + return None + + def handle_emphasis( + self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str] + ) -> None: + """ + Handles various text emphases + """ + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough + + # google and others may mark a font's weight as `bold` or `700` + bold = False + for bold_marker in config.BOLD_TEXT_STYLE_VALUES: + bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis + if bold: + break + + italic = "italic" in tag_emphasis and "italic" not in parent_emphasis + fixed = ( + google_fixed_width_font(tag_style) + and not google_fixed_width_font(parent_style) + and not self.pre + ) + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o(self.emphasis_mark) + self.drop_white_space += 1 + if bold: + self.o(self.strong_mark) + self.drop_white_space += 1 + if fixed: + self.o("`") + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = False + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o("`") + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.strong_mark) + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.emphasis_mark) + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + + def handle_tag( + self, tag: str, attrs: Dict[str, Optional[str]], start: bool + ) -> None: + self.current_tag = tag + + if self.tag_callback is not None: + if self.tag_callback(self, tag, attrs, start) is True: + return + + # first thing inside the anchor tag is another tag + # that produces some output + if ( + start + and self.maybe_automatic_link is not None + and tag not in ["p", "div", "style", "dl", "dt"] + and (tag != "img" or self.ignore_images) + ): + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style = {} # type: Dict[str, str] + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = ( + self.tag_stack.pop() if self.tag_stack else (None, {}, {}) + ) + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + + if hn(tag): + # check if nh is inside of an 'a' tag (incorrect but found in the wild) + if self.astack: + if start: + self.inheader = True + # are inside link name, so only add '#' if it can appear before '[' + if self.outtextlist and self.outtextlist[-1] == "[": + self.outtextlist.pop() + self.space = False + self.o(hn(tag) * "#" + " ") + self.o("[") + else: + self.p_p = 0 # don't break up link name + self.inheader = False + return # prevent redundant emphasis marks on headers + else: + self.p() + if start: + self.inheader = True + self.o(hn(tag) * "#" + " ") + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if tag in ["p", "div"]: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + elif self.astack: + pass + else: + self.p() + + if tag == "br" and start: + if self.blockquote > 0: + self.o(" \n> ") + else: + self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", "script"]: + if start: + self.quiet += 1 + else: + self.quiet -= 1 + + if tag == "style": + if start: + self.style += 1 + else: + self.style -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p() + self.o("> ", force=True) + self.start = True + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ["em", "i", "u"] and not self.ignore_emphasis: + # Separate with a space if we immediately follow an alphanumeric + # character, since otherwise Markdown won't render the emphasis + # marks, and we'll be left with eg 'foo_bar_' visible. + # (Don't add a space otherwise, though, since there isn't one in the + # original HTML.) + if ( + start + and self.preceding_data + and self.preceding_data[-1] not in string.whitespace + and self.preceding_data[-1] not in string.punctuation + ): + emphasis = " " + self.emphasis_mark + self.preceding_data += " " + else: + emphasis = self.emphasis_mark + + self.o(emphasis) + if start: + self.stressed = True + + if tag in ["strong", "b"] and not self.ignore_emphasis: + # Separate with space if we immediately follow an * character, since + # without it, Markdown won't render the resulting *** correctly. + # (Don't add a space otherwise, though, since there isn't one in the + # original HTML.) + if ( + start + and self.preceding_data + and self.preceding_data[-1] == self.strong_mark[0] + ): + strong = " " + self.strong_mark + self.preceding_data += " " + else: + strong = self.strong_mark + + self.o(strong) + if start: + self.stressed = True + + if tag in ["del", "strike", "s"]: + if start and self.preceding_data and self.preceding_data[-1] == "~": + strike = " ~~" + self.preceding_data += " " + else: + strike = "~~" + + self.o(strike) + if start: + self.stressed = True + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["kbd", "code", "tt"] and not self.pre: + self.o("`") # TODO: `` `this` `` + self.code = not self.code + + if tag == "abbr": + if start: + self.abbr_title = None + self.abbr_data = "" + if "title" in attrs: + self.abbr_title = attrs["title"] + else: + if self.abbr_title is not None: + assert self.abbr_data is not None + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = None + + if tag == "q": + if not self.quote: + self.o(self.open_quote) + else: + self.o(self.close_quote) + self.quote = not self.quote + + def link_url(self: HTML2Text, link: str, title: str = "") -> None: + url = urlparse.urljoin(self.baseurl, link) + title = ' "{}"'.format(title) if title.strip() else "" + self.o("]({url}{title})".format(url=escape_md(url), title=title)) + + if tag == "a" and not self.ignore_links: + if start: + if ( + "href" in attrs + and attrs["href"] is not None + and not (self.skip_internal_links and attrs["href"].startswith("#")) + and not ( + self.ignore_mailto_links and attrs["href"].startswith("mailto:") + ) + ): + self.astack.append(attrs) + self.maybe_automatic_link = attrs["href"] + self.empty_link = True + if self.protect_links: + attrs["href"] = "<" + attrs["href"] + ">" + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if self.maybe_automatic_link and not self.empty_link: + self.maybe_automatic_link = None + elif a: + assert a["href"] is not None + if self.empty_link: + self.o("[") + self.empty_link = False + self.maybe_automatic_link = None + if self.inline_links: + self.p_p = 0 + title = a.get("title") or "" + title = escape_md(title) + link_url(self, a["href"], title) + else: + i = self.previousIndex(a) + if i is not None: + a_props = self.a[i] + else: + self.acount += 1 + a_props = AnchorElement(a, self.acount, self.outcount) + self.a.append(a_props) + self.o("][" + str(a_props.count) + "]") + + if tag == "img" and start and not self.ignore_images: + if "src" in attrs: + assert attrs["src"] is not None + if not self.images_to_alt: + attrs["href"] = attrs["src"] + alt = attrs.get("alt") or self.default_image_alt + + # If we have images_with_size, write raw html including width, + # height, and alt attributes + if self.images_as_html or ( + self.images_with_size and ("width" in attrs or "height" in attrs) + ): + self.o("") + return + + # If we have a link to create, output the start + if self.maybe_automatic_link is not None: + href = self.maybe_automatic_link + if ( + self.images_to_alt + and escape_md(alt) == href + and self.absolute_url_matcher.match(href) + ): + self.o("<" + escape_md(alt) + ">") + self.empty_link = False + return + else: + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + # If we have images_to_alt, we discard the image itself, + # considering only the alt text. + if self.images_to_alt: + self.o(escape_md(alt)) + else: + self.o("![" + escape_md(alt) + "]") + if self.inline_links: + href = attrs.get("href") or "" + self.o( + "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" + ) + else: + i = self.previousIndex(attrs) + if i is not None: + a_props = self.a[i] + else: + self.acount += 1 + a_props = AnchorElement(attrs, self.acount, self.outcount) + self.a.append(a_props) + self.o("[" + str(a_props.count) + "]") + + if tag == "dl" and start: + self.p() + if tag == "dt" and not start: + self.pbr() + if tag == "dd" and start: + self.o(" ") + if tag == "dd" and not start: + self.pbr() + + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if not self.list and not self.lastWasList: + self.p() + if start: + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append(ListElement(list_style, numbering_start)) + else: + if self.list: + self.list.pop() + if not self.google_doc and not self.list: + self.o("\n") + self.lastWasList = True + else: + self.lastWasList = False + + if tag == "li": + self.pbr() + if start: + if self.list: + li = self.list[-1] + else: + li = ListElement("ul", 0) + if self.google_doc: + self.o(" " * self.google_nest_count(tag_style)) + else: + # Indent two spaces per list, except use three spaces for an + # unordered list inside an ordered list. + # https://spec.commonmark.org/0.28/#motivation + # TODO: line up
        1. s > 9 correctly. + parent_list = None + for list in self.list: + self.o( + " " if parent_list == "ol" and list.name == "ul" else " " + ) + parent_list = list.name + + if li.name == "ul": + self.o(self.ul_item_mark + " ") + elif li.name == "ol": + li.num += 1 + self.o(str(li.num) + ". ") + self.start = True + + if tag in ["table", "tr", "td", "th"]: + if self.ignore_tables: + if tag == "tr": + if start: + pass + else: + self.soft_br() + else: + pass + + elif self.bypass_tables: + if start: + self.soft_br() + if tag in ["td", "th"]: + if start: + self.o("<{}>\n\n".format(tag)) + else: + self.o("\n".format(tag)) + else: + if start: + self.o("<{}>".format(tag)) + else: + self.o("".format(tag)) + + else: + if tag == "table": + if start: + self.table_start = True + if self.pad_tables: + self.o("<" + config.TABLE_MARKER_FOR_PAD + ">") + self.o(" \n") + else: + if self.pad_tables: + # add break in case the table is empty or its 1 row table + self.soft_br() + self.o("") + self.o(" \n") + if tag in ["td", "th"] and start: + if self.split_next_td: + self.o("| ") + self.split_next_td = True + + if tag == "tr" and start: + self.td_count = 0 + if tag == "tr" and not start: + self.split_next_td = False + self.soft_br() + if tag == "tr" and not start and self.table_start: + # Underline table header + self.o("|".join(["---"] * self.td_count)) + self.soft_br() + self.table_start = False + if tag in ["td", "th"] and start: + self.td_count += 1 + + if tag == "pre": + if start: + self.startpre = True + self.pre = True + else: + self.pre = False + if self.mark_code: + self.out("\n[/code]") + self.p() + + # TODO: Add docstring for these one letter functions + def pbr(self) -> None: + "Pretty print has a line break" + if self.p_p == 0: + self.p_p = 1 + + def p(self) -> None: + "Set pretty print to 1 or 2 lines" + self.p_p = 1 if self.single_line_break else 2 + + def soft_br(self) -> None: + "Soft breaks" + self.pbr() + self.br_toggle = " " + + def o( + self, data: str, puredata: bool = False, force: Union[bool, str] = False + ) -> None: + """ + Deal with indentation and whitespace + """ + if self.abbr_data is not None: + self.abbr_data += data + + if not self.quiet: + if self.google_doc: + # prevent white space immediately after 'begin emphasis' + # marks ('**' and '_') + lstripped_data = data.lstrip() + if self.drop_white_space and not (self.pre or self.code): + data = lstripped_data + if lstripped_data != "": + self.drop_white_space = 0 + + if puredata and not self.pre: + # This is a very dangerous call ... it could mess up + # all handling of   when not handled properly + # (see entityref) + data = re.sub(r"\s+", r" ", data) + if data and data[0] == " ": + self.space = True + data = data[1:] + if not data and not force: + return + + if self.startpre: + # self.out(" :") #TODO: not output when already one there + if not data.startswith("\n") and not data.startswith("\r\n"): + #
          stuff...
          +                    data = "\n" + data
          +                if self.mark_code:
          +                    self.out("\n[code]")
          +                    self.p_p = 0
          +
          +            bq = ">" * self.blockquote
          +            if not (force and data and data[0] == ">") and self.blockquote:
          +                bq += " "
          +
          +            if self.pre:
          +                if not self.list:
          +                    bq += "    "
          +                # else: list content is already partially indented
          +                bq += "    " * len(self.list)
          +                data = data.replace("\n", "\n" + bq)
          +
          +            if self.startpre:
          +                self.startpre = False
          +                if self.list:
          +                    # use existing initial indentation
          +                    data = data.lstrip("\n")
          +
          +            if self.start:
          +                self.space = False
          +                self.p_p = 0
          +                self.start = False
          +
          +            if force == "end":
          +                # It's the end.
          +                self.p_p = 0
          +                self.out("\n")
          +                self.space = False
          +
          +            if self.p_p:
          +                self.out((self.br_toggle + "\n" + bq) * self.p_p)
          +                self.space = False
          +                self.br_toggle = ""
          +
          +            if self.space:
          +                if not self.lastWasNL:
          +                    self.out(" ")
          +                self.space = False
          +
          +            if self.a and (
          +                (self.p_p == 2 and self.links_each_paragraph) or force == "end"
          +            ):
          +                if force == "end":
          +                    self.out("\n")
          +
          +                newa = []
          +                for link in self.a:
          +                    if self.outcount > link.outcount:
          +                        self.out(
          +                            "   ["
          +                            + str(link.count)
          +                            + "]: "
          +                            + urlparse.urljoin(self.baseurl, link.attrs["href"])
          +                        )
          +                        if "title" in link.attrs:
          +                            assert link.attrs["title"] is not None
          +                            self.out(" (" + link.attrs["title"] + ")")
          +                        self.out("\n")
          +                    else:
          +                        newa.append(link)
          +
          +                # Don't need an extra line when nothing was done.
          +                if self.a != newa:
          +                    self.out("\n")
          +
          +                self.a = newa
          +
          +            if self.abbr_list and force == "end":
          +                for abbr, definition in self.abbr_list.items():
          +                    self.out("  *[" + abbr + "]: " + definition + "\n")
          +
          +            self.p_p = 0
          +            self.out(data)
          +            self.outcount += 1
          +
          +    def handle_data(self, data: str, entity_char: bool = False) -> None:
          +        if not data:
          +            # Data may be empty for some HTML entities. For example,
          +            # LEFT-TO-RIGHT MARK.
          +            return
          +
          +        if self.stressed:
          +            data = data.strip()
          +            self.stressed = False
          +            self.preceding_stressed = True
          +        elif self.preceding_stressed:
          +            if (
          +                re.match(r"[^][(){}\s.!?]", data[0])
          +                and not hn(self.current_tag)
          +                and self.current_tag not in ["a", "code", "pre"]
          +            ):
          +                # should match a letter or common punctuation
          +                data = " " + data
          +            self.preceding_stressed = False
          +
          +        if self.style:
          +            self.style_def.update(dumb_css_parser(data))
          +
          +        if self.maybe_automatic_link is not None:
          +            href = self.maybe_automatic_link
          +            if (
          +                href == data
          +                and self.absolute_url_matcher.match(href)
          +                and self.use_automatic_links
          +            ):
          +                self.o("<" + data + ">")
          +                self.empty_link = False
          +                return
          +            else:
          +                self.o("[")
          +                self.maybe_automatic_link = None
          +                self.empty_link = False
          +
          +        if not self.code and not self.pre and not entity_char:
          +            data = escape_md_section(data, snob=self.escape_snob)
          +        self.preceding_data = data
          +        self.o(data, puredata=True)
          +
          +    def charref(self, name: str) -> str:
          +        if name[0] in ["x", "X"]:
          +            c = int(name[1:], 16)
          +        else:
          +            c = int(name)
          +
          +        if not self.unicode_snob and c in unifiable_n:
          +            return unifiable_n[c]
          +        else:
          +            try:
          +                return chr(c)
          +            except ValueError:  # invalid unicode
          +                return ""
          +
          +    def entityref(self, c: str) -> str:
          +        if not self.unicode_snob and c in config.UNIFIABLE:
          +            return config.UNIFIABLE[c]
          +        try:
          +            ch = html.entities.html5[c + ";"]
          +        except KeyError:
          +            return "&" + c + ";"
          +        return config.UNIFIABLE[c] if c == "nbsp" else ch
          +
          +    def google_nest_count(self, style: Dict[str, str]) -> int:
          +        """
          +        Calculate the nesting count of google doc lists
          +
          +        :type style: dict
          +
          +        :rtype: int
          +        """
          +        nest_count = 0
          +        if "margin-left" in style:
          +            nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
          +
          +        return nest_count
          +
          +    def optwrap(self, text: str) -> str:
          +        """
          +        Wrap all paragraphs in the provided text.
          +
          +        :type text: str
          +
          +        :rtype: str
          +        """
          +        if not self.body_width:
          +            return text
          +
          +        result = ""
          +        newlines = 0
          +        # I cannot think of a better solution for now.
          +        # To avoid the non-wrap behaviour for entire paras
          +        # because of the presence of a link in it
          +        if not self.wrap_links:
          +            self.inline_links = False
          +        for para in text.split("\n"):
          +            if len(para) > 0:
          +                if not skipwrap(
          +                    para, self.wrap_links, self.wrap_list_items, self.wrap_tables
          +                ):
          +                    indent = ""
          +                    if para.startswith("  " + self.ul_item_mark):
          +                        # list item continuation: add a double indent to the
          +                        # new lines
          +                        indent = "    "
          +                    elif para.startswith("> "):
          +                        # blockquote continuation: add the greater than symbol
          +                        # to the new lines
          +                        indent = "> "
          +                    wrapped = wrap(
          +                        para,
          +                        self.body_width,
          +                        break_long_words=False,
          +                        subsequent_indent=indent,
          +                    )
          +                    result += "\n".join(wrapped)
          +                    if para.endswith("  "):
          +                        result += "  \n"
          +                        newlines = 1
          +                    elif indent:
          +                        result += "\n"
          +                        newlines = 1
          +                    else:
          +                        result += "\n\n"
          +                        newlines = 2
          +                else:
          +                    # Warning for the tempted!!!
          +                    # Be aware that obvious replacement of this with
          +                    # line.isspace()
          +                    # DOES NOT work! Explanations are welcome.
          +                    if not config.RE_SPACE.match(para):
          +                        result += para + "\n"
          +                        newlines = 1
          +            else:
          +                if newlines < 2:
          +                    result += "\n"
          +                    newlines += 1
          +        return result
          +
          +
          +def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
          +    if bodywidth is None:
          +        bodywidth = config.BODY_WIDTH
          +    h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
          +
          +    return h.handle(html)
          diff --git a/html2text/__main__.py b/html2text/__main__.py
          new file mode 100644
          index 00000000..4e28416e
          --- /dev/null
          +++ b/html2text/__main__.py
          @@ -0,0 +1,3 @@
          +from .cli import main
          +
          +main()
          diff --git a/html2text/cli.py b/html2text/cli.py
          new file mode 100644
          index 00000000..d0c62c97
          --- /dev/null
          +++ b/html2text/cli.py
          @@ -0,0 +1,322 @@
          +import argparse
          +import sys
          +
          +from . import HTML2Text, __version__, config
          +
          +
          +def main() -> None:
          +    baseurl = ""
          +
          +    class bcolors:
          +        HEADER = "\033[95m"
          +        OKBLUE = "\033[94m"
          +        OKGREEN = "\033[92m"
          +        WARNING = "\033[93m"
          +        FAIL = "\033[91m"
          +        ENDC = "\033[0m"
          +        BOLD = "\033[1m"
          +        UNDERLINE = "\033[4m"
          +
          +    p = argparse.ArgumentParser()
          +    p.add_argument(
          +        "--default-image-alt",
          +        dest="default_image_alt",
          +        default=config.DEFAULT_IMAGE_ALT,
          +        help="The default alt string for images with missing ones",
          +    )
          +    p.add_argument(
          +        "--pad-tables",
          +        dest="pad_tables",
          +        action="store_true",
          +        default=config.PAD_TABLES,
          +        help="pad the cells to equal column width in tables",
          +    )
          +    p.add_argument(
          +        "--no-wrap-links",
          +        dest="wrap_links",
          +        action="store_false",
          +        default=config.WRAP_LINKS,
          +        help="don't wrap links during conversion",
          +    )
          +    p.add_argument(
          +        "--wrap-list-items",
          +        dest="wrap_list_items",
          +        action="store_true",
          +        default=config.WRAP_LIST_ITEMS,
          +        help="wrap list items during conversion",
          +    )
          +    p.add_argument(
          +        "--wrap-tables",
          +        dest="wrap_tables",
          +        action="store_true",
          +        default=config.WRAP_TABLES,
          +        help="wrap tables",
          +    )
          +    p.add_argument(
          +        "--ignore-emphasis",
          +        dest="ignore_emphasis",
          +        action="store_true",
          +        default=config.IGNORE_EMPHASIS,
          +        help="don't include any formatting for emphasis",
          +    )
          +    p.add_argument(
          +        "--reference-links",
          +        dest="inline_links",
          +        action="store_false",
          +        default=config.INLINE_LINKS,
          +        help="use reference style links instead of inline links",
          +    )
          +    p.add_argument(
          +        "--ignore-links",
          +        dest="ignore_links",
          +        action="store_true",
          +        default=config.IGNORE_ANCHORS,
          +        help="don't include any formatting for links",
          +    )
          +    p.add_argument(
          +        "--ignore-mailto-links",
          +        action="store_true",
          +        dest="ignore_mailto_links",
          +        default=config.IGNORE_MAILTO_LINKS,
          +        help="don't include mailto: links",
          +    )
          +    p.add_argument(
          +        "--protect-links",
          +        dest="protect_links",
          +        action="store_true",
          +        default=config.PROTECT_LINKS,
          +        help="protect links from line breaks surrounding them with angle brackets",
          +    )
          +    p.add_argument(
          +        "--ignore-images",
          +        dest="ignore_images",
          +        action="store_true",
          +        default=config.IGNORE_IMAGES,
          +        help="don't include any formatting for images",
          +    )
          +    p.add_argument(
          +        "--images-as-html",
          +        dest="images_as_html",
          +        action="store_true",
          +        default=config.IMAGES_AS_HTML,
          +        help=(
          +            "Always write image tags as raw html; preserves `height`, `width` and "
          +            "`alt` if possible."
          +        ),
          +    )
          +    p.add_argument(
          +        "--images-to-alt",
          +        dest="images_to_alt",
          +        action="store_true",
          +        default=config.IMAGES_TO_ALT,
          +        help="Discard image data, only keep alt text",
          +    )
          +    p.add_argument(
          +        "--images-with-size",
          +        dest="images_with_size",
          +        action="store_true",
          +        default=config.IMAGES_WITH_SIZE,
          +        help=(
          +            "Write image tags with height and width attrs as raw html to retain "
          +            "dimensions"
          +        ),
          +    )
          +    p.add_argument(
          +        "-g",
          +        "--google-doc",
          +        action="store_true",
          +        dest="google_doc",
          +        default=False,
          +        help="convert an html-exported Google Document",
          +    )
          +    p.add_argument(
          +        "-d",
          +        "--dash-unordered-list",
          +        action="store_true",
          +        dest="ul_style_dash",
          +        default=False,
          +        help="use a dash rather than a star for unordered list items",
          +    )
          +    p.add_argument(
          +        "-e",
          +        "--asterisk-emphasis",
          +        action="store_true",
          +        dest="em_style_asterisk",
          +        default=False,
          +        help="use an asterisk rather than an underscore for emphasized text",
          +    )
          +    p.add_argument(
          +        "-b",
          +        "--body-width",
          +        dest="body_width",
          +        type=int,
          +        default=config.BODY_WIDTH,
          +        help="number of characters per output line, 0 for no wrap",
          +    )
          +    p.add_argument(
          +        "-i",
          +        "--google-list-indent",
          +        dest="list_indent",
          +        type=int,
          +        default=config.GOOGLE_LIST_INDENT,
          +        help="number of pixels Google indents nested lists",
          +    )
          +    p.add_argument(
          +        "-s",
          +        "--hide-strikethrough",
          +        action="store_true",
          +        dest="hide_strikethrough",
          +        default=False,
          +        help="hide strike-through text. only relevant when -g is " "specified as well",
          +    )
          +    p.add_argument(
          +        "--escape-all",
          +        action="store_true",
          +        dest="escape_snob",
          +        default=False,
          +        help=(
          +            "Escape all special characters.  Output is less readable, but avoids "
          +            "corner case formatting issues."
          +        ),
          +    )
          +    p.add_argument(
          +        "--bypass-tables",
          +        action="store_true",
          +        dest="bypass_tables",
          +        default=config.BYPASS_TABLES,
          +        help="Format tables in HTML rather than Markdown syntax.",
          +    )
          +    p.add_argument(
          +        "--ignore-tables",
          +        action="store_true",
          +        dest="ignore_tables",
          +        default=config.IGNORE_TABLES,
          +        help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
          +    )
          +    p.add_argument(
          +        "--single-line-break",
          +        action="store_true",
          +        dest="single_line_break",
          +        default=config.SINGLE_LINE_BREAK,
          +        help=(
          +            "Use a single line break after a block element rather than two line "
          +            "breaks. NOTE: Requires --body-width=0"
          +        ),
          +    )
          +    p.add_argument(
          +        "--unicode-snob",
          +        action="store_true",
          +        dest="unicode_snob",
          +        default=config.UNICODE_SNOB,
          +        help="Use unicode throughout document",
          +    )
          +    p.add_argument(
          +        "--no-automatic-links",
          +        action="store_false",
          +        dest="use_automatic_links",
          +        default=config.USE_AUTOMATIC_LINKS,
          +        help="Do not use automatic links wherever applicable",
          +    )
          +    p.add_argument(
          +        "--no-skip-internal-links",
          +        action="store_false",
          +        dest="skip_internal_links",
          +        default=config.SKIP_INTERNAL_LINKS,
          +        help="Do not skip internal links",
          +    )
          +    p.add_argument(
          +        "--links-after-para",
          +        action="store_true",
          +        dest="links_each_paragraph",
          +        default=config.LINKS_EACH_PARAGRAPH,
          +        help="Put links after each paragraph instead of document",
          +    )
          +    p.add_argument(
          +        "--mark-code",
          +        action="store_true",
          +        dest="mark_code",
          +        default=config.MARK_CODE,
          +        help="Mark program code blocks with [code]...[/code]",
          +    )
          +    p.add_argument(
          +        "--decode-errors",
          +        dest="decode_errors",
          +        default=config.DECODE_ERRORS,
          +        help=(
          +            "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
          +            "acceptable values"
          +        ),
          +    )
          +    p.add_argument(
          +        "--open-quote",
          +        dest="open_quote",
          +        default=config.OPEN_QUOTE,
          +        help="The character used to open quotes",
          +    )
          +    p.add_argument(
          +        "--close-quote",
          +        dest="close_quote",
          +        default=config.CLOSE_QUOTE,
          +        help="The character used to close quotes",
          +    )
          +    p.add_argument(
          +        "--version", action="version", version=".".join(map(str, __version__))
          +    )
          +    p.add_argument("filename", nargs="?")
          +    p.add_argument("encoding", nargs="?", default="utf-8")
          +    args = p.parse_args()
          +
          +    if args.filename and args.filename != "-":
          +        with open(args.filename, "rb") as fp:
          +            data = fp.read()
          +    else:
          +        data = sys.stdin.buffer.read()
          +
          +    try:
          +        html = data.decode(args.encoding, args.decode_errors)
          +    except UnicodeDecodeError as err:
          +        warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
          +        warning += " Use the " + bcolors.OKGREEN
          +        warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
          +        print(warning)
          +        raise err
          +
          +    h = HTML2Text(baseurl=baseurl)
          +    # handle options
          +    if args.ul_style_dash:
          +        h.ul_item_mark = "-"
          +    if args.em_style_asterisk:
          +        h.emphasis_mark = "*"
          +        h.strong_mark = "__"
          +
          +    h.body_width = args.body_width
          +    h.google_list_indent = args.list_indent
          +    h.ignore_emphasis = args.ignore_emphasis
          +    h.ignore_links = args.ignore_links
          +    h.ignore_mailto_links = args.ignore_mailto_links
          +    h.protect_links = args.protect_links
          +    h.ignore_images = args.ignore_images
          +    h.images_as_html = args.images_as_html
          +    h.images_to_alt = args.images_to_alt
          +    h.images_with_size = args.images_with_size
          +    h.google_doc = args.google_doc
          +    h.hide_strikethrough = args.hide_strikethrough
          +    h.escape_snob = args.escape_snob
          +    h.bypass_tables = args.bypass_tables
          +    h.ignore_tables = args.ignore_tables
          +    h.single_line_break = args.single_line_break
          +    h.inline_links = args.inline_links
          +    h.unicode_snob = args.unicode_snob
          +    h.use_automatic_links = args.use_automatic_links
          +    h.skip_internal_links = args.skip_internal_links
          +    h.links_each_paragraph = args.links_each_paragraph
          +    h.mark_code = args.mark_code
          +    h.wrap_links = args.wrap_links
          +    h.wrap_list_items = args.wrap_list_items
          +    h.wrap_tables = args.wrap_tables
          +    h.pad_tables = args.pad_tables
          +    h.default_image_alt = args.default_image_alt
          +    h.open_quote = args.open_quote
          +    h.close_quote = args.close_quote
          +
          +    sys.stdout.write(h.handle(html))
          diff --git a/html2text/config.py b/html2text/config.py
          new file mode 100644
          index 00000000..88d3f912
          --- /dev/null
          +++ b/html2text/config.py
          @@ -0,0 +1,165 @@
          +import re
          +
          +# Use Unicode characters instead of their ascii pseudo-replacements
          +UNICODE_SNOB = False
          +
          +# Marker to use for marking tables for padding post processing
          +TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
          +# Escape all special characters.  Output is less readable, but avoids
          +# corner case formatting issues.
          +ESCAPE_SNOB = False
          +
          +# Put the links after each paragraph instead of at the end.
          +LINKS_EACH_PARAGRAPH = False
          +
          +# Wrap long lines at position. 0 for no wrapping.
          +BODY_WIDTH = 78
          +
          +# Don't show internal links (href="#local-anchor") -- corresponding link
          +# targets won't be visible in the plain text file anyway.
          +SKIP_INTERNAL_LINKS = True
          +
          +# Use inline, rather than reference, formatting for images and links
          +INLINE_LINKS = True
          +
          +# Protect links from line breaks surrounding them with angle brackets (in
          +# addition to their square brackets)
          +PROTECT_LINKS = False
          +# WRAP_LINKS = True
          +WRAP_LINKS = True
          +
          +# Wrap list items.
          +WRAP_LIST_ITEMS = False
          +
          +# Wrap tables
          +WRAP_TABLES = False
          +
          +# Number of pixels Google indents nested lists
          +GOOGLE_LIST_INDENT = 36
          +
          +# Values Google and others may use to indicate bold text
          +BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
          +
          +IGNORE_ANCHORS = False
          +IGNORE_MAILTO_LINKS = False
          +IGNORE_IMAGES = False
          +IMAGES_AS_HTML = False
          +IMAGES_TO_ALT = False
          +IMAGES_WITH_SIZE = False
          +IGNORE_EMPHASIS = False
          +MARK_CODE = False
          +DECODE_ERRORS = "strict"
          +DEFAULT_IMAGE_ALT = ""
          +PAD_TABLES = False
          +
          +# Convert links with same href and text to  format
          +# if they are absolute links
          +USE_AUTOMATIC_LINKS = True
          +
          +# For checking space-only lines on line 771
          +RE_SPACE = re.compile(r"\s\+")
          +
          +RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
          +RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
          +RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
          +RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
          +
          +# to find links in the text
          +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
          +
          +# to find table separators
          +RE_TABLE = re.compile(r" \| ")
          +
          +RE_MD_DOT_MATCHER = re.compile(
          +    r"""
          +    ^             # start of line
          +    (\s*\d+)      # optional whitespace and a number
          +    (\.)          # dot
          +    (?=\s)        # lookahead assert whitespace
          +    """,
          +    re.MULTILINE | re.VERBOSE,
          +)
          +RE_MD_PLUS_MATCHER = re.compile(
          +    r"""
          +    ^
          +    (\s*)
          +    (\+)
          +    (?=\s)
          +    """,
          +    flags=re.MULTILINE | re.VERBOSE,
          +)
          +RE_MD_DASH_MATCHER = re.compile(
          +    r"""
          +    ^
          +    (\s*)
          +    (-)
          +    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
          +                  # or another dash (header or hr)
          +    """,
          +    flags=re.MULTILINE | re.VERBOSE,
          +)
          +RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
          +RE_MD_BACKSLASH_MATCHER = re.compile(
          +    r"""
          +    (\\)          # match one slash
          +    (?=[%s])      # followed by a char that requires escaping
          +    """
          +    % re.escape(RE_SLASH_CHARS),
          +    flags=re.VERBOSE,
          +)
          +
          +UNIFIABLE = {
          +    "rsquo": "'",
          +    "lsquo": "'",
          +    "rdquo": '"',
          +    "ldquo": '"',
          +    "copy": "(C)",
          +    "mdash": "--",
          +    "nbsp": " ",
          +    "rarr": "->",
          +    "larr": "<-",
          +    "middot": "*",
          +    "ndash": "-",
          +    "oelig": "oe",
          +    "aelig": "ae",
          +    "agrave": "a",
          +    "aacute": "a",
          +    "acirc": "a",
          +    "atilde": "a",
          +    "auml": "a",
          +    "aring": "a",
          +    "egrave": "e",
          +    "eacute": "e",
          +    "ecirc": "e",
          +    "euml": "e",
          +    "igrave": "i",
          +    "iacute": "i",
          +    "icirc": "i",
          +    "iuml": "i",
          +    "ograve": "o",
          +    "oacute": "o",
          +    "ocirc": "o",
          +    "otilde": "o",
          +    "ouml": "o",
          +    "ugrave": "u",
          +    "uacute": "u",
          +    "ucirc": "u",
          +    "uuml": "u",
          +    "lrm": "",
          +    "rlm": "",
          +}
          +
          +# Format tables in HTML rather than Markdown syntax
          +BYPASS_TABLES = False
          +# Ignore table-related tags (table, th, td, tr) while keeping rows
          +IGNORE_TABLES = False
          +
          +
          +# Use a single line break after a block element rather than two line breaks.
          +# NOTE: Requires body width setting to be 0.
          +SINGLE_LINE_BREAK = False
          +
          +
          +# Use double quotation marks when converting the  tag.
          +OPEN_QUOTE = '"'
          +CLOSE_QUOTE = '"'
          diff --git a/html2text/elements.py b/html2text/elements.py
          new file mode 100644
          index 00000000..2533ec08
          --- /dev/null
          +++ b/html2text/elements.py
          @@ -0,0 +1,18 @@
          +from typing import Dict, Optional
          +
          +
          +class AnchorElement:
          +    __slots__ = ["attrs", "count", "outcount"]
          +
          +    def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
          +        self.attrs = attrs
          +        self.count = count
          +        self.outcount = outcount
          +
          +
          +class ListElement:
          +    __slots__ = ["name", "num"]
          +
          +    def __init__(self, name: str, num: int):
          +        self.name = name
          +        self.num = num
          diff --git a/html2text/py.typed b/html2text/py.typed
          new file mode 100644
          index 00000000..e69de29b
          diff --git a/html2text/typing.py b/html2text/typing.py
          new file mode 100644
          index 00000000..6e17fed2
          --- /dev/null
          +++ b/html2text/typing.py
          @@ -0,0 +1,3 @@
          +class OutCallback:
          +    def __call__(self, s: str) -> None:
          +        ...
          diff --git a/html2text/utils.py b/html2text/utils.py
          new file mode 100644
          index 00000000..366748b6
          --- /dev/null
          +++ b/html2text/utils.py
          @@ -0,0 +1,290 @@
          +import html.entities
          +from typing import Dict, List, Optional
          +
          +from . import config
          +
          +unifiable_n = {
          +    html.entities.name2codepoint[k]: v
          +    for k, v in config.UNIFIABLE.items()
          +    if k != "nbsp"
          +}
          +
          +
          +def hn(tag: str) -> int:
          +    if tag[0] == "h" and len(tag) == 2:
          +        n = tag[1]
          +        if "0" < n <= "9":
          +            return int(n)
          +    return 0
          +
          +
          +def dumb_property_dict(style: str) -> Dict[str, str]:
          +    """
          +    :returns: A hash of css attributes
          +    """
          +    return {
          +        x.strip().lower(): y.strip().lower()
          +        for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
          +    }
          +
          +
          +def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
          +    """
          +    :type data: str
          +
          +    :returns: A hash of css selectors, each of which contains a hash of
          +    css attributes.
          +    :rtype: dict
          +    """
          +    # remove @import sentences
          +    data += ";"
          +    importIndex = data.find("@import")
          +    while importIndex != -1:
          +        data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
          +        importIndex = data.find("@import")
          +
          +    # parse the css. reverted from dictionary comprehension in order to
          +    # support older pythons
          +    pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
          +    try:
          +        elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
          +    except ValueError:
          +        elements = {}  # not that important
          +
          +    return elements
          +
          +
          +def element_style(
          +    attrs: Dict[str, Optional[str]],
          +    style_def: Dict[str, Dict[str, str]],
          +    parent_style: Dict[str, str],
          +) -> Dict[str, str]:
          +    """
          +    :type attrs: dict
          +    :type style_def: dict
          +    :type style_def: dict
          +
          +    :returns: A hash of the 'final' style attributes of the element
          +    :rtype: dict
          +    """
          +    style = parent_style.copy()
          +    if "class" in attrs:
          +        assert attrs["class"] is not None
          +        for css_class in attrs["class"].split():
          +            css_style = style_def.get("." + css_class, {})
          +            style.update(css_style)
          +    if "style" in attrs:
          +        assert attrs["style"] is not None
          +        immediate_style = dumb_property_dict(attrs["style"])
          +        style.update(immediate_style)
          +
          +    return style
          +
          +
          +def google_list_style(style: Dict[str, str]) -> str:
          +    """
          +    Finds out whether this is an ordered or unordered list
          +
          +    :type style: dict
          +
          +    :rtype: str
          +    """
          +    if "list-style-type" in style:
          +        list_style = style["list-style-type"]
          +        if list_style in ["disc", "circle", "square", "none"]:
          +            return "ul"
          +
          +    return "ol"
          +
          +
          +def google_has_height(style: Dict[str, str]) -> bool:
          +    """
          +    Check if the style of the element has the 'height' attribute
          +    explicitly defined
          +
          +    :type style: dict
          +
          +    :rtype: bool
          +    """
          +    return "height" in style
          +
          +
          +def google_text_emphasis(style: Dict[str, str]) -> List[str]:
          +    """
          +    :type style: dict
          +
          +    :returns: A list of all emphasis modifiers of the element
          +    :rtype: list
          +    """
          +    emphasis = []
          +    if "text-decoration" in style:
          +        emphasis.append(style["text-decoration"])
          +    if "font-style" in style:
          +        emphasis.append(style["font-style"])
          +    if "font-weight" in style:
          +        emphasis.append(style["font-weight"])
          +
          +    return emphasis
          +
          +
          +def google_fixed_width_font(style: Dict[str, str]) -> bool:
          +    """
          +    Check if the css of the current element defines a fixed width font
          +
          +    :type style: dict
          +
          +    :rtype: bool
          +    """
          +    font_family = ""
          +    if "font-family" in style:
          +        font_family = style["font-family"]
          +    return "courier new" == font_family or "consolas" == font_family
          +
          +
          +def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
          +    """
          +    Extract numbering from list element attributes
          +
          +    :type attrs: dict
          +
          +    :rtype: int or None
          +    """
          +    if "start" in attrs:
          +        assert attrs["start"] is not None
          +        try:
          +            return int(attrs["start"]) - 1
          +        except ValueError:
          +            pass
          +
          +    return 0
          +
          +
          +def skipwrap(
          +    para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
          +) -> bool:
          +    # If it appears to contain a link
          +    # don't wrap
          +    if not wrap_links and config.RE_LINK.search(para):
          +        return True
          +    # If the text begins with four spaces or one tab, it's a code block;
          +    # don't wrap
          +    if para[0:4] == "    " or para[0] == "\t":
          +        return True
          +
          +    # If the text begins with only two "--", possibly preceded by
          +    # whitespace, that's an emdash; so wrap.
          +    stripped = para.lstrip()
          +    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
          +        return False
          +
          +    # I'm not sure what this is for; I thought it was to detect lists,
          +    # but there's a 
          -inside- case in one of the tests that + # also depends upon it. + if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": + return not wrap_list_items + + # If text contains a pipe character it is likely a table + if not wrap_tables and config.RE_TABLE.search(para): + return True + + # If the text begins with a single -, *, or +, followed by a space, + # or an integer, followed by a ., followed by a space (in either + # case optionally proceeded by whitespace), it's a list; don't wrap. + return bool( + config.RE_ORDERED_LIST_MATCHER.match(stripped) + or config.RE_UNORDERED_LIST_MATCHER.match(stripped) + ) + + +def escape_md(text: str) -> str: + """ + Escapes markdown-sensitive characters within other markdown + constructs. + """ + return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) + + +def escape_md_section(text: str, snob: bool = False) -> str: + """ + Escapes markdown-sensitive characters across whole document sections. + """ + text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) + + if snob: + text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) + + text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) + + return text + + +def reformat_table(lines: List[str], right_margin: int) -> List[str]: + """ + Given the lines of a table + padds the cells and returns the new lines + """ + # find the maximum width of the columns + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] + max_cols = len(max_width) + for line in lines: + cols = [x.rstrip() for x in line.split("|")] + num_cols = len(cols) + + # don't drop any data if colspan attributes result in unequal lengths + if num_cols < max_cols: + cols += [""] * (max_cols - num_cols) + elif max_cols < num_cols: + max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] + max_cols = num_cols + + max_width = [ + max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) + ] + + # reformat + new_lines = [] + for line in lines: + cols = [x.rstrip() for x in line.split("|")] + if set(line.strip()) == set("-|"): + filler = "-" + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("|-" + "|".join(new_cols) + "|") + else: + filler = " " + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("| " + "|".join(new_cols) + "|") + return new_lines + + +def pad_tables_in_text(text: str, right_margin: int = 1) -> str: + """ + Provide padding for tables in the text + """ + lines = text.split("\n") + table_buffer = [] # type: List[str] + table_started = False + new_lines = [] + for line in lines: + # Toggle table started + if config.TABLE_MARKER_FOR_PAD in line: + table_started = not table_started + if not table_started: + table = reformat_table(table_buffer, right_margin) + new_lines.extend(table) + table_buffer = [] + new_lines.append("") + continue + # Process lines + if table_started: + table_buffer.append(line) + else: + new_lines.append(line) + return "\n".join(new_lines) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..6ba62eb7 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,51 @@ +[metadata] +name = html2text +version = attr: html2text.__version__ +description = Turn HTML into equivalent Markdown-structured text. +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/Alir3z4/html2text/ +author = Aaron Swartz +author_email = me@aaronsw.com +maintainer = Alireza Savand +maintainer_email = alireza.savand@gmail.com +license = GNU GPL 3 +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + License :: OSI Approved :: GNU General Public License (GPL) + Operating System :: OS Independent + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.5 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: Implementation :: CPython + Programming Language :: Python :: Implementation :: PyPy +platform = OS Independent + +[options] +zip_safe = False +packages = html2text +python_requires = >=3.5 + +[options.entry_points] +console_scripts = + html2text = html2text.cli:main + +[options.package_data] +html2text = py.typed + +[flake8] +max_line_length = 88 +extend-ignore = E203 + +[isort] +combine_as_imports = True +profile = black + +[mypy] +python_version = 3.5 diff --git a/setup.py b/setup.py index dd3d9bc2..60684932 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,3 @@ -import sys -from setuptools import setup, find_packages +from setuptools import setup -setup( - name = "html2text", - version = "3.200.3", - description = "Turn HTML into equivalent Markdown-structured text.", - author = "Aaron Swartz", - author_email = "me@aaronsw.com", - url='http://www.aaronsw.com/2002/html2text/', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.3', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2' - ], - entry_points=""" - [console_scripts] - html2text=html2text:main - """, - license='GNU GPL 3', - packages=find_packages(), - py_modules=['html2text'], - include_package_data=True, - zip_safe=False, -) +setup() diff --git a/test/GoogleDocMassDownload.html b/test/GoogleDocMassDownload.html index f31e90e7..784cf142 100644 --- a/test/GoogleDocMassDownload.html +++ b/test/GoogleDocMassDownload.html @@ -6,7 +6,7 @@ @import url(https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw); -

          +

          diff --git a/test/GoogleDocSaved.html b/test/GoogleDocSaved.html index a0cd19c9..10d80cfb 100644 --- a/test/GoogleDocSaved.html +++ b/test/GoogleDocSaved.html @@ -3,7 +3,7 @@ Sandbox + @import url('https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw');ol{margin:0;padding:0}p{margin:0}.c12{list-style-type:disc;margin:0;padding:0;text-decoration:none;}.c8{width:468pt;background-color:#ffffff;padding:72pt 72pt 72pt 72pt}.c2{padding-left:0pt;direction:ltr;margin-left:36pt}.c11{list-style-type:lower-latin;margin:0;padding:0}.c4{list-style-type:circle;margin:0;padding:0}.c1{padding-left:0pt;direction:ltr;margin-left:72pt}.c7{;margin:0;padding:0}.c3{font-style:italic;font-family:Courier New}.c0{height:11pt;direction:ltr}.c5{font-weight:bold}.c9{font-family:Consolas}.c13{font-family:Courier New}.c6{direction:ltr}.c10{font-style:italic}body{color:#000000;font-size:11pt;font-family:Arial}h1{padding-top:24pt;color:#000000;font-size:24pt;font-family:Arial;font-weight:bold;padding-bottom:6pt}h2{padding-top:18pt;color:#000000;font-size:18pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h3{padding-top:14pt;color:#000000;font-size:14pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h4{padding-top:12pt;color:#000000;font-size:12pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h5{padding-top:11pt;color:#000000;font-size:11pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h6{padding-top:10pt;color:#000000;font-size:10pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}

          diff --git a/test/GoogleDocSaved_two.html b/test/GoogleDocSaved_two.html new file mode 100644 index 00000000..4b30f492 --- /dev/null +++ b/test/GoogleDocSaved_two.html @@ -0,0 +1,147 @@ + + + + Sandbox + + + +

          + + + + test doc +

          +

          + + first issue +

          +

          + + +

          +
            +
          1. + + bit +
          2. +
          3. + + bold italic +
          4. +
          +
            +
          1. + + orange +
          2. +
          3. + + apple +
          4. +
          +
            +
          1. + + final +
          2. +
          +

          + + +

          +

          + + text to separate lists +

          +

          + + +

          +
            +
          1. + + now with numbers +
          2. +
          3. + + the prisoner +
          4. +
          +
            +
          1. + + not an + + italic number +
          2. +
          3. + + a + + bold human + +  being +
          4. +
          +
            +
          1. + + end +
          2. +
          +

          + + +

          +

          + + bold +

          +

          + + italic +

          +

          + + +

          +

          + + def func(x): +

          +

          + +   if x < 1: +

          +

          + +     return 'a' +

          +

          + +   return 'b' +

          +

          + + +

          +

          + + Some + + fixed width text + +  here +

          +

          + + italic fixed width text +

          +

          + + +

          + + + diff --git a/test/GoogleDocSaved_two.md b/test/GoogleDocSaved_two.md new file mode 100644 index 00000000..e69de29b diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/abbr_tag.html b/test/abbr_tag.html new file mode 100644 index 00000000..b523d6bc --- /dev/null +++ b/test/abbr_tag.html @@ -0,0 +1,2 @@ +TLA +xyz diff --git a/test/abbr_tag.md b/test/abbr_tag.md new file mode 100644 index 00000000..bffbcf21 --- /dev/null +++ b/test/abbr_tag.md @@ -0,0 +1,4 @@ +TLA xyz + + *[TLA]: Three Letter Acronym + diff --git a/test/anchors.html b/test/anchors.html new file mode 100644 index 00000000..2a00d637 --- /dev/null +++ b/test/anchors.html @@ -0,0 +1,7 @@ +

          Processing hyperlinks

          + +

          Additional hyperlink tests!

          + +Bold Link +filename.py +The source code is called magic.py diff --git a/test/anchors.md b/test/anchors.md new file mode 100644 index 00000000..bd9fda5c --- /dev/null +++ b/test/anchors.md @@ -0,0 +1,8 @@ +# Processing hyperlinks + +Additional hyperlink tests! + +[**Bold Link**](http://some.link) +[`filename.py`](http://some.link/filename.py) [The source code is called +`magic.py`](http://some.link/magicsources.py) + diff --git a/test/apos_element.html b/test/apos_element.html new file mode 100644 index 00000000..bce374e2 --- /dev/null +++ b/test/apos_element.html @@ -0,0 +1,5 @@ + + + ' + + diff --git a/test/apos_element.md b/test/apos_element.md new file mode 100644 index 00000000..12d94f4f --- /dev/null +++ b/test/apos_element.md @@ -0,0 +1,2 @@ +' + diff --git a/test/blockquote_example.html b/test/blockquote_example.html new file mode 100644 index 00000000..ccb3e449 --- /dev/null +++ b/test/blockquote_example.html @@ -0,0 +1,3 @@ +
          +"The time has come", the Walrus said, "To talk of many things: Of shoes - and ships - and sealing wax - Of cabbages - and kings- And why the sea is boiling hot - And whether pigs have wings." +
          diff --git a/test/blockquote_example.md b/test/blockquote_example.md new file mode 100644 index 00000000..ae264797 --- /dev/null +++ b/test/blockquote_example.md @@ -0,0 +1,4 @@ +> "The time has come", the Walrus said, "To talk of many things: Of shoes - +> and ships - and sealing wax - Of cabbages - and kings- And why the sea is +> boiling hot - And whether pigs have wings." + diff --git a/test/bodywidth_newline.html b/test/bodywidth_newline.html new file mode 100644 index 00000000..cd5f614b --- /dev/null +++ b/test/bodywidth_newline.html @@ -0,0 +1,6 @@ +

          Another theory is that magician and occultist Aliester Crowley created the beast while + attempting to summon evil spirits at his house on the edge of the + lake in the early 1900′s. I met a local woman who + prefers this explanation.

          diff --git a/test/bodywidth_newline.md b/test/bodywidth_newline.md new file mode 100644 index 00000000..e495c3ee --- /dev/null +++ b/test/bodywidth_newline.md @@ -0,0 +1 @@ +Another theory is that magician and occultist [Aliester Crowley](http://en.wikipedia.org/wiki/Aleister_Crowley) created the beast while attempting to summon evil spirits at his house on the edge of the lake in the early 1900′s. **I met a local woman who prefers this explanation.** diff --git a/test/bold_inside_link.html b/test/bold_inside_link.html new file mode 100644 index 00000000..278603e7 --- /dev/null +++ b/test/bold_inside_link.html @@ -0,0 +1,2 @@ +Text +sample diff --git a/test/bold_inside_link.md b/test/bold_inside_link.md new file mode 100644 index 00000000..d2738ccf --- /dev/null +++ b/test/bold_inside_link.md @@ -0,0 +1,2 @@ +[**Text**](link.htm) [**sample**](/nothing/) + diff --git a/test/bold_long_line.html b/test/bold_long_line.html new file mode 100644 index 00000000..2aba478d --- /dev/null +++ b/test/bold_long_line.html @@ -0,0 +1,3 @@ +

          +text and a very long long long long long long long long long long long long long long long long long long long long line +

          diff --git a/test/bold_long_line.md b/test/bold_long_line.md new file mode 100644 index 00000000..fc1119ed --- /dev/null +++ b/test/bold_long_line.md @@ -0,0 +1,3 @@ +**text** and a very long long long long long long long long long long long +long long long long long long long long long line + diff --git a/test/break_preserved_in_blockquote.html b/test/break_preserved_in_blockquote.html new file mode 100644 index 00000000..2436ed0e --- /dev/null +++ b/test/break_preserved_in_blockquote.html @@ -0,0 +1 @@ +a
          b
          c
          diff --git a/test/break_preserved_in_blockquote.md b/test/break_preserved_in_blockquote.md new file mode 100644 index 00000000..f3b94c2a --- /dev/null +++ b/test/break_preserved_in_blockquote.md @@ -0,0 +1,5 @@ +a + +> b +> c + diff --git a/test/decript_tage.html b/test/decript_tage.html new file mode 100644 index 00000000..c8826d40 --- /dev/null +++ b/test/decript_tage.html @@ -0,0 +1,3 @@ +something +something +something diff --git a/test/decript_tage.md b/test/decript_tage.md new file mode 100644 index 00000000..eba0adac --- /dev/null +++ b/test/decript_tage.md @@ -0,0 +1,2 @@ +~~something~~ ~~something~~ ~~something~~ + diff --git a/test/default_image_alt.html b/test/default_image_alt.html new file mode 100644 index 00000000..b1f04def --- /dev/null +++ b/test/default_image_alt.html @@ -0,0 +1 @@ + diff --git a/test/default_image_alt.md b/test/default_image_alt.md new file mode 100644 index 00000000..5c4f3c80 --- /dev/null +++ b/test/default_image_alt.md @@ -0,0 +1,2 @@ +[![Image](images/google.png)](http://google.com) + diff --git a/test/doc_with_table.html b/test/doc_with_table.html new file mode 100644 index 00000000..96b91931 --- /dev/null +++ b/test/doc_with_table.html @@ -0,0 +1,36 @@ + + + + + + + + +

          This is a test document

          + +With some text, code, bolds and italics. + +

          This is second header

          + +

          Displaynone text

          + + + + + + + + + + + + + + + + + +
          Header 1Header 2Header 3
          Content 1Content 2200 Image!
          Content 1Content 2200 Image!
          + + + \ No newline at end of file diff --git a/test/doc_with_table.md b/test/doc_with_table.md new file mode 100644 index 00000000..5cb1986e --- /dev/null +++ b/test/doc_with_table.md @@ -0,0 +1,13 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 | Header 2 | Header 3 +---|---|--- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! + diff --git a/test/doc_with_table_bypass.html b/test/doc_with_table_bypass.html new file mode 100644 index 00000000..fa64b370 --- /dev/null +++ b/test/doc_with_table_bypass.html @@ -0,0 +1,32 @@ + + + + + + + + +

          This is a test document

          + +With some text, code, bolds and italics. + +

          This is second header

          + +

          Displaynone text

          + + + + + + + + + + + + + +
          Header 1Header 2Header 3
          Content 1Content 2200 Image!
          + + + \ No newline at end of file diff --git a/test/doc_with_table_bypass.md b/test/doc_with_table_bypass.md new file mode 100644 index 00000000..d3255a79 --- /dev/null +++ b/test/doc_with_table_bypass.md @@ -0,0 +1,42 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + + + + + + + + + +
          + +Header 1 + + + +Header 2 + + + +Header 3 + +
          + +Content 1 + + + +Content 2 + + + +![200](http://lorempixel.com/200/200) Image! + +
          + diff --git a/test/emphasis_preserved_whitespace.html b/test/emphasis_preserved_whitespace.html new file mode 100644 index 00000000..4af44b8f --- /dev/null +++ b/test/emphasis_preserved_whitespace.html @@ -0,0 +1,27 @@ +

          emphasis

          +

          emphasis: some text

          +

          repeat: again

          + +

          bold

          +

          bold: some text

          +

          repeat: again

          + +

          strike

          +

          strike: some text

          +

          strike: again

          + +

          separate emphasis some more text

          + + +

          emphasis.

          +

          emphasis?

          +

          emphasis!

          +

          (emphasis)

          +

          [bold}

          +

          (strike]

          + + +

          *bold

          +

          ~strike

          + +

          em1em2

          diff --git a/test/emphasis_preserved_whitespace.md b/test/emphasis_preserved_whitespace.md new file mode 100644 index 00000000..0842c566 --- /dev/null +++ b/test/emphasis_preserved_whitespace.md @@ -0,0 +1,38 @@ +_emphasis_ + +_emphasis:_ some text + +_repeat:_ again + +**bold** + +**bold:** some text + +**repeat:** again + +~~strike~~ + +~~strike:~~ some text + +~~strike:~~ again + +separate _emphasis_ some more text + +_emphasis_. + +_emphasis_? + +_emphasis_! + +(_emphasis_) + +[**bold**} + +(~~strike~~] + +* **bold** + +~ ~~strike~~ + +_em1_ _em2_ + diff --git a/test/emphasis_whitespace.html b/test/emphasis_whitespace.html new file mode 100644 index 00000000..e1a2d628 --- /dev/null +++ b/test/emphasis_whitespace.html @@ -0,0 +1,31 @@ +

          ib

          + +

          .ib

          + +

          bi

          + +

          .bi

          + +

          is

          + +

          .is

          + +

          si

          + +

          .si

          + +

          bs

          + +

          .bs

          + +

          sb

          + +

          .sb

          + +

          sbi

          + +

          .sbi

          + +

          bis

          + +

          .bis

          diff --git a/test/emphasis_whitespace.md b/test/emphasis_whitespace.md new file mode 100644 index 00000000..06a48133 --- /dev/null +++ b/test/emphasis_whitespace.md @@ -0,0 +1,32 @@ +_**ib**_ + +._**ib**_ + +**_bi_** + +.**_bi_** + +_~~is~~_ + +._~~is~~_ + +_~~si~~_ + +._~~si~~_ + +**~~bs~~** + +.**~~bs~~** + +~~**sb**~~ + +.~~**sb**~~ + +~~**_sbi_**~~ + +.~~**_sbi_**~~ + +**_~~bis~~_** + +.**_~~bis~~_** + diff --git a/test/empty-link.html b/test/empty-link.html new file mode 100644 index 00000000..dad41d09 --- /dev/null +++ b/test/empty-link.html @@ -0,0 +1,6 @@ +

          Processing empty hyperlinks

          + +

          This test checks whether empty hyperlinks still appear in the markdown result.

          + + +

          \ No newline at end of file diff --git a/test/empty-link.md b/test/empty-link.md new file mode 100644 index 00000000..77bada00 --- /dev/null +++ b/test/empty-link.md @@ -0,0 +1,6 @@ +# Processing empty hyperlinks + +This test checks whether empty hyperlinks still appear in the markdown result. + +[](http://some.link) [](http://some.link) + diff --git a/test/empty-title-tag.html b/test/empty-title-tag.html new file mode 100644 index 00000000..bc8d3c57 --- /dev/null +++ b/test/empty-title-tag.html @@ -0,0 +1 @@ +This is an A tag with an empty title property \ No newline at end of file diff --git a/test/empty-title-tag.md b/test/empty-title-tag.md new file mode 100644 index 00000000..ce342288 --- /dev/null +++ b/test/empty-title-tag.md @@ -0,0 +1,2 @@ +[This is an A tag with an empty title property](test.html) + diff --git a/test/flip_emphasis.html b/test/flip_emphasis.html new file mode 100644 index 00000000..9152f43a --- /dev/null +++ b/test/flip_emphasis.html @@ -0,0 +1,2 @@ +Something +else diff --git a/test/flip_emphasis.md b/test/flip_emphasis.md new file mode 100644 index 00000000..9c436c53 --- /dev/null +++ b/test/flip_emphasis.md @@ -0,0 +1,2 @@ +*Something* __else__ + diff --git a/test/google-like_font-properties.html b/test/google-like_font-properties.html new file mode 100644 index 00000000..6f330b6b --- /dev/null +++ b/test/google-like_font-properties.html @@ -0,0 +1,21 @@ + + + CAPS-LOCK TEST + + +

          font-weight: bold

          +

          FONT-WEIGHT: BOLD

          +

          font-weight: 700

          +

          FONT-WEIGHT: 700

          +

          font-weight: 800

          +

          FONT-WEIGHT: 800

          +

          font-weight: 900

          +

          FONT-WEIGHT: 900

          +

          font-style: italic

          +

          FONT-STYLE: ITALIC

          +

          + font-weight: bold;font-style: italic

          +

          + FONT-WEIGHT: BOLD;FONT-STYLE: ITALIC

          + + diff --git a/test/google-like_font-properties.md b/test/google-like_font-properties.md new file mode 100644 index 00000000..c8a2e9c6 --- /dev/null +++ b/test/google-like_font-properties.md @@ -0,0 +1,12 @@ +**font-weight: bold** +**FONT-WEIGHT: BOLD** +**font-weight: 700** +**FONT-WEIGHT: 700** +**font-weight: 800** +**FONT-WEIGHT: 800** +**font-weight: 900** +**FONT-WEIGHT: 900** +_font-style: italic_ +_FONT-STYLE: ITALIC_ +_**font-weight: bold;font-style: italic**_ +_**FONT-WEIGHT: BOLD;FONT-STYLE: ITALIC**_ diff --git a/test/header_tags.html b/test/header_tags.html new file mode 100644 index 00000000..0e257936 --- /dev/null +++ b/test/header_tags.html @@ -0,0 +1,17 @@ + + +

          H1

          +

          H2

          +

          H3

          +

          H4

          +
          H5
          +
          H6
          + H7 + H8 + H9 + H10 + H11 + H12 + NO number + + diff --git a/test/header_tags.md b/test/header_tags.md new file mode 100644 index 00000000..cb1fd31e --- /dev/null +++ b/test/header_tags.md @@ -0,0 +1,20 @@ +# H1 + +## H2 + +### H3 + +#### H4 + +##### H5 + +###### H6 + +####### H7 + +######## H8 + +######### H9 + +H10 H11 H12 NO number + diff --git a/test/horizontal_rule.html b/test/horizontal_rule.html new file mode 100644 index 00000000..f159f52c --- /dev/null +++ b/test/horizontal_rule.html @@ -0,0 +1,5 @@ + + +
          + + diff --git a/test/horizontal_rule.md b/test/horizontal_rule.md new file mode 100644 index 00000000..8ccef4e8 --- /dev/null +++ b/test/horizontal_rule.md @@ -0,0 +1,2 @@ +* * * + diff --git a/test/html-escaping.html b/test/html-escaping.html new file mode 100644 index 00000000..9d805b03 --- /dev/null +++ b/test/html-escaping.html @@ -0,0 +1,3 @@ +

          Escaped HTML like <div> or & should NOT remain escaped on output

          +
          ...even when that escaped HTML is in a <pre> tag
          +...or a <code> tag \ No newline at end of file diff --git a/test/html-escaping.md b/test/html-escaping.md new file mode 100644 index 00000000..41a318b7 --- /dev/null +++ b/test/html-escaping.md @@ -0,0 +1,8 @@ +Escaped HTML like
          or & should NOT remain escaped on output + + + + ...even when that escaped HTML is in a
           tag
          +
          +`...or a  tag`
          +
          diff --git a/test/html_entities_out_of_text.html b/test/html_entities_out_of_text.html
          new file mode 100644
          index 00000000..1b062c9e
          --- /dev/null
          +++ b/test/html_entities_out_of_text.html
          @@ -0,0 +1 @@
          +állás: Country Manager
          diff --git a/test/html_entities_out_of_text.md b/test/html_entities_out_of_text.md
          new file mode 100644
          index 00000000..ee4b0a79
          --- /dev/null
          +++ b/test/html_entities_out_of_text.md
          @@ -0,0 +1,2 @@
          +[allas: Country Manager](http://thth)
          +
          diff --git a/test/images_as_html.html b/test/images_as_html.html
          new file mode 100644
          index 00000000..de52f953
          --- /dev/null
          +++ b/test/images_as_html.html
          @@ -0,0 +1,10 @@
          +
          +
          +An image with a width attr
          +
          +An image with a height attr
          +
          +An image with width and height
          +
          +
          +
          diff --git a/test/images_as_html.md b/test/images_as_html.md
          new file mode 100644
          index 00000000..91f1cc18
          --- /dev/null
          +++ b/test/images_as_html.md
          @@ -0,0 +1,7 @@
          + An image with a width attr An image with a height attr
          +An
          +image with width and height  
          +
          diff --git a/test/images_to_alt.html b/test/images_to_alt.html
          new file mode 100644
          index 00000000..9d135458
          --- /dev/null
          +++ b/test/images_to_alt.html
          @@ -0,0 +1,7 @@
          +
          +ALT TEXT
          +
          +
          +ALT TEXT +
          +http://example.com \ No newline at end of file diff --git a/test/images_to_alt.md b/test/images_to_alt.md new file mode 100644 index 00000000..c07f1f1d --- /dev/null +++ b/test/images_to_alt.md @@ -0,0 +1,4 @@ +[ ALT TEXT ](http://example.com) +[ALT TEXT](http://example.com) + + diff --git a/test/images_with_div_wrap.html b/test/images_with_div_wrap.html new file mode 100644 index 00000000..6fa6678a --- /dev/null +++ b/test/images_with_div_wrap.html @@ -0,0 +1 @@ +
          diff --git a/test/images_with_div_wrap.md b/test/images_with_div_wrap.md new file mode 100644 index 00000000..92c37228 --- /dev/null +++ b/test/images_with_div_wrap.md @@ -0,0 +1,2 @@ +[![](http://example.com/img.png)](http://example.com) + diff --git a/test/images_with_size.html b/test/images_with_size.html new file mode 100644 index 00000000..fcda9b67 --- /dev/null +++ b/test/images_with_size.html @@ -0,0 +1,10 @@ +An image without dimensions + +An image with a width attr + +An image with a height attr + +An image with width and height + + + diff --git a/test/images_with_size.md b/test/images_with_size.md new file mode 100644 index 00000000..8cf35a8b --- /dev/null +++ b/test/images_with_size.md @@ -0,0 +1,7 @@ +![An image without dimensions](image_without_dimensions.jpg) An image with a width attr +An image with a height attr An
+image with width and height ![](image_with_width_and_height.jpg) + diff --git a/test/img-tag-with-link.html b/test/img-tag-with-link.html new file mode 100644 index 00000000..92d3a96e --- /dev/null +++ b/test/img-tag-with-link.html @@ -0,0 +1,9 @@ +

          Processing images with links

          + +

          This test checks images with associated links.

          + +(banana) +[banana] +{banana} +([{}]) + \ No newline at end of file diff --git a/test/img-tag-with-link.md b/test/img-tag-with-link.md new file mode 100644 index 00000000..3025423e --- /dev/null +++ b/test/img-tag-with-link.md @@ -0,0 +1,10 @@ +# Processing images with links + +This test checks images with associated links. + +[![\(banana\)](http://placehold.it/350x150#\(banana\))](http://some.link) +[![\[banana\]](http://placehold.it/350x150#\[banana\])](http://some.link) +[![{banana}](http://placehold.it/350x150#{banana})](http://some.link) +[![\(\[{}\]\)](http://placehold.it/350x150#\(\[{}\]\))](http://some.link) +[![](http://placehold.it/350x150#\(\[{}\]\))](http://some.link) + diff --git a/test/inplace_baseurl_substitution.html b/test/inplace_baseurl_substitution.html new file mode 100644 index 00000000..cb55345d --- /dev/null +++ b/test/inplace_baseurl_substitution.html @@ -0,0 +1,11 @@ + + + +

          +read2text header image +

          +

          +BrettTerpstra.com +

          + + diff --git a/test/inplace_baseurl_substitution.md b/test/inplace_baseurl_substitution.md new file mode 100644 index 00000000..bc73c977 --- /dev/null +++ b/test/inplace_baseurl_substitution.md @@ -0,0 +1,3 @@ +![read2text header image](http://brettterpstra.com/uploads/2012/01/read2textheader.jpg) + +[BrettTerpstra.com](http://brettterpstra.com/) diff --git a/test/invalid_start.html b/test/invalid_start.html new file mode 100644 index 00000000..0e49fe53 --- /dev/null +++ b/test/invalid_start.html @@ -0,0 +1,8 @@ + + +
            +
          1. The ol has an invalid start
          2. +
          3. This should just be ignored
          4. +
          + + \ No newline at end of file diff --git a/test/invalid_start.md b/test/invalid_start.md new file mode 100644 index 00000000..bed039c4 --- /dev/null +++ b/test/invalid_start.md @@ -0,0 +1,3 @@ + 1. The ol has an invalid start + 2. This should just be ignored + diff --git a/test/invalid_unicode.html b/test/invalid_unicode.html new file mode 100644 index 00000000..3dd8b18a --- /dev/null +++ b/test/invalid_unicode.html @@ -0,0 +1 @@ +B�r diff --git a/test/invalid_unicode.md b/test/invalid_unicode.md new file mode 100644 index 00000000..865cc856 --- /dev/null +++ b/test/invalid_unicode.md @@ -0,0 +1,2 @@ +Br + diff --git a/test/kbd_tag.html b/test/kbd_tag.html new file mode 100644 index 00000000..4c2b92a2 --- /dev/null +++ b/test/kbd_tag.html @@ -0,0 +1 @@ +Press [CTRL]+c to copy. diff --git a/test/kbd_tag.md b/test/kbd_tag.md new file mode 100644 index 00000000..7fd30339 --- /dev/null +++ b/test/kbd_tag.md @@ -0,0 +1,2 @@ +Press `[CTRL]+c` to copy. + diff --git a/test/link_titles.html b/test/link_titles.html new file mode 100644 index 00000000..17b9f538 --- /dev/null +++ b/test/link_titles.html @@ -0,0 +1,3 @@ + first example +
          +

          second example

          diff --git a/test/link_titles.md b/test/link_titles.md new file mode 100644 index 00000000..930c691d --- /dev/null +++ b/test/link_titles.md @@ -0,0 +1,3 @@ +[ first example](http://example.com "MyTitle") +[ second example](http://example.com) + diff --git a/test/list_tags_example.html b/test/list_tags_example.html new file mode 100644 index 00000000..9a5ba7ab --- /dev/null +++ b/test/list_tags_example.html @@ -0,0 +1,39 @@ +
          +
          Definition List
          +
          A list of terms and their definitions/descriptions.
          +
          Ordered List
          +
          A numbered list.
          +
          Unordered List
          +
          An unnumbered list.
          +
          + +

          Example 2

          +
          +
          Vocals
          +
          Bruce Dickinson
          +
          Guitar
          +
          Adrian Smith
          +
          Dave Murray
          +
          Janick Gers
          +
          Bass
          +
          Steve Harris
          +
          Drums
          +
          Nicko McBrain
          +
          + +
            +
          • some item
          • +
          • Some other item
          • +
          • some item
          • +
          + +
            +
          1. Some other item
          2. +
          3. some item
          4. +
          5. some item
          6. +
          + +
            +
          • something else here
          • +
          • some item
          • +
          diff --git a/test/list_tags_example.md b/test/list_tags_example.md new file mode 100644 index 00000000..71568ef7 --- /dev/null +++ b/test/list_tags_example.md @@ -0,0 +1,38 @@ +Definition List + + A list of terms and their definitions/descriptions. +Ordered List + + A numbered list. +Unordered List + + An unnumbered list. + +#### Example 2 + +Vocals + + Bruce Dickinson +Guitar + + Adrian Smith + Dave Murray + Janick Gers +Bass + + Steve Harris +Drums + + Nicko McBrain + + * some item + * Some other item + * some item + + 1. Some other item + 2. some item + 3. some item + + * something else here + * some item + diff --git a/test/long_lines.html b/test/long_lines.html new file mode 100644 index 00000000..7389de91 --- /dev/null +++ b/test/long_lines.html @@ -0,0 +1 @@ +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd diff --git a/test/long_lines.md b/test/long_lines.md new file mode 100644 index 00000000..65363ae7 --- /dev/null +++ b/test/long_lines.md @@ -0,0 +1,14 @@ +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd +![](http://www.foooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo.com) +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd + diff --git a/test/lrm_after_b.html b/test/lrm_after_b.html new file mode 100644 index 00000000..89932aa3 --- /dev/null +++ b/test/lrm_after_b.html @@ -0,0 +1 @@ +b‎ diff --git a/test/lrm_after_b.md b/test/lrm_after_b.md new file mode 100644 index 00000000..9c875b21 --- /dev/null +++ b/test/lrm_after_b.md @@ -0,0 +1,2 @@ +**b** + diff --git a/test/lrm_after_i.html b/test/lrm_after_i.html new file mode 100644 index 00000000..ab65a679 --- /dev/null +++ b/test/lrm_after_i.html @@ -0,0 +1 @@ +Foo‎ \ No newline at end of file diff --git a/test/lrm_after_i.md b/test/lrm_after_i.md new file mode 100644 index 00000000..46c9efa1 --- /dev/null +++ b/test/lrm_after_i.md @@ -0,0 +1,2 @@ +_Foo_ + diff --git a/test/lrm_inside_i.html b/test/lrm_inside_i.html new file mode 100644 index 00000000..1e51164d --- /dev/null +++ b/test/lrm_inside_i.html @@ -0,0 +1 @@ +Foo‎bar \ No newline at end of file diff --git a/test/lrm_inside_i.md b/test/lrm_inside_i.md new file mode 100644 index 00000000..48926295 --- /dev/null +++ b/test/lrm_inside_i.md @@ -0,0 +1,2 @@ +_Foo bar_ + diff --git a/test/mark_code.html b/test/mark_code.html new file mode 100644 index 00000000..eed53c5b --- /dev/null +++ b/test/mark_code.html @@ -0,0 +1,12 @@ + + +

          Normal text with 'pre' code block.

          +
          +import os
          +
          +def function():
          +    a = 1
          +
          +

          Normal text continues.

          + + diff --git a/test/mark_code.md b/test/mark_code.md new file mode 100644 index 00000000..40ca81e1 --- /dev/null +++ b/test/mark_code.md @@ -0,0 +1,13 @@ +Normal text with 'pre' code block. + +[code] + + import os + + def function(): + a = 1 + +[/code] + +Normal text continues. + diff --git a/test/mixed_nested_lists.html b/test/mixed_nested_lists.html new file mode 100644 index 00000000..c7ed28d2 --- /dev/null +++ b/test/mixed_nested_lists.html @@ -0,0 +1,20 @@ +
            +
          1. ordered
          2. +
          3. ...
          4. +
              +
            • unordered
            • +
            • ...
            • +
            +
          5. end
          6. +
          + +
            +
          • unordered
          • +
          • ...
          • +
              +
            1. ordered
            2. +
            3. ...
            4. +
            +
          • end
          • +
          + diff --git a/test/mixed_nested_lists.md b/test/mixed_nested_lists.md new file mode 100644 index 00000000..d9934ffd --- /dev/null +++ b/test/mixed_nested_lists.md @@ -0,0 +1,12 @@ + 1. ordered + 2. ... + * unordered + * ... + 3. end + + * unordered + * ... + 1. ordered + 2. ... + * end + diff --git a/test/no_inline_links_example.html b/test/no_inline_links_example.html new file mode 100644 index 00000000..5e4c45c8 --- /dev/null +++ b/test/no_inline_links_example.html @@ -0,0 +1,9 @@ +Googler + No href + No href but title available + Example + + + +link text + diff --git a/test/no_inline_links_example.md b/test/no_inline_links_example.md new file mode 100644 index 00000000..dedea641 --- /dev/null +++ b/test/no_inline_links_example.md @@ -0,0 +1,9 @@ +[Googler][1] No href No href but title available [ Example][2] [ [ [ link text +][3]][3]][3] + + [1]: http://google.com + + [2]: http://example.com (Example title) + + [3]: http://example.com (abc) + diff --git a/test/no_inline_links_images_to_alt.html b/test/no_inline_links_images_to_alt.html new file mode 100644 index 00000000..83c395d9 --- /dev/null +++ b/test/no_inline_links_images_to_alt.html @@ -0,0 +1,7 @@ + +ALT TEXT + +
          +ALT TEXT +
          +http://example.com diff --git a/test/no_inline_links_images_to_alt.md b/test/no_inline_links_images_to_alt.md new file mode 100644 index 00000000..83266dfe --- /dev/null +++ b/test/no_inline_links_images_to_alt.md @@ -0,0 +1,8 @@ +[ ![ALT TEXT][1] ][2] +[![ALT TEXT][1]][2] +[![http://example.com][1]][2] + + [1]: http://example.com/img.png + + [2]: http://example.com + diff --git a/test/no_inline_links_nested.html b/test/no_inline_links_nested.html new file mode 100644 index 00000000..2dc9b075 --- /dev/null +++ b/test/no_inline_links_nested.html @@ -0,0 +1 @@ +thisthat diff --git a/test/no_inline_links_nested.md b/test/no_inline_links_nested.md new file mode 100644 index 00000000..40d5abb2 --- /dev/null +++ b/test/no_inline_links_nested.md @@ -0,0 +1,6 @@ +[[this][1]that][2] + + [1]: /test2/ + + [2]: http://google.com + diff --git a/test/no_mailto_links.html b/test/no_mailto_links.html new file mode 100644 index 00000000..e905e925 --- /dev/null +++ b/test/no_mailto_links.html @@ -0,0 +1 @@ +Send an email to me@example.com. diff --git a/test/no_mailto_links.md b/test/no_mailto_links.md new file mode 100644 index 00000000..dbffdced --- /dev/null +++ b/test/no_mailto_links.md @@ -0,0 +1,2 @@ +Send an email to me@example.com. + diff --git a/test/no_wrap_links.html b/test/no_wrap_links.html new file mode 100644 index 00000000..650ad814 --- /dev/null +++ b/test/no_wrap_links.html @@ -0,0 +1 @@ +And here is a long link I had at hand.

          diff --git a/test/no_wrap_links.md b/test/no_wrap_links.md new file mode 100644 index 00000000..34fbfbca --- /dev/null +++ b/test/no_wrap_links.md @@ -0,0 +1,2 @@ +And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-dpkg@lists.debian.org) is a long link I had at hand. + diff --git a/test/no_wrap_links_no_inline_links.html b/test/no_wrap_links_no_inline_links.html new file mode 100644 index 00000000..c0c44eac --- /dev/null +++ b/test/no_wrap_links_no_inline_links.html @@ -0,0 +1 @@ +And here is a long link I had at hand. diff --git a/test/no_wrap_links_no_inline_links.md b/test/no_wrap_links_no_inline_links.md new file mode 100644 index 00000000..2e7f3e17 --- /dev/null +++ b/test/no_wrap_links_no_inline_links.md @@ -0,0 +1,2 @@ +And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-dpkg@lists.debian.org) is a long link I had at hand. + diff --git a/test/pad_table.html b/test/pad_table.html new file mode 100644 index 00000000..3fdd40c0 --- /dev/null +++ b/test/pad_table.html @@ -0,0 +1,51 @@ + + +

          This is a test document

          With some text, code, bolds and italics.

          This is second header

          Displaynone text

          + + + + + + +
          Header 1 Header 2 Header 3
          Content 1 2 200 Image!
          Content 1 longer Content 2 blah
          Content Content 2 blah
          t Content 2 blah blah blah
          + + + + + +
          H1 H2 H3
          C1 Content 2 x
          C123 Content 2 xyz
          + +some content between the tables
          + + + + +
          Header 1 Header 2 Header 3
          Content 1 Content 2 200 Image!
          Content 1 Content 2 longer 200 Image!
          + +something else entirely
          + + + + + + + + + + + +
          OneTwoThree
          ABC
          AB+C
          A+BC
          A+B+C
          + + + + + + + + + + + +
          One+TwoThree
          ABC
          AB+C
          A+BC
          A+B+C
          + + diff --git a/test/pad_table.md b/test/pad_table.md new file mode 100644 index 00000000..81e8fcb0 --- /dev/null +++ b/test/pad_table.md @@ -0,0 +1,42 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +| Header 1 | Header 2 | Header 3 | +|------------------|-----------|----------------------------------------------| +| Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! | +| Content 1 longer | Content 2 | blah | +| Content | Content 2 | blah | +| t | Content 2 | blah blah blah | + +| H1 | H2 | H3 | +|------|-----------|-----| +| C1 | Content 2 | x | +| C123 | Content 2 | xyz | + +some content between the tables +| Header 1 | Header 2 | Header 3 | +|-----------|------------------|----------------------------------------------| +| Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! | +| Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! | + +something else entirely +| One | Two | Three | +|-------|-----|-------| +| A | B | C | +| A | B+C | +| A+B | C | +| A+B+C | + +| One+Two | Three | +|---------|-------| +| A | B | C | +| A | B+C | +| A+B | C | +| A+B+C | + + diff --git a/test/pad_table_empty.html b/test/pad_table_empty.html new file mode 100644 index 00000000..35123fd5 --- /dev/null +++ b/test/pad_table_empty.html @@ -0,0 +1 @@ +
          diff --git a/test/pad_table_empty.md b/test/pad_table_empty.md new file mode 100644 index 00000000..392d455f --- /dev/null +++ b/test/pad_table_empty.md @@ -0,0 +1,3 @@ +| | + + diff --git a/test/pad_table_no_closed_tr.html b/test/pad_table_no_closed_tr.html new file mode 100644 index 00000000..8292d74b --- /dev/null +++ b/test/pad_table_no_closed_tr.html @@ -0,0 +1,10 @@ + + + + + + + + + +
          xxx xxxxxxx
          xxxx xxxxxxxx
          diff --git a/test/pad_table_no_closed_tr.md b/test/pad_table_no_closed_tr.md new file mode 100644 index 00000000..2f168ea7 --- /dev/null +++ b/test/pad_table_no_closed_tr.md @@ -0,0 +1,3 @@ +| **xxx** | **xxx** | xxxx | **xxxx** | **xxxx** | xxxx | + + diff --git a/test/pre.html b/test/pre.html index 171074b7..9872fc24 100644 --- a/test/pre.html +++ b/test/pre.html @@ -1,6 +1,6 @@ - initial crowsed pre handling test #1 + initial crowded pre handling test #1
          a
          diff --git a/test/protect_links.html b/test/protect_links.html
          new file mode 100644
          index 00000000..b248d1ee
          --- /dev/null
          +++ b/test/protect_links.html
          @@ -0,0 +1 @@
          +foo
          \ No newline at end of file
          diff --git a/test/protect_links.md b/test/protect_links.md
          new file mode 100644
          index 00000000..23e153e9
          --- /dev/null
          +++ b/test/protect_links.md
          @@ -0,0 +1,3 @@
          +[foo]()
          +
          diff --git a/test/q_tag.html b/test/q_tag.html
          new file mode 100644
          index 00000000..63e03608
          --- /dev/null
          +++ b/test/q_tag.html
          @@ -0,0 +1 @@
          +If this is a test, he said, then it should pass.
          diff --git a/test/q_tag.md b/test/q_tag.md
          new file mode 100644
          index 00000000..be2109bc
          --- /dev/null
          +++ b/test/q_tag.md
          @@ -0,0 +1,2 @@
          +"If this is a test," he said, "then it should pass".
          +
          diff --git a/test/rlm_inside_strong.html b/test/rlm_inside_strong.html
          new file mode 100644
          index 00000000..34b58f72
          --- /dev/null
          +++ b/test/rlm_inside_strong.html
          @@ -0,0 +1 @@
          +Foo‏bar
          \ No newline at end of file
          diff --git a/test/rlm_inside_strong.md b/test/rlm_inside_strong.md
          new file mode 100644
          index 00000000..a172d71a
          --- /dev/null
          +++ b/test/rlm_inside_strong.md
          @@ -0,0 +1,2 @@
          +**Foo bar**
          +
          diff --git a/test/run_tests.py b/test/run_tests.py
          deleted file mode 100644
          index 7ebfd394..00000000
          --- a/test/run_tests.py
          +++ /dev/null
          @@ -1,124 +0,0 @@
          -import codecs
          -import glob
          -import os
          -import re
          -import subprocess
          -import sys
          -
          -sys.path.insert(0, '..')
          -import html2text
          -
          -
          -def test_module(fn, google_doc=False, **kwargs):
          -    print_conditions('module', google_doc=google_doc, **kwargs)
          -
          -    h = html2text.HTML2Text()
          -
          -    if google_doc:
          -        h.google_doc = True
          -        h.ul_item_mark = '-'
          -        h.body_width = 0
          -        h.hide_strikethrough = True
          -
          -    for k, v in kwargs.iteritems():
          -        setattr(h, k, v)
          -
          -    result = get_baseline(fn)
          -    actual = h.handle(file(fn).read())
          -    return print_result(fn, 'module', result, actual)
          -
          -def test_command(fn, *args):
          -    print_conditions('command', *args)
          -    args = list(args)
          -
          -    cmd = [sys.executable or 'python', '../html2text.py']
          -
          -    if '--googledoc' in args:
          -        args.remove('--googledoc')
          -        cmd += ['-g', '-d', '-b', '0', '-s']
          -
          -    if args:
          -        cmd.extend(args)
          -
          -    cmd += [fn]
          -
          -    result = get_baseline(fn)
          -    actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read()
          -
          -    if os.name == 'nt':
          -        # Fix the unwanted CR to CRCRLF replacement
          -        # during text pipelining on Windows/cygwin
          -        actual = re.sub(r'\r+', '\r', actual)
          -        actual = actual.replace('\r\n', '\n')
          -
          -    return print_result(fn, 'command', result, actual)
          -
          -def print_conditions(mode, *args, **kwargs):
          -    format = " * %s %s, %s: "
          -    sys.stdout.write(format % (mode, args, kwargs))
          -
          -def print_result(fn, mode, result, actual):
          -    if result == actual:
          -        print('PASS')
          -        return True
          -    else:
          -        print('FAIL')
          -
          -        if mode == 'command':
          -            print(len(result), len(actual))
          -
          -        dump_name = get_dump_name(fn, mode)
          -
          -        f = codecs.open(dump_name, encoding='utf-8', mode='w+')
          -        f.write(actual)
          -
          -        print("  Use: diff -u %s %s" % (get_baseline_name(fn), dump_name))
          -        return False
          -
          -def get_dump_name(fn, suffix):
          -    return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix)
          -
          -def get_baseline_name(fn):
          -    return os.path.splitext(fn)[0] + '.md'
          -
          -def get_baseline(fn):
          -    name = get_baseline_name(fn)
          -    f = codecs.open(name, mode='r', encoding='utf8')
          -    return f.read()
          -
          -def run_all_tests():
          -    html_files = glob.glob("*.html")
          -    passing = True
          -    for fn in html_files:
          -        module_args = {}
          -        cmdline_args = []
          -
          -        if fn.lower().startswith('google'):
          -            module_args['google_doc'] = True
          -            cmdline_args.append('--googledoc')
          -
          -        if fn.lower().find('unicode') >= 0:
          -            module_args['unicode_snob'] = True
          -
          -        if fn.lower().find('flip_emphasis') >= 0:
          -            module_args['emphasis_mark'] = '*'
          -            module_args['strong_mark'] = '__'
          -            cmdline_args.append('-e')
          -
          -        if fn.lower().find('escape_snob') >= 0:
          -            module_args['escape_snob'] = True
          -            cmdline_args.append('--escape-all')
          -
          -        print('\n' + fn + ':')
          -        passing = passing and test_module(fn, **module_args)
          -
          -        if not 'unicode_snob' in module_args: # Because there is no command-line option to control unicode_snob
          -            passing = passing and test_command(fn, *cmdline_args)
          -    if passing:
          -        print("ALL TESTS PASSED")
          -    else:
          -        print("Fail.")
          -        sys.exit(1)
          -
          -if __name__ == "__main__":
          -    run_all_tests()
          diff --git a/test/single_line_break.html b/test/single_line_break.html
          new file mode 100644
          index 00000000..ec3598e1
          --- /dev/null
          +++ b/test/single_line_break.html
          @@ -0,0 +1,6 @@
          +
          +
          +
          +
          Hello world.
          +
          And hello html2text.
          +
          diff --git a/test/single_line_break.md b/test/single_line_break.md new file mode 100644 index 00000000..31fde4fc --- /dev/null +++ b/test/single_line_break.md @@ -0,0 +1,2 @@ +Hello world. +And hello html2text. diff --git a/test/stressed_with_html_entities.html b/test/stressed_with_html_entities.html new file mode 100644 index 00000000..de925e2b --- /dev/null +++ b/test/stressed_with_html_entities.html @@ -0,0 +1 @@ +

          hello world ><

          diff --git a/test/stressed_with_html_entities.md b/test/stressed_with_html_entities.md new file mode 100644 index 00000000..6c59c1e9 --- /dev/null +++ b/test/stressed_with_html_entities.md @@ -0,0 +1,2 @@ +**hello** world >< + diff --git a/test/table_ignore.html b/test/table_ignore.html new file mode 100644 index 00000000..d2966aae --- /dev/null +++ b/test/table_ignore.html @@ -0,0 +1,26 @@ + + +

          This is a test document

          With some text, code, bolds and italics.

          This is second header

          Displaynone text

          + + + + + + +
          Header 1 Header 2 Header 3
          Content 1 2 200 Image!
          Content 1 longer Content 2 blah
          Content Content 2 blah
          t Content 2 blah blah blah
          + + + + + +
          H1 H2 H3
          C1 Content 2 x
          C123 Content 2 xyz
          + +some content between the tables
          + + + + +
          Header 1 Header 2 Header 3
          Content 1 Content 2 200 Image!
          Content 1 Content 2 longer 200 Image!
          + +something else entirely + diff --git a/test/table_ignore.md b/test/table_ignore.md new file mode 100644 index 00000000..ce2dedac --- /dev/null +++ b/test/table_ignore.md @@ -0,0 +1,22 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 Header 2 Header 3 +Content 1 2 ![200](http://lorempixel.com/200/200) Image! +Content 1 longer Content 2 blah +Content Content 2 blah +t Content 2 blah blah blah +H1 H2 H3 +C1 Content 2 x +C123 Content 2 xyz +some content between the tables +Header 1 Header 2 Header 3 +Content 1 Content 2 ![200](http://lorempixel.com/200/200) Image! +Content 1 Content 2 longer ![200](http://lorempixel.com/200/200) Image! +something else entirely + diff --git a/test/test_html2text.py b/test/test_html2text.py new file mode 100644 index 00000000..eebe34f3 --- /dev/null +++ b/test/test_html2text.py @@ -0,0 +1,238 @@ +import glob +import os +import re +import subprocess +import sys + +import pytest + +import html2text + +skip = object() + + +def cleanup_eol(clean_str): + if os.name == "nt" or sys.platform == "cygwin": + # Fix the unwanted CR to CRCRLF replacement + # during text pipelining on Windows/cygwin + # on cygwin, os.name == 'posix', not nt + clean_str = re.sub(r"\r+", "\r", clean_str) + clean_str = clean_str.replace("\r\n", "\n") + return clean_str + + +def generate_testdata(): + test_dir_name = os.path.dirname(os.path.realpath(__file__)) + for fn in glob.glob("%s/*.html" % test_dir_name): + module_args = {} + cmdline_args = [] + func_args = {} + base_fn = os.path.basename(fn).lower() + + if base_fn.startswith("default_image_alt"): + module_args["default_image_alt"] = "Image" + cmdline_args.append("--default-image-alt=Image") + func_args = skip + + if base_fn.startswith("google"): + module_args["google_doc"] = True + cmdline_args.append("--googledoc") + func_args = skip + + if base_fn.find("unicode") >= 0: + module_args["unicode_snob"] = True + cmdline_args.append("--unicode-snob") + func_args = skip + + if base_fn.find("flip_emphasis") >= 0: + module_args["emphasis_mark"] = "*" + module_args["strong_mark"] = "__" + cmdline_args.append("-e") + func_args = skip + + if base_fn.find("escape_snob") >= 0: + module_args["escape_snob"] = True + cmdline_args.append("--escape-all") + func_args = skip + + if base_fn.find("table_bypass") >= 0: + module_args["bypass_tables"] = True + cmdline_args.append("--bypass-tables") + func_args = skip + + if base_fn.startswith("table_ignore"): + module_args["ignore_tables"] = True + cmdline_args.append("--ignore-tables") + func_args = skip + + if base_fn.startswith("bodywidth"): + module_args["body_width"] = 0 + cmdline_args.append("--body-width=0") + func_args["bodywidth"] = 0 + + if base_fn.startswith("protect_links"): + module_args["protect_links"] = True + cmdline_args.append("--protect-links") + func_args = skip + + if base_fn.startswith("images_as_html"): + module_args["images_as_html"] = True + cmdline_args.append("--images-as-html") + func_args = skip + + if base_fn.startswith("images_to_alt"): + module_args["images_to_alt"] = True + cmdline_args.append("--images-to-alt") + func_args = skip + + if base_fn.startswith("images_with_size"): + module_args["images_with_size"] = True + cmdline_args.append("--images-with-size") + func_args = skip + + if base_fn.startswith("single_line_break"): + module_args["body_width"] = 0 + cmdline_args.append("--body-width=0") + module_args["single_line_break"] = True + cmdline_args.append("--single-line-break") + func_args = skip + + if base_fn.startswith("no_inline_links"): + module_args["inline_links"] = False + cmdline_args.append("--reference-links") + func_args = skip + + if base_fn.startswith("no_mailto_links"): + module_args["ignore_mailto_links"] = True + cmdline_args.append("--ignore-mailto-links") + func_args = skip + + if base_fn.startswith("no_wrap_links"): + module_args["wrap_links"] = False + cmdline_args.append("--no-wrap-links") + func_args = skip + + if base_fn.startswith("mark_code"): + module_args["mark_code"] = True + cmdline_args.append("--mark-code") + func_args = skip + + if base_fn.startswith("pad_table"): + module_args["pad_tables"] = True + cmdline_args.append("--pad-tables") + func_args = skip + + if base_fn.startswith("wrap_list_items"): + module_args["wrap_list_items"] = True + cmdline_args.append("--wrap-list-items") + func_args = skip + + if base_fn.startswith("wrap_tables"): + module_args["wrap_tables"] = True + cmdline_args.append("--wrap-tables") + func_args = skip + + if base_fn == "inplace_baseurl_substitution.html": + module_args["baseurl"] = "http://brettterpstra.com" + module_args["body_width"] = 0 + func_args["baseurl"] = "http://brettterpstra.com" + func_args["bodywidth"] = 0 + # CLI doesn't support baseurl. + cmdline_args = skip + + yield fn, module_args, cmdline_args, func_args + + +def generate_module_testdata(): + for fn, module_args, cmdline_args, func_args in generate_testdata(): + yield fn, module_args + + +def generate_command_testdata(): + for fn, module_args, cmdline_args, func_args in generate_testdata(): + if cmdline_args is not skip: + yield fn, cmdline_args + + +def generate_function_testdata(): + for fn, module_args, cmdline_args, func_args in generate_testdata(): + if func_args is not skip: + yield fn, func_args + + +@pytest.mark.parametrize("fn,module_args", generate_module_testdata()) +def test_module(fn, module_args): + h = html2text.HTML2Text() + h.fn = fn + + if module_args.pop("google_doc", False): + h.google_doc = True + h.ul_item_mark = "-" + h.body_width = 0 + h.hide_strikethrough = True + + for k, v in module_args.items(): + setattr(h, k, v) + + result = get_baseline(fn) + with open(fn) as inf: + actual = cleanup_eol(inf.read()) + actual = h.handle(actual) + assert result == actual + + +@pytest.mark.parametrize("fn,cmdline_args", generate_command_testdata()) +def test_command(fn, cmdline_args): + args = list(cmdline_args) + cmd = [sys.executable, "-m", "html2text"] + + if "--googledoc" in args: + args.remove("--googledoc") + cmd += ["-g", "-d", "-b", "0", "-s"] + + if args: + cmd.extend(args) + + cmd += [fn] + + result = get_baseline(fn) + out = subprocess.check_output(cmd) + + actual = out.decode() + + actual = cleanup_eol(actual) + + assert result == actual + + +@pytest.mark.parametrize("fn,func_args", generate_function_testdata()) +def test_function(fn, func_args): + with open(fn) as inf: + actual = html2text.html2text(inf.read(), **func_args) + result = get_baseline(fn) + assert result == actual + + +def get_baseline_name(fn): + return os.path.splitext(fn)[0] + ".md" + + +def get_baseline(fn): + name = get_baseline_name(fn) + with open(name, encoding="utf-8") as f: + out = f.read() + return cleanup_eol(out) + + +def test_tag_callback(): + def _skip_certain_tags(h2t, tag, attrs, start): + if tag == "b": + return True + + h = html2text.HTML2Text() + h.tag_callback = _skip_certain_tags + ret = h.handle( + 'this is a txt and this is a with text and ' + "some italics too." + ) + assert ret == ("this is a txt and this is a with text and some _italics_ too.\n\n") diff --git a/test/test_memleak.py b/test/test_memleak.py new file mode 100644 index 00000000..45a3ff68 --- /dev/null +++ b/test/test_memleak.py @@ -0,0 +1,26 @@ +import html2text + +# See https://github.com/Alir3z4/html2text/issues/13 for more information. + +INSTR = "miow " + + +def test_same_string(): + h2t = html2text.HTML2Text() + result = h2t.handle(INSTR) + # Now, we shouldn't get leak of the previous run to the new one. + assert h2t.handle(INSTR) == result + + +def test_empty_string(): + h2t = html2text.HTML2Text() + h2t.handle(INSTR) + # And even less when the input is empty. + assert h2t.handle("") == "\n\n" + + +def test_abbr_data(): + h2t = html2text.HTML2Text() + result = h2t.handle('

          foo TLA bar

          ') + assert result == "foo TLA bar\n\n *[TLA]: Three Letter Acronym\n\n" + assert h2t.abbr_data is None diff --git a/test/text_after_list.html b/test/text_after_list.html new file mode 100644 index 00000000..8691b4ff --- /dev/null +++ b/test/text_after_list.html @@ -0,0 +1,2 @@ +
          • item
          +text diff --git a/test/text_after_list.md b/test/text_after_list.md new file mode 100644 index 00000000..66d9d984 --- /dev/null +++ b/test/text_after_list.md @@ -0,0 +1,4 @@ + * item + +text + diff --git a/test/url-escaping.html b/test/url-escaping.html index 5c5693ce..5b03e697 100644 --- a/test/url-escaping.html +++ b/test/url-escaping.html @@ -6,8 +6,8 @@

          Markdown-sensible characters processing

        2. Some MSDN link using parenthesis
        3. Google search result URL with unescaped brackets
        4. Yet another test for [brackets], {curly braces} and (parenthesis) processing inside the anchor
        5. -
        6. Use automatic links like http://example.com/ when the URL is the label -
        7. Exempt non-absolute_URIs from automatic link detection +
        8. Use automatic links like http://example.com/ when the URL is the label
        9. +
        10. Exempt non-absolute_URIs from automatic link detection

And here are images with tricky attribute values:

@@ -16,3 +16,4 @@

Markdown-sensible characters processing

[banana]
{banana}
([{}]) + diff --git a/test/url-escaping.md b/test/url-escaping.md index ffb2bc87..f409d406 100644 --- a/test/url-escaping.md +++ b/test/url-escaping.md @@ -16,4 +16,5 @@ And here are images with tricky attribute values: ![\[banana\]](http://placehold.it/350x150#\[banana\]) ![{banana}](http://placehold.it/350x150#{banana}) ![\(\[{}\]\)](http://placehold.it/350x150#\(\[{}\]\)) +![](http://placehold.it/350x150#\(\[{}\]\)) diff --git a/test/wrap_list_items_example.html b/test/wrap_list_items_example.html new file mode 100644 index 00000000..26d5d9de --- /dev/null +++ b/test/wrap_list_items_example.html @@ -0,0 +1,13 @@ +
    +
  • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
  • +
  • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
  • +
+ +Text between lists. + +
    +
  • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
  • +
  • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
  • +
+ +Text after list. diff --git a/test/wrap_list_items_example.md b/test/wrap_list_items_example.md new file mode 100644 index 00000000..cf80e161 --- /dev/null +++ b/test/wrap_list_items_example.md @@ -0,0 +1,14 @@ + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + +Text between lists. + + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + +Text after list. + diff --git a/test/wrap_tables.html b/test/wrap_tables.html new file mode 100644 index 00000000..d7788007 --- /dev/null +++ b/test/wrap_tables.html @@ -0,0 +1,12 @@ + + +

This is a test document

With some text, code, bolds and italics.

This is second header

Displaynone text

+ + + + + + +
Header 1 Header 2 Header 3
Content 1 2 200 Image!
Content 1 longer Content 2 Here is some really long text that will wrap to the next line. Because it's so long.
Content Content 2 blah
t Content 2 blah blah blah
+ + diff --git a/test/wrap_tables.md b/test/wrap_tables.md new file mode 100644 index 00000000..37decf29 --- /dev/null +++ b/test/wrap_tables.md @@ -0,0 +1,16 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 | Header 2 | Header 3 +---|---|--- +Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 longer | Content 2 | Here is some really long text that will wrap to +the next line. Because it's so long. +Content | Content 2 | blah +t | Content 2 | blah blah blah + diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..baaf18f0 --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +envlist = + black + flake8 + isort + mypy + py{35,36,37,38,py3} +minversion = 1.9 + +[testenv] +commands = + pytest --cov=html2text {posargs} +deps = + pytest + pytest-cov + +[testenv:black] +basepython = python3 +commands = + black --target-version py35 --check --diff . +deps = + black +skip_install = true + +[testenv:flake8] +basepython = python3 +commands = + flake8 +deps = + flake8 +skip_install = true + +[testenv:isort] +basepython = python3 +commands = + isort --check-only --diff . +deps = + isort >= 5.0.1 +skip_install = true + +[testenv:mypy] +commands = mypy --strict html2text +deps = mypy +skip_install = true